glib/gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
   6  *
   7  * SPDX-License-Identifier: LGPL-2.1-or-later
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "config.h"
  24 #include "glibconfig.h"
  25
  26 #ifndef G_OS_WIN32
  27 #include <iconv.h>
  28 #endif
  29 #include <errno.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #include <stdlib.h>
  33
  34 #ifdef G_OS_WIN32
  35 #include "win_iconv.c"
  36 #endif
  37
  38 #ifdef G_PLATFORM_WIN32
  39 #define STRICT
  40 #include <windows.h>
  41 #undef STRICT
  42 #endif
  43
  44 #include "gconvert.h"
  45 #include "gconvertprivate.h"
  46
  47 #include "gcharsetprivate.h"
  48 #include "gslist.h"
  49 #include "gstrfuncs.h"
  50 #include "gtestutils.h"
  51 #include "gthread.h"
  52 #include "gthreadprivate.h"
  53 #include "gunicode.h"
  54 #include "gfileutils.h"
  55 #include "genviron.h"
  56
  57 #include "glibintl.h"
  58
  59
  60 /**
  61  * SECTION:conversions
  62  * @title: Character Set Conversion
  63  * @short_description: convert strings between different character sets
  64  *
  65  * The g_convert() family of function wraps the functionality of iconv().
  66  * In addition to pure character set conversions, GLib has functions to
  67  * deal with the extra complications of encodings for file names.
  68  *
  69  * ## File Name Encodings
  70  *
  71  * Historically, UNIX has not had a defined encoding for file names:
  72  * a file name is valid as long as it does not have path separators
  73  * in it ("/"). However, displaying file names may require conversion:
  74  * from the character set in which they were created, to the character
  75  * set in which the application operates. Consider the Spanish file name
  76  * "Presentación.sxi". If the application which created it uses
  77  * ISO-8859-1 for its encoding,
  78  * |[
  79  * Character:  P  r  e  s  e  n  t  a  c  i  ó  n  .  s  x  i
  80  * Hex code:   50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
  81  * ]|
  82  * However, if the application use UTF-8, the actual file name on
  83  * disk would look like this:
  84  * |[
  85  * Character:  P  r  e  s  e  n  t  a  c  i  ó     n  .  s  x  i
  86  * Hex code:   50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
  87  * ]|
  88  * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use
  89  * GLib do the same thing. If you get a file name from the file system,
  90  * for example, from readdir() or from g_dir_read_name(), and you wish
  91  * to display the file name to the user, you  will need to convert it
  92  * into UTF-8. The opposite case is when the user types the name of a
  93  * file they wish to save: the toolkit will give you that string in
  94  * UTF-8 encoding, and you will need to convert it to the character
  95  * set used for file names before you can create the file with open()
  96  * or fopen().
  97  *
  98  * By default, GLib assumes that file names on disk are in UTF-8
  99  * encoding. This is a valid assumption for file systems which
 100  * were created relatively recently: most applications use UTF-8
 101  * encoding for their strings, and that is also what they use for
 102  * the file names they create. However, older file systems may
 103  * still contain file names created in "older" encodings, such as
 104  * ISO-8859-1. In this case, for compatibility reasons, you may want
 105  * to instruct GLib to use that particular encoding for file names
 106  * rather than UTF-8. You can do this by specifying the encoding for
 107  * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]
 108  * environment variable. For example, if your installation uses
 109  * ISO-8859-1 for file names, you can put this in your `~/.profile`:
 110  * |[
 111  * export G_FILENAME_ENCODING=ISO-8859-1
 112  * ]|
 113  * GLib provides the functions g_filename_to_utf8() and
 114  * g_filename_from_utf8() to perform the necessary conversions.
 115  * These functions convert file names from the encoding specified
 116  * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This
 117  * [diagram][file-name-encodings-diagram] illustrates how
 118  * these functions are used to convert between UTF-8 and the
 119  * encoding for file names in the file system.
 120  *
 121  * ## Conversion between file name encodings # {#file-name-encodings-diagram)
 122  *
 123  * ![](file-name-encodings.png)
 124  *
 125  * ## Checklist for Application Writers
 126  *
 127  * This section is a practical summary of the detailed
 128  * things to do to make sure your applications process file
 129  * name encodings correctly.
 130  *
 131  * 1. If you get a file name from the file system from a function
 132  *    such as readdir() or gtk_file_chooser_get_filename(), you do
 133  *    not need to do any conversion to pass that file name to
 134  *    functions like open(), rename(), or fopen() -- those are "raw"
 135  *    file names which the file system understands.
 136  *
 137  * 2. If you need to display a file name, convert it to UTF-8 first
 138  *    by using g_filename_to_utf8(). If conversion fails, display a
 139  *    string like "Unknown file name". Do not convert this string back
 140  *    into the encoding used for file names if you wish to pass it to
 141  *    the file system; use the original file name instead.
 142  *
 143  *    For example, the document window of a word processor could display
 144  *    "Unknown file name" in its title bar but still let the user save
 145  *    the file, as it would keep the raw file name internally. This
 146  *    can happen if the user has not set the `G_FILENAME_ENCODING`
 147  *    environment variable even though they have files whose names are
 148  *    not encoded in UTF-8.
 149  *
 150  * 3. If your user interface lets the user type a file name for saving
 151  *    or renaming, convert it to the encoding used for file names in
 152  *    the file system by using g_filename_from_utf8(). Pass the converted
 153  *    file name to functions like fopen(). If conversion fails, ask the
 154  *    user to enter a different file name. This can happen if the user
 155  *    types Japanese characters when `G_FILENAME_ENCODING` is set to
 156  *    `ISO-8859-1`, for example.
 157  */
 158
 159 /* We try to terminate strings in unknown charsets with this many zero bytes
 160  * to ensure that multibyte strings really are nul-terminated when we return
 161  * them from g_convert() and friends.
 162  */
 163 #define NUL_TERMINATOR_LENGTH 4
 164
 165 G_DEFINE_QUARK (g_convert_error, g_convert_error)
 166
 167 static gboolean
 168 try_conversion (const char *to_codeset,
 169                 const char *from_codeset,
 170                 iconv_t    *cd)
 171 {
 172   *cd = iconv_open (to_codeset, from_codeset);
 173
 174   if (*cd == (iconv_t)-1 && errno == EINVAL)
 175     return FALSE;
 176   else
 177     return TRUE;
 178 }
 179
 180 static gboolean
 181 try_to_aliases (const char **to_aliases,
 182                 const char  *from_codeset,
 183                 iconv_t     *cd)
 184 {
 185   if (to_aliases)
 186     {
 187       const char **p = to_aliases;
 188       while (*p)
 189         {
 190           if (try_conversion (*p, from_codeset, cd))
 191             return TRUE;
 192
 193           p++;
 194         }
 195     }
 196
 197   return FALSE;
 198 }
 199
 200 /**
 201  * g_iconv_open: (skip)
 202  * @to_codeset: destination codeset
 203  * @from_codeset: source codeset
 204  *
 205  * Same as the standard UNIX routine iconv_open(), but
 206  * may be implemented via libiconv on UNIX flavors that lack
 207  * a native implementation.
 208  *
 209  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 210  * more convenient than the raw iconv wrappers.
 211  *
 212  * Returns: a "conversion descriptor", or (GIConv)-1 if
 213  *  opening the converter failed.
 214  **/
 215 GIConv
 216 g_iconv_open (const gchar  *to_codeset,
 217               const gchar  *from_codeset)
 218 {
 219   iconv_t cd;
 220
 221   if (!try_conversion (to_codeset, from_codeset, &cd))
 222     {
 223       const char **to_aliases = _g_charset_get_aliases (to_codeset);
 224       const char **from_aliases = _g_charset_get_aliases (from_codeset);
 225
 226       if (from_aliases)
 227         {
 228           const char **p = from_aliases;
 229           while (*p)
 230             {
 231               if (try_conversion (to_codeset, *p, &cd))
 232                 goto out;
 233
 234               if (try_to_aliases (to_aliases, *p, &cd))
 235                 goto out;
 236
 237               p++;
 238             }
 239         }
 240
 241       if (try_to_aliases (to_aliases, from_codeset, &cd))
 242         goto out;
 243     }
 244
 245  out:
 246   return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
 247 }
 248
 249 /**
 250  * g_iconv: (skip)
 251  * @converter: conversion descriptor from g_iconv_open()
 252  * @inbuf: bytes to convert
 253  * @inbytes_left: (inout): inout parameter, bytes remaining to convert in @inbuf
 254  * @outbuf: converted output bytes
 255  * @outbytes_left: (inout): inout parameter, bytes available to fill in @outbuf
 256  *
 257  * Same as the standard UNIX routine iconv(), but
 258  * may be implemented via libiconv on UNIX flavors that lack
 259  * a native implementation.
 260  *
 261  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 262  * more convenient than the raw iconv wrappers.
 263  *
 264  * Note that the behaviour of iconv() for characters which are valid in the
 265  * input character set, but which have no representation in the output character
 266  * set, is implementation defined. This function may return success (with a
 267  * positive number of non-reversible conversions as replacement characters were
 268  * used), or it may return -1 and set an error such as %EILSEQ, in such a
 269  * situation.
 270  *
 271  * Returns: count of non-reversible conversions, or -1 on error
 272  **/
 273 gsize
 274 g_iconv (GIConv   converter,
 275          gchar  **inbuf,
 276          gsize   *inbytes_left,
 277          gchar  **outbuf,
 278          gsize   *outbytes_left)
 279 {
 280   iconv_t cd = (iconv_t)converter;
 281
 282   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
 283 }
 284
 285 /**
 286  * g_iconv_close: (skip)
 287  * @converter: a conversion descriptor from g_iconv_open()
 288  *
 289  * Same as the standard UNIX routine iconv_close(), but
 290  * may be implemented via libiconv on UNIX flavors that lack
 291  * a native implementation. Should be called to clean up
 292  * the conversion descriptor from g_iconv_open() when
 293  * you are done converting things.
 294  *
 295  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 296  * more convenient than the raw iconv wrappers.
 297  *
 298  * Returns: -1 on error, 0 on success
 299  **/
 300 gint
 301 g_iconv_close (GIConv converter)
 302 {
 303   iconv_t cd = (iconv_t)converter;
 304
 305   return iconv_close (cd);
 306 }
 307
 308 static GIConv
 309 open_converter (const gchar *to_codeset,
 310                 const gchar *from_codeset,
 311                 GError     **error)
 312 {
 313   GIConv cd;
 314
 315   cd = g_iconv_open (to_codeset, from_codeset);
 316
 317   if (cd == (GIConv) -1)
 318     {
 319       /* Something went wrong.  */
 320       if (error)
 321         {
 322           if (errno == EINVAL)
 323             g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 324                          _("Conversion from character set “%s” to “%s” is not supported"),
 325                          from_codeset, to_codeset);
 326           else
 327             g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 328                          _("Could not open converter from “%s” to “%s”"),
 329                          from_codeset, to_codeset);
 330         }
 331     }
 332
 333   return cd;
 334 }
 335
 336 static int
 337 close_converter (GIConv cd)
 338 {
 339   if (cd == (GIConv) -1)
 340     return 0;
 341
 342   return g_iconv_close (cd);
 343 }
 344
 345 /**
 346  * g_convert_with_iconv: (skip)
 347  * @str:           (array length=len) (element-type guint8):
 348  *                 the string to convert.
 349  * @len:           the length of the string in bytes, or -1 if the string is
 350  *                 nul-terminated (Note that some encodings may allow nul
 351  *                 bytes to occur inside strings. In that case, using -1
 352  *                 for the @len parameter is unsafe)
 353  * @converter:     conversion descriptor from g_iconv_open()
 354  * @bytes_read:    (out) (optional): location to store the number of bytes in
 355  *                 the input string that were successfully converted, or %NULL.
 356  *                 Even if the conversion was successful, this may be
 357  *                 less than @len if there were partial characters
 358  *                 at the end of the input. If the error
 359  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 360  *                 stored will be the byte offset after the last valid
 361  *                 input sequence.
 362  * @bytes_written: (out) (optional): the number of bytes stored in
 363  *                 the output buffer (not including the terminating nul).
 364  * @error:         location to store the error occurring, or %NULL to ignore
 365  *                 errors. Any of the errors in #GConvertError may occur.
 366  *
 367  * Converts a string from one character set to another.
 368  *
 369  * Note that you should use g_iconv() for streaming conversions.
 370  * Despite the fact that @bytes_read can return information about partial
 371  * characters, the g_convert_... functions are not generally suitable
 372  * for streaming. If the underlying converter maintains internal state,
 373  * then this won't be preserved across successive calls to g_convert(),
 374  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
 375  * this is the GNU C converter for CP1255 which does not emit a base
 376  * character until it knows that the next character is not a mark that
 377  * could combine with the base character.)
 378  *
 379  * Characters which are valid in the input character set, but which have no
 380  * representation in the output character set will result in a
 381  * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
 382  * specification, which leaves this behaviour implementation defined. Note that
 383  * this is the same error code as is returned for an invalid byte sequence in
 384  * the input character set. To get defined behaviour for conversion of
 385  * unrepresentable characters, use g_convert_with_fallback().
 386  *
 387  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
 388  *               If the conversion was successful, a newly allocated buffer
 389  *               containing the converted string, which must be freed with
 390  *               g_free(). Otherwise %NULL and @error will be set.
 391  **/
 392 gchar*
 393 g_convert_with_iconv (const gchar *str,
 394                       gssize       len,
 395                       GIConv       converter,
 396                       gsize       *bytes_read,
 397                       gsize       *bytes_written,
 398                       GError     **error)
 399 {
 400   gchar *dest;
 401   gchar *outp;
 402   const gchar *p;
 403   gsize inbytes_remaining;
 404   gsize outbytes_remaining;
 405   gsize err;
 406   gsize outbuf_size;
 407   gboolean have_error = FALSE;
 408   gboolean done = FALSE;
 409   gboolean reset = FALSE;
 410
 411   g_return_val_if_fail (converter != (GIConv) -1, NULL);
 412
 413   if (len < 0)
 414     len = strlen (str);
 415
 416   p = str;
 417   inbytes_remaining = len;
 418   outbuf_size = len + NUL_TERMINATOR_LENGTH;
 419
 420   outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
 421   outp = dest = g_malloc (outbuf_size);
 422
 423   while (!done && !have_error)
 424     {
 425       if (reset)
 426         err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
 427       else
 428         err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
 429
 430       if (err == (gsize) -1)
 431         {
 432           switch (errno)
 433             {
 434             case EINVAL:
 435               /* Incomplete text, do not report an error */
 436               done = TRUE;
 437               break;
 438             case E2BIG:
 439               {
 440                 gsize used = outp - dest;
 441
 442                 outbuf_size *= 2;
 443                 dest = g_realloc (dest, outbuf_size);
 444
 445                 outp = dest + used;
 446                 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
 447               }
 448               break;
 449             case EILSEQ:
 450               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 451                                    _("Invalid byte sequence in conversion input"));
 452               have_error = TRUE;
 453               break;
 454             default:
 455               {
 456                 int errsv = errno;
 457
 458                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 459                              _("Error during conversion: %s"),
 460                              g_strerror (errsv));
 461               }
 462               have_error = TRUE;
 463               break;
 464             }
 465         }
 466       else if (err > 0)
 467         {
 468           /* @err gives the number of replacement characters used. */
 469           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 470                                _("Unrepresentable character in conversion input"));
 471           have_error = TRUE;
 472         }
 473       else
 474         {
 475           if (!reset)
 476             {
 477               /* call g_iconv with NULL inbuf to cleanup shift state */
 478               reset = TRUE;
 479               inbytes_remaining = 0;
 480             }
 481           else
 482             done = TRUE;
 483         }
 484     }
 485
 486   memset (outp, 0, NUL_TERMINATOR_LENGTH);
 487
 488   if (bytes_read)
 489     *bytes_read = p - str;
 490   else
 491     {
 492       if ((p - str) != len)
 493         {
 494           if (!have_error)
 495             {
 496               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 497                                    _("Partial character sequence at end of input"));
 498               have_error = TRUE;
 499             }
 500         }
 501     }
 502
 503   if (bytes_written)
 504     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 505
 506   if (have_error)
 507     {
 508       g_free (dest);
 509       return NULL;
 510     }
 511   else
 512     return dest;
 513 }
 514
 515 /**
 516  * g_convert:
 517  * @str:           (array length=len) (element-type guint8):
 518  *                 the string to convert.
 519  * @len:           the length of the string in bytes, or -1 if the string is
 520  *                 nul-terminated (Note that some encodings may allow nul
 521  *                 bytes to occur inside strings. In that case, using -1
 522  *                 for the @len parameter is unsafe)
 523  * @to_codeset:    name of character set into which to convert @str
 524  * @from_codeset:  character set of @str.
 525  * @bytes_read:    (out) (optional): location to store the number of bytes in
 526  *                 the input string that were successfully converted, or %NULL.
 527  *                 Even if the conversion was successful, this may be
 528  *                 less than @len if there were partial characters
 529  *                 at the end of the input. If the error
 530  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 531  *                 stored will be the byte offset after the last valid
 532  *                 input sequence.
 533  * @bytes_written: (out) (optional): the number of bytes stored in
 534  *                 the output buffer (not including the terminating nul).
 535  * @error:         location to store the error occurring, or %NULL to ignore
 536  *                 errors. Any of the errors in #GConvertError may occur.
 537  *
 538  * Converts a string from one character set to another.
 539  *
 540  * Note that you should use g_iconv() for streaming conversions.
 541  * Despite the fact that @bytes_read can return information about partial
 542  * characters, the g_convert_... functions are not generally suitable
 543  * for streaming. If the underlying converter maintains internal state,
 544  * then this won't be preserved across successive calls to g_convert(),
 545  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
 546  * this is the GNU C converter for CP1255 which does not emit a base
 547  * character until it knows that the next character is not a mark that
 548  * could combine with the base character.)
 549  *
 550  * Using extensions such as "//TRANSLIT" may not work (or may not work
 551  * well) on many platforms.  Consider using g_str_to_ascii() instead.
 552  *
 553  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
 554  *          If the conversion was successful, a newly allocated buffer
 555  *          containing the converted string, which must be freed with g_free().
 556  *          Otherwise %NULL and @error will be set.
 557  **/
 558 gchar*
 559 g_convert (const gchar *str,
 560            gssize       len,
 561            const gchar *to_codeset,
 562            const gchar *from_codeset,
 563            gsize       *bytes_read,
 564            gsize       *bytes_written,
 565            GError     **error)
 566 {
 567   gchar *res;
 568   GIConv cd;
 569
 570   g_return_val_if_fail (str != NULL, NULL);
 571   g_return_val_if_fail (to_codeset != NULL, NULL);
 572   g_return_val_if_fail (from_codeset != NULL, NULL);
 573
 574   cd = open_converter (to_codeset, from_codeset, error);
 575
 576   if (cd == (GIConv) -1)
 577     {
 578       if (bytes_read)
 579         *bytes_read = 0;
 580
 581       if (bytes_written)
 582         *bytes_written = 0;
 583
 584       return NULL;
 585     }
 586
 587   res = g_convert_with_iconv (str, len, cd,
 588                               bytes_read, bytes_written,
 589                               error);
 590
 591   close_converter (cd);
 592
 593   return res;
 594 }
 595
 596 /**
 597  * g_convert_with_fallback:
 598  * @str:          (array length=len) (element-type guint8):
 599  *                the string to convert.
 600  * @len:          the length of the string in bytes, or -1 if the string is
 601  *                 nul-terminated (Note that some encodings may allow nul
 602  *                 bytes to occur inside strings. In that case, using -1
 603  *                 for the @len parameter is unsafe)
 604  * @to_codeset:   name of character set into which to convert @str
 605  * @from_codeset: character set of @str.
 606  * @fallback:     UTF-8 string to use in place of characters not
 607  *                present in the target encoding. (The string must be
 608  *                representable in the target encoding).
 609  *                If %NULL, characters not in the target encoding will
 610  *                be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
 611  * @bytes_read:   (out) (optional): location to store the number of bytes in
 612  *                the input string that were successfully converted, or %NULL.
 613  *                Even if the conversion was successful, this may be
 614  *                less than @len if there were partial characters
 615  *                at the end of the input.
 616  * @bytes_written: (out) (optional): the number of bytes stored in
 617  *                 the output buffer (not including the terminating nul).
 618  * @error:        location to store the error occurring, or %NULL to ignore
 619  *                errors. Any of the errors in #GConvertError may occur.
 620  *
 621  * Converts a string from one character set to another, possibly
 622  * including fallback sequences for characters not representable
 623  * in the output. Note that it is not guaranteed that the specification
 624  * for the fallback sequences in @fallback will be honored. Some
 625  * systems may do an approximate conversion from @from_codeset
 626  * to @to_codeset in their iconv() functions,
 627  * in which case GLib will simply return that approximate conversion.
 628  *
 629  * Note that you should use g_iconv() for streaming conversions.
 630  * Despite the fact that @bytes_read can return information about partial
 631  * characters, the g_convert_... functions are not generally suitable
 632  * for streaming. If the underlying converter maintains internal state,
 633  * then this won't be preserved across successive calls to g_convert(),
 634  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
 635  * this is the GNU C converter for CP1255 which does not emit a base
 636  * character until it knows that the next character is not a mark that
 637  * could combine with the base character.)
 638  *
 639  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
 640  *          If the conversion was successful, a newly allocated buffer
 641  *          containing the converted string, which must be freed with g_free().
 642  *          Otherwise %NULL and @error will be set.
 643  **/
 644 gchar*
 645 g_convert_with_fallback (const gchar *str,
 646                          gssize       len,
 647                          const gchar *to_codeset,
 648                          const gchar *from_codeset,
 649                          const gchar *fallback,
 650                          gsize       *bytes_read,
 651                          gsize       *bytes_written,
 652                          GError     **error)
 653 {
 654   gchar *utf8;
 655   gchar *dest;
 656   gchar *outp;
 657   const gchar *insert_str = NULL;
 658   const gchar *p;
 659   gsize inbytes_remaining;
 660   const gchar *save_p = NULL;
 661   gsize save_inbytes = 0;
 662   gsize outbytes_remaining;
 663   gsize err;
 664   GIConv cd;
 665   gsize outbuf_size;
 666   gboolean have_error = FALSE;
 667   gboolean done = FALSE;
 668
 669   GError *local_error = NULL;
 670
 671   g_return_val_if_fail (str != NULL, NULL);
 672   g_return_val_if_fail (to_codeset != NULL, NULL);
 673   g_return_val_if_fail (from_codeset != NULL, NULL);
 674
 675   if (len < 0)
 676     len = strlen (str);
 677
 678   /* Try an exact conversion; we only proceed if this fails
 679    * due to an illegal sequence in the input string.
 680    */
 681   dest = g_convert (str, len, to_codeset, from_codeset,
 682                     bytes_read, bytes_written, &local_error);
 683   if (!local_error)
 684     return dest;
 685
 686   g_assert (dest == NULL);
 687
 688   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 689     {
 690       g_propagate_error (error, local_error);
 691       return NULL;
 692     }
 693   else
 694     g_error_free (local_error);
 695
 696   local_error = NULL;
 697
 698   /* No go; to proceed, we need a converter from "UTF-8" to
 699    * to_codeset, and the string as UTF-8.
 700    */
 701   cd = open_converter (to_codeset, "UTF-8", error);
 702   if (cd == (GIConv) -1)
 703     {
 704       if (bytes_read)
 705         *bytes_read = 0;
 706
 707       if (bytes_written)
 708         *bytes_written = 0;
 709
 710       return NULL;
 711     }
 712
 713   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 714                     bytes_read, &inbytes_remaining, error);
 715   if (!utf8)
 716     {
 717       close_converter (cd);
 718       if (bytes_written)
 719         *bytes_written = 0;
 720       return NULL;
 721     }
 722
 723   /* Now the heart of the code. We loop through the UTF-8 string, and
 724    * whenever we hit an offending character, we form fallback, convert
 725    * the fallback to the target codeset, and then go back to
 726    * converting the original string after finishing with the fallback.
 727    *
 728    * The variables save_p and save_inbytes store the input state
 729    * for the original string while we are converting the fallback
 730    */
 731   p = utf8;
 732
 733   outbuf_size = len + NUL_TERMINATOR_LENGTH;
 734   outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
 735   outp = dest = g_malloc (outbuf_size);
 736
 737   while (!done && !have_error)
 738     {
 739       gsize inbytes_tmp = inbytes_remaining;
 740       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
 741       inbytes_remaining = inbytes_tmp;
 742
 743       if (err == (gsize) -1)
 744         {
 745           switch (errno)
 746             {
 747             case EINVAL:
 748               g_assert_not_reached();
 749               break;
 750             case E2BIG:
 751               {
 752                 gsize used = outp - dest;
 753
 754                 outbuf_size *= 2;
 755                 dest = g_realloc (dest, outbuf_size);
 756
 757                 outp = dest + used;
 758                 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
 759
 760                 break;
 761               }
 762             case EILSEQ:
 763               if (save_p)
 764                 {
 765                   /* Error converting fallback string - fatal
 766                    */
 767                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 768                                _("Cannot convert fallback “%s” to codeset “%s”"),
 769                                insert_str, to_codeset);
 770                   have_error = TRUE;
 771                   break;
 772                 }
 773               else if (p)
 774                 {
 775                   if (!fallback)
 776                     {
 777                       gunichar ch = g_utf8_get_char (p);
 778                       insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
 779                                                     ch);
 780                     }
 781                   else
 782                     insert_str = fallback;
 783
 784                   save_p = g_utf8_next_char (p);
 785                   save_inbytes = inbytes_remaining - (save_p - p);
 786                   p = insert_str;
 787                   inbytes_remaining = strlen (p);
 788                   break;
 789                 }
 790               /* if p is null */
 791               G_GNUC_FALLTHROUGH;
 792             default:
 793               {
 794                 int errsv = errno;
 795
 796                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 797                              _("Error during conversion: %s"),
 798                              g_strerror (errsv));
 799               }
 800
 801               have_error = TRUE;
 802               break;
 803             }
 804         }
 805       else
 806         {
 807           if (save_p)
 808             {
 809               if (!fallback)
 810                 g_free ((gchar *)insert_str);
 811               p = save_p;
 812               inbytes_remaining = save_inbytes;
 813               save_p = NULL;
 814             }
 815           else if (p)
 816             {
 817               /* call g_iconv with NULL inbuf to cleanup shift state */
 818               p = NULL;
 819               inbytes_remaining = 0;
 820             }
 821           else
 822             done = TRUE;
 823         }
 824     }
 825
 826   /* Cleanup
 827    */
 828   memset (outp, 0, NUL_TERMINATOR_LENGTH);
 829
 830   close_converter (cd);
 831
 832   if (bytes_written)
 833     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 834
 835   g_free (utf8);
 836
 837   if (have_error)
 838     {
 839       if (save_p && !fallback)
 840         g_free ((gchar *)insert_str);
 841       g_free (dest);
 842       return NULL;
 843     }
 844   else
 845     return dest;
 846 }
 847
 848 /*
 849  * g_locale_to_utf8
 850  *
 851  *
 852  */
 853
 854 /*
 855  * Validate @string as UTF-8. @len can be negative if @string is
 856  * nul-terminated, or a non-negative value in bytes. If @string ends in an
 857  * incomplete sequence, or contains any illegal sequences or nul codepoints,
 858  * %NULL will be returned and the error set to
 859  * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
 860  * On success, @bytes_read and @bytes_written, if provided, will be set to
 861  * the number of bytes in @string up to @len or the terminating nul byte.
 862  * On error, @bytes_read will be set to the byte offset after the last valid
 863  * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
 864  */
 865 static gchar *
 866 strdup_len (const gchar *string,
 867             gssize       len,
 868             gsize       *bytes_read,
 869             gsize       *bytes_written,
 870             GError     **error)
 871 {
 872   gsize real_len;
 873   const gchar *end_valid;
 874
 875   if (!g_utf8_validate (string, len, &end_valid))
 876     {
 877       if (bytes_read)
 878         *bytes_read = end_valid - string;
 879       if (bytes_written)
 880         *bytes_written = 0;
 881
 882       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 883                            _("Invalid byte sequence in conversion input"));
 884       return NULL;
 885     }
 886
 887   real_len = end_valid - string;
 888
 889   if (bytes_read)
 890     *bytes_read = real_len;
 891   if (bytes_written)
 892     *bytes_written = real_len;
 893
 894   return g_strndup (string, real_len);
 895 }
 896
 897 typedef enum
 898 {
 899   CONVERT_CHECK_NO_NULS_IN_INPUT  = 1 << 0,
 900   CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
 901 } ConvertCheckFlags;
 902
 903 /*
 904  * Convert from @string in the encoding identified by @from_codeset,
 905  * returning a string in the encoding identifed by @to_codeset.
 906  * @len can be negative if @string is nul-terminated, or a non-negative
 907  * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
 908  * to check the input, the output, or both, for embedded nul bytes.
 909  * On success, @bytes_read, if provided, will be set to the number of bytes
 910  * in @string up to @len or the terminating nul byte, and @bytes_written, if
 911  * provided, will be set to the number of output bytes written into the
 912  * returned buffer, excluding the terminating nul sequence.
 913  * On error, @bytes_read will be set to the byte offset after the last valid
 914  * sequence in @string, and @bytes_written will be set to 0.
 915  */
 916 static gchar *
 917 convert_checked (const gchar      *string,
 918                  gssize            len,
 919                  const gchar      *to_codeset,
 920                  const gchar      *from_codeset,
 921                  ConvertCheckFlags flags,
 922                  gsize            *bytes_read,
 923                  gsize            *bytes_written,
 924                  GError          **error)
 925 {
 926   gchar *out;
 927   gsize outbytes;
 928
 929   if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
 930     {
 931       const gchar *early_nul = memchr (string, '\0', len);
 932       if (early_nul != NULL)
 933         {
 934           if (bytes_read)
 935             *bytes_read = early_nul - string;
 936           if (bytes_written)
 937             *bytes_written = 0;
 938
 939           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 940                                _("Embedded NUL byte in conversion input"));
 941           return NULL;
 942         }
 943     }
 944
 945   out = g_convert (string, len, to_codeset, from_codeset,
 946                    bytes_read, &outbytes, error);
 947   if (out == NULL)
 948     {
 949       if (bytes_written)
 950         *bytes_written = 0;
 951       return NULL;
 952     }
 953
 954   if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
 955       && memchr (out, '\0', outbytes) != NULL)
 956     {
 957       g_free (out);
 958       if (bytes_written)
 959         *bytes_written = 0;
 960       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
 961                            _("Embedded NUL byte in conversion output"));
 962       return NULL;
 963     }
 964
 965   if (bytes_written)
 966     *bytes_written = outbytes;
 967   return out;
 968 }
 969
 970 /**
 971  * g_locale_to_utf8:
 972  * @opsysstring:   (array length=len) (element-type guint8): a string in the
 973  *                 encoding of the current locale. On Windows
 974  *                 this means the system codepage.
 975  * @len:           the length of the string, or -1 if the string is
 976  *                 nul-terminated (Note that some encodings may allow nul
 977  *                 bytes to occur inside strings. In that case, using -1
 978  *                 for the @len parameter is unsafe)
 979  * @bytes_read: (out) (optional): location to store the number of bytes in the
 980  *                 input string that were successfully converted, or %NULL.
 981  *                 Even if the conversion was successful, this may be
 982  *                 less than @len if there were partial characters
 983  *                 at the end of the input. If the error
 984  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 985  *                 stored will be the byte offset after the last valid
 986  *                 input sequence.
 987  * @bytes_written: (out) (optional): the number of bytes stored in the output
 988  *                 buffer (not including the terminating nul).
 989  * @error:         location to store the error occurring, or %NULL to ignore
 990  *                 errors. Any of the errors in #GConvertError may occur.
 991  *
 992  * Converts a string which is in the encoding used for strings by
 993  * the C runtime (usually the same as that used by the operating
 994  * system) in the [current locale][setlocale] into a UTF-8 string.
 995  *
 996  * If the source encoding is not UTF-8 and the conversion output contains a
 997  * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
 998  * function returns %NULL.
 999  * If the source encoding is UTF-8, an embedded nul character is treated with
1000  * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
1001  * earlier versions of this library. Use g_convert() to produce output that
1002  * may contain embedded nul characters.
1003  *
1004  * Returns: (type utf8): The converted string, or %NULL on an error.
1005  **/
1006 gchar *
1007 g_locale_to_utf8 (const gchar  *opsysstring,
1008                   gssize        len,
1009                   gsize        *bytes_read,
1010                   gsize        *bytes_written,
1011                   GError      **error)
1012 {
1013   const char *charset;
1014
1015   if (g_get_charset (&charset))
1016     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1017   else
1018     return convert_checked (opsysstring, len, "UTF-8", charset,
1019                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1020                             bytes_read, bytes_written, error);
1021 }
1022
1023 /*
1024  * Do the exact same as g_locale_to_utf8 except that the charset would
1025  * be retrieved from _g_get_time_charset (which uses LC_TIME)
1026  *
1027  * Returns: The converted string, or %NULL on an error.
1028  */
1029 gchar *
1030 _g_time_locale_to_utf8 (const gchar *opsysstring,
1031                         gssize       len,
1032                         gsize       *bytes_read,
1033                         gsize       *bytes_written,
1034                         GError     **error)
1035 {
1036   const char *charset;
1037
1038   if (_g_get_time_charset (&charset))
1039     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1040   else
1041     return convert_checked (opsysstring, len, "UTF-8", charset,
1042                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1043                             bytes_read, bytes_written, error);
1044 }
1045
1046 /*
1047  * Do the exact same as g_locale_to_utf8 except that the charset would
1048  * be retrieved from _g_get_ctype_charset (which uses LC_CTYPE)
1049  *
1050  * Returns: The converted string, or %NULL on an error.
1051  */
1052 gchar *
1053 _g_ctype_locale_to_utf8 (const gchar *opsysstring,
1054                          gssize       len,
1055                          gsize       *bytes_read,
1056                          gsize       *bytes_written,
1057                          GError     **error)
1058 {
1059   const char *charset;
1060
1061   if (_g_get_ctype_charset (&charset))
1062     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1063   else
1064     return convert_checked (opsysstring, len, "UTF-8", charset,
1065                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1066                             bytes_read, bytes_written, error);
1067 }
1068
1069 /**
1070  * g_locale_from_utf8:
1071  * @utf8string:    a UTF-8 encoded string
1072  * @len:           the length of the string, or -1 if the string is
1073  *                 nul-terminated.
1074  * @bytes_read: (out) (optional): location to store the number of bytes in the
1075  *                 input string that were successfully converted, or %NULL.
1076  *                 Even if the conversion was successful, this may be
1077  *                 less than @len if there were partial characters
1078  *                 at the end of the input. If the error
1079  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1080  *                 stored will be the byte offset after the last valid
1081  *                 input sequence.
1082  * @bytes_written: (out) (optional): the number of bytes stored in the output
1083  *                 buffer (not including the terminating nul).
1084  * @error:         location to store the error occurring, or %NULL to ignore
1085  *                 errors. Any of the errors in #GConvertError may occur.
1086  *
1087  * Converts a string from UTF-8 to the encoding used for strings by
1088  * the C runtime (usually the same as that used by the operating
1089  * system) in the [current locale][setlocale]. On Windows this means
1090  * the system codepage.
1091  *
1092  * The input string shall not contain nul characters even if the @len
1093  * argument is positive. A nul character found inside the string will result
1094  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1095  * input that may contain embedded nul characters.
1096  *
1097  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1098  *          A newly-allocated buffer containing the converted string,
1099  *          or %NULL on an error, and error will be set.
1100  **/
1101 gchar *
1102 g_locale_from_utf8 (const gchar *utf8string,
1103                     gssize       len,
1104                     gsize       *bytes_read,
1105                     gsize       *bytes_written,
1106                     GError     **error)
1107 {
1108   const gchar *charset;
1109
1110   if (g_get_charset (&charset))
1111     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1112   else
1113     return convert_checked (utf8string, len, charset, "UTF-8",
1114                             CONVERT_CHECK_NO_NULS_IN_INPUT,
1115                             bytes_read, bytes_written, error);
1116 }
1117
1118 #ifndef G_PLATFORM_WIN32
1119
1120 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1121
1122 struct _GFilenameCharsetCache {
1123   gboolean is_utf8;
1124   gchar *charset;
1125   gchar **filename_charsets;
1126 };
1127
1128 static void
1129 filename_charset_cache_free (gpointer data)
1130 {
1131   GFilenameCharsetCache *cache = data;
1132   g_free (cache->charset);
1133   g_strfreev (cache->filename_charsets);
1134   g_free (cache);
1135 }
1136
1137 /**
1138  * g_get_filename_charsets:
1139  * @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1140  *    return location for the %NULL-terminated list of encoding names
1141  *
1142  * Determines the preferred character sets used for filenames.
1143  * The first character set from the @charsets is the filename encoding, the
1144  * subsequent character sets are used when trying to generate a displayable
1145  * representation of a filename, see g_filename_display_name().
1146  *
1147  * On Unix, the character sets are determined by consulting the
1148  * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1149  * On Windows, the character set used in the GLib API is always UTF-8
1150  * and said environment variables have no effect.
1151  *
1152  * `G_FILENAME_ENCODING` may be set to a comma-separated list of
1153  * character set names. The special token "\@locale" is taken
1154  * to  mean the character set for the [current locale][setlocale].
1155  * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1156  * the character set of the current locale is taken as the filename
1157  * encoding. If neither environment variable  is set, UTF-8 is taken
1158  * as the filename encoding, but the character set of the current locale
1159  * is also put in the list of encodings.
1160  *
1161  * The returned @charsets belong to GLib and must not be freed.
1162  *
1163  * Note that on Unix, regardless of the locale character set or
1164  * `G_FILENAME_ENCODING` value, the actual file names present
1165  * on a system might be in any random encoding or just gibberish.
1166  *
1167  * Returns: %TRUE if the filename encoding is UTF-8.
1168  *
1169  * Since: 2.6
1170  */
1171 gboolean
1172 g_get_filename_charsets (const gchar ***filename_charsets)
1173 {
1174   static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1175   GFilenameCharsetCache *cache = g_private_get (&cache_private);
1176   const gchar *charset;
1177
1178   if (!cache)
1179     cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache));
1180
1181   g_get_charset (&charset);
1182
1183   if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1184     {
1185       const gchar *new_charset;
1186       const gchar *p;
1187       gint i;
1188
1189       g_free (cache->charset);
1190       g_strfreev (cache->filename_charsets);
1191       cache->charset = g_strdup (charset);
1192
1193       p = g_getenv ("G_FILENAME_ENCODING");
1194       if (p != NULL && p[0] != '\0')
1195         {
1196           cache->filename_charsets = g_strsplit (p, ",", 0);
1197           cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1198
1199           for (i = 0; cache->filename_charsets[i]; i++)
1200             {
1201               if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1202                 {
1203                   g_get_charset (&new_charset);
1204                   g_free (cache->filename_charsets[i]);
1205                   cache->filename_charsets[i] = g_strdup (new_charset);
1206                 }
1207             }
1208         }
1209       else if (g_getenv ("G_BROKEN_FILENAMES") != NULL)
1210         {
1211           cache->filename_charsets = g_new0 (gchar *, 2);
1212           cache->is_utf8 = g_get_charset (&new_charset);
1213           cache->filename_charsets[0] = g_strdup (new_charset);
1214         }
1215       else
1216         {
1217           cache->filename_charsets = g_new0 (gchar *, 3);
1218           cache->is_utf8 = TRUE;
1219           cache->filename_charsets[0] = g_strdup ("UTF-8");
1220           if (!g_get_charset (&new_charset))
1221             cache->filename_charsets[1] = g_strdup (new_charset);
1222         }
1223     }
1224
1225   if (filename_charsets)
1226     *filename_charsets = (const gchar **)cache->filename_charsets;
1227
1228   return cache->is_utf8;
1229 }
1230
1231 #else /* G_PLATFORM_WIN32 */
1232
1233 gboolean
1234 g_get_filename_charsets (const gchar ***filename_charsets)
1235 {
1236   static const gchar *charsets[] = {
1237     "UTF-8",
1238     NULL
1239   };
1240
1241 #ifdef G_OS_WIN32
1242   /* On Windows GLib pretends that the filename charset is UTF-8 */
1243   if (filename_charsets)
1244     *filename_charsets = charsets;
1245
1246   return TRUE;
1247 #else
1248   gboolean result;
1249
1250   /* Cygwin works like before */
1251   result = g_get_charset (&(charsets[0]));
1252
1253   if (filename_charsets)
1254     *filename_charsets = charsets;
1255
1256   return result;
1257 #endif
1258 }
1259
1260 #endif /* G_PLATFORM_WIN32 */
1261
1262 static gboolean
1263 get_filename_charset (const gchar **filename_charset)
1264 {
1265   const gchar **charsets;
1266   gboolean is_utf8;
1267
1268   is_utf8 = g_get_filename_charsets (&charsets);
1269
1270   if (filename_charset)
1271     *filename_charset = charsets[0];
1272
1273   return is_utf8;
1274 }
1275
1276 /**
1277  * g_filename_to_utf8:
1278  * @opsysstring: (type filename): a string in the encoding for filenames
1279  * @len:           the length of the string, or -1 if the string is
1280  *                 nul-terminated (Note that some encodings may allow nul
1281  *                 bytes to occur inside strings. In that case, using -1
1282  *                 for the @len parameter is unsafe)
1283  * @bytes_read: (out) (optional): location to store the number of bytes in the
1284  *                 input string that were successfully converted, or %NULL.
1285  *                 Even if the conversion was successful, this may be
1286  *                 less than @len if there were partial characters
1287  *                 at the end of the input. If the error
1288  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1289  *                 stored will be the byte offset after the last valid
1290  *                 input sequence.
1291  * @bytes_written: (out) (optional): the number of bytes stored in the output
1292  *                 buffer (not including the terminating nul).
1293  * @error:         location to store the error occurring, or %NULL to ignore
1294  *                 errors. Any of the errors in #GConvertError may occur.
1295  *
1296  * Converts a string which is in the encoding used by GLib for
1297  * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1298  * for filenames; on other platforms, this function indirectly depends on
1299  * the [current locale][setlocale].
1300  *
1301  * The input string shall not contain nul characters even if the @len
1302  * argument is positive. A nul character found inside the string will result
1303  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1304  * If the source encoding is not UTF-8 and the conversion output contains a
1305  * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1306  * function returns %NULL. Use g_convert() to produce output that
1307  * may contain embedded nul characters.
1308  *
1309  * Returns: (type utf8): The converted string, or %NULL on an error.
1310  **/
1311 gchar*
1312 g_filename_to_utf8 (const gchar *opsysstring,
1313                     gssize       len,
1314                     gsize       *bytes_read,
1315                     gsize       *bytes_written,
1316                     GError     **error)
1317 {
1318   const gchar *charset;
1319
1320   g_return_val_if_fail (opsysstring != NULL, NULL);
1321
1322   if (get_filename_charset (&charset))
1323     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1324   else
1325     return convert_checked (opsysstring, len, "UTF-8", charset,
1326                             CONVERT_CHECK_NO_NULS_IN_INPUT |
1327                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1328                             bytes_read, bytes_written, error);
1329 }
1330
1331 /**
1332  * g_filename_from_utf8:
1333  * @utf8string:    (type utf8): a UTF-8 encoded string.
1334  * @len:           the length of the string, or -1 if the string is
1335  *                 nul-terminated.
1336  * @bytes_read:    (out) (optional): location to store the number of bytes in
1337  *                 the input string that were successfully converted, or %NULL.
1338  *                 Even if the conversion was successful, this may be
1339  *                 less than @len if there were partial characters
1340  *                 at the end of the input. If the error
1341  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1342  *                 stored will be the byte offset after the last valid
1343  *                 input sequence.
1344  * @bytes_written: (out) (optional): the number of bytes stored in
1345  *                 the output buffer (not including the terminating nul).
1346  * @error:         location to store the error occurring, or %NULL to ignore
1347  *                 errors. Any of the errors in #GConvertError may occur.
1348  *
1349  * Converts a string from UTF-8 to the encoding GLib uses for
1350  * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1351  * on other platforms, this function indirectly depends on the
1352  * [current locale][setlocale].
1353  *
1354  * The input string shall not contain nul characters even if the @len
1355  * argument is positive. A nul character found inside the string will result
1356  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1357  * not UTF-8 and the conversion output contains a nul character, the error
1358  * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1359  *
1360  * Returns: (type filename):
1361  *               The converted string, or %NULL on an error.
1362  **/
1363 gchar*
1364 g_filename_from_utf8 (const gchar *utf8string,
1365                       gssize       len,
1366                       gsize       *bytes_read,
1367                       gsize       *bytes_written,
1368                       GError     **error)
1369 {
1370   const gchar *charset;
1371
1372   if (get_filename_charset (&charset))
1373     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1374   else
1375     return convert_checked (utf8string, len, charset, "UTF-8",
1376                             CONVERT_CHECK_NO_NULS_IN_INPUT |
1377                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1378                             bytes_read, bytes_written, error);
1379 }
1380
1381 /* Test of haystack has the needle prefix, comparing case
1382  * insensitive. haystack may be UTF-8, but needle must
1383  * contain only ascii. */
1384 static gboolean
1385 has_case_prefix (const gchar *haystack, const gchar *needle)
1386 {
1387   const gchar *h, *n;
1388
1389   /* Eat one character at a time. */
1390   h = haystack;
1391   n = needle;
1392
1393   while (*n && *h &&
1394          g_ascii_tolower (*n) == g_ascii_tolower (*h))
1395     {
1396       n++;
1397       h++;
1398     }
1399
1400   return *n == '\0';
1401 }
1402
1403 typedef enum {
1404   UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
1405   UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
1406   UNSAFE_PATH       = 0x8,  /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1407   UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
1408   UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
1409 } UnsafeCharacterSet;
1410
1411 static const guchar acceptable[96] = {
1412   /* A table of the ASCII chars from space (32) to DEL (127) */
1413   /*      !    "    #    $    %    &    '    (    )    *    +    ,    -    .    / */
1414   0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1415   /* 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ? */
1416   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1417   /* @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O */
1418   0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1419   /* P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _ */
1420   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1421   /* `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o */
1422   0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1423   /* p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~  DEL */
1424   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1425 };
1426
1427 static const gchar hex[] = "0123456789ABCDEF";
1428
1429 /* Note: This escape function works on file: URIs, but if you want to
1430  * escape something else, please read RFC-2396 */
1431 static gchar *
1432 g_escape_uri_string (const gchar *string,
1433                      UnsafeCharacterSet mask)
1434 {
1435 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1436
1437   const gchar *p;
1438   gchar *q;
1439   gchar *result;
1440   int c;
1441   gint unacceptable;
1442   UnsafeCharacterSet use_mask;
1443
1444   g_return_val_if_fail (mask == UNSAFE_ALL
1445                         || mask == UNSAFE_ALLOW_PLUS
1446                         || mask == UNSAFE_PATH
1447                         || mask == UNSAFE_HOST
1448                         || mask == UNSAFE_SLASHES, NULL);
1449
1450   unacceptable = 0;
1451   use_mask = mask;
1452   for (p = string; *p != '\0'; p++)
1453     {
1454       c = (guchar) *p;
1455       if (!ACCEPTABLE (c))
1456         unacceptable++;
1457     }
1458
1459   result = g_malloc (p - string + unacceptable * 2 + 1);
1460
1461   use_mask = mask;
1462   for (q = result, p = string; *p != '\0'; p++)
1463     {
1464       c = (guchar) *p;
1465
1466       if (!ACCEPTABLE (c))
1467         {
1468           *q++ = '%'; /* means hex coming */
1469           *q++ = hex[c >> 4];
1470           *q++ = hex[c & 15];
1471         }
1472       else
1473         *q++ = *p;
1474     }
1475
1476   *q = '\0';
1477
1478   return result;
1479 }
1480
1481
1482 static gchar *
1483 g_escape_file_uri (const gchar *hostname,
1484                    const gchar *pathname)
1485 {
1486   char *escaped_hostname = NULL;
1487   char *escaped_path;
1488   char *res;
1489
1490 #ifdef G_OS_WIN32
1491   char *p, *backslash;
1492
1493   /* Turn backslashes into forward slashes. That's what Netscape
1494    * does, and they are actually more or less equivalent in Windows.
1495    */
1496
1497   pathname = g_strdup (pathname);
1498   p = (char *) pathname;
1499
1500   while ((backslash = strchr (p, '\\')) != NULL)
1501     {
1502       *backslash = '/';
1503       p = backslash + 1;
1504     }
1505 #endif
1506
1507   if (hostname && *hostname != '\0')
1508     {
1509       escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1510     }
1511
1512   escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1513
1514   res = g_strconcat ("file://",
1515                      (escaped_hostname) ? escaped_hostname : "",
1516                      (*escaped_path != '/') ? "/" : "",
1517                      escaped_path,
1518                      NULL);
1519
1520 #ifdef G_OS_WIN32
1521   g_free ((char *) pathname);
1522 #endif
1523
1524   g_free (escaped_hostname);
1525   g_free (escaped_path);
1526
1527   return res;
1528 }
1529
1530 static int
1531 unescape_character (const char *scanner)
1532 {
1533   int first_digit;
1534   int second_digit;
1535
1536   first_digit = g_ascii_xdigit_value (scanner[0]);
1537   if (first_digit < 0)
1538     return -1;
1539
1540   second_digit = g_ascii_xdigit_value (scanner[1]);
1541   if (second_digit < 0)
1542     return -1;
1543
1544   return (first_digit << 4) | second_digit;
1545 }
1546
1547 static gchar *
1548 g_unescape_uri_string (const char *escaped,
1549                        int         len,
1550                        const char *illegal_escaped_characters,
1551                        gboolean    ascii_must_not_be_escaped)
1552 {
1553   const gchar *in, *in_end;
1554   gchar *out, *result;
1555   int c;
1556
1557   if (escaped == NULL)
1558     return NULL;
1559
1560   if (len < 0)
1561     len = strlen (escaped);
1562
1563   result = g_malloc (len + 1);
1564
1565   out = result;
1566   for (in = escaped, in_end = escaped + len; in < in_end; in++)
1567     {
1568       c = *in;
1569
1570       if (c == '%')
1571         {
1572           /* catch partial escape sequences past the end of the substring */
1573           if (in + 3 > in_end)
1574             break;
1575
1576           c = unescape_character (in + 1);
1577
1578           /* catch bad escape sequences and NUL characters */
1579           if (c <= 0)
1580             break;
1581
1582           /* catch escaped ASCII */
1583           if (ascii_must_not_be_escaped && c <= 0x7F)
1584             break;
1585
1586           /* catch other illegal escaped characters */
1587           if (strchr (illegal_escaped_characters, c) != NULL)
1588             break;
1589
1590           in += 2;
1591         }
1592
1593       *out++ = c;
1594     }
1595
1596   g_assert (out - result <= len);
1597   *out = '\0';
1598
1599   if (in != in_end)
1600     {
1601       g_free (result);
1602       return NULL;
1603     }
1604
1605   return result;
1606 }
1607
1608 static gboolean
1609 is_asciialphanum (gunichar c)
1610 {
1611   return c <= 0x7F && g_ascii_isalnum (c);
1612 }
1613
1614 static gboolean
1615 is_asciialpha (gunichar c)
1616 {
1617   return c <= 0x7F && g_ascii_isalpha (c);
1618 }
1619
1620 /* allows an empty string */
1621 static gboolean
1622 hostname_validate (const char *hostname)
1623 {
1624   const char *p;
1625   gunichar c, first_char, last_char;
1626
1627   p = hostname;
1628   if (*p == '\0')
1629     return TRUE;
1630   do
1631     {
1632       /* read in a label */
1633       c = g_utf8_get_char (p);
1634       p = g_utf8_next_char (p);
1635       if (!is_asciialphanum (c))
1636         return FALSE;
1637       first_char = c;
1638       do
1639         {
1640           last_char = c;
1641           c = g_utf8_get_char (p);
1642           p = g_utf8_next_char (p);
1643         }
1644       while (is_asciialphanum (c) || c == '-');
1645       if (last_char == '-')
1646         return FALSE;
1647
1648       /* if that was the last label, check that it was a toplabel */
1649       if (c == '\0' || (c == '.' && *p == '\0'))
1650         return is_asciialpha (first_char);
1651     }
1652   while (c == '.');
1653   return FALSE;
1654 }
1655
1656 /**
1657  * g_filename_from_uri:
1658  * @uri: a uri describing a filename (escaped, encoded in ASCII).
1659  * @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1660  *            If there is no hostname in the URI, %NULL will be
1661  *            stored in this location.
1662  * @error: location to store the error occurring, or %NULL to ignore
1663  *         errors. Any of the errors in #GConvertError may occur.
1664  *
1665  * Converts an escaped ASCII-encoded URI to a local filename in the
1666  * encoding used for filenames.
1667  *
1668  * Returns: (type filename): a newly-allocated string holding
1669  *               the resulting filename, or %NULL on an error.
1670  **/
1671 gchar *
1672 g_filename_from_uri (const gchar *uri,
1673                      gchar      **hostname,
1674                      GError     **error)
1675 {
1676   const char *path_part;
1677   const char *host_part;
1678   char *unescaped_hostname;
1679   char *result;
1680   char *filename;
1681   int offs;
1682 #ifdef G_OS_WIN32
1683   char *p, *slash;
1684 #endif
1685
1686   if (hostname)
1687     *hostname = NULL;
1688
1689   if (!has_case_prefix (uri, "file:/"))
1690     {
1691       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1692                    _("The URI “%s” is not an absolute URI using the “file” scheme"),
1693                    uri);
1694       return NULL;
1695     }
1696
1697   path_part = uri + strlen ("file:");
1698
1699   if (strchr (path_part, '#') != NULL)
1700     {
1701       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1702                    _("The local file URI “%s” may not include a “#”"),
1703                    uri);
1704       return NULL;
1705     }
1706
1707   if (has_case_prefix (path_part, "///"))
1708     path_part += 2;
1709   else if (has_case_prefix (path_part, "//"))
1710     {
1711       path_part += 2;
1712       host_part = path_part;
1713
1714       path_part = strchr (path_part, '/');
1715
1716       if (path_part == NULL)
1717         {
1718           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1719                        _("The URI “%s” is invalid"),
1720                        uri);
1721           return NULL;
1722         }
1723
1724       unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1725
1726       if (unescaped_hostname == NULL ||
1727           !hostname_validate (unescaped_hostname))
1728         {
1729           g_free (unescaped_hostname);
1730           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1731                        _("The hostname of the URI “%s” is invalid"),
1732                        uri);
1733           return NULL;
1734         }
1735
1736       if (hostname)
1737         *hostname = unescaped_hostname;
1738       else
1739         g_free (unescaped_hostname);
1740     }
1741
1742   filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1743
1744   if (filename == NULL)
1745     {
1746       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1747                    _("The URI “%s” contains invalidly escaped characters"),
1748                    uri);
1749       return NULL;
1750     }
1751
1752   offs = 0;
1753 #ifdef G_OS_WIN32
1754   /* Drop localhost */
1755   if (hostname && *hostname != NULL &&
1756       g_ascii_strcasecmp (*hostname, "localhost") == 0)
1757     {
1758       g_free (*hostname);
1759       *hostname = NULL;
1760     }
1761
1762   /* Turn slashes into backslashes, because that's the canonical spelling */
1763   p = filename;
1764   while ((slash = strchr (p, '/')) != NULL)
1765     {
1766       *slash = '\\';
1767       p = slash + 1;
1768     }
1769
1770   /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1771    * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1772    * the filename from the drive letter.
1773    */
1774   if (g_ascii_isalpha (filename[1]))
1775     {
1776       if (filename[2] == ':')
1777         offs = 1;
1778       else if (filename[2] == '|')
1779         {
1780           filename[2] = ':';
1781           offs = 1;
1782         }
1783     }
1784 #endif
1785
1786   result = g_strdup (filename + offs);
1787   g_free (filename);
1788
1789   return result;
1790 }
1791
1792 /**
1793  * g_filename_to_uri:
1794  * @filename: (type filename): an absolute filename specified in the GLib file
1795  *     name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1796  *     on Windows
1797  * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1798  * @error: location to store the error occurring, or %NULL to ignore
1799  *         errors. Any of the errors in #GConvertError may occur.
1800  *
1801  * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1802  * component following Section 3.3. of RFC 2396.
1803  *
1804  * Returns: a newly-allocated string holding the resulting
1805  *               URI, or %NULL on an error.
1806  **/
1807 gchar *
1808 g_filename_to_uri (const gchar *filename,
1809                    const gchar *hostname,
1810                    GError     **error)
1811 {
1812   char *escaped_uri;
1813
1814   g_return_val_if_fail (filename != NULL, NULL);
1815
1816   if (!g_path_is_absolute (filename))
1817     {
1818       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1819                    _("The pathname “%s” is not an absolute path"),
1820                    filename);
1821       return NULL;
1822     }
1823
1824   if (hostname &&
1825       !(g_utf8_validate (hostname, -1, NULL)
1826         && hostname_validate (hostname)))
1827     {
1828       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1829                            _("Invalid hostname"));
1830       return NULL;
1831     }
1832
1833 #ifdef G_OS_WIN32
1834   /* Don't use localhost unnecessarily */
1835   if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1836     hostname = NULL;
1837 #endif
1838
1839   escaped_uri = g_escape_file_uri (hostname, filename);
1840
1841   return escaped_uri;
1842 }
1843
1844 /**
1845  * g_uri_list_extract_uris:
1846  * @uri_list: an URI list
1847  *
1848  * Splits an URI list conforming to the text/uri-list
1849  * mime type defined in RFC 2483 into individual URIs,
1850  * discarding any comments. The URIs are not validated.
1851  *
1852  * Returns: (transfer full): a newly allocated %NULL-terminated list
1853  *   of strings holding the individual URIs. The array should be freed
1854  *   with g_strfreev().
1855  *
1856  * Since: 2.6
1857  */
1858 gchar **
1859 g_uri_list_extract_uris (const gchar *uri_list)
1860 {
1861   GPtrArray *uris;
1862   const gchar *p, *q;
1863
1864   uris = g_ptr_array_new ();
1865
1866   p = uri_list;
1867
1868   /* We don't actually try to validate the URI according to RFC
1869    * 2396, or even check for allowed characters - we just ignore
1870    * comments and trim whitespace off the ends.  We also
1871    * allow LF delimination as well as the specified CRLF.
1872    *
1873    * We do allow comments like specified in RFC 2483.
1874    */
1875   while (p)
1876     {
1877       if (*p != '#')
1878         {
1879           while (g_ascii_isspace (*p))
1880             p++;
1881
1882           q = p;
1883           while (*q && (*q != '\n') && (*q != '\r'))
1884             q++;
1885
1886           if (q > p)
1887             {
1888               q--;
1889               while (q > p && g_ascii_isspace (*q))
1890                 q--;
1891
1892               if (q > p)
1893                 g_ptr_array_add (uris, g_strndup (p, q - p + 1));
1894             }
1895         }
1896       p = strchr (p, '\n');
1897       if (p)
1898         p++;
1899     }
1900
1901   g_ptr_array_add (uris, NULL);
1902
1903   return (gchar **) g_ptr_array_free (uris, FALSE);
1904 }
1905
1906 /**
1907  * g_filename_display_basename:
1908  * @filename: (type filename): an absolute pathname in the
1909  *     GLib file name encoding
1910  *
1911  * Returns the display basename for the particular filename, guaranteed
1912  * to be valid UTF-8. The display name might not be identical to the filename,
1913  * for instance there might be problems converting it to UTF-8, and some files
1914  * can be translated in the display.
1915  *
1916  * If GLib cannot make sense of the encoding of @filename, as a last resort it
1917  * replaces unknown characters with U+FFFD, the Unicode replacement character.
1918  * You can search the result for the UTF-8 encoding of this character (which is
1919  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1920  * encoding.
1921  *
1922  * You must pass the whole absolute pathname to this functions so that
1923  * translation of well known locations can be done.
1924  *
1925  * This function is preferred over g_filename_display_name() if you know the
1926  * whole path, as it allows translation.
1927  *
1928  * Returns: a newly allocated string containing
1929  *   a rendition of the basename of the filename in valid UTF-8
1930  *
1931  * Since: 2.6
1932  **/
1933 gchar *
1934 g_filename_display_basename (const gchar *filename)
1935 {
1936   char *basename;
1937   char *display_name;
1938
1939   g_return_val_if_fail (filename != NULL, NULL);
1940
1941   basename = g_path_get_basename (filename);
1942   display_name = g_filename_display_name (basename);
1943   g_free (basename);
1944   return display_name;
1945 }
1946
1947 /**
1948  * g_filename_display_name:
1949  * @filename: (type filename): a pathname hopefully in the
1950  *     GLib file name encoding
1951  *
1952  * Converts a filename into a valid UTF-8 string. The conversion is
1953  * not necessarily reversible, so you should keep the original around
1954  * and use the return value of this function only for display purposes.
1955  * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
1956  * even if the filename actually isn't in the GLib file name encoding.
1957  *
1958  * If GLib cannot make sense of the encoding of @filename, as a last resort it
1959  * replaces unknown characters with U+FFFD, the Unicode replacement character.
1960  * You can search the result for the UTF-8 encoding of this character (which is
1961  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1962  * encoding.
1963  *
1964  * If you know the whole pathname of the file you should use
1965  * g_filename_display_basename(), since that allows location-based
1966  * translation of filenames.
1967  *
1968  * Returns: a newly allocated string containing
1969  *   a rendition of the filename in valid UTF-8
1970  *
1971  * Since: 2.6
1972  **/
1973 gchar *
1974 g_filename_display_name (const gchar *filename)
1975 {
1976   gint i;
1977   const gchar **charsets;
1978   gchar *display_name = NULL;
1979   gboolean is_utf8;
1980
1981   is_utf8 = g_get_filename_charsets (&charsets);
1982
1983   if (is_utf8)
1984     {
1985       if (g_utf8_validate (filename, -1, NULL))
1986         display_name = g_strdup (filename);
1987     }
1988
1989   if (!display_name)
1990     {
1991       /* Try to convert from the filename charsets to UTF-8.
1992        * Skip the first charset if it is UTF-8.
1993        */
1994       for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
1995         {
1996           display_name = g_convert (filename, -1, "UTF-8", charsets[i],
1997                                     NULL, NULL, NULL);
1998
1999           if (display_name)
2000             break;
2001         }
2002     }
2003
2004   /* if all conversions failed, we replace invalid UTF-8
2005    * by a question mark
2006    */
2007   if (!display_name)
2008     display_name = g_utf8_make_valid (filename, -1);
2009
2010   return display_name;
2011 }
2012
2013 #ifdef G_OS_WIN32
2014
2015 /* Binary compatibility versions. Not for newly compiled code. */
2016
2017 _GLIB_EXTERN gchar *g_filename_to_utf8_utf8   (const gchar  *opsysstring,
2018                                                gssize        len,
2019                                                gsize        *bytes_read,
2020                                                gsize        *bytes_written,
2021                                                GError      **error) G_GNUC_MALLOC;
2022 _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar  *utf8string,
2023                                                gssize        len,
2024                                                gsize        *bytes_read,
2025                                                gsize        *bytes_written,
2026                                                GError      **error) G_GNUC_MALLOC;
2027 _GLIB_EXTERN gchar *g_filename_from_uri_utf8  (const gchar  *uri,
2028                                                gchar       **hostname,
2029                                                GError      **error) G_GNUC_MALLOC;
2030 _GLIB_EXTERN gchar *g_filename_to_uri_utf8    (const gchar  *filename,
2031                                                const gchar  *hostname,
2032                                                GError      **error) G_GNUC_MALLOC;
2033
2034 gchar *
2035 g_filename_to_utf8_utf8 (const gchar *opsysstring,
2036                          gssize       len,
2037                          gsize       *bytes_read,
2038                          gsize       *bytes_written,
2039                          GError     **error)
2040 {
2041   return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
2042 }
2043
2044 gchar *
2045 g_filename_from_utf8_utf8 (const gchar *utf8string,
2046                            gssize       len,
2047                            gsize       *bytes_read,
2048                            gsize       *bytes_written,
2049                            GError     **error)
2050 {
2051   return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
2052 }
2053
2054 gchar *
2055 g_filename_from_uri_utf8 (const gchar *uri,
2056                           gchar      **hostname,
2057                           GError     **error)
2058 {
2059   return g_filename_from_uri (uri, hostname, error);
2060 }
2061
2062 gchar *
2063 g_filename_to_uri_utf8 (const gchar *filename,
2064                         const gchar *hostname,
2065                         GError     **error)
2066 {
2067   return g_filename_to_uri (filename, hostname, error);
2068 }
2069
2070 #endif