1 /* GLIB - Library of useful routines for C programming
3 * gconvert.c: Convert between character sets using iconv
4 * Copyright Red Hat Inc., 2000
5 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
7 * SPDX-License-Identifier: LGPL-2.1-or-later
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
24 #include "glibconfig.h"
35 #include "win_iconv.c"
38 #ifdef G_PLATFORM_WIN32
45 #include "gconvertprivate.h"
47 #include "gcharsetprivate.h"
49 #include "gstrfuncs.h"
50 #include "gtestutils.h"
52 #include "gthreadprivate.h"
54 #include "gfileutils.h"
59 /* We try to terminate strings in unknown charsets with this many zero bytes
60 * to ensure that multibyte strings really are nul-terminated when we return
61 * them from g_convert() and friends.
63 #define NUL_TERMINATOR_LENGTH 4
65 G_DEFINE_QUARK (g_convert_error, g_convert_error)
68 try_conversion (const char *to_codeset,
69 const char *from_codeset,
72 *cd = iconv_open (to_codeset, from_codeset);
74 if (*cd == (iconv_t)-1 && errno == EINVAL)
77 #if defined(__FreeBSD__) && defined(ICONV_SET_ILSEQ_INVALID)
78 /* On FreeBSD request GNU iconv compatible handling of characters that cannot
79 * be repesented in the destination character set.
80 * See https://cgit.freebsd.org/src/commit/?id=7c5b23111c5fd1992047922d4247c4a1ce1bb6c3
83 if (iconvctl (*cd, ICONV_SET_ILSEQ_INVALID, &value) != 0)
90 try_to_aliases (const char **to_aliases,
91 const char *from_codeset,
96 const char **p = to_aliases;
99 if (try_conversion (*p, from_codeset, cd))
110 * g_iconv_open: (skip)
111 * @to_codeset: destination codeset
112 * @from_codeset: source codeset
114 * Same as the standard UNIX routine iconv_open(), but
115 * may be implemented via libiconv on UNIX flavors that lack
116 * a native implementation.
118 * GLib provides g_convert() and g_locale_to_utf8() which are likely
119 * more convenient than the raw iconv wrappers.
121 * Returns: a "conversion descriptor", or (GIConv)-1 if
122 * opening the converter failed.
125 g_iconv_open (const gchar *to_codeset,
126 const gchar *from_codeset)
130 if (!try_conversion (to_codeset, from_codeset, &cd))
132 const char **to_aliases = _g_charset_get_aliases (to_codeset);
133 const char **from_aliases = _g_charset_get_aliases (from_codeset);
137 const char **p = from_aliases;
140 if (try_conversion (to_codeset, *p, &cd))
143 if (try_to_aliases (to_aliases, *p, &cd))
150 if (try_to_aliases (to_aliases, from_codeset, &cd))
155 return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
160 * @converter: conversion descriptor from g_iconv_open()
161 * @inbuf: bytes to convert
162 * @inbytes_left: (inout): inout parameter, bytes remaining to convert in @inbuf
163 * @outbuf: converted output bytes
164 * @outbytes_left: (inout): inout parameter, bytes available to fill in @outbuf
166 * Same as the standard UNIX routine iconv(), but
167 * may be implemented via libiconv on UNIX flavors that lack
168 * a native implementation.
170 * GLib provides g_convert() and g_locale_to_utf8() which are likely
171 * more convenient than the raw iconv wrappers.
173 * Note that the behaviour of iconv() for characters which are valid in the
174 * input character set, but which have no representation in the output character
175 * set, is implementation defined. This function may return success (with a
176 * positive number of non-reversible conversions as replacement characters were
177 * used), or it may return -1 and set an error such as %EILSEQ, in such a
180 * Returns: count of non-reversible conversions, or -1 on error
183 g_iconv (GIConv converter,
187 gsize *outbytes_left)
189 iconv_t cd = (iconv_t)converter;
191 return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
195 * g_iconv_close: (skip)
196 * @converter: a conversion descriptor from g_iconv_open()
198 * Same as the standard UNIX routine iconv_close(), but
199 * may be implemented via libiconv on UNIX flavors that lack
200 * a native implementation. Should be called to clean up
201 * the conversion descriptor from g_iconv_open() when
202 * you are done converting things.
204 * GLib provides g_convert() and g_locale_to_utf8() which are likely
205 * more convenient than the raw iconv wrappers.
207 * Returns: -1 on error, 0 on success
210 g_iconv_close (GIConv converter)
212 iconv_t cd = (iconv_t)converter;
214 return iconv_close (cd);
218 open_converter (const gchar *to_codeset,
219 const gchar *from_codeset,
224 cd = g_iconv_open (to_codeset, from_codeset);
226 if (cd == (GIConv) -1)
228 /* Something went wrong. */
232 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
233 _("Conversion from character set “%s” to “%s” is not supported"),
234 from_codeset, to_codeset);
236 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
237 _("Could not open converter from “%s” to “%s”"),
238 from_codeset, to_codeset);
246 close_converter (GIConv cd)
248 if (cd == (GIConv) -1)
251 return g_iconv_close (cd);
255 * g_convert_with_iconv: (skip)
256 * @str: (array length=len) (element-type guint8):
257 * the string to convert.
258 * @len: the length of the string in bytes, or -1 if the string is
259 * nul-terminated (Note that some encodings may allow nul
260 * bytes to occur inside strings. In that case, using -1
261 * for the @len parameter is unsafe)
262 * @converter: conversion descriptor from g_iconv_open()
263 * @bytes_read: (out) (optional): location to store the number of bytes in
264 * the input string that were successfully converted, or %NULL.
265 * Even if the conversion was successful, this may be
266 * less than @len if there were partial characters
267 * at the end of the input. If the error
268 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
269 * stored will be the byte offset after the last valid
271 * @bytes_written: (out) (optional): the number of bytes stored in
272 * the output buffer (not including the terminating nul).
273 * @error: location to store the error occurring, or %NULL to ignore
274 * errors. Any of the errors in #GConvertError may occur.
276 * Converts a string from one character set to another.
278 * Note that you should use g_iconv() for streaming conversions.
279 * Despite the fact that @bytes_read can return information about partial
280 * characters, the g_convert_... functions are not generally suitable
281 * for streaming. If the underlying converter maintains internal state,
282 * then this won't be preserved across successive calls to g_convert(),
283 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
284 * this is the GNU C converter for CP1255 which does not emit a base
285 * character until it knows that the next character is not a mark that
286 * could combine with the base character.)
288 * Characters which are valid in the input character set, but which have no
289 * representation in the output character set will result in a
290 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
291 * specification, which leaves this behaviour implementation defined. Note that
292 * this is the same error code as is returned for an invalid byte sequence in
293 * the input character set. To get defined behaviour for conversion of
294 * unrepresentable characters, use g_convert_with_fallback().
296 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
297 * If the conversion was successful, a newly allocated buffer
298 * containing the converted string, which must be freed with
299 * g_free(). Otherwise %NULL and @error will be set.
302 g_convert_with_iconv (const gchar *str,
306 gsize *bytes_written,
312 gsize inbytes_remaining;
313 gsize outbytes_remaining;
316 gboolean have_error = FALSE;
317 gboolean done = FALSE;
318 gboolean reset = FALSE;
320 g_return_val_if_fail (converter != (GIConv) -1, NULL);
326 inbytes_remaining = len;
327 outbuf_size = len + NUL_TERMINATOR_LENGTH;
329 outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
330 outp = dest = g_malloc (outbuf_size);
332 while (!done && !have_error)
335 err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
337 err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
339 if (err == (gsize) -1)
344 /* Incomplete text, do not report an error */
349 gsize used = outp - dest;
352 dest = g_realloc (dest, outbuf_size);
355 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
359 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
360 _("Invalid byte sequence in conversion input"));
367 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
368 _("Error during conversion: %s"),
377 /* @err gives the number of replacement characters used. */
378 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
379 _("Unrepresentable character in conversion input"));
386 /* call g_iconv with NULL inbuf to cleanup shift state */
388 inbytes_remaining = 0;
395 memset (outp, 0, NUL_TERMINATOR_LENGTH);
398 *bytes_read = p - str;
401 if ((p - str) != len)
405 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
406 _("Partial character sequence at end of input"));
413 *bytes_written = outp - dest; /* Doesn't include '\0' */
426 * @str: (array length=len) (element-type guint8):
427 * the string to convert.
428 * @len: the length of the string in bytes, or -1 if the string is
429 * nul-terminated (Note that some encodings may allow nul
430 * bytes to occur inside strings. In that case, using -1
431 * for the @len parameter is unsafe)
432 * @to_codeset: name of character set into which to convert @str
433 * @from_codeset: character set of @str.
434 * @bytes_read: (out) (optional): location to store the number of bytes in
435 * the input string that were successfully converted, or %NULL.
436 * Even if the conversion was successful, this may be
437 * less than @len if there were partial characters
438 * at the end of the input. If the error
439 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
440 * stored will be the byte offset after the last valid
442 * @bytes_written: (out) (optional): the number of bytes stored in
443 * the output buffer (not including the terminating nul).
444 * @error: location to store the error occurring, or %NULL to ignore
445 * errors. Any of the errors in #GConvertError may occur.
447 * Converts a string from one character set to another.
449 * Note that you should use g_iconv() for streaming conversions.
450 * Despite the fact that @bytes_read can return information about partial
451 * characters, the g_convert_... functions are not generally suitable
452 * for streaming. If the underlying converter maintains internal state,
453 * then this won't be preserved across successive calls to g_convert(),
454 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
455 * this is the GNU C converter for CP1255 which does not emit a base
456 * character until it knows that the next character is not a mark that
457 * could combine with the base character.)
459 * Using extensions such as "//TRANSLIT" may not work (or may not work
460 * well) on many platforms. Consider using g_str_to_ascii() instead.
462 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
463 * If the conversion was successful, a newly allocated buffer
464 * containing the converted string, which must be freed with g_free().
465 * Otherwise %NULL and @error will be set.
468 g_convert (const gchar *str,
470 const gchar *to_codeset,
471 const gchar *from_codeset,
473 gsize *bytes_written,
479 g_return_val_if_fail (str != NULL, NULL);
480 g_return_val_if_fail (to_codeset != NULL, NULL);
481 g_return_val_if_fail (from_codeset != NULL, NULL);
483 cd = open_converter (to_codeset, from_codeset, error);
485 if (cd == (GIConv) -1)
496 res = g_convert_with_iconv (str, len, cd,
497 bytes_read, bytes_written,
500 close_converter (cd);
506 * g_convert_with_fallback:
507 * @str: (array length=len) (element-type guint8):
508 * the string to convert.
509 * @len: the length of the string in bytes, or -1 if the string is
510 * nul-terminated (Note that some encodings may allow nul
511 * bytes to occur inside strings. In that case, using -1
512 * for the @len parameter is unsafe)
513 * @to_codeset: name of character set into which to convert @str
514 * @from_codeset: character set of @str.
515 * @fallback: UTF-8 string to use in place of characters not
516 * present in the target encoding. (The string must be
517 * representable in the target encoding).
518 * If %NULL, characters not in the target encoding will
519 * be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
520 * @bytes_read: (out) (optional): location to store the number of bytes in
521 * the input string that were successfully converted, or %NULL.
522 * Even if the conversion was successful, this may be
523 * less than @len if there were partial characters
524 * at the end of the input.
525 * @bytes_written: (out) (optional): the number of bytes stored in
526 * the output buffer (not including the terminating nul).
527 * @error: location to store the error occurring, or %NULL to ignore
528 * errors. Any of the errors in #GConvertError may occur.
530 * Converts a string from one character set to another, possibly
531 * including fallback sequences for characters not representable
532 * in the output. Note that it is not guaranteed that the specification
533 * for the fallback sequences in @fallback will be honored. Some
534 * systems may do an approximate conversion from @from_codeset
535 * to @to_codeset in their iconv() functions,
536 * in which case GLib will simply return that approximate conversion.
538 * Note that you should use g_iconv() for streaming conversions.
539 * Despite the fact that @bytes_read can return information about partial
540 * characters, the g_convert_... functions are not generally suitable
541 * for streaming. If the underlying converter maintains internal state,
542 * then this won't be preserved across successive calls to g_convert(),
543 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
544 * this is the GNU C converter for CP1255 which does not emit a base
545 * character until it knows that the next character is not a mark that
546 * could combine with the base character.)
548 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
549 * If the conversion was successful, a newly allocated buffer
550 * containing the converted string, which must be freed with g_free().
551 * Otherwise %NULL and @error will be set.
554 g_convert_with_fallback (const gchar *str,
556 const gchar *to_codeset,
557 const gchar *from_codeset,
558 const gchar *fallback,
560 gsize *bytes_written,
566 const gchar *insert_str = NULL;
568 gsize inbytes_remaining;
569 const gchar *save_p = NULL;
570 gsize save_inbytes = 0;
571 gsize outbytes_remaining;
575 gboolean have_error = FALSE;
576 gboolean done = FALSE;
578 GError *local_error = NULL;
580 g_return_val_if_fail (str != NULL, NULL);
581 g_return_val_if_fail (to_codeset != NULL, NULL);
582 g_return_val_if_fail (from_codeset != NULL, NULL);
587 /* Try an exact conversion; we only proceed if this fails
588 * due to an illegal sequence in the input string.
590 dest = g_convert (str, len, to_codeset, from_codeset,
591 bytes_read, bytes_written, &local_error);
595 g_assert (dest == NULL);
597 if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
599 g_propagate_error (error, local_error);
603 g_error_free (local_error);
607 /* No go; to proceed, we need a converter from "UTF-8" to
608 * to_codeset, and the string as UTF-8.
610 cd = open_converter (to_codeset, "UTF-8", error);
611 if (cd == (GIConv) -1)
622 utf8 = g_convert (str, len, "UTF-8", from_codeset,
623 bytes_read, &inbytes_remaining, error);
626 close_converter (cd);
632 /* Now the heart of the code. We loop through the UTF-8 string, and
633 * whenever we hit an offending character, we form fallback, convert
634 * the fallback to the target codeset, and then go back to
635 * converting the original string after finishing with the fallback.
637 * The variables save_p and save_inbytes store the input state
638 * for the original string while we are converting the fallback
642 outbuf_size = len + NUL_TERMINATOR_LENGTH;
643 outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
644 outp = dest = g_malloc (outbuf_size);
646 while (!done && !have_error)
648 gsize inbytes_tmp = inbytes_remaining;
649 err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
650 inbytes_remaining = inbytes_tmp;
652 if (err == (gsize) -1)
657 g_assert_not_reached();
661 gsize used = outp - dest;
664 dest = g_realloc (dest, outbuf_size);
667 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
674 /* Error converting fallback string - fatal
676 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
677 _("Cannot convert fallback “%s” to codeset “%s”"),
678 insert_str, to_codeset);
686 gunichar ch = g_utf8_get_char (p);
687 insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
691 insert_str = fallback;
693 save_p = g_utf8_next_char (p);
694 save_inbytes = inbytes_remaining - (save_p - p);
696 inbytes_remaining = strlen (p);
705 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
706 _("Error during conversion: %s"),
719 g_free ((gchar *)insert_str);
721 inbytes_remaining = save_inbytes;
726 /* call g_iconv with NULL inbuf to cleanup shift state */
728 inbytes_remaining = 0;
737 memset (outp, 0, NUL_TERMINATOR_LENGTH);
739 close_converter (cd);
742 *bytes_written = outp - dest; /* Doesn't include '\0' */
748 if (save_p && !fallback)
749 g_free ((gchar *)insert_str);
764 * Validate @string as UTF-8. @len can be negative if @string is
765 * nul-terminated, or a non-negative value in bytes. If @string ends in an
766 * incomplete sequence, or contains any illegal sequences or nul codepoints,
767 * %NULL will be returned and the error set to
768 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
769 * On success, @bytes_read and @bytes_written, if provided, will be set to
770 * the number of bytes in @string up to @len or the terminating nul byte.
771 * On error, @bytes_read will be set to the byte offset after the last valid
772 * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
775 strdup_len (const gchar *string,
778 gsize *bytes_written,
782 const gchar *end_valid;
784 if (!g_utf8_validate (string, len, &end_valid))
787 *bytes_read = end_valid - string;
791 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
792 _("Invalid byte sequence in conversion input"));
796 real_len = end_valid - string;
799 *bytes_read = real_len;
801 *bytes_written = real_len;
803 return g_strndup (string, real_len);
808 CONVERT_CHECK_NO_NULS_IN_INPUT = 1 << 0,
809 CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
813 * Convert from @string in the encoding identified by @from_codeset,
814 * returning a string in the encoding identifed by @to_codeset.
815 * @len can be negative if @string is nul-terminated, or a non-negative
816 * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
817 * to check the input, the output, or both, for embedded nul bytes.
818 * On success, @bytes_read, if provided, will be set to the number of bytes
819 * in @string up to @len or the terminating nul byte, and @bytes_written, if
820 * provided, will be set to the number of output bytes written into the
821 * returned buffer, excluding the terminating nul sequence.
822 * On error, @bytes_read will be set to the byte offset after the last valid
823 * sequence in @string, and @bytes_written will be set to 0.
826 convert_checked (const gchar *string,
828 const gchar *to_codeset,
829 const gchar *from_codeset,
830 ConvertCheckFlags flags,
832 gsize *bytes_written,
838 if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
840 const gchar *early_nul = memchr (string, '\0', len);
841 if (early_nul != NULL)
844 *bytes_read = early_nul - string;
848 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
849 _("Embedded NUL byte in conversion input"));
854 out = g_convert (string, len, to_codeset, from_codeset,
855 bytes_read, &outbytes, error);
863 if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
864 && memchr (out, '\0', outbytes) != NULL)
869 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
870 _("Embedded NUL byte in conversion output"));
875 *bytes_written = outbytes;
881 * @opsysstring: (array length=len) (element-type guint8): a string in the
882 * encoding of the current locale. On Windows
883 * this means the system codepage.
884 * @len: the length of the string, or -1 if the string is
885 * nul-terminated (Note that some encodings may allow nul
886 * bytes to occur inside strings. In that case, using -1
887 * for the @len parameter is unsafe)
888 * @bytes_read: (out) (optional): location to store the number of bytes in the
889 * input string that were successfully converted, or %NULL.
890 * Even if the conversion was successful, this may be
891 * less than @len if there were partial characters
892 * at the end of the input. If the error
893 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
894 * stored will be the byte offset after the last valid
896 * @bytes_written: (out) (optional): the number of bytes stored in the output
897 * buffer (not including the terminating nul).
898 * @error: location to store the error occurring, or %NULL to ignore
899 * errors. Any of the errors in #GConvertError may occur.
901 * Converts a string which is in the encoding used for strings by
902 * the C runtime (usually the same as that used by the operating
903 * system) in the [current locale][setlocale] into a UTF-8 string.
905 * If the source encoding is not UTF-8 and the conversion output contains a
906 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
907 * function returns %NULL.
908 * If the source encoding is UTF-8, an embedded nul character is treated with
909 * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
910 * earlier versions of this library. Use g_convert() to produce output that
911 * may contain embedded nul characters.
913 * Returns: (type utf8): The converted string, or %NULL on an error.
916 g_locale_to_utf8 (const gchar *opsysstring,
919 gsize *bytes_written,
924 if (g_get_charset (&charset))
925 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
927 return convert_checked (opsysstring, len, "UTF-8", charset,
928 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
929 bytes_read, bytes_written, error);
933 * Do the exact same as g_locale_to_utf8 except that the charset would
934 * be retrieved from _g_get_time_charset (which uses LC_TIME)
936 * Returns: The converted string, or %NULL on an error.
939 _g_time_locale_to_utf8 (const gchar *opsysstring,
942 gsize *bytes_written,
947 if (_g_get_time_charset (&charset))
948 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
950 return convert_checked (opsysstring, len, "UTF-8", charset,
951 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
952 bytes_read, bytes_written, error);
956 * Do the exact same as g_locale_to_utf8 except that the charset would
957 * be retrieved from _g_get_ctype_charset (which uses LC_CTYPE)
959 * Returns: The converted string, or %NULL on an error.
962 _g_ctype_locale_to_utf8 (const gchar *opsysstring,
965 gsize *bytes_written,
970 if (_g_get_ctype_charset (&charset))
971 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
973 return convert_checked (opsysstring, len, "UTF-8", charset,
974 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
975 bytes_read, bytes_written, error);
979 * g_locale_from_utf8:
980 * @utf8string: a UTF-8 encoded string
981 * @len: the length of the string, or -1 if the string is
983 * @bytes_read: (out) (optional): location to store the number of bytes in the
984 * input string that were successfully converted, or %NULL.
985 * Even if the conversion was successful, this may be
986 * less than @len if there were partial characters
987 * at the end of the input. If the error
988 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
989 * stored will be the byte offset after the last valid
991 * @bytes_written: (out) (optional): the number of bytes stored in the output
992 * buffer (not including the terminating nul).
993 * @error: location to store the error occurring, or %NULL to ignore
994 * errors. Any of the errors in #GConvertError may occur.
996 * Converts a string from UTF-8 to the encoding used for strings by
997 * the C runtime (usually the same as that used by the operating
998 * system) in the [current locale][setlocale]. On Windows this means
999 * the system codepage.
1001 * The input string shall not contain nul characters even if the @len
1002 * argument is positive. A nul character found inside the string will result
1003 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1004 * input that may contain embedded nul characters.
1006 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1007 * A newly-allocated buffer containing the converted string,
1008 * or %NULL on an error, and error will be set.
1011 g_locale_from_utf8 (const gchar *utf8string,
1014 gsize *bytes_written,
1017 const gchar *charset;
1019 if (g_get_charset (&charset))
1020 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1022 return convert_checked (utf8string, len, charset, "UTF-8",
1023 CONVERT_CHECK_NO_NULS_IN_INPUT,
1024 bytes_read, bytes_written, error);
1027 #ifndef G_PLATFORM_WIN32
1029 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1031 struct _GFilenameCharsetCache {
1034 gchar **filename_charsets;
1038 filename_charset_cache_free (gpointer data)
1040 GFilenameCharsetCache *cache = data;
1041 g_free (cache->charset);
1042 g_strfreev (cache->filename_charsets);
1047 * g_get_filename_charsets:
1048 * @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1049 * return location for the %NULL-terminated list of encoding names
1051 * Determines the preferred character sets used for filenames.
1052 * The first character set from the @charsets is the filename encoding, the
1053 * subsequent character sets are used when trying to generate a displayable
1054 * representation of a filename, see g_filename_display_name().
1056 * On Unix, the character sets are determined by consulting the
1057 * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1058 * On Windows, the character set used in the GLib API is always UTF-8
1059 * and said environment variables have no effect.
1061 * `G_FILENAME_ENCODING` may be set to a comma-separated list of
1062 * character set names. The special token "\@locale" is taken
1063 * to mean the character set for the [current locale][setlocale].
1064 * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1065 * the character set of the current locale is taken as the filename
1066 * encoding. If neither environment variable is set, UTF-8 is taken
1067 * as the filename encoding, but the character set of the current locale
1068 * is also put in the list of encodings.
1070 * The returned @charsets belong to GLib and must not be freed.
1072 * Note that on Unix, regardless of the locale character set or
1073 * `G_FILENAME_ENCODING` value, the actual file names present
1074 * on a system might be in any random encoding or just gibberish.
1076 * Returns: %TRUE if the filename encoding is UTF-8.
1081 g_get_filename_charsets (const gchar ***filename_charsets)
1083 static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1084 GFilenameCharsetCache *cache = g_private_get (&cache_private);
1085 const gchar *charset;
1088 cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache));
1090 g_get_charset (&charset);
1092 if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1094 const gchar *new_charset;
1098 g_free (cache->charset);
1099 g_strfreev (cache->filename_charsets);
1100 cache->charset = g_strdup (charset);
1102 p = g_getenv ("G_FILENAME_ENCODING");
1103 if (p != NULL && p[0] != '\0')
1105 cache->filename_charsets = g_strsplit (p, ",", 0);
1106 cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1108 for (i = 0; cache->filename_charsets[i]; i++)
1110 if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1112 g_get_charset (&new_charset);
1113 g_free (cache->filename_charsets[i]);
1114 cache->filename_charsets[i] = g_strdup (new_charset);
1118 else if (g_getenv ("G_BROKEN_FILENAMES") != NULL)
1120 cache->filename_charsets = g_new0 (gchar *, 2);
1121 cache->is_utf8 = g_get_charset (&new_charset);
1122 cache->filename_charsets[0] = g_strdup (new_charset);
1126 cache->filename_charsets = g_new0 (gchar *, 3);
1127 cache->is_utf8 = TRUE;
1128 cache->filename_charsets[0] = g_strdup ("UTF-8");
1129 if (!g_get_charset (&new_charset))
1130 cache->filename_charsets[1] = g_strdup (new_charset);
1134 if (filename_charsets)
1135 *filename_charsets = (const gchar **)cache->filename_charsets;
1137 return cache->is_utf8;
1140 #else /* G_PLATFORM_WIN32 */
1143 g_get_filename_charsets (const gchar ***filename_charsets)
1145 static const gchar *charsets[] = {
1151 /* On Windows GLib pretends that the filename charset is UTF-8 */
1152 if (filename_charsets)
1153 *filename_charsets = charsets;
1159 /* Cygwin works like before */
1160 result = g_get_charset (&(charsets[0]));
1162 if (filename_charsets)
1163 *filename_charsets = charsets;
1169 #endif /* G_PLATFORM_WIN32 */
1172 get_filename_charset (const gchar **filename_charset)
1174 const gchar **charsets;
1177 is_utf8 = g_get_filename_charsets (&charsets);
1179 if (filename_charset)
1180 *filename_charset = charsets[0];
1186 * g_filename_to_utf8:
1187 * @opsysstring: (type filename): a string in the encoding for filenames
1188 * @len: the length of the string, or -1 if the string is
1189 * nul-terminated (Note that some encodings may allow nul
1190 * bytes to occur inside strings. In that case, using -1
1191 * for the @len parameter is unsafe)
1192 * @bytes_read: (out) (optional): location to store the number of bytes in the
1193 * input string that were successfully converted, or %NULL.
1194 * Even if the conversion was successful, this may be
1195 * less than @len if there were partial characters
1196 * at the end of the input. If the error
1197 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1198 * stored will be the byte offset after the last valid
1200 * @bytes_written: (out) (optional): the number of bytes stored in the output
1201 * buffer (not including the terminating nul).
1202 * @error: location to store the error occurring, or %NULL to ignore
1203 * errors. Any of the errors in #GConvertError may occur.
1205 * Converts a string which is in the encoding used by GLib for
1206 * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1207 * for filenames; on other platforms, this function indirectly depends on
1208 * the [current locale][setlocale].
1210 * The input string shall not contain nul characters even if the @len
1211 * argument is positive. A nul character found inside the string will result
1212 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1213 * If the source encoding is not UTF-8 and the conversion output contains a
1214 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1215 * function returns %NULL. Use g_convert() to produce output that
1216 * may contain embedded nul characters.
1218 * Returns: (type utf8): The converted string, or %NULL on an error.
1221 g_filename_to_utf8 (const gchar *opsysstring,
1224 gsize *bytes_written,
1227 const gchar *charset;
1229 g_return_val_if_fail (opsysstring != NULL, NULL);
1231 if (get_filename_charset (&charset))
1232 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1234 return convert_checked (opsysstring, len, "UTF-8", charset,
1235 CONVERT_CHECK_NO_NULS_IN_INPUT |
1236 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1237 bytes_read, bytes_written, error);
1241 * g_filename_from_utf8:
1242 * @utf8string: (type utf8): a UTF-8 encoded string.
1243 * @len: the length of the string, or -1 if the string is
1245 * @bytes_read: (out) (optional): location to store the number of bytes in
1246 * the input string that were successfully converted, or %NULL.
1247 * Even if the conversion was successful, this may be
1248 * less than @len if there were partial characters
1249 * at the end of the input. If the error
1250 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1251 * stored will be the byte offset after the last valid
1253 * @bytes_written: (out) (optional): the number of bytes stored in
1254 * the output buffer (not including the terminating nul).
1255 * @error: location to store the error occurring, or %NULL to ignore
1256 * errors. Any of the errors in #GConvertError may occur.
1258 * Converts a string from UTF-8 to the encoding GLib uses for
1259 * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1260 * on other platforms, this function indirectly depends on the
1261 * [current locale][setlocale].
1263 * The input string shall not contain nul characters even if the @len
1264 * argument is positive. A nul character found inside the string will result
1265 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1266 * not UTF-8 and the conversion output contains a nul character, the error
1267 * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1269 * Returns: (type filename):
1270 * The converted string, or %NULL on an error.
1273 g_filename_from_utf8 (const gchar *utf8string,
1276 gsize *bytes_written,
1279 const gchar *charset;
1281 if (get_filename_charset (&charset))
1282 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1284 return convert_checked (utf8string, len, charset, "UTF-8",
1285 CONVERT_CHECK_NO_NULS_IN_INPUT |
1286 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1287 bytes_read, bytes_written, error);
1290 /* Test of haystack has the needle prefix, comparing case
1291 * insensitive. haystack may be UTF-8, but needle must
1292 * contain only ascii. */
1294 has_case_prefix (const gchar *haystack, const gchar *needle)
1298 /* Eat one character at a time. */
1303 g_ascii_tolower (*n) == g_ascii_tolower (*h))
1313 UNSAFE_ALL = 0x1, /* Escape all unsafe characters */
1314 UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */
1315 UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1316 UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */
1317 UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */
1318 } UnsafeCharacterSet;
1320 static const guchar acceptable[96] = {
1321 /* A table of the ASCII chars from space (32) to DEL (127) */
1322 /* ! " # $ % & ' ( ) * + , - . / */
1323 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1324 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1325 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1326 /* @ A B C D E F G H I J K L M N O */
1327 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1328 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1329 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1330 /* ` a b c d e f g h i j k l m n o */
1331 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1332 /* p q r s t u v w x y z { | } ~ DEL */
1333 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1336 static const gchar hex[] = "0123456789ABCDEF";
1338 /* Note: This escape function works on file: URIs, but if you want to
1339 * escape something else, please read RFC-2396 */
1341 g_escape_uri_string (const gchar *string,
1342 UnsafeCharacterSet mask)
1344 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1351 UnsafeCharacterSet use_mask;
1353 g_return_val_if_fail (mask == UNSAFE_ALL
1354 || mask == UNSAFE_ALLOW_PLUS
1355 || mask == UNSAFE_PATH
1356 || mask == UNSAFE_HOST
1357 || mask == UNSAFE_SLASHES, NULL);
1361 for (p = string; *p != '\0'; p++)
1364 if (!ACCEPTABLE (c))
1368 result = g_malloc (p - string + unacceptable * 2 + 1);
1371 for (q = result, p = string; *p != '\0'; p++)
1375 if (!ACCEPTABLE (c))
1377 *q++ = '%'; /* means hex coming */
1392 g_escape_file_uri (const gchar *hostname,
1393 const gchar *pathname)
1395 char *escaped_hostname = NULL;
1400 char *p, *backslash;
1402 /* Turn backslashes into forward slashes. That's what Netscape
1403 * does, and they are actually more or less equivalent in Windows.
1406 pathname = g_strdup (pathname);
1407 p = (char *) pathname;
1409 while ((backslash = strchr (p, '\\')) != NULL)
1416 if (hostname && *hostname != '\0')
1418 escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1421 escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1423 res = g_strconcat ("file://",
1424 (escaped_hostname) ? escaped_hostname : "",
1425 (*escaped_path != '/') ? "/" : "",
1430 g_free ((char *) pathname);
1433 g_free (escaped_hostname);
1434 g_free (escaped_path);
1440 unescape_character (const char *scanner)
1445 first_digit = g_ascii_xdigit_value (scanner[0]);
1446 if (first_digit < 0)
1449 second_digit = g_ascii_xdigit_value (scanner[1]);
1450 if (second_digit < 0)
1453 return (first_digit << 4) | second_digit;
1457 g_unescape_uri_string (const char *escaped,
1459 const char *illegal_escaped_characters,
1460 gboolean ascii_must_not_be_escaped)
1462 const gchar *in, *in_end;
1463 gchar *out, *result;
1466 if (escaped == NULL)
1470 len = strlen (escaped);
1472 result = g_malloc (len + 1);
1475 for (in = escaped, in_end = escaped + len; in < in_end; in++)
1481 /* catch partial escape sequences past the end of the substring */
1482 if (in + 3 > in_end)
1485 c = unescape_character (in + 1);
1487 /* catch bad escape sequences and NUL characters */
1491 /* catch escaped ASCII */
1492 if (ascii_must_not_be_escaped && c <= 0x7F)
1495 /* catch other illegal escaped characters */
1496 if (strchr (illegal_escaped_characters, c) != NULL)
1505 g_assert (out - result <= len);
1518 is_asciialphanum (gunichar c)
1520 return c <= 0x7F && g_ascii_isalnum (c);
1524 is_asciialpha (gunichar c)
1526 return c <= 0x7F && g_ascii_isalpha (c);
1529 /* allows an empty string */
1531 hostname_validate (const char *hostname)
1534 gunichar c, first_char, last_char;
1541 /* read in a label */
1542 c = g_utf8_get_char (p);
1543 p = g_utf8_next_char (p);
1544 if (!is_asciialphanum (c))
1550 c = g_utf8_get_char (p);
1551 p = g_utf8_next_char (p);
1553 while (is_asciialphanum (c) || c == '-');
1554 if (last_char == '-')
1557 /* if that was the last label, check that it was a toplabel */
1558 if (c == '\0' || (c == '.' && *p == '\0'))
1559 return is_asciialpha (first_char);
1566 * g_filename_from_uri:
1567 * @uri: a uri describing a filename (escaped, encoded in ASCII).
1568 * @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1569 * If there is no hostname in the URI, %NULL will be
1570 * stored in this location.
1571 * @error: location to store the error occurring, or %NULL to ignore
1572 * errors. Any of the errors in #GConvertError may occur.
1574 * Converts an escaped ASCII-encoded URI to a local filename in the
1575 * encoding used for filenames.
1577 * Since GLib 2.78, the query string and fragment can be present in the URI,
1578 * but are not part of the resulting filename.
1579 * We take inspiration from https://url.spec.whatwg.org/#file-state,
1580 * but we don't support the entire standard.
1582 * Returns: (type filename): a newly-allocated string holding
1583 * the resulting filename, or %NULL on an error.
1586 g_filename_from_uri (const gchar *uri,
1590 const char *past_scheme;
1591 const char *host_part;
1592 char *unescaped_hostname;
1605 if (!has_case_prefix (uri, "file:/"))
1607 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1608 _("The URI “%s” is not an absolute URI using the “file” scheme"),
1613 temp_uri = g_strdup (uri);
1615 past_scheme = temp_uri + strlen ("file:");
1617 past_path = strchr (past_scheme, '?');
1618 if (past_path != NULL)
1621 past_path = strchr (past_scheme, '#');
1622 if (past_path != NULL)
1625 if (has_case_prefix (past_scheme, "///"))
1627 else if (has_case_prefix (past_scheme, "//"))
1630 host_part = past_scheme;
1632 past_scheme = strchr (past_scheme, '/');
1634 if (past_scheme == NULL)
1637 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1638 _("The URI “%s” is invalid"),
1643 unescaped_hostname = g_unescape_uri_string (host_part, past_scheme - host_part, "", TRUE);
1645 if (unescaped_hostname == NULL ||
1646 !hostname_validate (unescaped_hostname))
1648 g_free (unescaped_hostname);
1650 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1651 _("The hostname of the URI “%s” is invalid"),
1657 *hostname = unescaped_hostname;
1659 g_free (unescaped_hostname);
1662 filename = g_unescape_uri_string (past_scheme, -1, "/", FALSE);
1664 if (filename == NULL)
1667 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1668 _("The URI “%s” contains invalidly escaped characters"),
1675 /* Drop localhost */
1676 if (hostname && *hostname != NULL &&
1677 g_ascii_strcasecmp (*hostname, "localhost") == 0)
1683 /* Turn slashes into backslashes, because that's the canonical spelling */
1685 while ((slash = strchr (p, '/')) != NULL)
1691 /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1692 * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1693 * the filename from the drive letter.
1695 if (g_ascii_isalpha (filename[1]))
1697 if (filename[2] == ':')
1699 else if (filename[2] == '|')
1707 result = g_strdup (filename + offs);
1716 * g_filename_to_uri:
1717 * @filename: (type filename): an absolute filename specified in the GLib file
1718 * name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1720 * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1721 * @error: location to store the error occurring, or %NULL to ignore
1722 * errors. Any of the errors in #GConvertError may occur.
1724 * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1725 * component following Section 3.3. of RFC 2396.
1727 * Returns: a newly-allocated string holding the resulting
1728 * URI, or %NULL on an error.
1731 g_filename_to_uri (const gchar *filename,
1732 const gchar *hostname,
1737 g_return_val_if_fail (filename != NULL, NULL);
1739 if (!g_path_is_absolute (filename))
1741 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1742 _("The pathname “%s” is not an absolute path"),
1748 !(g_utf8_validate (hostname, -1, NULL)
1749 && hostname_validate (hostname)))
1751 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1752 _("Invalid hostname"));
1757 /* Don't use localhost unnecessarily */
1758 if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1762 escaped_uri = g_escape_file_uri (hostname, filename);
1768 * g_uri_list_extract_uris:
1769 * @uri_list: an URI list
1771 * Splits an URI list conforming to the text/uri-list
1772 * mime type defined in RFC 2483 into individual URIs,
1773 * discarding any comments. The URIs are not validated.
1775 * Returns: (transfer full): a newly allocated %NULL-terminated list
1776 * of strings holding the individual URIs. The array should be freed
1777 * with g_strfreev().
1782 g_uri_list_extract_uris (const gchar *uri_list)
1787 uris = g_ptr_array_new ();
1791 /* We don't actually try to validate the URI according to RFC
1792 * 2396, or even check for allowed characters - we just ignore
1793 * comments and trim whitespace off the ends. We also
1794 * allow LF delimination as well as the specified CRLF.
1796 * We do allow comments like specified in RFC 2483.
1802 while (g_ascii_isspace (*p))
1806 while (*q && (*q != '\n') && (*q != '\r'))
1812 while (q > p && g_ascii_isspace (*q))
1816 g_ptr_array_add (uris, g_strndup (p, q - p + 1));
1819 p = strchr (p, '\n');
1824 g_ptr_array_add (uris, NULL);
1826 return (gchar **) g_ptr_array_free (uris, FALSE);
1830 * g_filename_display_basename:
1831 * @filename: (type filename): an absolute pathname in the
1832 * GLib file name encoding
1834 * Returns the display basename for the particular filename, guaranteed
1835 * to be valid UTF-8. The display name might not be identical to the filename,
1836 * for instance there might be problems converting it to UTF-8, and some files
1837 * can be translated in the display.
1839 * If GLib cannot make sense of the encoding of @filename, as a last resort it
1840 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1841 * You can search the result for the UTF-8 encoding of this character (which is
1842 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1845 * You must pass the whole absolute pathname to this functions so that
1846 * translation of well known locations can be done.
1848 * This function is preferred over g_filename_display_name() if you know the
1849 * whole path, as it allows translation.
1851 * Returns: a newly allocated string containing
1852 * a rendition of the basename of the filename in valid UTF-8
1857 g_filename_display_basename (const gchar *filename)
1862 g_return_val_if_fail (filename != NULL, NULL);
1864 basename = g_path_get_basename (filename);
1865 display_name = g_filename_display_name (basename);
1867 return display_name;
1871 * g_filename_display_name:
1872 * @filename: (type filename): a pathname hopefully in the
1873 * GLib file name encoding
1875 * Converts a filename into a valid UTF-8 string. The conversion is
1876 * not necessarily reversible, so you should keep the original around
1877 * and use the return value of this function only for display purposes.
1878 * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
1879 * even if the filename actually isn't in the GLib file name encoding.
1881 * If GLib cannot make sense of the encoding of @filename, as a last resort it
1882 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1883 * You can search the result for the UTF-8 encoding of this character (which is
1884 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1887 * If you know the whole pathname of the file you should use
1888 * g_filename_display_basename(), since that allows location-based
1889 * translation of filenames.
1891 * Returns: a newly allocated string containing
1892 * a rendition of the filename in valid UTF-8
1897 g_filename_display_name (const gchar *filename)
1900 const gchar **charsets;
1901 gchar *display_name = NULL;
1904 is_utf8 = g_get_filename_charsets (&charsets);
1908 if (g_utf8_validate (filename, -1, NULL))
1909 display_name = g_strdup (filename);
1914 /* Try to convert from the filename charsets to UTF-8.
1915 * Skip the first charset if it is UTF-8.
1917 for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
1919 display_name = g_convert (filename, -1, "UTF-8", charsets[i],
1927 /* if all conversions failed, we replace invalid UTF-8
1928 * by a question mark
1931 display_name = g_utf8_make_valid (filename, -1);
1933 return display_name;
1938 /* Binary compatibility versions. Not for newly compiled code. */
1940 _GLIB_EXTERN gchar *g_filename_to_utf8_utf8 (const gchar *opsysstring,
1943 gsize *bytes_written,
1944 GError **error) G_GNUC_MALLOC;
1945 _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar *utf8string,
1948 gsize *bytes_written,
1949 GError **error) G_GNUC_MALLOC;
1950 _GLIB_EXTERN gchar *g_filename_from_uri_utf8 (const gchar *uri,
1952 GError **error) G_GNUC_MALLOC;
1953 _GLIB_EXTERN gchar *g_filename_to_uri_utf8 (const gchar *filename,
1954 const gchar *hostname,
1955 GError **error) G_GNUC_MALLOC;
1958 g_filename_to_utf8_utf8 (const gchar *opsysstring,
1961 gsize *bytes_written,
1964 return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
1968 g_filename_from_utf8_utf8 (const gchar *utf8string,
1971 gsize *bytes_written,
1974 return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
1978 g_filename_from_uri_utf8 (const gchar *uri,
1982 return g_filename_from_uri (uri, hostname, error);
1986 g_filename_to_uri_utf8 (const gchar *filename,
1987 const gchar *hostname,
1990 return g_filename_to_uri (filename, hostname, error);