1 /* GLIB - Library of useful routines for C programming
3 * gconvert.c: Convert between character sets using iconv
4 * Copyright Red Hat Inc., 2000
5 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 02111-1307, USA.
32 #include "gprintfint.h"
33 #include "gthreadprivate.h"
36 #ifdef G_PLATFORM_WIN32
44 #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H)
45 #error GNU libiconv in use but included iconv.h not from libiconv
47 #if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H)
48 #error GNU libiconv not in use but included iconv.h is from libiconv
54 g_convert_error_quark (void)
56 return g_quark_from_static_string ("g_convert_error");
60 try_conversion (const char *to_codeset,
61 const char *from_codeset,
64 *cd = iconv_open (to_codeset, from_codeset);
66 if (*cd == (iconv_t)-1 && errno == EINVAL)
73 try_to_aliases (const char **to_aliases,
74 const char *from_codeset,
79 const char **p = to_aliases;
82 if (try_conversion (*p, from_codeset, cd))
92 extern const char ** G_GNUC_INTERNAL _g_charset_get_aliases (const char *canonical_name);
96 * @to_codeset: destination codeset
97 * @from_codeset: source codeset
99 * Same as the standard UNIX routine iconv_open(), but
100 * may be implemented via libiconv on UNIX flavors that lack
101 * a native implementation.
103 * GLib provides g_convert() and g_locale_to_utf8() which are likely
104 * more convenient than the raw iconv wrappers.
106 * Return value: a "conversion descriptor", or (GIConv)-1 if
107 * opening the converter failed.
110 g_iconv_open (const gchar *to_codeset,
111 const gchar *from_codeset)
115 if (!try_conversion (to_codeset, from_codeset, &cd))
117 const char **to_aliases = _g_charset_get_aliases (to_codeset);
118 const char **from_aliases = _g_charset_get_aliases (from_codeset);
122 const char **p = from_aliases;
125 if (try_conversion (to_codeset, *p, &cd))
128 if (try_to_aliases (to_aliases, *p, &cd))
135 if (try_to_aliases (to_aliases, from_codeset, &cd))
140 return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
145 * @converter: conversion descriptor from g_iconv_open()
146 * @inbuf: bytes to convert
147 * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
148 * @outbuf: converted output bytes
149 * @outbytes_left: inout parameter, bytes available to fill in @outbuf
151 * Same as the standard UNIX routine iconv(), but
152 * may be implemented via libiconv on UNIX flavors that lack
153 * a native implementation.
155 * GLib provides g_convert() and g_locale_to_utf8() which are likely
156 * more convenient than the raw iconv wrappers.
158 * Return value: count of non-reversible conversions, or -1 on error
161 g_iconv (GIConv converter,
165 gsize *outbytes_left)
167 iconv_t cd = (iconv_t)converter;
169 return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
174 * @converter: a conversion descriptor from g_iconv_open()
176 * Same as the standard UNIX routine iconv_close(), but
177 * may be implemented via libiconv on UNIX flavors that lack
178 * a native implementation. Should be called to clean up
179 * the conversion descriptor from g_iconv_open() when
180 * you are done converting things.
182 * GLib provides g_convert() and g_locale_to_utf8() which are likely
183 * more convenient than the raw iconv wrappers.
185 * Return value: -1 on error, 0 on success
188 g_iconv_close (GIConv converter)
190 iconv_t cd = (iconv_t)converter;
192 return iconv_close (cd);
196 #ifdef NEED_ICONV_CACHE
198 #define ICONV_CACHE_SIZE (16)
200 struct _iconv_cache_bucket {
207 static GList *iconv_cache_list;
208 static GHashTable *iconv_cache;
209 static GHashTable *iconv_open_hash;
210 static guint iconv_cache_size = 0;
211 G_LOCK_DEFINE_STATIC (iconv_cache_lock);
213 /* caller *must* hold the iconv_cache_lock */
215 iconv_cache_init (void)
217 static gboolean initialized = FALSE;
222 iconv_cache_list = NULL;
223 iconv_cache = g_hash_table_new (g_str_hash, g_str_equal);
224 iconv_open_hash = g_hash_table_new (g_direct_hash, g_direct_equal);
231 * iconv_cache_bucket_new:
233 * @cd: iconv descriptor
235 * Creates a new cache bucket, inserts it into the cache and
236 * increments the cache size.
238 * Returns a pointer to the newly allocated cache bucket.
240 static struct _iconv_cache_bucket *
241 iconv_cache_bucket_new (const gchar *key, GIConv cd)
243 struct _iconv_cache_bucket *bucket;
245 bucket = g_new (struct _iconv_cache_bucket, 1);
246 bucket->key = g_strdup (key);
247 bucket->refcount = 1;
251 g_hash_table_insert (iconv_cache, bucket->key, bucket);
253 /* FIXME: if we sorted the list so items with few refcounts were
254 first, then we could expire them faster in iconv_cache_expire_unused () */
255 iconv_cache_list = g_list_prepend (iconv_cache_list, bucket);
264 * iconv_cache_bucket_expire:
265 * @node: cache bucket's node
266 * @bucket: cache bucket
268 * Expires a single cache bucket @bucket. This should only ever be
269 * called on a bucket that currently has no used iconv descriptors
272 * @node is not a required argument. If @node is not supplied, we
273 * search for it ourselves.
276 iconv_cache_bucket_expire (GList *node, struct _iconv_cache_bucket *bucket)
278 g_hash_table_remove (iconv_cache, bucket->key);
281 node = g_list_find (iconv_cache_list, bucket);
283 g_assert (node != NULL);
287 node->prev->next = node->next;
289 node->next->prev = node->prev;
293 iconv_cache_list = node->next;
295 node->next->prev = NULL;
298 g_list_free_1 (node);
300 g_free (bucket->key);
301 g_iconv_close (bucket->cd);
309 * iconv_cache_expire_unused:
311 * Expires as many unused cache buckets as it needs to in order to get
312 * the total number of buckets < ICONV_CACHE_SIZE.
315 iconv_cache_expire_unused (void)
317 struct _iconv_cache_bucket *bucket;
320 node = iconv_cache_list;
321 while (node && iconv_cache_size >= ICONV_CACHE_SIZE)
326 if (bucket->refcount == 0)
327 iconv_cache_bucket_expire (node, bucket);
334 open_converter (const gchar *to_codeset,
335 const gchar *from_codeset,
338 struct _iconv_cache_bucket *bucket;
343 key = g_alloca (strlen (from_codeset) + strlen (to_codeset) + 2);
344 _g_sprintf (key, "%s:%s", from_codeset, to_codeset);
346 G_LOCK (iconv_cache_lock);
348 /* make sure the cache has been initialized */
351 bucket = g_hash_table_lookup (iconv_cache, key);
356 cd = g_iconv_open (to_codeset, from_codeset);
357 if (cd == (GIConv) -1)
362 /* Apparently iconv on Solaris <= 7 segfaults if you pass in
363 * NULL for anything but inbuf; work around that. (NULL outbuf
364 * or NULL *outbuf is allowed by Unix98.)
366 gsize inbytes_left = 0;
367 gchar *outbuf = NULL;
368 gsize outbytes_left = 0;
373 /* reset the descriptor */
374 g_iconv (cd, NULL, &inbytes_left, &outbuf, &outbytes_left);
381 cd = g_iconv_open (to_codeset, from_codeset);
382 if (cd == (GIConv) -1)
385 iconv_cache_expire_unused ();
387 bucket = iconv_cache_bucket_new (key, cd);
390 g_hash_table_insert (iconv_open_hash, cd, bucket->key);
392 G_UNLOCK (iconv_cache_lock);
398 G_UNLOCK (iconv_cache_lock);
400 /* Something went wrong. */
404 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
405 _("Conversion from character set '%s' to '%s' is not supported"),
406 from_codeset, to_codeset);
408 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
409 _("Could not open converter from '%s' to '%s'"),
410 from_codeset, to_codeset);
417 close_converter (GIConv converter)
419 struct _iconv_cache_bucket *bucket;
425 if (cd == (GIConv) -1)
428 G_LOCK (iconv_cache_lock);
430 key = g_hash_table_lookup (iconv_open_hash, cd);
433 g_hash_table_remove (iconv_open_hash, cd);
435 bucket = g_hash_table_lookup (iconv_cache, key);
440 if (cd == bucket->cd)
441 bucket->used = FALSE;
445 if (!bucket->refcount && iconv_cache_size > ICONV_CACHE_SIZE)
447 /* expire this cache bucket */
448 iconv_cache_bucket_expire (NULL, bucket);
453 G_UNLOCK (iconv_cache_lock);
455 g_warning ("This iconv context wasn't opened using open_converter");
457 return g_iconv_close (converter);
460 G_UNLOCK (iconv_cache_lock);
465 #else /* !NEED_ICONV_CACHE */
468 open_converter (const gchar *to_codeset,
469 const gchar *from_codeset,
474 cd = g_iconv_open (to_codeset, from_codeset);
476 if (cd == (GIConv) -1)
478 /* Something went wrong. */
482 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
483 _("Conversion from character set '%s' to '%s' is not supported"),
484 from_codeset, to_codeset);
486 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
487 _("Could not open converter from '%s' to '%s'"),
488 from_codeset, to_codeset);
496 close_converter (GIConv cd)
498 if (cd == (GIConv) -1)
501 return g_iconv_close (cd);
504 #endif /* NEED_ICONV_CACHE */
507 * g_convert_with_iconv:
508 * @str: the string to convert
509 * @len: the length of the string, or -1 if the string is
510 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
511 * @converter: conversion descriptor from g_iconv_open()
512 * @bytes_read: location to store the number of bytes in the
513 * input string that were successfully converted, or %NULL.
514 * Even if the conversion was successful, this may be
515 * less than @len if there were partial characters
516 * at the end of the input. If the error
517 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
518 * stored will the byte offset after the last valid
520 * @bytes_written: the number of bytes stored in the output buffer (not
521 * including the terminating nul).
522 * @error: location to store the error occuring, or %NULL to ignore
523 * errors. Any of the errors in #GConvertError may occur.
525 * Converts a string from one character set to another.
527 * Note that you should use g_iconv() for streaming
528 * conversions<footnote id="streaming-state">
530 * Despite the fact that @byes_read can return information about partial
531 * characters, the <literal>g_convert_...</literal> functions
532 * are not generally suitable for streaming. If the underlying converter
533 * being used maintains internal state, then this won't be preserved
534 * across successive calls to g_convert(), g_convert_with_iconv() or
535 * g_convert_with_fallback(). (An example of this is the GNU C converter
536 * for CP1255 which does not emit a base character until it knows that
537 * the next character is not a mark that could combine with the base
542 * Return value: If the conversion was successful, a newly allocated
543 * nul-terminated string, which must be freed with
544 * g_free(). Otherwise %NULL and @error will be set.
547 g_convert_with_iconv (const gchar *str,
551 gsize *bytes_written,
557 const gchar *shift_p = NULL;
558 gsize inbytes_remaining;
559 gsize outbytes_remaining;
562 gboolean have_error = FALSE;
563 gboolean done = FALSE;
565 g_return_val_if_fail (converter != (GIConv) -1, NULL);
571 inbytes_remaining = len;
572 outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
574 outbytes_remaining = outbuf_size - 1; /* -1 for nul */
575 outp = dest = g_malloc (outbuf_size);
577 while (!done && !have_error)
579 err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
581 if (err == (size_t) -1)
586 /* Incomplete text, do not report an error */
591 size_t used = outp - dest;
594 dest = g_realloc (dest, outbuf_size);
597 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
602 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
603 _("Invalid byte sequence in conversion input"));
608 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
609 _("Error during conversion: %s"),
619 /* call g_iconv with NULL inbuf to cleanup shift state */
622 inbytes_remaining = 0;
635 *bytes_read = p - str;
638 if ((p - str) != len)
643 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
644 _("Partial character sequence at end of input"));
651 *bytes_written = outp - dest; /* Doesn't include '\0' */
664 * @str: the string to convert
665 * @len: the length of the string, or -1 if the string is
666 * nul-terminated<footnote id="nul-unsafe">
668 Note that some encodings may allow nul bytes to
669 occur inside strings. In that case, using -1 for
670 the @len parameter is unsafe.
673 * @to_codeset: name of character set into which to convert @str
674 * @from_codeset: character set of @str.
675 * @bytes_read: location to store the number of bytes in the
676 * input string that were successfully converted, or %NULL.
677 * Even if the conversion was successful, this may be
678 * less than @len if there were partial characters
679 * at the end of the input. If the error
680 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
681 * stored will the byte offset after the last valid
683 * @bytes_written: the number of bytes stored in the output buffer (not
684 * including the terminating nul).
685 * @error: location to store the error occuring, or %NULL to ignore
686 * errors. Any of the errors in #GConvertError may occur.
688 * Converts a string from one character set to another.
690 * Note that you should use g_iconv() for streaming
691 * conversions<footnoteref linkend="streaming-state"/>.
693 * Return value: If the conversion was successful, a newly allocated
694 * nul-terminated string, which must be freed with
695 * g_free(). Otherwise %NULL and @error will be set.
698 g_convert (const gchar *str,
700 const gchar *to_codeset,
701 const gchar *from_codeset,
703 gsize *bytes_written,
709 g_return_val_if_fail (str != NULL, NULL);
710 g_return_val_if_fail (to_codeset != NULL, NULL);
711 g_return_val_if_fail (from_codeset != NULL, NULL);
713 cd = open_converter (to_codeset, from_codeset, error);
715 if (cd == (GIConv) -1)
726 res = g_convert_with_iconv (str, len, cd,
727 bytes_read, bytes_written,
730 close_converter (cd);
736 * g_convert_with_fallback:
737 * @str: the string to convert
738 * @len: the length of the string, or -1 if the string is
739 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
740 * @to_codeset: name of character set into which to convert @str
741 * @from_codeset: character set of @str.
742 * @fallback: UTF-8 string to use in place of character not
743 * present in the target encoding. (The string must be
744 * representable in the target encoding).
745 If %NULL, characters not in the target encoding will
746 be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
747 * @bytes_read: location to store the number of bytes in the
748 * input string that were successfully converted, or %NULL.
749 * Even if the conversion was successful, this may be
750 * less than @len if there were partial characters
751 * at the end of the input.
752 * @bytes_written: the number of bytes stored in the output buffer (not
753 * including the terminating nul).
754 * @error: location to store the error occuring, or %NULL to ignore
755 * errors. Any of the errors in #GConvertError may occur.
757 * Converts a string from one character set to another, possibly
758 * including fallback sequences for characters not representable
759 * in the output. Note that it is not guaranteed that the specification
760 * for the fallback sequences in @fallback will be honored. Some
761 * systems may do a approximate conversion from @from_codeset
762 * to @to_codeset in their iconv() functions,
763 * in which case GLib will simply return that approximate conversion.
765 * Note that you should use g_iconv() for streaming
766 * conversions<footnoteref linkend="streaming-state"/>.
768 * Return value: If the conversion was successful, a newly allocated
769 * nul-terminated string, which must be freed with
770 * g_free(). Otherwise %NULL and @error will be set.
773 g_convert_with_fallback (const gchar *str,
775 const gchar *to_codeset,
776 const gchar *from_codeset,
779 gsize *bytes_written,
785 const gchar *insert_str = NULL;
787 gsize inbytes_remaining;
788 const gchar *save_p = NULL;
789 gsize save_inbytes = 0;
790 gsize outbytes_remaining;
794 gboolean have_error = FALSE;
795 gboolean done = FALSE;
797 GError *local_error = NULL;
799 g_return_val_if_fail (str != NULL, NULL);
800 g_return_val_if_fail (to_codeset != NULL, NULL);
801 g_return_val_if_fail (from_codeset != NULL, NULL);
806 /* Try an exact conversion; we only proceed if this fails
807 * due to an illegal sequence in the input string.
809 dest = g_convert (str, len, to_codeset, from_codeset,
810 bytes_read, bytes_written, &local_error);
814 if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
816 g_propagate_error (error, local_error);
820 g_error_free (local_error);
824 /* No go; to proceed, we need a converter from "UTF-8" to
825 * to_codeset, and the string as UTF-8.
827 cd = open_converter (to_codeset, "UTF-8", error);
828 if (cd == (GIConv) -1)
839 utf8 = g_convert (str, len, "UTF-8", from_codeset,
840 bytes_read, &inbytes_remaining, error);
843 close_converter (cd);
849 /* Now the heart of the code. We loop through the UTF-8 string, and
850 * whenever we hit an offending character, we form fallback, convert
851 * the fallback to the target codeset, and then go back to
852 * converting the original string after finishing with the fallback.
854 * The variables save_p and save_inbytes store the input state
855 * for the original string while we are converting the fallback
859 outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
860 outbytes_remaining = outbuf_size - 1; /* -1 for nul */
861 outp = dest = g_malloc (outbuf_size);
863 while (!done && !have_error)
865 size_t inbytes_tmp = inbytes_remaining;
866 err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
867 inbytes_remaining = inbytes_tmp;
869 if (err == (size_t) -1)
874 g_assert_not_reached();
878 size_t used = outp - dest;
881 dest = g_realloc (dest, outbuf_size);
884 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
891 /* Error converting fallback string - fatal
893 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
894 _("Cannot convert fallback '%s' to codeset '%s'"),
895 insert_str, to_codeset);
903 gunichar ch = g_utf8_get_char (p);
904 insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
908 insert_str = fallback;
910 save_p = g_utf8_next_char (p);
911 save_inbytes = inbytes_remaining - (save_p - p);
913 inbytes_remaining = strlen (p);
916 /* fall thru if p is NULL */
918 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
919 _("Error during conversion: %s"),
930 g_free ((gchar *)insert_str);
932 inbytes_remaining = save_inbytes;
937 /* call g_iconv with NULL inbuf to cleanup shift state */
939 inbytes_remaining = 0;
950 close_converter (cd);
953 *bytes_written = outp - dest; /* Doesn't include '\0' */
959 if (save_p && !fallback)
960 g_free ((gchar *)insert_str);
975 strdup_len (const gchar *string,
977 gsize *bytes_written,
984 if (!g_utf8_validate (string, len, NULL))
991 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
992 _("Invalid byte sequence in conversion input"));
997 real_len = strlen (string);
1002 while (real_len < len && string[real_len])
1007 *bytes_read = real_len;
1009 *bytes_written = real_len;
1011 return g_strndup (string, real_len);
1016 * @opsysstring: a string in the encoding of the current locale. On Windows
1017 * this means the system codepage.
1018 * @len: the length of the string, or -1 if the string is
1019 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
1020 * @bytes_read: location to store the number of bytes in the
1021 * input string that were successfully converted, or %NULL.
1022 * Even if the conversion was successful, this may be
1023 * less than @len if there were partial characters
1024 * at the end of the input. If the error
1025 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1026 * stored will the byte offset after the last valid
1028 * @bytes_written: the number of bytes stored in the output buffer (not
1029 * including the terminating nul).
1030 * @error: location to store the error occuring, or %NULL to ignore
1031 * errors. Any of the errors in #GConvertError may occur.
1033 * Converts a string which is in the encoding used for strings by
1034 * the C runtime (usually the same as that used by the operating
1035 * system) in the current locale into a UTF-8 string.
1037 * Return value: The converted string, or %NULL on an error.
1040 g_locale_to_utf8 (const gchar *opsysstring,
1043 gsize *bytes_written,
1046 const char *charset;
1048 if (g_get_charset (&charset))
1049 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1051 return g_convert (opsysstring, len,
1052 "UTF-8", charset, bytes_read, bytes_written, error);
1056 * g_locale_from_utf8:
1057 * @utf8string: a UTF-8 encoded string
1058 * @len: the length of the string, or -1 if the string is
1059 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
1060 * @bytes_read: location to store the number of bytes in the
1061 * input string that were successfully converted, or %NULL.
1062 * Even if the conversion was successful, this may be
1063 * less than @len if there were partial characters
1064 * at the end of the input. If the error
1065 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1066 * stored will the byte offset after the last valid
1068 * @bytes_written: the number of bytes stored in the output buffer (not
1069 * including the terminating nul).
1070 * @error: location to store the error occuring, or %NULL to ignore
1071 * errors. Any of the errors in #GConvertError may occur.
1073 * Converts a string from UTF-8 to the encoding used for strings by
1074 * the C runtime (usually the same as that used by the operating
1075 * system) in the current locale.
1077 * Return value: The converted string, or %NULL on an error.
1080 g_locale_from_utf8 (const gchar *utf8string,
1083 gsize *bytes_written,
1086 const gchar *charset;
1088 if (g_get_charset (&charset))
1089 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1091 return g_convert (utf8string, len,
1092 charset, "UTF-8", bytes_read, bytes_written, error);
1095 #ifndef G_PLATFORM_WIN32
1097 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1099 struct _GFilenameCharsetCache {
1102 gchar **filename_charsets;
1106 filename_charset_cache_free (gpointer data)
1108 GFilenameCharsetCache *cache = data;
1109 g_free (cache->charset);
1110 g_strfreev (cache->filename_charsets);
1115 * g_get_filename_charsets:
1116 * @charsets: return location for the %NULL-terminated list of encoding names
1118 * Determines the preferred character sets used for filenames.
1119 * The first character set from the @charsets is the filename encoding, the
1120 * subsequent character sets are used when trying to generate a displayable
1121 * representation of a filename, see g_filename_display_name().
1123 * On Unix, the character sets are determined by consulting the
1124 * environment variables <envar>G_FILENAME_ENCODING</envar> and
1125 * <envar>G_BROKEN_FILENAMES</envar>. On Windows, the character set
1126 * used in the GLib API is always UTF-8 and said environment variables
1129 * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list
1130 * of character set names. The special token "@locale" is taken to mean the
1131 * character set for the current locale. If <envar>G_FILENAME_ENCODING</envar>
1132 * is not set, but <envar>G_BROKEN_FILENAMES</envar> is, the character set of
1133 * the current locale is taken as the filename encoding. If neither environment
1134 * variable is set, UTF-8 is taken as the filename encoding, but the character
1135 * set of the current locale is also put in the list of encodings.
1137 * The returned @charsets belong to GLib and must not be freed.
1139 * Note that on Unix, regardless of the locale character set or
1140 * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present on a
1141 * system might be in any random encoding or just gibberish.
1143 * Return value: %TRUE if the filename encoding is UTF-8.
1148 g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
1150 static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
1151 GFilenameCharsetCache *cache = g_static_private_get (&cache_private);
1152 const gchar *charset;
1156 cache = g_new0 (GFilenameCharsetCache, 1);
1157 g_static_private_set (&cache_private, cache, filename_charset_cache_free);
1160 g_get_charset (&charset);
1162 if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1164 const gchar *new_charset;
1168 g_free (cache->charset);
1169 g_strfreev (cache->filename_charsets);
1170 cache->charset = g_strdup (charset);
1172 p = getenv ("G_FILENAME_ENCODING");
1173 if (p != NULL && p[0] != '\0')
1175 cache->filename_charsets = g_strsplit (p, ",", 0);
1176 cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1178 for (i = 0; cache->filename_charsets[i]; i++)
1180 if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1182 g_get_charset (&new_charset);
1183 g_free (cache->filename_charsets[i]);
1184 cache->filename_charsets[i] = g_strdup (new_charset);
1188 else if (getenv ("G_BROKEN_FILENAMES") != NULL)
1190 cache->filename_charsets = g_new0 (gchar *, 2);
1191 cache->is_utf8 = g_get_charset (&new_charset);
1192 cache->filename_charsets[0] = g_strdup (new_charset);
1196 cache->filename_charsets = g_new0 (gchar *, 3);
1197 cache->is_utf8 = TRUE;
1198 cache->filename_charsets[0] = g_strdup ("UTF-8");
1199 if (!g_get_charset (&new_charset))
1200 cache->filename_charsets[1] = g_strdup (new_charset);
1204 if (filename_charsets)
1205 *filename_charsets = (const gchar **)cache->filename_charsets;
1207 return cache->is_utf8;
1210 #else /* G_PLATFORM_WIN32 */
1213 g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
1215 static const gchar *charsets[] = {
1221 /* On Windows GLib pretends that the filename charset is UTF-8 */
1222 if (filename_charsets)
1223 *filename_charsets = charsets;
1229 /* Cygwin works like before */
1230 result = g_get_charset (&(charsets[0]));
1232 if (filename_charsets)
1233 *filename_charsets = charsets;
1239 #endif /* G_PLATFORM_WIN32 */
1242 get_filename_charset (const gchar **filename_charset)
1244 const gchar **charsets;
1247 is_utf8 = g_get_filename_charsets (&charsets);
1249 if (filename_charset)
1250 *filename_charset = charsets[0];
1255 /* This is called from g_thread_init(). It's used to
1256 * initialize some static data in a threadsafe way.
1259 _g_convert_thread_init (void)
1261 const gchar **dummy;
1262 (void) g_get_filename_charsets (&dummy);
1266 * g_filename_to_utf8:
1267 * @opsysstring: a string in the encoding for filenames
1268 * @len: the length of the string, or -1 if the string is
1269 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
1270 * @bytes_read: location to store the number of bytes in the
1271 * input string that were successfully converted, or %NULL.
1272 * Even if the conversion was successful, this may be
1273 * less than @len if there were partial characters
1274 * at the end of the input. If the error
1275 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1276 * stored will the byte offset after the last valid
1278 * @bytes_written: the number of bytes stored in the output buffer (not
1279 * including the terminating nul).
1280 * @error: location to store the error occuring, or %NULL to ignore
1281 * errors. Any of the errors in #GConvertError may occur.
1283 * Converts a string which is in the encoding used by GLib for
1284 * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1287 * Return value: The converted string, or %NULL on an error.
1290 g_filename_to_utf8 (const gchar *opsysstring,
1293 gsize *bytes_written,
1296 const gchar *charset;
1298 if (get_filename_charset (&charset))
1299 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1301 return g_convert (opsysstring, len,
1302 "UTF-8", charset, bytes_read, bytes_written, error);
1307 #undef g_filename_to_utf8
1309 /* Binary compatibility version. Not for newly compiled code. */
1312 g_filename_to_utf8 (const gchar *opsysstring,
1315 gsize *bytes_written,
1318 const gchar *charset;
1320 if (g_get_charset (&charset))
1321 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1323 return g_convert (opsysstring, len,
1324 "UTF-8", charset, bytes_read, bytes_written, error);
1330 * g_filename_from_utf8:
1331 * @utf8string: a UTF-8 encoded string.
1332 * @len: the length of the string, or -1 if the string is
1334 * @bytes_read: location to store the number of bytes in the
1335 * input string that were successfully converted, or %NULL.
1336 * Even if the conversion was successful, this may be
1337 * less than @len if there were partial characters
1338 * at the end of the input. If the error
1339 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1340 * stored will the byte offset after the last valid
1342 * @bytes_written: the number of bytes stored in the output buffer (not
1343 * including the terminating nul).
1344 * @error: location to store the error occuring, or %NULL to ignore
1345 * errors. Any of the errors in #GConvertError may occur.
1347 * Converts a string from UTF-8 to the encoding GLib uses for
1348 * filenames. Note that on Windows GLib uses UTF-8 for filenames.
1350 * Return value: The converted string, or %NULL on an error.
1353 g_filename_from_utf8 (const gchar *utf8string,
1356 gsize *bytes_written,
1359 const gchar *charset;
1361 if (get_filename_charset (&charset))
1362 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1364 return g_convert (utf8string, len,
1365 charset, "UTF-8", bytes_read, bytes_written, error);
1370 #undef g_filename_from_utf8
1372 /* Binary compatibility version. Not for newly compiled code. */
1375 g_filename_from_utf8 (const gchar *utf8string,
1378 gsize *bytes_written,
1381 const gchar *charset;
1383 if (g_get_charset (&charset))
1384 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1386 return g_convert (utf8string, len,
1387 charset, "UTF-8", bytes_read, bytes_written, error);
1392 /* Test of haystack has the needle prefix, comparing case
1393 * insensitive. haystack may be UTF-8, but needle must
1394 * contain only ascii. */
1396 has_case_prefix (const gchar *haystack, const gchar *needle)
1400 /* Eat one character at a time. */
1405 g_ascii_tolower (*n) == g_ascii_tolower (*h))
1415 UNSAFE_ALL = 0x1, /* Escape all unsafe characters */
1416 UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */
1417 UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1418 UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */
1419 UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */
1420 } UnsafeCharacterSet;
1422 static const guchar acceptable[96] = {
1423 /* A table of the ASCII chars from space (32) to DEL (127) */
1424 /* ! " # $ % & ' ( ) * + , - . / */
1425 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1426 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1427 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1428 /* @ A B C D E F G H I J K L M N O */
1429 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1430 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1431 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1432 /* ` a b c d e f g h i j k l m n o */
1433 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1434 /* p q r s t u v w x y z { | } ~ DEL */
1435 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1438 static const gchar hex[16] = "0123456789ABCDEF";
1440 /* Note: This escape function works on file: URIs, but if you want to
1441 * escape something else, please read RFC-2396 */
1443 g_escape_uri_string (const gchar *string,
1444 UnsafeCharacterSet mask)
1446 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1453 UnsafeCharacterSet use_mask;
1455 g_return_val_if_fail (mask == UNSAFE_ALL
1456 || mask == UNSAFE_ALLOW_PLUS
1457 || mask == UNSAFE_PATH
1458 || mask == UNSAFE_HOST
1459 || mask == UNSAFE_SLASHES, NULL);
1463 for (p = string; *p != '\0'; p++)
1466 if (!ACCEPTABLE (c))
1470 result = g_malloc (p - string + unacceptable * 2 + 1);
1473 for (q = result, p = string; *p != '\0'; p++)
1477 if (!ACCEPTABLE (c))
1479 *q++ = '%'; /* means hex coming */
1494 g_escape_file_uri (const gchar *hostname,
1495 const gchar *pathname)
1497 char *escaped_hostname = NULL;
1502 char *p, *backslash;
1504 /* Turn backslashes into forward slashes. That's what Netscape
1505 * does, and they are actually more or less equivalent in Windows.
1508 pathname = g_strdup (pathname);
1509 p = (char *) pathname;
1511 while ((backslash = strchr (p, '\\')) != NULL)
1518 if (hostname && *hostname != '\0')
1520 escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1523 escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1525 res = g_strconcat ("file://",
1526 (escaped_hostname) ? escaped_hostname : "",
1527 (*escaped_path != '/') ? "/" : "",
1532 g_free ((char *) pathname);
1535 g_free (escaped_hostname);
1536 g_free (escaped_path);
1542 unescape_character (const char *scanner)
1547 first_digit = g_ascii_xdigit_value (scanner[0]);
1548 if (first_digit < 0)
1551 second_digit = g_ascii_xdigit_value (scanner[1]);
1552 if (second_digit < 0)
1555 return (first_digit << 4) | second_digit;
1559 g_unescape_uri_string (const char *escaped,
1561 const char *illegal_escaped_characters,
1562 gboolean ascii_must_not_be_escaped)
1564 const gchar *in, *in_end;
1565 gchar *out, *result;
1568 if (escaped == NULL)
1572 len = strlen (escaped);
1574 result = g_malloc (len + 1);
1577 for (in = escaped, in_end = escaped + len; in < in_end; in++)
1583 /* catch partial escape sequences past the end of the substring */
1584 if (in + 3 > in_end)
1587 c = unescape_character (in + 1);
1589 /* catch bad escape sequences and NUL characters */
1593 /* catch escaped ASCII */
1594 if (ascii_must_not_be_escaped && c <= 0x7F)
1597 /* catch other illegal escaped characters */
1598 if (strchr (illegal_escaped_characters, c) != NULL)
1607 g_assert (out - result <= len);
1620 is_asciialphanum (gunichar c)
1622 return c <= 0x7F && g_ascii_isalnum (c);
1626 is_asciialpha (gunichar c)
1628 return c <= 0x7F && g_ascii_isalpha (c);
1631 /* allows an empty string */
1633 hostname_validate (const char *hostname)
1636 gunichar c, first_char, last_char;
1643 /* read in a label */
1644 c = g_utf8_get_char (p);
1645 p = g_utf8_next_char (p);
1646 if (!is_asciialphanum (c))
1652 c = g_utf8_get_char (p);
1653 p = g_utf8_next_char (p);
1655 while (is_asciialphanum (c) || c == '-');
1656 if (last_char == '-')
1659 /* if that was the last label, check that it was a toplabel */
1660 if (c == '\0' || (c == '.' && *p == '\0'))
1661 return is_asciialpha (first_char);
1668 * g_filename_from_uri:
1669 * @uri: a uri describing a filename (escaped, encoded in ASCII).
1670 * @hostname: Location to store hostname for the URI, or %NULL.
1671 * If there is no hostname in the URI, %NULL will be
1672 * stored in this location.
1673 * @error: location to store the error occuring, or %NULL to ignore
1674 * errors. Any of the errors in #GConvertError may occur.
1676 * Converts an escaped ASCII-encoded URI to a local filename in the
1677 * encoding used for filenames.
1679 * Return value: a newly-allocated string holding the resulting
1680 * filename, or %NULL on an error.
1683 g_filename_from_uri (const gchar *uri,
1687 const char *path_part;
1688 const char *host_part;
1689 char *unescaped_hostname;
1700 if (!has_case_prefix (uri, "file:/"))
1702 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1703 _("The URI '%s' is not an absolute URI using the \"file\" scheme"),
1708 path_part = uri + strlen ("file:");
1710 if (strchr (path_part, '#') != NULL)
1712 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1713 _("The local file URI '%s' may not include a '#'"),
1718 if (has_case_prefix (path_part, "///"))
1720 else if (has_case_prefix (path_part, "//"))
1723 host_part = path_part;
1725 path_part = strchr (path_part, '/');
1727 if (path_part == NULL)
1729 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1730 _("The URI '%s' is invalid"),
1735 unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1737 if (unescaped_hostname == NULL ||
1738 !hostname_validate (unescaped_hostname))
1740 g_free (unescaped_hostname);
1741 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1742 _("The hostname of the URI '%s' is invalid"),
1748 *hostname = unescaped_hostname;
1750 g_free (unescaped_hostname);
1753 filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1755 if (filename == NULL)
1757 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1758 _("The URI '%s' contains invalidly escaped characters"),
1765 /* Drop localhost */
1766 if (hostname && *hostname != NULL &&
1767 g_ascii_strcasecmp (*hostname, "localhost") == 0)
1773 /* Turn slashes into backslashes, because that's the canonical spelling */
1775 while ((slash = strchr (p, '/')) != NULL)
1781 /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1782 * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1783 * the filename from the drive letter.
1785 if (g_ascii_isalpha (filename[1]))
1787 if (filename[2] == ':')
1789 else if (filename[2] == '|')
1797 result = g_strdup (filename + offs);
1805 #undef g_filename_from_uri
1808 g_filename_from_uri (const gchar *uri,
1812 gchar *utf8_filename;
1813 gchar *retval = NULL;
1815 utf8_filename = g_filename_from_uri_utf8 (uri, hostname, error);
1818 retval = g_locale_from_utf8 (utf8_filename, -1, NULL, NULL, error);
1819 g_free (utf8_filename);
1827 * g_filename_to_uri:
1828 * @filename: an absolute filename specified in the GLib file name encoding,
1829 * which is the on-disk file name bytes on Unix, and UTF-8 on
1831 * @hostname: A UTF-8 encoded hostname, or %NULL for none.
1832 * @error: location to store the error occuring, or %NULL to ignore
1833 * errors. Any of the errors in #GConvertError may occur.
1835 * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1836 * component following Section 3.3. of RFC 2396.
1838 * Return value: a newly-allocated string holding the resulting
1839 * URI, or %NULL on an error.
1842 g_filename_to_uri (const gchar *filename,
1843 const gchar *hostname,
1848 g_return_val_if_fail (filename != NULL, NULL);
1850 if (!g_path_is_absolute (filename))
1852 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1853 _("The pathname '%s' is not an absolute path"),
1859 !(g_utf8_validate (hostname, -1, NULL)
1860 && hostname_validate (hostname)))
1862 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1863 _("Invalid hostname"));
1868 /* Don't use localhost unnecessarily */
1869 if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1873 escaped_uri = g_escape_file_uri (hostname, filename);
1880 #undef g_filename_to_uri
1883 g_filename_to_uri (const gchar *filename,
1884 const gchar *hostname,
1887 gchar *utf8_filename;
1888 gchar *retval = NULL;
1890 utf8_filename = g_locale_to_utf8 (filename, -1, NULL, NULL, error);
1894 retval = g_filename_to_uri_utf8 (utf8_filename, hostname, error);
1895 g_free (utf8_filename);
1904 * g_uri_list_extract_uris:
1905 * @uri_list: an URI list
1907 * Splits an URI list conforming to the text/uri-list
1908 * mime type defined in RFC 2483 into individual URIs,
1909 * discarding any comments. The URIs are not validated.
1911 * Returns: a newly allocated %NULL-terminated list of
1912 * strings holding the individual URIs. The array should
1913 * be freed with g_strfreev().
1918 g_uri_list_extract_uris (const gchar *uri_list)
1929 /* We don't actually try to validate the URI according to RFC
1930 * 2396, or even check for allowed characters - we just ignore
1931 * comments and trim whitespace off the ends. We also
1932 * allow LF delimination as well as the specified CRLF.
1934 * We do allow comments like specified in RFC 2483.
1940 while (g_ascii_isspace (*p))
1944 while (*q && (*q != '\n') && (*q != '\r'))
1950 while (q > p && g_ascii_isspace (*q))
1955 uris = g_slist_prepend (uris, g_strndup (p, q - p + 1));
1960 p = strchr (p, '\n');
1965 result = g_new (gchar *, n_uris + 1);
1967 result[n_uris--] = NULL;
1968 for (u = uris; u; u = u->next)
1969 result[n_uris--] = u->data;
1971 g_slist_free (uris);
1977 * g_filename_display_basename:
1978 * @filename: an absolute pathname in the GLib file name encoding
1980 * Returns the display basename for the particular filename, guaranteed
1981 * to be valid UTF-8. The display name might not be identical to the filename,
1982 * for instance there might be problems converting it to UTF-8, and some files
1983 * can be translated in the display.
1985 * If GLib can not make sense of the encoding of @filename, as a last resort it
1986 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1987 * You can search the result for the UTF-8 encoding of this character (which is
1988 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1991 * You must pass the whole absolute pathname to this functions so that
1992 * translation of well known locations can be done.
1994 * This function is preferred over g_filename_display_name() if you know the
1995 * whole path, as it allows translation.
1997 * Return value: a newly allocated string containing
1998 * a rendition of the basename of the filename in valid UTF-8
2003 g_filename_display_basename (const gchar *filename)
2008 g_return_val_if_fail (filename != NULL, NULL);
2010 basename = g_path_get_basename (filename);
2011 display_name = g_filename_display_name (basename);
2013 return display_name;
2017 * g_filename_display_name:
2018 * @filename: a pathname hopefully in the GLib file name encoding
2020 * Converts a filename into a valid UTF-8 string. The conversion is
2021 * not necessarily reversible, so you should keep the original around
2022 * and use the return value of this function only for display purposes.
2023 * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
2024 * even if the filename actually isn't in the GLib file name encoding.
2026 * If GLib can not make sense of the encoding of @filename, as a last resort it
2027 * replaces unknown characters with U+FFFD, the Unicode replacement character.
2028 * You can search the result for the UTF-8 encoding of this character (which is
2029 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
2032 * If you know the whole pathname of the file you should use
2033 * g_filename_display_basename(), since that allows location-based
2034 * translation of filenames.
2036 * Return value: a newly allocated string containing
2037 * a rendition of the filename in valid UTF-8
2042 g_filename_display_name (const gchar *filename)
2045 const gchar **charsets;
2046 gchar *display_name = NULL;
2049 is_utf8 = g_get_filename_charsets (&charsets);
2053 if (g_utf8_validate (filename, -1, NULL))
2054 display_name = g_strdup (filename);
2059 /* Try to convert from the filename charsets to UTF-8.
2060 * Skip the first charset if it is UTF-8.
2062 for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
2064 display_name = g_convert (filename, -1, "UTF-8", charsets[i],
2072 /* if all conversions failed, we replace invalid UTF-8
2073 * by a question mark
2076 display_name = _g_utf8_make_valid (filename);
2078 return display_name;
2081 #define __G_CONVERT_C__
2082 #include "galiasdef.c"