X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=glib%2Fgconvert.c;h=1d55fda1342e892ac3a32beae5c5ea923e61a192;hb=35eaf037bdfca985abf5d349e7355f1d2ed9c77b;hp=5b0bb14fe501c4147b0a8c8ef607cb6b0e87777c;hpb=11a3e72c3a352b8044db7671f8b20c681bf93f4c;p=platform%2Fupstream%2Fglib.git diff --git a/glib/gconvert.c b/glib/gconvert.c index 5b0bb14..1d55fda 100644 --- a/glib/gconvert.c +++ b/glib/gconvert.c @@ -2,7 +2,7 @@ * * gconvert.c: Convert between character sets using iconv * Copyright Red Hat Inc., 2000 - * Authors: Havoc Pennington , Owen Taylor , Owen Taylor * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -15,22 +15,23 @@ * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. + * License along with this library; if not, see . */ #include "config.h" +#include "glibconfig.h" +#ifndef G_OS_WIN32 #include +#endif #include #include #include #include -#include "glib.h" -#include "gprintfint.h" -#include "gthreadinit.h" +#ifdef G_OS_WIN32 +#include "win_iconv.c" +#endif #ifdef G_PLATFORM_WIN32 #define STRICT @@ -38,26 +39,134 @@ #undef STRICT #endif +#include "gconvert.h" + +#include "gcharsetprivate.h" +#include "gslist.h" +#include "gstrfuncs.h" +#include "gtestutils.h" +#include "gthread.h" +#include "gunicode.h" +#include "gfileutils.h" + #include "glibintl.h" #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H) #error GNU libiconv in use but included iconv.h not from libiconv #endif -#if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H) +#if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H) \ + && !defined (__APPLE_CC__) && !defined (__LP_64__) #error GNU libiconv not in use but included iconv.h is from libiconv #endif -#include "galias.h" -GQuark -g_convert_error_quark (void) -{ - static GQuark quark; - if (!quark) - quark = g_quark_from_static_string ("g_convert_error"); +/** + * SECTION:conversions + * @title: Character Set Conversion + * @short_description: convert strings between different character sets + * + * The g_convert() family of function wraps the functionality of iconv(). + * In addition to pure character set conversions, GLib has functions to + * deal with the extra complications of encodings for file names. + * + * ## File Name Encodings + * + * Historically, UNIX has not had a defined encoding for file names: + * a file name is valid as long as it does not have path separators + * in it ("/"). However, displaying file names may require conversion: + * from the character set in which they were created, to the character + * set in which the application operates. Consider the Spanish file name + * "Presentación.sxi". If the application which created it uses + * ISO-8859-1 for its encoding, + * |[ + * Character: P r e s e n t a c i ó n . s x i + * Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69 + * ]| + * However, if the application use UTF-8, the actual file name on + * disk would look like this: + * |[ + * Character: P r e s e n t a c i ó n . s x i + * Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69 + * ]| + * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use + * Glib do the same thing. If you get a file name from the file system, + * for example, from readdir() or from g_dir_read_name(), and you wish + * to display the file name to the user, you will need to convert it + * into UTF-8. The opposite case is when the user types the name of a + * file he wishes to save: the toolkit will give you that string in + * UTF-8 encoding, and you will need to convert it to the character + * set used for file names before you can create the file with open() + * or fopen(). + * + * By default, Glib assumes that file names on disk are in UTF-8 + * encoding. This is a valid assumption for file systems which + * were created relatively recently: most applications use UTF-8 + * encoding for their strings, and that is also what they use for + * the file names they create. However, older file systems may + * still contain file names created in "older" encodings, such as + * ISO-8859-1. In this case, for compatibility reasons, you may want + * to instruct Glib to use that particular encoding for file names + * rather than UTF-8. You can do this by specifying the encoding for + * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING] + * environment variable. For example, if your installation uses + * ISO-8859-1 for file names, you can put this in your `~/.profile` + * |[ + * export G_FILENAME_ENCODING=ISO-8859-1 + * ]| + * Glib provides the functions g_filename_to_utf8() and + * g_filename_from_utf8() to perform the necessary conversions. + * These functions convert file names from the encoding specified + * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This + * [diagram][file-name-encodings-diagram] illustrates how + * these functions are used to convert between UTF-8 and the + * encoding for file names in the file system. + * + * ## Conversion between file name encodings # {#file-name-encodings-diagram) + * + * ![](file-name-encodings.png) + * + * ## Checklist for Application Writers + * + * This section is a practical summary of the detailed + + * things to do to make sure your applications process file + * name encodings correctly. + * + * 1. If you get a file name from the file system from a function + * such as readdir() or gtk_file_chooser_get_filename(), you do + * not need to do any conversion to pass that file name to + * functions like open(), rename(), or fopen() -- those are "raw" + * file names which the file system understands. + * + * 2. If you need to display a file name, convert it to UTF-8 first + * by using g_filename_to_utf8(). If conversion fails, display a + * string like "Unknown file name". Do not convert this string back + * into the encoding used for file names if you wish to pass it to + * the file system; use the original file name instead. + * + * For example, the document window of a word processor could display + * "Unknown file name" in its title bar but still let the user save + * the file, as it would keep the raw file name internally. This + * can happen if the user has not set the `G_FILENAME_ENCODING` + * environment variable even though he has files whose names are + * not encoded in UTF-8. + * + * 3. If your user interface lets the user type a file name for saving + * or renaming, convert it to the encoding used for file names in + * the file system by using g_filename_from_utf8(). Pass the converted + * file name to functions like fopen(). If conversion fails, ask the + * user to enter a different file name. This can happen if the user + * types Japanese characters when `G_FILENAME_ENCODING` is set to + * `ISO-8859-1`, for example. + */ - return quark; -} +/* We try to terminate strings in unknown charsets with this many zero bytes + * to ensure that multibyte strings really are nul-terminated when we return + * them from g_convert() and friends. + */ +#define NUL_TERMINATOR_LENGTH 4 + +G_DEFINE_QUARK (g_convert_error, g_convert_error) static gboolean try_conversion (const char *to_codeset, @@ -92,8 +201,6 @@ try_to_aliases (const char **to_aliases, return FALSE; } -extern const char **_g_charset_get_aliases (const char *canonical_name) G_GNUC_INTERNAL; - /** * g_iconv_open: * @to_codeset: destination codeset @@ -106,7 +213,7 @@ extern const char **_g_charset_get_aliases (const char *canonical_name) G_GNUC_I * GLib provides g_convert() and g_locale_to_utf8() which are likely * more convenient than the raw iconv wrappers. * - * Return value: a "conversion descriptor", or (GIConv)-1 if + * Returns: a "conversion descriptor", or (GIConv)-1 if * opening the converter failed. **/ GIConv @@ -158,9 +265,9 @@ g_iconv_open (const gchar *to_codeset, * GLib provides g_convert() and g_locale_to_utf8() which are likely * more convenient than the raw iconv wrappers. * - * Return value: count of non-reversible conversions, or -1 on error + * Returns: count of non-reversible conversions, or -1 on error **/ -size_t +gsize g_iconv (GIConv converter, gchar **inbuf, gsize *inbytes_left, @@ -185,7 +292,7 @@ g_iconv (GIConv converter, * GLib provides g_convert() and g_locale_to_utf8() which are likely * more convenient than the raw iconv wrappers. * - * Return value: -1 on error, 0 on success + * Returns: -1 on error, 0 on success **/ gint g_iconv_close (GIConv converter) @@ -195,278 +302,6 @@ g_iconv_close (GIConv converter) return iconv_close (cd); } - -#ifdef NEED_ICONV_CACHE - -#define ICONV_CACHE_SIZE (16) - -struct _iconv_cache_bucket { - gchar *key; - guint32 refcount; - gboolean used; - GIConv cd; -}; - -static GList *iconv_cache_list; -static GHashTable *iconv_cache; -static GHashTable *iconv_open_hash; -static guint iconv_cache_size = 0; -G_LOCK_DEFINE_STATIC (iconv_cache_lock); - -/* caller *must* hold the iconv_cache_lock */ -static void -iconv_cache_init (void) -{ - static gboolean initialized = FALSE; - - if (initialized) - return; - - iconv_cache_list = NULL; - iconv_cache = g_hash_table_new (g_str_hash, g_str_equal); - iconv_open_hash = g_hash_table_new (g_direct_hash, g_direct_equal); - - initialized = TRUE; -} - - -/** - * iconv_cache_bucket_new: - * @key: cache key - * @cd: iconv descriptor - * - * Creates a new cache bucket, inserts it into the cache and - * increments the cache size. - * - * Returns a pointer to the newly allocated cache bucket. - **/ -static struct _iconv_cache_bucket * -iconv_cache_bucket_new (const gchar *key, GIConv cd) -{ - struct _iconv_cache_bucket *bucket; - - bucket = g_new (struct _iconv_cache_bucket, 1); - bucket->key = g_strdup (key); - bucket->refcount = 1; - bucket->used = TRUE; - bucket->cd = cd; - - g_hash_table_insert (iconv_cache, bucket->key, bucket); - - /* FIXME: if we sorted the list so items with few refcounts were - first, then we could expire them faster in iconv_cache_expire_unused () */ - iconv_cache_list = g_list_prepend (iconv_cache_list, bucket); - - iconv_cache_size++; - - return bucket; -} - - -/** - * iconv_cache_bucket_expire: - * @node: cache bucket's node - * @bucket: cache bucket - * - * Expires a single cache bucket @bucket. This should only ever be - * called on a bucket that currently has no used iconv descriptors - * open. - * - * @node is not a required argument. If @node is not supplied, we - * search for it ourselves. - **/ -static void -iconv_cache_bucket_expire (GList *node, struct _iconv_cache_bucket *bucket) -{ - g_hash_table_remove (iconv_cache, bucket->key); - - if (node == NULL) - node = g_list_find (iconv_cache_list, bucket); - - g_assert (node != NULL); - - if (node->prev) - { - node->prev->next = node->next; - if (node->next) - node->next->prev = node->prev; - } - else - { - iconv_cache_list = node->next; - if (node->next) - node->next->prev = NULL; - } - - g_list_free_1 (node); - - g_free (bucket->key); - g_iconv_close (bucket->cd); - g_free (bucket); - - iconv_cache_size--; -} - - -/** - * iconv_cache_expire_unused: - * - * Expires as many unused cache buckets as it needs to in order to get - * the total number of buckets < ICONV_CACHE_SIZE. - **/ -static void -iconv_cache_expire_unused (void) -{ - struct _iconv_cache_bucket *bucket; - GList *node, *next; - - node = iconv_cache_list; - while (node && iconv_cache_size >= ICONV_CACHE_SIZE) - { - next = node->next; - - bucket = node->data; - if (bucket->refcount == 0) - iconv_cache_bucket_expire (node, bucket); - - node = next; - } -} - -static GIConv -open_converter (const gchar *to_codeset, - const gchar *from_codeset, - GError **error) -{ - struct _iconv_cache_bucket *bucket; - gchar *key; - GIConv cd; - - /* create our key */ - key = g_alloca (strlen (from_codeset) + strlen (to_codeset) + 2); - _g_sprintf (key, "%s:%s", from_codeset, to_codeset); - - G_LOCK (iconv_cache_lock); - - /* make sure the cache has been initialized */ - iconv_cache_init (); - - bucket = g_hash_table_lookup (iconv_cache, key); - if (bucket) - { - if (bucket->used) - { - cd = g_iconv_open (to_codeset, from_codeset); - if (cd == (GIConv) -1) - goto error; - } - else - { - /* Apparently iconv on Solaris <= 7 segfaults if you pass in - * NULL for anything but inbuf; work around that. (NULL outbuf - * or NULL *outbuf is allowed by Unix98.) - */ - gsize inbytes_left = 0; - gchar *outbuf = NULL; - gsize outbytes_left = 0; - - cd = bucket->cd; - bucket->used = TRUE; - - /* reset the descriptor */ - g_iconv (cd, NULL, &inbytes_left, &outbuf, &outbytes_left); - } - - bucket->refcount++; - } - else - { - cd = g_iconv_open (to_codeset, from_codeset); - if (cd == (GIConv) -1) - goto error; - - iconv_cache_expire_unused (); - - bucket = iconv_cache_bucket_new (key, cd); - } - - g_hash_table_insert (iconv_open_hash, cd, bucket->key); - - G_UNLOCK (iconv_cache_lock); - - return cd; - - error: - - G_UNLOCK (iconv_cache_lock); - - /* Something went wrong. */ - if (error) - { - if (errno == EINVAL) - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, - _("Conversion from character set '%s' to '%s' is not supported"), - from_codeset, to_codeset); - else - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, - _("Could not open converter from '%s' to '%s'"), - from_codeset, to_codeset); - } - - return cd; -} - -static int -close_converter (GIConv converter) -{ - struct _iconv_cache_bucket *bucket; - const gchar *key; - GIConv cd; - - cd = converter; - - if (cd == (GIConv) -1) - return 0; - - G_LOCK (iconv_cache_lock); - - key = g_hash_table_lookup (iconv_open_hash, cd); - if (key) - { - g_hash_table_remove (iconv_open_hash, cd); - - bucket = g_hash_table_lookup (iconv_cache, key); - g_assert (bucket); - - bucket->refcount--; - - if (cd == bucket->cd) - bucket->used = FALSE; - else - g_iconv_close (cd); - - if (!bucket->refcount && iconv_cache_size > ICONV_CACHE_SIZE) - { - /* expire this cache bucket */ - iconv_cache_bucket_expire (NULL, bucket); - } - } - else - { - G_UNLOCK (iconv_cache_lock); - - g_warning ("This iconv context wasn't opened using open_converter"); - - return g_iconv_close (converter); - } - - G_UNLOCK (iconv_cache_lock); - - return 0; -} - -#else /* !NEED_ICONV_CACHE */ - static GIConv open_converter (const gchar *to_codeset, const gchar *from_codeset, @@ -504,13 +339,13 @@ close_converter (GIConv cd) return g_iconv_close (cd); } -#endif /* NEED_ICONV_CACHE */ - /** * g_convert_with_iconv: * @str: the string to convert - * @len: the length of the string, or -1 if the string is - * nul-terminated. + * @len: the length of the string in bytes, or -1 if the string is + * nul-terminated (Note that some encodings may allow nul + * bytes to occur inside strings. In that case, using -1 + * for the @len parameter is unsafe) * @converter: conversion descriptor from g_iconv_open() * @bytes_read: location to store the number of bytes in the * input string that were successfully converted, or %NULL. @@ -522,27 +357,22 @@ close_converter (GIConv cd) * input sequence. * @bytes_written: the number of bytes stored in the output buffer (not * including the terminating nul). - * @error: location to store the error occuring, or %NULL to ignore + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * * Converts a string from one character set to another. * - * Note that you should use g_iconv() for streaming - * conversions - * + * Note that you should use g_iconv() for streaming conversions. * Despite the fact that @byes_read can return information about partial - * characters, the g_convert_... functions - * are not generally suitable for streaming. If the underlying converter - * being used maintains internal state, then this won't be preserved - * across successive calls to g_convert(), g_convert_with_iconv() or - * g_convert_with_fallback(). (An example of this is the GNU C converter - * for CP1255 which does not emit a base character until it knows that - * the next character is not a mark that could combine with the base - * character.) - * - * . + * characters, the g_convert_... functions are not generally suitable + * for streaming. If the underlying converter maintains internal state, + * then this won't be preserved across successive calls to g_convert(), + * g_convert_with_iconv() or g_convert_with_fallback(). (An example of + * this is the GNU C converter for CP1255 which does not emit a base + * character until it knows that the next character is not a mark that + * could combine with the base character.) * - * Return value: If the conversion was successful, a newly allocated + * Returns: If the conversion was successful, a newly allocated * nul-terminated string, which must be freed with * g_free(). Otherwise %NULL and @error will be set. **/ @@ -557,13 +387,13 @@ g_convert_with_iconv (const gchar *str, gchar *dest; gchar *outp; const gchar *p; - const gchar *shift_p = NULL; gsize inbytes_remaining; gsize outbytes_remaining; gsize err; gsize outbuf_size; gboolean have_error = FALSE; gboolean done = FALSE; + gboolean reset = FALSE; g_return_val_if_fail (converter != (GIConv) -1, NULL); @@ -572,16 +402,19 @@ g_convert_with_iconv (const gchar *str, p = str; inbytes_remaining = len; - outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ + outbuf_size = len + NUL_TERMINATOR_LENGTH; - outbytes_remaining = outbuf_size - 1; /* -1 for nul */ + outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; outp = dest = g_malloc (outbuf_size); while (!done && !have_error) { - err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining); + if (reset) + err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining); + else + err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining); - if (err == (size_t) -1) + if (err == (gsize) -1) { switch (errno) { @@ -591,37 +424,38 @@ g_convert_with_iconv (const gchar *str, break; case E2BIG: { - size_t used = outp - dest; + gsize used = outp - dest; outbuf_size *= 2; dest = g_realloc (dest, outbuf_size); outp = dest + used; - outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ + outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; } break; case EILSEQ: - if (error) - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, - _("Invalid byte sequence in conversion input")); + g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + _("Invalid byte sequence in conversion input")); have_error = TRUE; break; default: - if (error) - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, - _("Error during conversion: %s"), - g_strerror (errno)); + { + int errsv = errno; + + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, + _("Error during conversion: %s"), + g_strerror (errsv)); + } have_error = TRUE; break; } } else { - if (!shift_p) + if (!reset) { /* call g_iconv with NULL inbuf to cleanup shift state */ - shift_p = p; - p = NULL; + reset = TRUE; inbytes_remaining = 0; } else @@ -629,10 +463,7 @@ g_convert_with_iconv (const gchar *str, } } - if (shift_p) - p = shift_p; - - *outp = '\0'; + memset (outp, 0, NUL_TERMINATOR_LENGTH); if (bytes_read) *bytes_read = p - str; @@ -642,9 +473,8 @@ g_convert_with_iconv (const gchar *str, { if (!have_error) { - if (error) - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, - _("Partial character sequence at end of input")); + g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, + _("Partial character sequence at end of input")); have_error = TRUE; } } @@ -665,17 +495,13 @@ g_convert_with_iconv (const gchar *str, /** * g_convert: * @str: the string to convert - * @len: the length of the string, or -1 if the string is - * nul-terminated - - Note that some encodings may allow nul bytes to - occur inside strings. In that case, using -1 for - the @len parameter is unsafe. - - . + * @len: the length of the string in bytes, or -1 if the string is + * nul-terminated (Note that some encodings may allow nul + * bytes to occur inside strings. In that case, using -1 + * for the @len parameter is unsafe) * @to_codeset: name of character set into which to convert @str * @from_codeset: character set of @str. - * @bytes_read: location to store the number of bytes in the + * @bytes_read: (out): location to store the number of bytes in the * input string that were successfully converted, or %NULL. * Even if the conversion was successful, this may be * less than @len if there were partial characters @@ -683,17 +509,27 @@ g_convert_with_iconv (const gchar *str, * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value * stored will the byte offset after the last valid * input sequence. - * @bytes_written: the number of bytes stored in the output buffer (not + * @bytes_written: (out): the number of bytes stored in the output buffer (not * including the terminating nul). - * @error: location to store the error occuring, or %NULL to ignore + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * * Converts a string from one character set to another. * - * Note that you should use g_iconv() for streaming - * conversions. + * Note that you should use g_iconv() for streaming conversions. + * Despite the fact that @byes_read can return information about partial + * characters, the g_convert_... functions are not generally suitable + * for streaming. If the underlying converter maintains internal state, + * then this won't be preserved across successive calls to g_convert(), + * g_convert_with_iconv() or g_convert_with_fallback(). (An example of + * this is the GNU C converter for CP1255 which does not emit a base + * character until it knows that the next character is not a mark that + * could combine with the base character.) + * + * Using extensions such as "//TRANSLIT" may not work (or may not work + * well) on many platforms. Consider using g_str_to_ascii() instead. * - * Return value: If the conversion was successful, a newly allocated + * Returns: If the conversion was successful, a newly allocated * nul-terminated string, which must be freed with * g_free(). Otherwise %NULL and @error will be set. **/ @@ -738,8 +574,10 @@ g_convert (const gchar *str, /** * g_convert_with_fallback: * @str: the string to convert - * @len: the length of the string, or -1 if the string is - * nul-terminated. + * @len: the length of the string in bytes, or -1 if the string is + * nul-terminated (Note that some encodings may allow nul + * bytes to occur inside strings. In that case, using -1 + * for the @len parameter is unsafe) * @to_codeset: name of character set into which to convert @str * @from_codeset: character set of @str. * @fallback: UTF-8 string to use in place of character not @@ -754,21 +592,28 @@ g_convert (const gchar *str, * at the end of the input. * @bytes_written: the number of bytes stored in the output buffer (not * including the terminating nul). - * @error: location to store the error occuring, or %NULL to ignore + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * * Converts a string from one character set to another, possibly * including fallback sequences for characters not representable * in the output. Note that it is not guaranteed that the specification * for the fallback sequences in @fallback will be honored. Some - * systems may do a approximate conversion from @from_codeset + * systems may do an approximate conversion from @from_codeset * to @to_codeset in their iconv() functions, * in which case GLib will simply return that approximate conversion. * - * Note that you should use g_iconv() for streaming - * conversions. + * Note that you should use g_iconv() for streaming conversions. + * Despite the fact that @byes_read can return information about partial + * characters, the g_convert_... functions are not generally suitable + * for streaming. If the underlying converter maintains internal state, + * then this won't be preserved across successive calls to g_convert(), + * g_convert_with_iconv() or g_convert_with_fallback(). (An example of + * this is the GNU C converter for CP1255 which does not emit a base + * character until it knows that the next character is not a mark that + * could combine with the base character.) * - * Return value: If the conversion was successful, a newly allocated + * Returns: If the conversion was successful, a newly allocated * nul-terminated string, which must be freed with * g_free(). Otherwise %NULL and @error will be set. **/ @@ -777,7 +622,7 @@ g_convert_with_fallback (const gchar *str, gssize len, const gchar *to_codeset, const gchar *from_codeset, - gchar *fallback, + const gchar *fallback, gsize *bytes_read, gsize *bytes_written, GError **error) @@ -859,17 +704,17 @@ g_convert_with_fallback (const gchar *str, */ p = utf8; - outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ - outbytes_remaining = outbuf_size - 1; /* -1 for nul */ + outbuf_size = len + NUL_TERMINATOR_LENGTH; + outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; outp = dest = g_malloc (outbuf_size); while (!done && !have_error) { - size_t inbytes_tmp = inbytes_remaining; + gsize inbytes_tmp = inbytes_remaining; err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining); inbytes_remaining = inbytes_tmp; - if (err == (size_t) -1) + if (err == (gsize) -1) { switch (errno) { @@ -878,13 +723,13 @@ g_convert_with_fallback (const gchar *str, break; case E2BIG: { - size_t used = outp - dest; + gsize used = outp - dest; outbuf_size *= 2; dest = g_realloc (dest, outbuf_size); outp = dest + used; - outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ + outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; break; } @@ -918,9 +763,14 @@ g_convert_with_fallback (const gchar *str, } /* fall thru if p is NULL */ default: - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, - _("Error during conversion: %s"), - g_strerror (errno)); + { + int errsv = errno; + + g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, + _("Error during conversion: %s"), + g_strerror (errsv)); + } + have_error = TRUE; break; } @@ -948,7 +798,7 @@ g_convert_with_fallback (const gchar *str, /* Cleanup */ - *outp = '\0'; + memset (outp, 0, NUL_TERMINATOR_LENGTH); close_converter (cd); @@ -991,8 +841,8 @@ strdup_len (const gchar *string, if (bytes_written) *bytes_written = 0; - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, - _("Invalid byte sequence in conversion input")); + g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + _("Invalid byte sequence in conversion input")); return NULL; } @@ -1019,7 +869,9 @@ strdup_len (const gchar *string, * @opsysstring: a string in the encoding of the current locale. On Windows * this means the system codepage. * @len: the length of the string, or -1 if the string is - * nul-terminated. + * nul-terminated (Note that some encodings may allow nul + * bytes to occur inside strings. In that case, using -1 + * for the @len parameter is unsafe) * @bytes_read: location to store the number of bytes in the * input string that were successfully converted, or %NULL. * Even if the conversion was successful, this may be @@ -1030,14 +882,15 @@ strdup_len (const gchar *string, * input sequence. * @bytes_written: the number of bytes stored in the output buffer (not * including the terminating nul). - * @error: location to store the error occuring, or %NULL to ignore + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * * Converts a string which is in the encoding used for strings by * the C runtime (usually the same as that used by the operating - * system) in the current locale into a UTF-8 string. + * system) in the [current locale][setlocale] into a UTF-8 string. * - * Return value: The converted string, or %NULL on an error. + * Returns: A newly-allocated buffer containing the converted string, + * or %NULL on an error, and error will be set. **/ gchar * g_locale_to_utf8 (const gchar *opsysstring, @@ -1059,7 +912,9 @@ g_locale_to_utf8 (const gchar *opsysstring, * g_locale_from_utf8: * @utf8string: a UTF-8 encoded string * @len: the length of the string, or -1 if the string is - * nul-terminated. + * nul-terminated (Note that some encodings may allow nul + * bytes to occur inside strings. In that case, using -1 + * for the @len parameter is unsafe) * @bytes_read: location to store the number of bytes in the * input string that were successfully converted, or %NULL. * Even if the conversion was successful, this may be @@ -1070,14 +925,16 @@ g_locale_to_utf8 (const gchar *opsysstring, * input sequence. * @bytes_written: the number of bytes stored in the output buffer (not * including the terminating nul). - * @error: location to store the error occuring, or %NULL to ignore + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * * Converts a string from UTF-8 to the encoding used for strings by * the C runtime (usually the same as that used by the operating - * system) in the current locale. + * system) in the [current locale][setlocale]. On Windows this means + * the system codepage. * - * Return value: The converted string, or %NULL on an error. + * Returns: A newly-allocated buffer containing the converted string, + * or %NULL on an error, and error will be set. **/ gchar * g_locale_from_utf8 (const gchar *utf8string, @@ -1124,40 +981,40 @@ filename_charset_cache_free (gpointer data) * representation of a filename, see g_filename_display_name(). * * On Unix, the character sets are determined by consulting the - * environment variables G_FILENAME_ENCODING and - * G_BROKEN_FILENAMES. On Windows, the character set - * used in the GLib API is always UTF-8 and said environment variables - * have no effect. + * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`. + * On Windows, the character set used in the GLib API is always UTF-8 + * and said environment variables have no effect. * - * G_FILENAME_ENCODING may be set to a comma-separated list - * of character set names. The special token "@locale" is taken to mean the - * character set for the current locale. If G_FILENAME_ENCODING - * is not set, but G_BROKEN_FILENAMES is, the character set of - * the current locale is taken as the filename encoding. If neither environment - * variable is set, UTF-8 is taken as the filename encoding, but the character - * set of the current locale is also put in the list of encodings. + * `G_FILENAME_ENCODING` may be set to a comma-separated list of + * character set names. The special token "@locale" is taken + * to mean the character set for the [current locale][setlocale]. + * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is, + * the character set of the current locale is taken as the filename + * encoding. If neither environment variable is set, UTF-8 is taken + * as the filename encoding, but the character set of the current locale + * is also put in the list of encodings. * * The returned @charsets belong to GLib and must not be freed. * * Note that on Unix, regardless of the locale character set or - * G_FILENAME_ENCODING value, the actual file names present on a - * system might be in any random encoding or just gibberish. + * `G_FILENAME_ENCODING` value, the actual file names present + * on a system might be in any random encoding or just gibberish. * - * Return value: %TRUE if the filename encoding is UTF-8. + * Returns: %TRUE if the filename encoding is UTF-8. * * Since: 2.6 */ gboolean -g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) +g_get_filename_charsets (const gchar ***filename_charsets) { - static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT; - GFilenameCharsetCache *cache = g_static_private_get (&cache_private); + static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free); + GFilenameCharsetCache *cache = g_private_get (&cache_private); const gchar *charset; if (!cache) { cache = g_new0 (GFilenameCharsetCache, 1); - g_static_private_set (&cache_private, cache, filename_charset_cache_free); + g_private_set (&cache_private, cache); } g_get_charset (&charset); @@ -1213,7 +1070,7 @@ g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) #else /* G_PLATFORM_WIN32 */ gboolean -g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) +g_get_filename_charsets (const gchar ***filename_charsets) { static const gchar *charsets[] = { "UTF-8", @@ -1255,21 +1112,13 @@ get_filename_charset (const gchar **filename_charset) return is_utf8; } -/* This is called from g_thread_init(). It's used to - * initialize some static data in a threadsafe way. - */ -void -_g_convert_thread_init (void) -{ - const gchar **dummy; - (void) g_get_filename_charsets (&dummy); -} - /** * g_filename_to_utf8: * @opsysstring: a string in the encoding for filenames * @len: the length of the string, or -1 if the string is - * nul-terminated. + * nul-terminated (Note that some encodings may allow nul + * bytes to occur inside strings. In that case, using -1 + * for the @len parameter is unsafe) * @bytes_read: location to store the number of bytes in the * input string that were successfully converted, or %NULL. * Even if the conversion was successful, this may be @@ -1280,14 +1129,15 @@ _g_convert_thread_init (void) * input sequence. * @bytes_written: the number of bytes stored in the output buffer (not * including the terminating nul). - * @error: location to store the error occuring, or %NULL to ignore + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * * Converts a string which is in the encoding used by GLib for * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8 - * for filenames. + * for filenames; on other platforms, this function indirectly depends on + * the [current locale][setlocale]. * - * Return value: The converted string, or %NULL on an error. + * Returns: The converted string, or %NULL on an error. **/ gchar* g_filename_to_utf8 (const gchar *opsysstring, @@ -1298,6 +1148,8 @@ g_filename_to_utf8 (const gchar *opsysstring, { const gchar *charset; + g_return_val_if_fail (opsysstring != NULL, NULL); + if (get_filename_charset (&charset)) return strdup_len (opsysstring, len, bytes_read, bytes_written, error); else @@ -1305,11 +1157,14 @@ g_filename_to_utf8 (const gchar *opsysstring, "UTF-8", charset, bytes_read, bytes_written, error); } -#ifdef G_OS_WIN32 +#if defined (G_OS_WIN32) && !defined (_WIN64) #undef g_filename_to_utf8 -/* Binary compatibility version. Not for newly compiled code. */ +/* Binary compatibility version. Not for newly compiled code. Also not needed for + * 64-bit versions as there should be no old deployed binaries that would use + * the old versions. + */ gchar* g_filename_to_utf8 (const gchar *opsysstring, @@ -1320,6 +1175,8 @@ g_filename_to_utf8 (const gchar *opsysstring, { const gchar *charset; + g_return_val_if_fail (opsysstring != NULL, NULL); + if (g_get_charset (&charset)) return strdup_len (opsysstring, len, bytes_read, bytes_written, error); else @@ -1334,23 +1191,26 @@ g_filename_to_utf8 (const gchar *opsysstring, * @utf8string: a UTF-8 encoded string. * @len: the length of the string, or -1 if the string is * nul-terminated. - * @bytes_read: location to store the number of bytes in the - * input string that were successfully converted, or %NULL. + * @bytes_read: (out) (allow-none): location to store the number of bytes in + * the input string that were successfully converted, or %NULL. * Even if the conversion was successful, this may be * less than @len if there were partial characters * at the end of the input. If the error * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value * stored will the byte offset after the last valid * input sequence. - * @bytes_written: the number of bytes stored in the output buffer (not + * @bytes_written: (out): the number of bytes stored in the output buffer (not * including the terminating nul). - * @error: location to store the error occuring, or %NULL to ignore + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * * Converts a string from UTF-8 to the encoding GLib uses for - * filenames. Note that on Windows GLib uses UTF-8 for filenames. + * filenames. Note that on Windows GLib uses UTF-8 for filenames; + * on other platforms, this function indirectly depends on the + * [current locale][setlocale]. * - * Return value: The converted string, or %NULL on an error. + * Returns: (array length=bytes_written) (element-type guint8) (transfer full): + * The converted string, or %NULL on an error. **/ gchar* g_filename_from_utf8 (const gchar *utf8string, @@ -1368,7 +1228,7 @@ g_filename_from_utf8 (const gchar *utf8string, charset, "UTF-8", bytes_read, bytes_written, error); } -#ifdef G_OS_WIN32 +#if defined (G_OS_WIN32) && !defined (_WIN64) #undef g_filename_from_utf8 @@ -1670,17 +1530,17 @@ hostname_validate (const char *hostname) /** * g_filename_from_uri: * @uri: a uri describing a filename (escaped, encoded in ASCII). - * @hostname: Location to store hostname for the URI, or %NULL. + * @hostname: (out) (allow-none): Location to store hostname for the URI, or %NULL. * If there is no hostname in the URI, %NULL will be * stored in this location. - * @error: location to store the error occuring, or %NULL to ignore + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * * Converts an escaped ASCII-encoded URI to a local filename in the * encoding used for filenames. * - * Return value: a newly-allocated string holding the resulting - * filename, or %NULL on an error. + * Returns: (type filename): a newly-allocated string holding + * the resulting filename, or %NULL on an error. **/ gchar * g_filename_from_uri (const gchar *uri, @@ -1803,7 +1663,7 @@ g_filename_from_uri (const gchar *uri, return result; } -#ifdef G_OS_WIN32 +#if defined (G_OS_WIN32) && !defined (_WIN64) #undef g_filename_from_uri @@ -1831,13 +1691,14 @@ g_filename_from_uri (const gchar *uri, * @filename: an absolute filename specified in the GLib file name encoding, * which is the on-disk file name bytes on Unix, and UTF-8 on * Windows - * @hostname: A UTF-8 encoded hostname, or %NULL for none. - * @error: location to store the error occuring, or %NULL to ignore + * @hostname: (allow-none): A UTF-8 encoded hostname, or %NULL for none. + * @error: location to store the error occurring, or %NULL to ignore * errors. Any of the errors in #GConvertError may occur. * - * Converts an absolute filename to an escaped ASCII-encoded URI. + * Converts an absolute filename to an escaped ASCII-encoded URI, with the path + * component following Section 3.3. of RFC 2396. * - * Return value: a newly-allocated string holding the resulting + * Returns: a newly-allocated string holding the resulting * URI, or %NULL on an error. **/ gchar * @@ -1861,8 +1722,8 @@ g_filename_to_uri (const gchar *filename, !(g_utf8_validate (hostname, -1, NULL) && hostname_validate (hostname))) { - g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, - _("Invalid hostname")); + g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + _("Invalid hostname")); return NULL; } @@ -1877,7 +1738,7 @@ g_filename_to_uri (const gchar *filename, return escaped_uri; } -#ifdef G_OS_WIN32 +#if defined (G_OS_WIN32) && !defined (_WIN64) #undef g_filename_to_uri @@ -1910,9 +1771,9 @@ g_filename_to_uri (const gchar *filename, * mime type defined in RFC 2483 into individual URIs, * discarding any comments. The URIs are not validated. * - * Returns: a newly allocated %NULL-terminated list of - * strings holding the individual URIs. The array should - * be freed with g_strfreev(). + * Returns: (transfer full): a newly allocated %NULL-terminated list + * of strings holding the individual URIs. The array should be freed + * with g_strfreev(). * * Since: 2.6 */ @@ -1975,44 +1836,6 @@ g_uri_list_extract_uris (const gchar *uri_list) return result; } -static gchar * -make_valid_utf8 (const gchar *name) -{ - GString *string; - const gchar *remainder, *invalid; - gint remaining_bytes, valid_bytes; - - string = NULL; - remainder = name; - remaining_bytes = strlen (name); - - while (remaining_bytes != 0) - { - if (g_utf8_validate (remainder, remaining_bytes, &invalid)) - break; - valid_bytes = invalid - remainder; - - if (string == NULL) - string = g_string_sized_new (remaining_bytes); - - g_string_append_len (string, remainder, valid_bytes); - /* append U+FFFD REPLACEMENT CHARACTER */ - g_string_append (string, "\357\277\275"); - - remaining_bytes -= valid_bytes + 1; - remainder = invalid + 1; - } - - if (string == NULL) - return g_strdup (name); - - g_string_append (string, remainder); - - g_assert (g_utf8_validate (string->str, -1, NULL)); - - return g_string_free (string, FALSE); -} - /** * g_filename_display_basename: * @filename: an absolute pathname in the GLib file name encoding @@ -2022,7 +1845,7 @@ make_valid_utf8 (const gchar *name) * for instance there might be problems converting it to UTF-8, and some files * can be translated in the display. * - * If GLib can not make sense of the encoding of @filename, as a last resort it + * If GLib cannot make sense of the encoding of @filename, as a last resort it * replaces unknown characters with U+FFFD, the Unicode replacement character. * You can search the result for the UTF-8 encoding of this character (which is * "\357\277\275" in octal notation) to find out if @filename was in an invalid @@ -2034,7 +1857,7 @@ make_valid_utf8 (const gchar *name) * This function is preferred over g_filename_display_name() if you know the * whole path, as it allows translation. * - * Return value: a newly allocated string containing + * Returns: a newly allocated string containing * a rendition of the basename of the filename in valid UTF-8 * * Since: 2.6 @@ -2063,7 +1886,7 @@ g_filename_display_basename (const gchar *filename) * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL * even if the filename actually isn't in the GLib file name encoding. * - * If GLib can not make sense of the encoding of @filename, as a last resort it + * If GLib cannot make sense of the encoding of @filename, as a last resort it * replaces unknown characters with U+FFFD, the Unicode replacement character. * You can search the result for the UTF-8 encoding of this character (which is * "\357\277\275" in octal notation) to find out if @filename was in an invalid @@ -2073,7 +1896,7 @@ g_filename_display_basename (const gchar *filename) * g_filename_display_basename(), since that allows location-based * translation of filenames. * - * Return value: a newly allocated string containing + * Returns: a newly allocated string containing * a rendition of the filename in valid UTF-8 * * Since: 2.6 @@ -2113,10 +1936,7 @@ g_filename_display_name (const gchar *filename) * by a question mark */ if (!display_name) - display_name = make_valid_utf8 (filename); + display_name = _g_utf8_make_valid (filename); return display_name; } - -#define __G_CONVERT_C__ -#include "galiasdef.c"