From 87ad7806a77156be2568ab768053ecb90ea0d66f Mon Sep 17 00:00:00 2001 From: Matthias Clasen Date: Tue, 2 Nov 2004 21:29:33 +0000 Subject: [PATCH] New function to convert a filename to a UTF-8 string for display purposes. 2004-11-02 Matthias Clasen * glib/gconvert.c (g_filename_display_name): New function to convert a filename to a UTF-8 string for display purposes. (g_get_filename_charsets): New function to return the encodings which are tried when converting a filename to UTF-8. --- docs/reference/ChangeLog | 5 + docs/reference/glib/glib-sections.txt | 2 + glib/gconvert.c | 233 +++++++++++++++++++++++++--------- glib/gconvert.h | 2 + 4 files changed, 184 insertions(+), 58 deletions(-) diff --git a/docs/reference/ChangeLog b/docs/reference/ChangeLog index 4543059..16aa86c 100644 --- a/docs/reference/ChangeLog +++ b/docs/reference/ChangeLog @@ -1,3 +1,8 @@ +2004-11-02 Matthias Clasen + + * glib/glib-sections.txt: Add g_get_filename_charsets and + g_filename_display_name. + 2004-11-01 Matthias Clasen * glib/tmpl/option.sgml: Updates diff --git a/docs/reference/glib/glib-sections.txt b/docs/reference/glib/glib-sections.txt index 996ea5a..810535c 100644 --- a/docs/reference/glib/glib-sections.txt +++ b/docs/reference/glib/glib-sections.txt @@ -2079,6 +2079,8 @@ g_filename_to_utf8 g_filename_from_utf8 g_filename_from_uri g_filename_to_uri +g_get_filename_charsets +g_filename_display_name g_uri_list_extract_uris g_locale_from_utf8 GConvertError diff --git a/glib/gconvert.c b/glib/gconvert.c index 4566f88..9752ae2 100644 --- a/glib/gconvert.c +++ b/glib/gconvert.c @@ -998,7 +998,7 @@ typedef struct _GFilenameCharsetCache GFilenameCharsetCache; struct _GFilenameCharsetCache { gboolean is_utf8; gchar *charset; - gchar *filename_charset; + gchar **filename_charsets; }; static void @@ -1006,41 +1006,47 @@ filename_charset_cache_free (gpointer data) { GFilenameCharsetCache *cache = data; g_free (cache->charset); - g_free (cache->filename_charset); + g_strfreev (cache->filename_charsets); g_free (cache); } /* - * get_filename_charset: - * @charset: return location for the name of the filename encoding + * g_get_filename_charsets: + * @charsets: return location for the %NULL-terminated list of encoding names * - * Determines the preferred character set used for filenames by - * consulting the environment variables G_FILENAME_ENCODING and - * G_BROKEN_FILENAMES. + * Determines the preferred character sets used for filenames. + * The first character set from the @charsets is the filename encoding, the + * subsequent character sets are used when trying to generate a displayable + * representation of a filename, see g_filename_get_display_name(). * - * G_FILENAME_ENCODING may be set to a comma-separated list of character - * set names. The special token "@locale" is taken to mean the character set - * for the current locale. The first character set from the list is taken - * as the filename encoding. - * If G_FILENAME_ENCODING is not set, but G_BROKEN_FILENAMES is, the - * character set of the current locale is taken as the filename encoding. + * The character sets are determined by consulting the environment variables + * G_FILENAME_ENCODING and G_BROKEN_FILENAMES. * - * The returned @charset belongs to GLib and must not be freed. + * G_FILENAME_ENCODING may be set to a comma-separated list + * of character set names. The special token "@locale" is taken to mean the + * character set for the current locale. If G_FILENAME_ENCODING + * is not set, but G_BROKEN_FILENAMES is, the character set of + * the current locale is taken as the filename encoding. If neither environment + * variable is set, UTF-8 is taken as the filename encoding, but the character + * set of the current locale is also put in the list of encodings. + * + * The returned @charsets belong to GLib and must not be freed. * * Note that on Unix, regardless of the locale character set or - * G_FILENAME_ENCODING value, the actual file names present on a + * G_FILENAME_ENCODING value, the actual file names present on a * system might be in any random encoding or just gibberish. * - * Return value: %TRUE - * if the charset used for filename is UTF-8. + * Return value: %TRUE if the filename encoding is UTF-8. + * + * Since: 2.6 */ -static gboolean -get_filename_charset (const gchar **filename_charset) +gboolean +g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) { static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT; GFilenameCharsetCache *cache = g_static_private_get (&cache_private); const gchar *charset; - + if (!cache) { cache = g_new0 (GFilenameCharsetCache, 1); @@ -1052,77 +1058,95 @@ get_filename_charset (const gchar **filename_charset) if (!(cache->charset && strcmp (cache->charset, charset) == 0)) { const gchar *new_charset; - gchar *p, *q; + gchar *p; + gint i; g_free (cache->charset); - g_free (cache->filename_charset); + g_strfreev (cache->filename_charsets); cache->charset = g_strdup (charset); p = getenv ("G_FILENAME_ENCODING"); if (p != NULL) { - q = strchr (p, ','); - if (!q) - q = p + strlen (p); + cache->filename_charsets = g_strsplit (p, ",", 0); + cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0); - if (strncmp ("@locale", p, q - p) == 0) - { - cache->is_utf8 = g_get_charset (&new_charset); - cache->filename_charset = g_strdup (new_charset); - } - else + for (i = 0; cache->filename_charsets[i]; i++) { - cache->filename_charset = g_strndup (p, q - p); - cache->is_utf8 = (strcmp (cache->filename_charset, "UTF-8") == 0); + if (strcmp ("@locale", cache->filename_charsets[i]) == 0) + { + g_get_charset (&new_charset); + g_free (cache->filename_charsets[i]); + cache->filename_charsets[i] = g_strdup (new_charset); + } } } else if (getenv ("G_BROKEN_FILENAMES") != NULL) { + cache->filename_charsets = g_new0 (gchar *, 2); cache->is_utf8 = g_get_charset (&new_charset); - cache->filename_charset = g_strdup (new_charset); + cache->filename_charsets[0] = g_strdup (new_charset); } else { - cache->filename_charset = g_strdup ("UTF-8"); + cache->filename_charsets = g_new0 (gchar *, 3); cache->is_utf8 = TRUE; + cache->filename_charsets[0] = g_strdup ("UTF-8"); + if (!g_get_charset (&new_charset)) + cache->filename_charsets[1] = g_strdup (new_charset); } } - if (filename_charset) - *filename_charset = cache->filename_charset; + if (filename_charsets) + *filename_charsets = (const gchar **)cache->filename_charsets; return cache->is_utf8; } #else /* G_PLATFORM_WIN32 */ -static gboolean -get_filename_charset (const gchar **filename_charset) +gboolean +g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets) { + static gchar *charsets[] = { + "UTF-8", + NULL + }; + #ifdef G_OS_WIN32 /* On Windows GLib pretends that the filename charset is UTF-8 */ - if (filename_charset) - *filename_charset = "UTF-8"; + if (filename_charsets) + *filename_charsets = charsets; + return TRUE; #else + gboolean result; + /* Cygwin works like before */ - g_get_charset (filename_charset); - return FALSE; + result = g_get_charset (&(charsets[0])); + + if (filename_charsets) + *filename_charsets = charsets; + + return result; #endif } -#ifdef G_OS_WIN32 +#endif /* G_PLATFORM_WIN32 */ static gboolean -old_get_filename_charset (const gchar **filename_charset) +get_filename_charset (const gchar **filename_charset) { - g_get_charset (filename_charset); - return FALSE; -} - -#endif + const gchar **charsets; + gboolean is_utf8; + + is_utf8 = g_get_filename_charsets (&charsets); -#endif /* G_PLATFORM_WIN32 */ + if (filename_charset) + *filename_charset = charsets[0]; + + return is_utf8; +} /* This is called from g_thread_init(). It's used to * initialize some static data in a threadsafe way. @@ -1130,8 +1154,8 @@ old_get_filename_charset (const gchar **filename_charset) void _g_convert_thread_init (void) { - const gchar *dummy; - (void) get_filename_charset (&dummy); + const gchar **dummy; + (void) get_filename_charsets (&dummy); } /** @@ -1188,7 +1212,7 @@ g_filename_to_utf8 (const gchar *opsysstring, { const gchar *charset; - if (old_get_filename_charset (&charset)) + if (g_get_charset (&charset)) return strdup_len (opsysstring, len, bytes_read, bytes_written, error); else return g_convert (opsysstring, len, @@ -1250,7 +1274,7 @@ g_filename_from_utf8 (const gchar *utf8string, { const gchar *charset; - if (old_get_filename_charset (&charset)) + if (g_get_charset (&charset)) return strdup_len (utf8string, len, bytes_read, bytes_written, error); else return g_convert (utf8string, len, @@ -1684,9 +1708,9 @@ g_filename_from_uri (const gchar *uri, * URI, or %NULL on an error. **/ gchar * -g_filename_to_uri (const gchar *filename, - const gchar *hostname, - GError **error) +g_filename_to_uri (const gchar *filename, + const gchar *hostname, + GError **error) { char *escaped_uri; @@ -1792,3 +1816,96 @@ g_uri_list_extract_uris (const gchar *uri_list) return result; } + +static gchar * +make_valid_utf8 (const gchar *name) +{ + GString *string; + const gchar *remainder, *invalid; + gint remaining_bytes, valid_bytes; + + string = NULL; + remainder = name; + remaining_bytes = strlen (name); + + while (remaining_bytes != 0) + { + if (g_utf8_validate (remainder, remaining_bytes, &invalid)) + break; + valid_bytes = invalid - remainder; + + if (string == NULL) + string = g_string_sized_new (remaining_bytes); + + g_string_append_len (string, remainder, valid_bytes); + g_string_append_c (string, '?'); + + remaining_bytes -= valid_bytes + 1; + remainder = invalid + 1; + } + + if (string == NULL) + return g_strdup (name); + + g_string_append (string, remainder); + g_string_append (string, " (invalid encoding)"); + + g_assert (g_utf8_validate (string->str, -1, NULL)); + + return g_string_free (string, FALSE); +} + +/** + * g_filename_display_name: + * @filename: a pathname in the GLib filename encoding + * + * Converts a filename into a valid UTF-8 string. The + * conversion is not necessarily reversible, so you + * should keep the original around and use the return + * value of this function only for display purposes. + * + * Return value: a newly allocated string containing + * a rendition of the filename in valid UTF-8 + * + * Since: 2.6 + **/ +gchar * +g_filename_display_name (const gchar *filename) +{ + gint i; + const gchar **charsets; + gchar *display_name = NULL; + gboolean is_utf8; + + is_utf8 = g_get_filename_charsets (&charsets); + + if (is_utf8) + { + if (g_utf8_validate (filename, -1, NULL)) + display_name = g_strdup (filename); + } + + if (!display_name) + { + /* Try to convert from the filename charsets to UTF-8. + * Skip the first charset if it is UTF-8. + */ + for (i = is_utf8 ? 1 : 0; charsets[i]; i++) + { + display_name = g_convert (filename, -1, "UTF-8", charsets[i], + NULL, NULL, NULL); + + if (display_name) + break; + } + } + + /* if all conversions failed, we replace invalid UTF-8 + * by a question mark + */ + if (!display_name) + display_name = make_valid_utf8 (filename); + + return display_name; +} + diff --git a/glib/gconvert.h b/glib/gconvert.h index f666e28..cc4f48b 100644 --- a/glib/gconvert.h +++ b/glib/gconvert.h @@ -121,6 +121,8 @@ gchar *g_filename_from_uri (const gchar *uri, gchar *g_filename_to_uri (const gchar *filename, const gchar *hostname, GError **error); +gchar *g_filename_display_name (const gchar *filename); +gboolean g_get_filename_charsets (G_CONST_RETURN gchar ***charsets); gchar **g_uri_list_extract_uris (const gchar *uri_list); -- 2.7.4