1 /* gcharset.c - Charset information
3 * Copyright (C) 2011 Red Hat, Inc.
5 * SPDX-License-Identifier: LGPL-2.1-or-later
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
24 #include "gcharsetprivate.h"
29 #include "gmessages.h"
30 #include "gstrfuncs.h"
32 #include "gthreadprivate.h"
37 #include "libcharset/libcharset.h"
42 #if (HAVE_LANGINFO_TIME_CODESET || HAVE_LANGINFO_CODESET)
48 #define WIN32_LEAN_AND_MEAN
52 G_LOCK_DEFINE_STATIC (aliases);
57 static GHashTable *alias_hash = NULL;
64 alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
66 aliases = _g_locale_get_charset_aliases ();
67 while (*aliases != '\0')
69 const char *canonical;
71 const char **alias_array;
75 aliases += strlen (aliases) + 1;
77 aliases += strlen (aliases) + 1;
79 alias_array = g_hash_table_lookup (alias_hash, canonical);
82 while (alias_array[count])
86 alias_array = g_renew (const char *, alias_array, count + 2);
87 alias_array[count] = alias;
88 alias_array[count + 1] = NULL;
90 g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
99 /* As an abuse of the alias table, the following routines gets
100 * the charsets that are aliases for the canonical name.
103 _g_charset_get_aliases (const char *canonical_name)
105 GHashTable *alias_hash = get_alias_hash ();
107 return g_hash_table_lookup (alias_hash, canonical_name);
111 g_utf8_get_charset_internal (const char *raw_data,
114 /* Allow CHARSET to override the charset of any locale category. Users should
115 * probably never be setting this — instead, just add the charset after a `.`
116 * in `LANGUAGE`/`LC_ALL`/`LC_*`/`LANG`. I can’t find any reference (in
117 * `git log`, code comments, or man pages) to this environment variable being
118 * standardised or documented or even used anywhere outside GLib. Perhaps it
119 * should eventually be removed. */
120 const char *charset = g_getenv ("CHARSET");
122 if (charset && *charset)
126 if (charset && strstr (charset, "UTF-8"))
132 /* The libcharset code tries to be thread-safe without
133 * a lock, but has a memory leak and a missing memory
134 * barrier, so we lock for it
137 charset = _g_locale_charset_unalias (raw_data);
140 if (charset && *charset)
144 if (charset && strstr (charset, "UTF-8"))
150 /* Assume this for compatibility at present. */
156 typedef struct _GCharsetCache GCharsetCache;
158 struct _GCharsetCache {
165 charset_cache_free (gpointer data)
167 GCharsetCache *cache = data;
169 g_free (cache->charset);
175 * @charset: (out) (optional) (transfer none): return location for character set
178 * Obtains the character set for the [current locale][setlocale]; you
179 * might use this character set as an argument to g_convert(), to convert
180 * from the current locale's encoding to some other encoding. (Frequently
181 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)
183 * On Windows the character set returned by this function is the
184 * so-called system default ANSI code-page. That is the character set
185 * used by the "narrow" versions of C library and Win32 functions that
186 * handle file names. It might be different from the character set
187 * used by the C library's current locale.
189 * On Linux, the character set is found by consulting nl_langinfo() if
190 * available. If not, the environment variables `LC_ALL`, `LC_CTYPE`, `LANG`
191 * and `CHARSET` are queried in order.
193 * The return value is %TRUE if the locale's encoding is UTF-8, in that
194 * case you can perhaps avoid calling g_convert().
196 * The string returned in @charset is not allocated, and should not be
199 * Returns: %TRUE if the returned charset is UTF-8
202 g_get_charset (const char **charset)
204 static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
205 GCharsetCache *cache = g_private_get (&cache_private);
209 cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
212 raw = _g_locale_charset_raw ();
215 if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
217 const gchar *new_charset;
220 g_free (cache->charset);
221 cache->raw = g_strdup (raw);
222 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
223 cache->charset = g_strdup (new_charset);
227 *charset = cache->charset;
229 return cache->is_utf8;
233 * Do the same as g_get_charset() but it temporarily set locale (LC_ALL to
234 * LC_TIME) to correctly check for charset about time conversion relatives.
236 * Returns: %TRUE if the returned charset is UTF-8
239 _g_get_time_charset (const char **charset)
241 static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
242 GCharsetCache *cache = g_private_get (&cache_private);
246 cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
248 #ifdef HAVE_LANGINFO_TIME_CODESET
249 raw = nl_langinfo (_NL_TIME_CODESET);
252 raw = _g_locale_charset_raw ();
256 if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
258 const gchar *new_charset;
261 g_free (cache->charset);
262 cache->raw = g_strdup (raw);
263 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
264 cache->charset = g_strdup (new_charset);
268 *charset = cache->charset;
270 return cache->is_utf8;
273 * Do the same as g_get_charset() but it temporarily set locale (LC_ALL to
274 * LC_CTYPE) to correctly check for charset about CTYPE conversion relatives.
276 * Returns: %TRUE if the returned charset is UTF-8
279 _g_get_ctype_charset (const char **charset)
281 static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
282 GCharsetCache *cache = g_private_get (&cache_private);
286 cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
288 #ifdef HAVE_LANGINFO_CODESET
289 raw = nl_langinfo (CODESET);
292 raw = _g_locale_charset_raw ();
296 if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
298 const gchar *new_charset;
301 g_free (cache->charset);
302 cache->raw = g_strdup (raw);
303 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
304 cache->charset = g_strdup (new_charset);
308 *charset = cache->charset;
310 return cache->is_utf8;
316 * Gets the character set for the current locale.
318 * Returns: a newly allocated string containing the name
319 * of the character set. This string must be freed with g_free().
324 const gchar *charset;
326 g_get_charset (&charset);
328 return g_strdup (charset);
332 * g_get_console_charset:
333 * @charset: (out) (optional) (transfer none): return location for character set
336 * Obtains the character set used by the console attached to the process,
337 * which is suitable for printing output to the terminal.
339 * Usually this matches the result returned by g_get_charset(), but in
340 * environments where the locale's character set does not match the encoding
341 * of the console this function tries to guess a more suitable value instead.
343 * On Windows the character set returned by this function is the
344 * output code page used by the console associated with the calling process.
345 * If the codepage can't be determined (for example because there is no
346 * console attached) UTF-8 is assumed.
348 * The return value is %TRUE if the locale's encoding is UTF-8, in that
349 * case you can perhaps avoid calling g_convert().
351 * The string returned in @charset is not allocated, and should not be
354 * Returns: %TRUE if the returned charset is UTF-8
359 g_get_console_charset (const char **charset)
362 static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
363 GCharsetCache *cache = g_private_get (&cache_private);
366 char buf[2 + 20 + 1]; /* "CP" + G_MAXUINT64 (to be safe) in decimal form (20 bytes) + "\0" */
367 const gchar *raw = NULL;
370 cache = g_private_set_alloc0 (&cache_private, sizeof (GCharsetCache));
372 /* first try to query $LANG (works for Cygwin/MSYS/MSYS2 and others using mintty) */
373 locale = g_getenv ("LANG");
374 if (locale != NULL && locale[0] != '\0')
376 /* If the locale name contains an encoding after the dot, return it. */
377 const char *dot = strchr (locale, '.');
381 const char *modifier;
384 /* Look for the possible @... trailer and remove it, if any. */
385 modifier = strchr (dot, '@');
386 if (modifier == NULL)
388 else if ((gsize) (modifier - dot) < sizeof (buf))
390 memcpy (buf, dot, modifier - dot);
391 buf[modifier - dot] = '\0';
396 /* next try querying console codepage using native win32 API */
399 cp = GetConsoleOutputCP ();
402 sprintf (buf, "CP%u", cp);
405 else if (GetLastError () != ERROR_INVALID_HANDLE)
407 gchar *emsg = g_win32_error_message (GetLastError ());
408 g_warning ("Failed to determine console output code page: %s. "
409 "Falling back to UTF-8", emsg);
413 /* fall-back to UTF-8 if the rest failed (it's a universal default) */
417 if (cache->raw == NULL || strcmp (cache->raw, raw) != 0)
419 const gchar *new_charset;
422 g_free (cache->charset);
423 cache->raw = g_strdup (raw);
424 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
425 cache->charset = g_strdup (new_charset);
429 *charset = cache->charset;
431 return cache->is_utf8;
433 /* assume the locale settings match the console encoding on non-Windows OSs */
434 return g_get_charset (charset);
440 /* read an alias file for the locales */
442 read_aliases (const gchar *file,
443 GHashTable *alias_table)
448 fp = fopen (file,"r");
451 while (fgets (buf, 256, fp))
457 /* Line is a comment */
458 if ((buf[0] == '#') || (buf[0] == '\0'))
461 /* Reads first column */
462 for (p = buf, q = NULL; *p; p++) {
463 if ((*p == '\t') || (*p == ' ') || (*p == ':')) {
466 while ((*q == '\t') || (*q == ' ')) {
472 /* The line only had one column */
473 if (!q || *q == '\0')
476 /* Read second column */
477 for (p = q; *p; p++) {
478 if ((*p == '\t') || (*p == ' ')) {
484 /* Add to alias table if necessary */
485 if (!g_hash_table_lookup (alias_table, buf)) {
486 g_hash_table_insert (alias_table, g_strdup (buf), g_strdup (q));
495 unalias_lang (char *lang)
498 static GHashTable *alias_table = NULL;
502 if (g_once_init_enter (&alias_table))
504 GHashTable *table = g_hash_table_new (g_str_hash, g_str_equal);
505 read_aliases ("/usr/share/locale/locale.alias", table);
506 g_once_init_leave (&alias_table, table);
510 while ((p = g_hash_table_lookup (alias_table, lang)) && (strcmp (p, lang) != 0))
515 static gboolean said_before = FALSE;
517 g_warning ("Too many alias levels for a locale, "
518 "may indicate a loop");
527 /* Mask for components of locale spec. The ordering here is from
528 * least significant to most significant
532 COMPONENT_CODESET = 1 << 0,
533 COMPONENT_TERRITORY = 1 << 1,
534 COMPONENT_MODIFIER = 1 << 2
537 /* Break an X/Open style locale specification into components
540 explode_locale (const gchar *locale,
546 const gchar *uscore_pos;
548 const gchar *dot_pos;
552 uscore_pos = strchr (locale, '_');
553 dot_pos = strchr (uscore_pos ? uscore_pos : locale, '.');
554 at_pos = strchr (dot_pos ? dot_pos : (uscore_pos ? uscore_pos : locale), '@');
558 mask |= COMPONENT_MODIFIER;
559 *modifier = g_strdup (at_pos);
562 at_pos = locale + strlen (locale);
566 mask |= COMPONENT_CODESET;
567 *codeset = g_strndup (dot_pos, at_pos - dot_pos);
574 mask |= COMPONENT_TERRITORY;
575 *territory = g_strndup (uscore_pos, dot_pos - uscore_pos);
578 uscore_pos = dot_pos;
580 *language = g_strndup (locale, uscore_pos - locale);
586 * Compute all interesting variants for a given locale name -
587 * by stripping off different components of the value.
589 * For simplicity, we assume that the locale is in
590 * X/Open format: language[_territory][.codeset][@modifier]
592 * TODO: Extend this to handle the CEN format (see the GNUlibc docs)
593 * as well. We could just copy the code from glibc wholesale
594 * but it is big, ugly, and complicated, so I'm reluctant
595 * to do so when this should handle 99% of the time...
598 append_locale_variants (GPtrArray *array,
601 gchar *language = NULL;
602 gchar *territory = NULL;
603 gchar *codeset = NULL;
604 gchar *modifier = NULL;
609 g_return_if_fail (locale != NULL);
611 mask = explode_locale (locale, &language, &territory, &codeset, &modifier);
613 /* Iterate through all possible combinations, from least attractive
614 * to most attractive.
616 for (j = 0; j <= mask; ++j)
620 if ((i & ~mask) == 0)
622 gchar *val = g_strconcat (language,
623 (i & COMPONENT_TERRITORY) ? territory : "",
624 (i & COMPONENT_CODESET) ? codeset : "",
625 (i & COMPONENT_MODIFIER) ? modifier : "",
627 g_ptr_array_add (array, val);
632 if (mask & COMPONENT_CODESET)
634 if (mask & COMPONENT_TERRITORY)
636 if (mask & COMPONENT_MODIFIER)
641 * g_get_locale_variants:
642 * @locale: a locale identifier
644 * Returns a list of derived variants of @locale, which can be used to
645 * e.g. construct locale-dependent filenames or search paths. The returned
646 * list is sorted from most desirable to least desirable.
647 * This function handles territory, charset and extra locale modifiers. See
648 * [`setlocale(3)`](man:setlocale) for information about locales and their format.
650 * @locale itself is guaranteed to be returned in the output.
652 * For example, if @locale is `fr_BE`, then the returned list
653 * is `fr_BE`, `fr`. If @locale is `en_GB.UTF-8@euro`, then the returned list
654 * is `en_GB.UTF-8@euro`, `en_GB.UTF-8`, `en_GB@euro`, `en_GB`, `en.UTF-8@euro`,
655 * `en.UTF-8`, `en@euro`, `en`.
657 * If you need the list of variants for the current locale,
658 * use g_get_language_names().
660 * Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
661 * allocated array of newly allocated strings with the locale variants. Free with
667 g_get_locale_variants (const gchar *locale)
671 g_return_val_if_fail (locale != NULL, NULL);
673 array = g_ptr_array_sized_new (8);
674 append_locale_variants (array, locale);
675 g_ptr_array_add (array, NULL);
677 return (gchar **) g_ptr_array_free (array, FALSE);
680 /* The following is (partly) taken from the gettext package.
681 Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc. */
684 guess_category_value (const gchar *category_name)
688 /* The highest priority value is the 'LANGUAGE' environment
689 variable. This is a GNU extension. */
690 retval = g_getenv ("LANGUAGE");
691 if ((retval != NULL) && (retval[0] != '\0'))
694 /* 'LANGUAGE' is not set. So we have to proceed with the POSIX
695 methods of looking to 'LC_ALL', 'LC_xxx', and 'LANG'. On some
696 systems this can be done by the 'setlocale' function itself. */
698 /* Setting of LC_ALL overwrites all other. */
699 retval = g_getenv ("LC_ALL");
700 if ((retval != NULL) && (retval[0] != '\0'))
703 /* Next comes the name of the desired category. */
704 retval = g_getenv (category_name);
705 if ((retval != NULL) && (retval[0] != '\0'))
708 /* Last possibility is the LANG environment variable. */
709 retval = g_getenv ("LANG");
710 if ((retval != NULL) && (retval[0] != '\0'))
713 #ifdef G_PLATFORM_WIN32
714 /* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
715 * LANG, which we already did above. Oh well. The main point of
716 * calling g_win32_getlocale() is to get the thread's locale as used
717 * by Windows and the Microsoft C runtime (in the "English_United
718 * States" format) translated into the Unixish format.
721 char *locale = g_win32_getlocale ();
722 retval = g_intern_string (locale);
731 typedef struct _GLanguageNamesCache GLanguageNamesCache;
733 struct _GLanguageNamesCache {
735 gchar **language_names;
739 language_names_cache_free (gpointer data)
741 GLanguageNamesCache *cache = data;
742 g_free (cache->languages);
743 g_strfreev (cache->language_names);
748 * g_get_language_names:
750 * Computes a list of applicable locale names, which can be used to
751 * e.g. construct locale-dependent filenames or search paths. The returned
752 * list is sorted from most desirable to least desirable and always contains
753 * the default locale "C".
755 * For example, if LANGUAGE=de:en_US, then the returned list is
756 * "de", "en_US", "en", "C".
758 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
759 * `LC_MESSAGES` and `LANG` to find the list of locales specified by the
762 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
763 * that must not be modified or freed.
767 const gchar * const *
768 g_get_language_names (void)
770 return g_get_language_names_with_category ("LC_MESSAGES");
774 * g_get_language_names_with_category:
775 * @category_name: a locale category name
777 * Computes a list of applicable locale names with a locale category name,
778 * which can be used to construct the fallback locale-dependent filenames
779 * or search paths. The returned list is sorted from most desirable to
780 * least desirable and always contains the default locale "C".
782 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
783 * @category_name, and `LANG` to find the list of locales specified by the
786 * g_get_language_names() returns g_get_language_names_with_category("LC_MESSAGES").
788 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by
789 * the thread g_get_language_names_with_category was called from.
790 * It must not be modified or freed. It must be copied if planned to be used in another thread.
794 const gchar * const *
795 g_get_language_names_with_category (const gchar *category_name)
797 static GPrivate cache_private = G_PRIVATE_INIT ((void (*)(gpointer)) g_hash_table_unref);
798 GHashTable *cache = g_private_get (&cache_private);
799 const gchar *languages;
800 GLanguageNamesCache *name_cache;
802 g_return_val_if_fail (category_name != NULL, NULL);
806 cache = g_hash_table_new_full (g_str_hash, g_str_equal,
807 g_free, language_names_cache_free);
808 g_private_set (&cache_private, cache);
811 languages = guess_category_value (category_name);
815 name_cache = (GLanguageNamesCache *) g_hash_table_lookup (cache, category_name);
816 if (!(name_cache && name_cache->languages &&
817 strcmp (name_cache->languages, languages) == 0))
822 g_hash_table_remove (cache, category_name);
824 array = g_ptr_array_sized_new (8);
826 alist = g_strsplit (languages, ":", 0);
827 for (a = alist; *a; a++)
828 append_locale_variants (array, unalias_lang (*a));
830 g_ptr_array_add (array, g_strdup ("C"));
831 g_ptr_array_add (array, NULL);
833 name_cache = g_new0 (GLanguageNamesCache, 1);
834 name_cache->languages = g_strdup (languages);
835 name_cache->language_names = (gchar **) g_ptr_array_free (array, FALSE);
836 g_hash_table_insert (cache, g_strdup (category_name), name_cache);
839 return (const gchar * const *) name_cache->language_names;