1 /* Determine a canonical name for the current locale's character encoding.
3 Copyright (C) 2000-2006, 2008-2013 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, see <http://www.gnu.org/licenses/>. */
18 /* Written by Bruno Haible <bruno@clisp.org>. */
23 #include "localcharset.h"
31 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
35 #if defined _WIN32 || defined __WIN32__
36 # define WINDOWS_NATIVE
40 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
46 #if !defined WINDOWS_NATIVE
48 # if HAVE_LANGINFO_CODESET
49 # include <langinfo.h>
51 # if 0 /* see comment below */
56 # define WIN32_LEAN_AND_MEAN
59 #elif defined WINDOWS_NATIVE
60 # define WIN32_LEAN_AND_MEAN
68 /* For MB_CUR_MAX_L */
73 #if ENABLE_RELOCATABLE
74 # include "relocatable.h"
76 # define relocate(pathname) (pathname)
81 # include "configmake.h"
84 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
89 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
90 /* Native Windows, Cygwin, OS/2, DOS */
91 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
94 #ifndef DIRECTORY_SEPARATOR
95 # define DIRECTORY_SEPARATOR '/'
99 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
102 #if HAVE_DECL_GETC_UNLOCKED
104 # define getc getc_unlocked
107 /* The following static variable is declared 'volatile' to avoid a
108 possible multithread problem in the function get_charset_aliases. If we
109 are running in a threaded environment, and if two threads initialize
110 'charset_aliases' simultaneously, both will produce the same value,
111 and everything will be ok if the two assignments to 'charset_aliases'
112 are atomic. But I don't know what will happen if the two assignments mix. */
114 # define volatile /* empty */
116 /* Pointer to the contents of the charset.alias file, if it has already been
117 read, else NULL. Its format is:
118 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
119 static const char * volatile charset_aliases;
121 /* Return a pointer to the contents of the charset.alias file. */
123 get_charset_aliases (void)
127 cp = charset_aliases;
130 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
132 const char *base = "charset.alias";
135 /* Make it possible to override the charset.alias location. This is
136 necessary for running the testsuite before "make install". */
137 dir = getenv ("CHARSETALIASDIR");
138 if (dir == NULL || dir[0] == '\0')
139 dir = relocate (LIBDIR);
141 /* Concatenate dir and base into freshly allocated file_name. */
143 size_t dir_len = strlen (dir);
144 size_t base_len = strlen (base);
145 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
146 file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
147 if (file_name != NULL)
149 memcpy (file_name, dir, dir_len);
151 file_name[dir_len] = DIRECTORY_SEPARATOR;
152 memcpy (file_name + dir_len + add_slash, base, base_len + 1);
156 if (file_name == NULL)
157 /* Out of memory. Treat the file as empty. */
163 /* Open the file. Reject symbolic links on platforms that support
164 O_NOFOLLOW. This is a security feature. Without it, an attacker
165 could retrieve parts of the contents (namely, the tail of the
166 first line that starts with "* ") of an arbitrary file by placing
167 a symbolic link to that file under the name "charset.alias" in
168 some writable directory and defining the environment variable
169 CHARSETALIASDIR to point to that directory. */
170 fd = open (file_name,
171 O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
173 /* File not found. Treat it as empty. */
179 fp = fdopen (fd, "r");
182 /* Out of memory. Treat the file as empty. */
188 /* Parse the file's contents. */
189 char *res_ptr = NULL;
203 if (c == '\n' || c == ' ' || c == '\t')
207 /* Skip comment, to end of line. */
210 while (!(c == EOF || c == '\n'));
216 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
220 old_res_ptr = res_ptr;
223 res_size = l1 + 1 + l2 + 1;
224 res_ptr = (char *) malloc (res_size + 1);
228 res_size += l1 + 1 + l2 + 1;
229 res_ptr = (char *) realloc (res_ptr, res_size + 1);
238 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
239 strcpy (res_ptr + res_size - (l2 + 1), buf2);
246 *(res_ptr + res_size) = '\0';
258 /* To avoid the trouble of installing a file that is shared by many
259 GNU packages -- many packaging systems have problems with this --,
260 simply inline the aliases here. */
261 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
262 "ISO8859-2" "\0" "ISO-8859-2" "\0"
263 "ISO8859-4" "\0" "ISO-8859-4" "\0"
264 "ISO8859-5" "\0" "ISO-8859-5" "\0"
265 "ISO8859-7" "\0" "ISO-8859-7" "\0"
266 "ISO8859-9" "\0" "ISO-8859-9" "\0"
267 "ISO8859-13" "\0" "ISO-8859-13" "\0"
268 "ISO8859-15" "\0" "ISO-8859-15" "\0"
269 "KOI8-R" "\0" "KOI8-R" "\0"
270 "KOI8-U" "\0" "KOI8-U" "\0"
271 "CP866" "\0" "CP866" "\0"
272 "CP949" "\0" "CP949" "\0"
273 "CP1131" "\0" "CP1131" "\0"
274 "CP1251" "\0" "CP1251" "\0"
275 "eucCN" "\0" "GB2312" "\0"
276 "GB2312" "\0" "GB2312" "\0"
277 "eucJP" "\0" "EUC-JP" "\0"
278 "eucKR" "\0" "EUC-KR" "\0"
279 "Big5" "\0" "BIG5" "\0"
280 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
281 "GBK" "\0" "GBK" "\0"
282 "GB18030" "\0" "GB18030" "\0"
283 "SJIS" "\0" "SHIFT_JIS" "\0"
284 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
285 "PT154" "\0" "PT154" "\0"
286 /*"ISCII-DEV" "\0" "?" "\0"*/
287 "*" "\0" "UTF-8" "\0";
291 /* To avoid the troubles of an extra file charset.alias_vms in the
292 sources of many GNU packages, simply inline the aliases here. */
293 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
294 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
295 section 10.7 "Handling Different Character Sets". */
296 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
297 "ISO8859-2" "\0" "ISO-8859-2" "\0"
298 "ISO8859-5" "\0" "ISO-8859-5" "\0"
299 "ISO8859-7" "\0" "ISO-8859-7" "\0"
300 "ISO8859-8" "\0" "ISO-8859-8" "\0"
301 "ISO8859-9" "\0" "ISO-8859-9" "\0"
303 "eucJP" "\0" "EUC-JP" "\0"
304 "SJIS" "\0" "SHIFT_JIS" "\0"
305 "DECKANJI" "\0" "DEC-KANJI" "\0"
306 "SDECKANJI" "\0" "EUC-JP" "\0"
308 "eucTW" "\0" "EUC-TW" "\0"
309 "DECHANYU" "\0" "DEC-HANYU" "\0"
310 "DECHANZI" "\0" "GB2312" "\0"
312 "DECKOREAN" "\0" "EUC-KR" "\0";
315 # if defined WINDOWS_NATIVE || defined __CYGWIN__
316 /* To avoid the troubles of installing a separate file in the same
317 directory as the DLL and of retrieving the DLL's directory at
318 runtime, simply inline the aliases here. */
320 cp = "CP936" "\0" "GBK" "\0"
321 "CP1361" "\0" "JOHAB" "\0"
322 "CP20127" "\0" "ASCII" "\0"
323 "CP20866" "\0" "KOI8-R" "\0"
324 "CP20936" "\0" "GB2312" "\0"
325 "CP21866" "\0" "KOI8-RU" "\0"
326 "CP28591" "\0" "ISO-8859-1" "\0"
327 "CP28592" "\0" "ISO-8859-2" "\0"
328 "CP28593" "\0" "ISO-8859-3" "\0"
329 "CP28594" "\0" "ISO-8859-4" "\0"
330 "CP28595" "\0" "ISO-8859-5" "\0"
331 "CP28596" "\0" "ISO-8859-6" "\0"
332 "CP28597" "\0" "ISO-8859-7" "\0"
333 "CP28598" "\0" "ISO-8859-8" "\0"
334 "CP28599" "\0" "ISO-8859-9" "\0"
335 "CP28605" "\0" "ISO-8859-15" "\0"
336 "CP38598" "\0" "ISO-8859-8" "\0"
337 "CP51932" "\0" "EUC-JP" "\0"
338 "CP51936" "\0" "GB2312" "\0"
339 "CP51949" "\0" "EUC-KR" "\0"
340 "CP51950" "\0" "EUC-TW" "\0"
341 "CP54936" "\0" "GB18030" "\0"
342 "CP65001" "\0" "UTF-8" "\0";
346 charset_aliases = cp;
352 /* Determine the current locale's character encoding, and canonicalize it
353 into one of the canonical names listed in config.charset.
354 The result must not be freed; it is statically allocated.
355 If the canonical name cannot be determined, the result is a non-canonical
362 locale_charset (void)
367 #if !(defined WINDOWS_NATIVE || defined OS2)
369 # if HAVE_LANGINFO_CODESET
371 /* Most systems support nl_langinfo (CODESET) nowadays. */
372 codeset = nl_langinfo (CODESET);
375 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
376 returns "US-ASCII". Return the suffix of the locale name from the
377 environment variables (if present) or the codepage as a number. */
378 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
381 static char buf[2 + 10 + 1];
383 locale = getenv ("LC_ALL");
384 if (locale == NULL || locale[0] == '\0')
386 locale = getenv ("LC_CTYPE");
387 if (locale == NULL || locale[0] == '\0')
388 locale = getenv ("LANG");
390 if (locale != NULL && locale[0] != '\0')
392 /* If the locale name contains an encoding after the dot, return
394 const char *dot = strchr (locale, '.');
398 const char *modifier;
401 /* Look for the possible @... trailer and remove it, if any. */
402 modifier = strchr (dot, '@');
403 if (modifier == NULL)
405 if (modifier - dot < sizeof (buf))
407 memcpy (buf, dot, modifier - dot);
408 buf [modifier - dot] = '\0';
414 /* The Windows API has a function returning the locale's codepage as a
415 number: GetACP(). This encoding is used by Cygwin, unless the user
416 has set the environment variable CYGWIN=codepage:oem (which very few
418 Output directed to console windows needs to be converted (to
419 GetOEMCP() if the console is using a raster font, or to
420 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
421 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
422 converting to GetConsoleOutputCP(). This leads to correct results,
423 except when SetConsoleOutputCP has been called and a raster font is
425 sprintf (buf, "CP%u", GetACP ());
432 /* On old systems which lack it, use setlocale or getenv. */
433 const char *locale = NULL;
435 /* But most old systems don't have a complete set of locales. Some
436 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
437 use setlocale here; it would return "C" when it doesn't support the
438 locale name the user has set. */
440 locale = setlocale (LC_CTYPE, NULL);
442 if (locale == NULL || locale[0] == '\0')
444 locale = getenv ("LC_ALL");
445 if (locale == NULL || locale[0] == '\0')
447 locale = getenv ("LC_CTYPE");
448 if (locale == NULL || locale[0] == '\0')
449 locale = getenv ("LANG");
453 /* On some old systems, one used to set locale = "iso8859_1". On others,
454 you set it to "language_COUNTRY.charset". In any case, we resolve it
455 through the charset.alias file. */
460 #elif defined WINDOWS_NATIVE
462 static char buf[2 + 10 + 1];
464 /* The Windows API has a function returning the locale's codepage as a
466 When the output goes to a console window, it needs to be provided in
467 GetOEMCP() encoding if the console is using a raster font, or in
468 GetConsoleOutputCP() encoding if it is using a TrueType font.
469 But in GUI programs and for output sent to files and pipes, GetACP()
470 encoding is the best bet. */
471 sprintf (buf, "CP%u", GetACP ());
477 static char buf[2 + 10 + 1];
481 /* Allow user to override the codeset, as set in the operating system,
482 with standard language environment variables. */
483 locale = getenv ("LC_ALL");
484 if (locale == NULL || locale[0] == '\0')
486 locale = getenv ("LC_CTYPE");
487 if (locale == NULL || locale[0] == '\0')
488 locale = getenv ("LANG");
490 if (locale != NULL && locale[0] != '\0')
492 /* If the locale name contains an encoding after the dot, return it. */
493 const char *dot = strchr (locale, '.');
497 const char *modifier;
500 /* Look for the possible @... trailer and remove it, if any. */
501 modifier = strchr (dot, '@');
502 if (modifier == NULL)
504 if (modifier - dot < sizeof (buf))
506 memcpy (buf, dot, modifier - dot);
507 buf [modifier - dot] = '\0';
512 /* Resolve through the charset.alias file. */
517 /* OS/2 has a function returning the locale's codepage as a number. */
518 if (DosQueryCp (sizeof (cp), cp, &cplen))
522 sprintf (buf, "CP%u", cp[0]);
530 /* The canonical name cannot be determined. */
534 for (aliases = get_charset_aliases ();
536 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
537 if (strcmp (codeset, aliases) == 0
538 || (aliases[0] == '*' && aliases[1] == '\0'))
540 codeset = aliases + strlen (aliases) + 1;
544 /* Don't return an empty string. GNU libc and GNU libiconv interpret
545 the empty string as denoting "the locale's character encoding",
546 thus GNU libiconv would call this function a second time. */
547 if (codeset[0] == '\0')
551 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
552 (the default codeset) does not work when MB_CUR_MAX is 1. */
553 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)