1 /* unicode.c - functions to convert unicode characters */
3 /* Copyright (C) 2010-2012 Free Software Foundation, Inc.
5 This file is part of GNU Bash, the Bourne Again SHell.
7 Bash is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 Bash is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Bash. If not, see <http://www.gnu.org/licenses/>.
23 #if defined (HANDLE_MULTIBYTE)
42 # define USHORT_MAX USHRT_MAX
44 # define USHORT_MAX ((unsigned short) ~(unsigned short)0)
49 # define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
52 #if defined (HAVE_LOCALE_CHARSET)
53 extern const char *locale_charset __P((void));
55 extern char *get_locale_var __P((char *));
58 static int u32init = 0;
59 static int utf8locale = 0;
60 #if defined (HAVE_ICONV)
61 static iconv_t localconv;
64 #ifndef HAVE_LOCALE_CHARSET
65 static char charsetbuf[40];
72 locale = get_locale_var ("LC_CTYPE");
73 if (locale == 0 || *locale == 0)
75 strcpy (charsetbuf, "ASCII");
78 s = strrchr (locale, '.');
81 strcpy (charsetbuf, s+1);
82 t = strchr (charsetbuf, '@');
87 strcpy (charsetbuf, locale);
95 #if defined (HAVE_ICONV)
96 if (u32init && localconv != (iconv_t)-1)
98 iconv_close (localconv);
99 localconv = (iconv_t)-1;
114 l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4);
118 else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */
120 s[0] = (x >> 8) & 0xFF;
125 s[0] = (x >> 24) & 0xFF;
126 s[1] = (x >> 16) & 0xFF;
127 s[2] = (x >> 8) & 0xFF;
142 l = sprintf (s, "\\u%04X", wc);
144 l = sprintf (s, "\\u%08X", wc);
148 /* Convert unsigned 32-bit int to utf-8 character string */
161 else if (wc < 0x0800)
163 s[0] = (wc >> 6) | 0xc0;
164 s[1] = (wc & 0x3f) | 0x80;
167 else if (wc < 0x10000)
169 /* Technically, we could return 0 here if 0xd800 <= wc <= 0x0dfff */
170 s[0] = (wc >> 12) | 0xe0;
171 s[1] = ((wc >> 6) & 0x3f) | 0x80;
172 s[2] = (wc & 0x3f) | 0x80;
175 else if (wc < 0x200000)
177 s[0] = (wc >> 18) | 0xf0;
178 s[1] = ((wc >> 12) & 0x3f) | 0x80;
179 s[2] = ((wc >> 6) & 0x3f) | 0x80;
180 s[3] = (wc & 0x3f) | 0x80;
183 /* Strictly speaking, UTF-8 doesn't have characters longer than 4 bytes */
184 else if (wc < 0x04000000)
186 s[0] = (wc >> 24) | 0xf8;
187 s[1] = ((wc >> 18) & 0x3f) | 0x80;
188 s[2] = ((wc >> 12) & 0x3f) | 0x80;
189 s[3] = ((wc >> 6) & 0x3f) | 0x80;
190 s[4] = (wc & 0x3f) | 0x80;
193 else if (wc < 0x080000000)
195 s[0] = (wc >> 30) | 0xf8;
196 s[1] = ((wc >> 24) & 0x3f) | 0x80;
197 s[2] = ((wc >> 18) & 0x3f) | 0x80;
198 s[3] = ((wc >> 12) & 0x3f) | 0x80;
199 s[4] = ((wc >> 6) & 0x3f) | 0x80;
200 s[5] = (wc & 0x3f) | 0x80;
210 /* Convert a 32-bit unsigned int (unicode) to a UTF-16 string. Rarely used,
211 only if sizeof(wchar_t) == 2. */
222 s[0] = (unsigned short) (c & 0xFFFF);
225 else if (c >= 0x0e000 && c <= 0x010ffff)
228 s[0] = (unsigned short)((c >> 10) + 0xd800);
229 s[1] = (unsigned short)((c & 0x3ff) + 0xdc00);
236 /* convert a single unicode-32 character into a multibyte string and put the
237 result in S, which must be large enough (at least MB_LEN_MAX bytes) */
248 char obuf[25], *optr;
254 #if __STDC_ISO_10646__
256 if (sizeof (wchar_t) == 4 && c <= 0x7fffffff)
258 else if (sizeof (wchar_t) == 2 && c <= 0x10ffff && u32toutf16 (c, ws))
259 n = wcstombs (s, ws, MB_LEN_MAX);
267 codeset = nl_langinfo (CODESET);
268 if (STREQ (codeset, "UTF-8"))
270 n = u32toutf8 (c, s);
276 /* this is mostly from coreutils-8.5/lib/unicodeio.c */
279 # if HAVE_LOCALE_CHARSET
280 charset = locale_charset (); /* XXX - fix later */
282 charset = stub_charset ();
284 if (STREQ (charset, "UTF-8"))
288 localconv = iconv_open (charset, "UTF-8");
289 if (localconv == (iconv_t)-1)
290 /* We assume ASCII when presented with an unknown encoding. */
291 localconv = iconv_open ("ASCII", "UTF-8");
296 /* If we have a UTF-8 locale, convert to UTF-8 and return converted value. */
297 n = u32toutf8 (c, s);
301 /* If the conversion is not supported, even the ASCII requested above, we
302 bail now. Currently we return the UTF-8 conversion. We could return
304 if (localconv == (iconv_t)-1)
308 obytesleft = sizeof (obuf);
312 iconv (localconv, NULL, NULL, NULL, NULL);
314 if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1)
317 /* You get ISO C99 escape sequences if iconv fails */
318 n = u32tocesc (c, s);
320 /* You get UTF-8 if iconv fails */
327 /* number of chars to be copied is optr - obuf if we want to do bounds
330 return (optr - obuf);
331 #endif /* HAVE_ICONV */
333 n = u32tocesc (c, s); /* fallback is ISO C99 escape sequences */
341 #endif /* HANDLE_MULTIBYTE */