1 /* unicode.c - functions to convert unicode characters */
3 /* Copyright (C) 2010 Free Software Foundation, Inc.
5 This file is part of GNU Bash, the Bourne Again SHell.
7 Bash is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 Bash is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with Bash. If not, see <http://www.gnu.org/licenses/>.
23 #if defined (HANDLE_MULTIBYTE)
41 # define USHORT_MAX USHRT_MAX
43 # define USHORT_MAX ((unsigned short) ~(unsigned short)0)
48 # define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
51 #if defined (HAVE_LOCALE_CHARSET)
52 extern const char *locale_charset __P((void));
54 extern char *get_locale_var __P((char *));
57 static int u32init = 0;
58 static int utf8locale = 0;
59 #if defined (HAVE_ICONV)
60 static iconv_t localconv;
63 #ifndef HAVE_LOCALE_CHARSET
69 locale = get_locale_var ("LC_CTYPE");
70 if (locale == 0 || *locale == 0)
72 s = strrchr (locale, '.');
80 else if (STREQ (locale, "UTF-8"))
97 l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4);
101 else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */
103 s[0] = (x >> 8) & 0xFF;
108 s[0] = (x >> 24) & 0xFF;
109 s[1] = (x >> 16) & 0xFF;
110 s[2] = (x >> 8) & 0xFF;
124 l = (wc < 0x0080) ? 1 : ((wc < 0x0800) ? 2 : 3);
127 s[0] = (unsigned char)wc;
128 else if (wc < 0x0800)
130 s[0] = (wc >> 6) | 0xc0;
131 s[1] = (wc & 0x3f) | 0x80;
135 s[0] = (wc >> 12) | 0xe0;
136 s[1] = ((wc >> 6) & 0x3f) | 0x80;
137 s[2] = (wc & 0x3f) | 0x80;
143 /* convert a single unicode-32 character into a multibyte string and put the
144 result in S, which must be large enough (at least MB_LEN_MAX bytes) */
154 char obuf[25], *optr;
162 #if __STDC_ISO_10646__
163 if (sizeof (wchar_t) == 4)
171 codeset = nl_langinfo (CODESET);
172 if (STREQ (codeset, "UTF-8"))
174 n = u32toutf8 (wc, s);
180 /* this is mostly from coreutils-8.5/lib/unicodeio.c */
183 # if HAVE_LOCALE_CHARSET
184 charset = locale_charset (); /* XXX - fix later */
186 charset = stub_charset ();
188 if (STREQ (charset, "UTF-8"))
192 localconv = iconv_open (charset, "UTF-8");
193 if (localconv == (iconv_t)-1)
194 localconv = iconv_open (charset, "ASCII");
201 n = u32toutf8 (wc, s);
205 if (localconv == (iconv_t)-1)
207 n = u32tochar (wc, s);
211 n = u32toutf8 (wc, s);
214 obytesleft = sizeof (obuf);
218 iconv (localconv, NULL, NULL, NULL, NULL);
220 if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1)
221 return n; /* You get utf-8 if iconv fails */
225 /* number of chars to be copied is optr - obuf if we want to do bounds
228 return (optr - obuf);
231 n = u32tochar (wc, s); /* fallback */
235 #endif /* HANDLE_MULTIBYTE */