2 * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com>
3 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * Convert a string between UTF-8 and the locale's charset.
37 /* Thanks to Peter Harris <peter.harris@hummingbird.com> for this win32
44 static unsigned char *make_utf8_string(const wchar_t *unicode)
46 int size = 0, index = 0, out_index = 0;
50 /* first calculate the size of the target string */
55 } else if(c < 0x0800) {
63 out = malloc(size + 1);
72 out[out_index++] = (unsigned char)c;
73 } else if(c < 0x800) {
74 out[out_index++] = 0xc0 | (c >> 6);
75 out[out_index++] = 0x80 | (c & 0x3f);
77 out[out_index++] = 0xe0 | (c >> 12);
78 out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
79 out[out_index++] = 0x80 | (c & 0x3f);
83 out[out_index] = 0x00;
88 static wchar_t *make_unicode_string(const unsigned char *utf8)
90 int size = 0, index = 0, out_index = 0;
94 /* first calculate the size of the target string */
99 } else if((c & 0xe0) == 0xe0) {
108 out = malloc((size + 1) * sizeof(wchar_t));
116 if((c & 0x80) == 0) {
117 out[out_index++] = c;
118 } else if((c & 0xe0) == 0xe0) {
119 out[out_index] = (c & 0x1F) << 12;
121 out[out_index] |= (c & 0x3F) << 6;
123 out[out_index++] |= (c & 0x3F);
125 out[out_index] = (c & 0x3F) << 6;
127 out[out_index++] |= (c & 0x3F);
136 int utf8_encode(const char *from, char **to)
141 wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
142 strlen(from), NULL, 0);
146 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
150 unicode = calloc(wchars + 1, sizeof(unsigned short));
153 fprintf(stderr, "Out of memory processing string to UTF8\n");
157 err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
158 strlen(from), unicode, wchars);
162 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
166 /* On NT-based windows systems, we could use WideCharToMultiByte(), but
167 * MS doesn't actually have a consistent API across win32.
169 *to = make_utf8_string(unicode);
175 int utf8_decode(const char *from, char **to)
180 /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but
181 * MS doesn't actually have a consistent API across win32.
183 unicode = make_unicode_string(from);
186 fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n");
190 chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
191 -1, NULL, 0, NULL, NULL);
195 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
200 *to = calloc(chars + 1, sizeof(unsigned char));
203 fprintf(stderr, "Out of memory processing string to local charset\n");
208 err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
209 -1, *to, chars, NULL, NULL);
212 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
223 #else /* End win32. Rest is for real operating systems */
226 #ifdef HAVE_LANGINFO_CODESET
227 #include <langinfo.h>
230 int iconvert(const char *fromcode, const char *tocode,
231 const char *from, size_t fromlen,
232 char **to, size_t *tolen);
234 static char *current_charset = 0; /* means "US-ASCII" */
236 UTF8_API void convert_set_charset(const char *charset)
239 #ifdef HAVE_LANGINFO_CODESET
241 charset = nl_langinfo(CODESET);
245 charset = getenv("CHARSET");
247 free(current_charset);
249 if (charset && *charset)
250 current_charset = strdup(charset);
253 static int convert_buffer(const char *fromcode, const char *tocode,
254 const char *from, size_t fromlen,
255 char **to, size_t *tolen)
260 ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
265 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
266 ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
274 static int convert_string(const char *fromcode, const char *tocode,
275 const char *from, char **to, char replace)
281 fromlen = strlen(from);
282 ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
288 s = malloc(fromlen + 1);
299 UTF8_API int utf8_encode(const char *from, char **to)
303 if (!current_charset)
304 convert_set_charset(0);
305 charset = current_charset ? current_charset : "US-ASCII";
306 return convert_string(charset, "UTF-8", from, to, '#');
309 UTF8_API int utf8_decode(const char *from, char **to)
313 if (!current_charset)
314 convert_set_charset(0);
315 charset = current_charset ? current_charset : "US-ASCII";
316 return convert_string("UTF-8", charset, from, to, '?');