1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
4 * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
7 * Michael Zucchi <notzed@ximian.com>
8 * Jeffery Stedfast <fejj@ximian.com>
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Library General Public
12 * License, version 2, as published by the Free Software Foundation.
14 * This library is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Library General Public License for more details.
19 * You should have received a copy of the GNU Library General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
40 #include "camel-iconv.h"
41 #include "iconv-detect.h"
45 G_LOCK_DEFINE_STATIC (iconv);
47 struct _iconv_cache_node {
48 struct _iconv_cache *parent;
56 GQueue open; /* stores iconv_cache_nodes, busy ones up front */
59 #define E_ICONV_CACHE_SIZE (16)
61 static GQueue iconv_cache_list = G_QUEUE_INIT;
62 static GHashTable *iconv_cache;
63 static GHashTable *iconv_cache_open;
65 static GHashTable *iconv_charsets = NULL;
66 static gchar *locale_charset = NULL;
67 static gchar *locale_lang = NULL;
71 const gchar *iconv_name;
72 } known_iconv_charsets[] = {
74 /* charset name, iconv-friendly charset name */
75 { "iso-8859-1", "iso-8859-1" },
76 { "iso8859-1", "iso-8859-1" },
77 /* the above mostly serves as an example for iso-style charsets,
78 * but we have code that will populate the iso-*'s if/when they
79 * show up in camel_iconv_charset_name () so I'm
80 * not going to bother putting them all in here... */
81 { "windows-cp1251", "cp1251" },
82 { "windows-1251", "cp1251" },
83 { "cp1251", "cp1251" },
84 /* the above mostly serves as an example for windows-style
85 * charsets, but we have code that will parse and convert them
86 * to their cp#### equivalents if/when they show up in
87 * camel_iconv_charset_name () so I'm not going to bother
88 * putting them all in here either... */
90 /* charset name (lowercase!), iconv-friendly name (sometimes case sensitive) */
93 /* 10646 is a special case, its usually UCS-2 big endian */
94 /* This might need some checking but should be ok for solaris/linux */
95 { "iso-10646-1", "UCS-2BE" },
96 { "iso_10646-1", "UCS-2BE" },
97 { "iso10646-1", "UCS-2BE" },
98 { "iso-10646", "UCS-2BE" },
99 { "iso_10646", "UCS-2BE" },
100 { "iso10646", "UCS-2BE" },
102 { "ks_c_5601-1987", "EUC-KR" },
104 /* FIXME: Japanese/Korean/Chinese stuff needs checking */
105 { "euckr-0", "EUC-KR" },
106 { "5601", "EUC-KR" },
107 { "zh_TW-euc", "EUC-TW" },
108 { "zh_CN.euc", "gb18030" },
109 { "zh_TW-big5", "BIG5" },
110 { "euc-cn", "gb18030" },
111 { "big5-0", "BIG5" },
112 { "big5.eten-0", "BIG5" },
113 { "big5hkscs-0", "BIG5HKSCS" },
114 { "gb2312-0", "gb18030" },
115 { "gb2312.1980-0", "gb18030" },
116 { "gb-2312", "gb18030" },
117 { "gb2312", "gb18030" },
118 { "gb18030-0", "gb18030" },
121 { "eucjp-0", "eucJP" },
122 { "ujis-0", "ujis" },
123 { "jisx0208.1983-0","SJIS" },
124 { "jisx0212.1990-0","SJIS" },
130 e_strdown (gchar *str)
132 register gchar *s = str;
135 if (*s >= 'A' && *s <= 'Z')
146 register gchar *s = str;
149 if (*s >= 'a' && *s <= 'z')
158 locale_parse_lang (const gchar *locale)
160 gchar *codeset, *lang;
162 if ((codeset = strchr (locale, '.')))
163 lang = g_strndup (locale, codeset - locale);
165 lang = g_strdup (locale);
167 /* validate the language */
168 if (strlen (lang) >= 2) {
169 if (lang[2] == '-' || lang[2] == '_') {
170 /* canonicalise the lang */
173 /* validate the country code */
174 if (strlen (lang + 3) > 2) {
175 /* invalid country code */
181 } else if (lang[2] != '\0') {
182 /* invalid language */
189 /* invalid language */
195 /* NOTE: Owns the lock on return if keep is TRUE !*/
197 iconv_init (gint keep)
199 gchar *from, *to, *locale;
204 if (iconv_charsets != NULL) {
210 iconv_charsets = g_hash_table_new (g_str_hash, g_str_equal);
212 for (i = 0; known_iconv_charsets[i].charset != NULL; i++) {
213 from = g_strdup (known_iconv_charsets[i].charset);
214 to = g_strdup (known_iconv_charsets[i].iconv_name);
216 g_hash_table_insert (iconv_charsets, from, to);
219 iconv_cache = g_hash_table_new (g_str_hash, g_str_equal);
220 iconv_cache_open = g_hash_table_new (NULL, NULL);
223 locale = setlocale (LC_ALL, NULL);
225 locale = g_win32_getlocale ();
228 if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
229 /* The locale "C" or "POSIX" is a portable locale; its
230 * LC_CTYPE part corresponds to the 7-bit ASCII character
234 locale_charset = NULL;
238 g_get_charset (&locale_charset);
239 locale_charset = g_strdup (locale_charset);
240 e_strdown (locale_charset);
243 locale_charset = g_strdup (nl_langinfo (CODESET));
244 e_strdown (locale_charset);
246 /* A locale name is typically of the form language[_terri-
247 * tory][.codeset][@modifier], where language is an ISO 639
248 * language code, territory is an ISO 3166 country code, and
249 * codeset is a character set or encoding identifier like
250 * ISO-8859-1 or UTF-8.
254 codeset = strchr (locale, '.');
258 /* ; is a hack for debian systems and / is a hack for Solaris systems */
259 for (p = codeset; *p && !strchr ("@;/", *p); p++);
260 locale_charset = g_strndup (codeset, p - codeset);
261 e_strdown (locale_charset);
263 /* charset unknown */
264 locale_charset = NULL;
267 #endif /* !G_OS_WIN32 */
269 /* parse the locale lang */
270 locale_parse_lang (locale);
282 camel_iconv_charset_name (const gchar *charset)
284 gchar *name, *ret, *tmp;
289 name = g_alloca (strlen (charset) + 1);
290 strcpy (name, charset);
294 ret = g_hash_table_lookup (iconv_charsets, name);
300 /* Unknown, try canonicalise some basic charset types to something that should work */
301 if (strncmp (name, "iso", 3) == 0) {
302 /* Convert iso-nnnn-n or isonnnn-n or iso_nnnn-n to iso-nnnn-n or isonnnn-n */
307 if (*tmp == '-' || *tmp == '_')
310 iso = strtoul (tmp, &p, 10);
313 /* they all become ICONV_10646 */
314 ret = g_strdup (ICONV_10646);
317 if (*tmp == '-' || *tmp == '_')
320 codepage = strtoul (tmp, &p, 10);
323 /* codepage is numeric */
326 ret = g_strdup ("IBM-921");
329 ret = g_strdup_printf (ICONV_ISO_D_FORMAT, iso, codepage);
331 /* codepage is a string - probably iso-2022-jp or something */
332 ret = g_strdup_printf (ICONV_ISO_S_FORMAT, iso, p);
335 } else if (strncmp (name, "windows-", 8) == 0) {
336 /* Convert windows-nnnnn or windows-cpnnnnn to cpnnnn */
338 if (!strncmp (tmp, "cp", 2))
340 ret = g_strdup_printf ("CP%s", tmp);
341 } else if (strncmp (name, "microsoft-", 10) == 0) {
342 /* Convert microsoft-nnnnn or microsoft-cpnnnnn to cpnnnn */
344 if (!strncmp (tmp, "cp", 2))
346 ret = g_strdup_printf ("CP%s", tmp);
348 /* Just assume its ok enough as is, case and all */
349 ret = g_strdup (charset);
352 g_hash_table_insert (iconv_charsets, g_strdup (name), ret);
359 flush_entry (struct _iconv_cache *ic)
361 struct _iconv_cache_node *in;
363 while ((in = g_queue_pop_head (&ic->open)) != NULL) {
364 if (in->ip != (iconv_t) - 1) {
365 g_hash_table_remove (iconv_cache_open, in->ip);
366 iconv_close (in->ip);
375 /* This should run pretty quick, its called a lot */
377 camel_iconv_open (const gchar *oto,
380 const gchar *to, *from;
382 struct _iconv_cache *ic;
383 struct _iconv_cache_node *in;
387 if (oto == NULL || ofrom == NULL) {
392 to = camel_iconv_charset_name (oto);
393 from = camel_iconv_charset_name (ofrom);
394 tofrom = g_alloca (strlen (to) + strlen (from) + 2);
395 sprintf (tofrom, "%s%%%s", to, from);
399 ic = g_hash_table_lookup (iconv_cache, tofrom);
401 g_queue_remove (&iconv_cache_list, ic);
405 link = g_queue_peek_tail_link (&iconv_cache_list);
407 while (link != NULL && iconv_cache_list.length > E_ICONV_CACHE_SIZE) {
408 GList *prev = g_list_previous (link);
410 ic = (struct _iconv_cache *) link->data;
411 in = g_queue_peek_head (&ic->open);
413 if (in != NULL && !in->busy) {
414 cd (printf ("Flushing iconv converter '%s'\n", ic->conv));
415 g_queue_delete_link (&iconv_cache_list, link);
416 g_hash_table_remove (iconv_cache, ic->conv);
423 ic = g_malloc (sizeof (*ic));
424 g_queue_init (&ic->open);
425 ic->conv = g_strdup (tofrom);
426 g_hash_table_insert (iconv_cache, ic->conv, ic);
428 cd (printf ("Creating iconv converter '%s'\n", ic->conv));
431 g_queue_push_head (&iconv_cache_list, ic);
433 /* If we have a free iconv, use it */
434 in = g_queue_peek_tail (&ic->open);
435 if (in != NULL && !in->busy) {
436 cd (printf ("using existing iconv converter '%s'\n", ic->conv));
438 if (ip != (iconv_t) - 1) {
439 /* work around some broken iconv implementations
440 * that die if the length arguments are NULL
442 gsize buggy_iconv_len = 0;
443 gchar *buggy_iconv_buf = NULL;
445 /* resets the converter */
446 iconv (ip, &buggy_iconv_buf, &buggy_iconv_len, &buggy_iconv_buf, &buggy_iconv_len);
448 g_queue_remove (&ic->open, in);
449 g_queue_push_head (&ic->open, in);
452 cd (printf ("creating new iconv converter '%s'\n", ic->conv));
453 ip = iconv_open (to, from);
454 in = g_malloc (sizeof (*in));
457 g_queue_push_head (&ic->open, in);
458 if (ip != (iconv_t) - 1) {
459 g_hash_table_insert (iconv_cache_open, ip, in);
463 g_warning ("Could not open converter for '%s' to '%s' charset", from, to);
475 camel_iconv (iconv_t cd,
481 return iconv (cd, (gchar **) inbuf, inbytesleft, outbuf, outbytesleft);
485 camel_iconv_close (iconv_t ip)
487 struct _iconv_cache_node *in;
489 if (ip == (iconv_t) - 1)
493 in = g_hash_table_lookup (iconv_cache_open, ip);
495 cd (printf ("closing iconv converter '%s'\n", in->parent->conv));
496 g_queue_remove (&in->parent->open, in);
498 g_queue_push_tail (&in->parent->open, in);
500 g_warning ("trying to close iconv i dont know about: %p", ip);
507 camel_iconv_locale_charset (void)
511 return locale_charset;
515 camel_iconv_locale_language (void)
522 /* map CJKR charsets to their language code */
523 /* NOTE: only support charset names that will be returned by
524 * camel_iconv_charset_name() so that we don't have to keep track of all
525 * the aliases too. */
527 const gchar *charset;
529 } cjkr_lang_map[] = {
531 { "BIG5HKSCS", "zh" },
536 { "iso-2022-jp", "ja" },
547 camel_iconv_charset_language (const gchar *charset)
554 charset = camel_iconv_charset_name (charset);
555 for (i = 0; i < G_N_ELEMENTS (cjkr_lang_map); i++) {
556 if (!g_ascii_strcasecmp (cjkr_lang_map[i].charset, charset))
557 return cjkr_lang_map[i].lang;