1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
4 * Copyright 2000, 2001, Ximian, Inc.
7 * Michael Zucchi <notzed@ximian.com>
8 * Jeffery Stedfast <fejj@ximian.com>
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Library General Public
12 * License, version 2, as published by the Free Software Foundation.
14 * This library is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Library General Public License for more details.
19 * You should have received a copy of the GNU Library General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
43 #include "iconv-detect.h"
47 #ifdef G_THREADS_ENABLED
48 static GStaticMutex lock = G_STATIC_MUTEX_INIT;
49 #define LOCK() g_static_mutex_lock(&lock)
50 #define UNLOCK() g_static_mutex_unlock(&lock)
56 typedef struct _EDListNode {
57 struct _EDListNode *next;
58 struct _EDListNode *prev;
61 typedef struct _EDList {
62 struct _EDListNode *head;
63 struct _EDListNode *tail;
64 struct _EDListNode *tailpred;
67 #define E_DLIST_INITIALISER(l) { (EDListNode *)&l.tail, 0, (EDListNode *)&l.head }
69 struct _iconv_cache_node {
70 struct _iconv_cache_node *next;
71 struct _iconv_cache_node *prev;
73 struct _iconv_cache *parent;
80 struct _iconv_cache *next;
81 struct _iconv_cache *prev;
85 EDList open; /* stores iconv_cache_nodes, busy ones up front */
88 #define E_ICONV_CACHE_SIZE (16)
90 static EDList iconv_cache_list;
91 static GHashTable *iconv_cache;
92 static GHashTable *iconv_cache_open;
93 static unsigned int iconv_cache_size = 0;
95 static GHashTable *iconv_charsets = NULL;
96 static char *locale_charset = NULL;
97 static char *locale_lang = NULL;
102 } known_iconv_charsets[] = {
104 /* charset name, iconv-friendly charset name */
105 { "iso-8859-1", "iso-8859-1" },
106 { "iso8859-1", "iso-8859-1" },
107 /* the above mostly serves as an example for iso-style charsets,
108 but we have code that will populate the iso-*'s if/when they
109 show up in e_iconv_charset_name() so I'm
110 not going to bother putting them all in here... */
111 { "windows-cp1251", "cp1251" },
112 { "windows-1251", "cp1251" },
113 { "cp1251", "cp1251" },
114 /* the above mostly serves as an example for windows-style
115 charsets, but we have code that will parse and convert them
116 to their cp#### equivalents if/when they show up in
117 e_iconv_charset_name() so I'm not going to bother
118 putting them all in here either... */
120 /* charset name (lowercase!), iconv-friendly name (sometimes case sensitive) */
121 { "utf-8", "UTF-8" },
123 /* 10646 is a special case, its usually UCS-2 big endian */
124 /* This might need some checking but should be ok for solaris/linux */
125 { "iso-10646-1", "UCS-2BE" },
126 { "iso_10646-1", "UCS-2BE" },
127 { "iso10646-1", "UCS-2BE" },
128 { "iso-10646", "UCS-2BE" },
129 { "iso_10646", "UCS-2BE" },
130 { "iso10646", "UCS-2BE" },
132 { "ks_c_5601-1987", "EUC-KR" },
134 /* FIXME: Japanese/Korean/Chinese stuff needs checking */
135 { "euckr-0", "EUC-KR" },
136 { "5601", "EUC-KR" },
137 { "zh_TW-euc", "EUC-TW" },
138 { "zh_CN.euc", "gb2312" },
139 { "zh_TW-big5", "BIG5" },
140 { "euc-cn", "gb2312" },
141 { "big5-0", "BIG5" },
142 { "big5.eten-0", "BIG5" },
143 { "big5hkscs-0", "BIG5HKSCS" },
144 { "gb2312-0", "gb2312" },
145 { "gb2312.1980-0", "gb2312" },
146 { "gb-2312", "gb2312" },
147 { "gb18030-0", "gb18030" },
150 { "eucjp-0", "eucJP" },
151 { "ujis-0", "ujis" },
152 { "jisx0208.1983-0","SJIS" },
153 { "jisx0212.1990-0","SJIS" },
160 /* Another copy of this trivial list implementation
161 Why? This stuff gets called a lot (potentially), should run fast,
162 and g_list's are f@@#$ed up to make this a hassle */
163 static void e_dlist_init(EDList *v)
165 v->head = (EDListNode *)&v->tail;
167 v->tailpred = (EDListNode *)&v->head;
170 static EDListNode *e_dlist_addhead(EDList *l, EDListNode *n)
173 n->prev = (EDListNode *)&l->head;
179 static EDListNode *e_dlist_addtail(EDList *l, EDListNode *n)
181 n->next = (EDListNode *)&l->tail;
182 n->prev = l->tailpred;
183 l->tailpred->next = n;
188 static EDListNode *e_dlist_remove(EDListNode *n)
190 n->next->prev = n->prev;
191 n->prev->next = n->next;
196 /* fucking glib... */
198 e_strdown (char *str)
200 register char *s = str;
203 if (*s >= 'A' && *s <= 'Z')
214 register char *s = str;
217 if (*s >= 'a' && *s <= 'z')
227 locale_parse_lang (const char *locale)
229 char *codeset, *lang;
231 if ((codeset = strchr (locale, '.')))
232 lang = g_strndup (locale, codeset - locale);
234 lang = g_strdup (locale);
236 /* validate the language */
237 if (strlen (lang) >= 2) {
238 if (lang[2] == '-' || lang[2] == '_') {
239 /* canonicalise the lang */
242 /* validate the country code */
243 if (strlen (lang + 3) > 2) {
244 /* invalid country code */
250 } else if (lang[2] != '\0') {
251 /* invalid language */
258 /* invalid language */
264 /* NOTE: Owns the lock on return if keep is TRUE ! */
266 e_iconv_init(int keep)
268 char *from, *to, *locale;
273 if (iconv_charsets != NULL) {
279 iconv_charsets = g_hash_table_new(g_str_hash, g_str_equal);
281 for (i = 0; known_iconv_charsets[i].charset != NULL; i++) {
282 from = g_strdup(known_iconv_charsets[i].charset);
283 to = g_strdup(known_iconv_charsets[i].iconv_name);
285 g_hash_table_insert(iconv_charsets, from, to);
288 e_dlist_init(&iconv_cache_list);
289 iconv_cache = g_hash_table_new(g_str_hash, g_str_equal);
290 iconv_cache_open = g_hash_table_new(NULL, NULL);
293 locale = setlocale (LC_ALL, NULL);
295 locale = g_win32_getlocale ();
298 if (!locale || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
299 /* The locale "C" or "POSIX" is a portable locale; its
300 * LC_CTYPE part corresponds to the 7-bit ASCII character
304 locale_charset = NULL;
308 g_get_charset (&locale_charset);
309 locale_charset = g_strdup (locale_charset);
310 e_strdown (locale_charset);
313 locale_charset = g_strdup (nl_langinfo (CODESET));
314 e_strdown (locale_charset);
316 /* A locale name is typically of the form language[_terri-
317 * tory][.codeset][@modifier], where language is an ISO 639
318 * language code, territory is an ISO 3166 country code, and
319 * codeset is a character set or encoding identifier like
320 * ISO-8859-1 or UTF-8.
324 codeset = strchr (locale, '.');
328 /* ; is a hack for debian systems and / is a hack for Solaris systems */
329 for (p = codeset; *p && !strchr ("@;/", *p); p++);
330 locale_charset = g_strndup (codeset, p - codeset);
331 e_strdown (locale_charset);
333 /* charset unknown */
334 locale_charset = NULL;
337 #endif /* !G_OS_WIN32 */
339 /* parse the locale lang */
340 locale_parse_lang (locale);
351 const char *e_iconv_charset_name(const char *charset)
353 char *name, *ret, *tmp;
358 name = g_alloca (strlen (charset) + 1);
359 strcpy (name, charset);
363 ret = g_hash_table_lookup(iconv_charsets, name);
369 /* Unknown, try canonicalise some basic charset types to something that should work */
370 if (strncmp(name, "iso", 3) == 0) {
371 /* Convert iso-nnnn-n or isonnnn-n or iso_nnnn-n to iso-nnnn-n or isonnnn-n */
376 if (*tmp == '-' || *tmp == '_')
379 iso = strtoul (tmp, &p, 10);
382 /* they all become ICONV_10646 */
383 ret = g_strdup (ICONV_10646);
386 if (*tmp == '-' || *tmp == '_')
389 codepage = strtoul (tmp, &p, 10);
392 /* codepage is numeric */
395 ret = g_strdup ("IBM-921");
398 ret = g_strdup_printf (ICONV_ISO_D_FORMAT, iso, codepage);
400 /* codepage is a string - probably iso-2022-jp or something */
401 ret = g_strdup_printf (ICONV_ISO_S_FORMAT, iso, p);
404 } else if (strncmp(name, "windows-", 8) == 0) {
405 /* Convert windows-nnnnn or windows-cpnnnnn to cpnnnn */
407 if (!strncmp(tmp, "cp", 2))
409 ret = g_strdup_printf("CP%s", tmp);
410 } else if (strncmp(name, "microsoft-", 10) == 0) {
411 /* Convert microsoft-nnnnn or microsoft-cpnnnnn to cpnnnn */
413 if (!strncmp(tmp, "cp", 2))
415 ret = g_strdup_printf("CP%s", tmp);
417 /* Just assume its ok enough as is, case and all */
418 ret = g_strdup(charset);
421 g_hash_table_insert(iconv_charsets, g_strdup(name), ret);
428 flush_entry(struct _iconv_cache *ic)
430 struct _iconv_cache_node *in, *nn;
432 in = (struct _iconv_cache_node *)ic->open.head;
435 if (in->ip != (iconv_t)-1) {
436 g_hash_table_remove(iconv_cache_open, in->ip);
447 /* This should run pretty quick, its called a lot */
448 iconv_t e_iconv_open(const char *oto, const char *ofrom)
450 const char *to, *from;
452 struct _iconv_cache *ic;
453 struct _iconv_cache_node *in;
457 if (oto == NULL || ofrom == NULL) {
462 to = e_iconv_charset_name (oto);
463 from = e_iconv_charset_name (ofrom);
464 tofrom = g_alloca (strlen (to) + strlen (from) + 2);
465 sprintf(tofrom, "%s%%%s", to, from);
469 ic = g_hash_table_lookup(iconv_cache, tofrom);
471 e_dlist_remove((EDListNode *)ic);
473 struct _iconv_cache *last = (struct _iconv_cache *)iconv_cache_list.tailpred;
474 struct _iconv_cache *prev;
477 while (prev && iconv_cache_size > E_ICONV_CACHE_SIZE) {
478 in = (struct _iconv_cache_node *)last->open.head;
479 if (in->next && !in->busy) {
480 cd(printf("Flushing iconv converter '%s'\n", last->conv));
481 e_dlist_remove((EDListNode *)last);
482 g_hash_table_remove(iconv_cache, last->conv);
492 ic = g_malloc(sizeof(*ic));
493 e_dlist_init(&ic->open);
494 ic->conv = g_strdup(tofrom);
495 g_hash_table_insert(iconv_cache, ic->conv, ic);
497 cd(printf("Creating iconv converter '%s'\n", ic->conv));
499 e_dlist_addhead(&iconv_cache_list, (EDListNode *)ic);
501 /* If we have a free iconv, use it */
502 in = (struct _iconv_cache_node *)ic->open.tailpred;
503 if (in->prev && !in->busy) {
504 cd(printf("using existing iconv converter '%s'\n", ic->conv));
506 if (ip != (iconv_t)-1) {
507 /* work around some broken iconv implementations
508 * that die if the length arguments are NULL
510 size_t buggy_iconv_len = 0;
511 char *buggy_iconv_buf = NULL;
513 /* resets the converter */
514 iconv(ip, &buggy_iconv_buf, &buggy_iconv_len, &buggy_iconv_buf, &buggy_iconv_len);
516 e_dlist_remove((EDListNode *)in);
517 e_dlist_addhead(&ic->open, (EDListNode *)in);
520 cd(printf("creating new iconv converter '%s'\n", ic->conv));
521 ip = iconv_open(to, from);
522 in = g_malloc(sizeof(*in));
525 e_dlist_addhead(&ic->open, (EDListNode *)in);
526 if (ip != (iconv_t)-1) {
527 g_hash_table_insert(iconv_cache_open, ip, in);
531 g_warning("Could not open converter for '%s' to '%s' charset", from, to);
542 size_t e_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char ** outbuf, size_t *outbytesleft)
544 return iconv(cd, (char **) inbuf, inbytesleft, outbuf, outbytesleft);
548 e_iconv_close(iconv_t ip)
550 struct _iconv_cache_node *in;
552 if (ip == (iconv_t)-1)
556 in = g_hash_table_lookup(iconv_cache_open, ip);
558 cd(printf("closing iconv converter '%s'\n", in->parent->conv));
559 e_dlist_remove((EDListNode *)in);
561 e_dlist_addtail(&in->parent->open, (EDListNode *)in);
563 g_warning("trying to close iconv i dont know about: %p", ip);
570 const char *e_iconv_locale_charset(void)
574 return locale_charset;
579 e_iconv_locale_language (void)
581 e_iconv_init (FALSE);
586 /* map CJKR charsets to their language code */
587 /* NOTE: only support charset names that will be returned by
588 * e_iconv_charset_name() so that we don't have to keep track of all
589 * the aliases too. */
593 } cjkr_lang_map[] = {
595 { "BIG5HKSCS", "zh" },
600 { "iso-2022-jp", "ja" },
610 #define NUM_CJKR_LANGS (sizeof (cjkr_lang_map) / sizeof (cjkr_lang_map[0]))
613 e_iconv_charset_language (const char *charset)
620 charset = e_iconv_charset_name (charset);
621 for (i = 0; i < NUM_CJKR_LANGS; i++) {
622 if (!g_ascii_strcasecmp (cjkr_lang_map[i].charset, charset))
623 return cjkr_lang_map[i].lang;