1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Copyright (C) 2000-2012 Jeffrey Stedfast
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License
7 * as published by the Free Software Foundation; either version 2.1
8 * of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free
17 * Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
29 #include <sys/types.h>
38 #if defined (WIN32) || defined (__CYGWIN__)
39 #define WIN32_LEAN_AND_MEAN
43 #include "gmime-charset-map-private.h"
44 #include "gmime-table-private.h"
45 #include "gmime-charset.h"
46 #include "gmime-iconv.h"
48 #ifdef HAVE_ICONV_DETECT_H
49 #include "iconv-detect.h"
50 #else /* use old-style detection */
51 #if defined (__aix__) || defined (__irix__) || defined (__sun__)
52 #define ICONV_ISO_INT_FORMAT "ISO%u-%u"
53 /* this one is for charsets like ISO-2022-JP, for which at least
54 Solaris wants a - after the ISO */
55 #define ICONV_ISO_STR_FORMAT "ISO-%u-%s"
56 #elif defined (__hpux__)
57 #define ICONV_ISO_INT_FORMAT "iso%u%u"
58 #define ICONV_ISO_STR_FORMAT "iso%u%s"
60 #define ICONV_ISO_INT_FORMAT "iso-%u-%u"
61 #define ICONV_ISO_STR_FORMAT "iso-%u-%s"
62 #endif /* __aix__, __irix__, __sun__ */
63 #define ICONV_10646 "iso-10646"
64 #endif /* USE_ICONV_DETECT */
68 * SECTION: gmime-charset
69 * @title: gmime-charset
70 * @short_description: Charset helper functions
73 * Charset utility functions.
77 /* a useful website on charset alaises:
78 * http://www.li18nux.org/subgroups/sa/locnameguide/v1.1draft/CodesetAliasTable-V11.html */
82 const char *iconv_name;
83 } known_iconv_charsets[] = {
84 /* charset name, iconv-friendly name (sometimes case sensitive) */
88 /* ANSI_X3.4-1968 is used on some systems and should be
89 treated the same as US-ASCII */
90 { "ANSI_X3.4-1968", NULL },
92 /* 10646 is a special case, its usually UCS-2 big endian */
93 /* This might need some checking but should be ok for
95 { "iso-10646-1", "UCS-2BE" },
96 { "iso_10646-1", "UCS-2BE" },
97 { "iso10646-1", "UCS-2BE" },
98 { "iso-10646", "UCS-2BE" },
99 { "iso_10646", "UCS-2BE" },
100 { "iso10646", "UCS-2BE" },
102 /* Korean charsets */
103 /* Note: according to http://www.iana.org/assignments/character-sets,
104 * ks_c_5601-1987 should really map to ISO-2022-KR, but the EUC-KR
105 * mapping was given to me via a native Korean user, so I'm not sure
106 * if I should change this... perhaps they are compatable? */
107 { "ks_c_5601-1987", "EUC-KR" },
108 { "5601", "EUC-KR" },
109 { "ksc-5601", "EUC-KR" },
110 { "ksc-5601-1987", "EUC-KR" },
111 { "ksc-5601_1987", "EUC-KR" },
112 { "ks_c_5861-1992", "EUC-KR" },
113 { "euckr-0", "EUC-KR" },
115 /* Chinese charsets */
116 { "big5-0", "BIG5" },
117 { "big5.eten-0", "BIG5" },
118 { "big5hkscs-0", "BIG5HKSCS" },
119 /* Note: GBK is a superset of gb2312 (see
120 * http://en.wikipedia.org/wiki/GBK for details), so 'upgrade'
121 * gb2312 to GBK so that we can completely convert GBK text
122 * that is incorrectly tagged as gb2312 to UTF-8. */
124 { "gb-2312", "GBK" },
125 { "gb2312-0", "GBK" },
126 { "gb2312-80", "GBK" },
127 { "gb2312.1980-0", "GBK" },
128 /* euc-cn is an alias for gb2312 */
130 { "gb18030-0", "gb18030" },
133 /* Japanese charsets */
134 { "eucjp-0", "eucJP" }, /* should this map to "EUC-JP" instead? */
135 { "ujis-0", "ujis" }, /* we might want to map this to EUC-JP */
136 { "jisx0208.1983-0", "SJIS" },
137 { "jisx0212.1990-0", "SJIS" },
142 /* map CJKR charsets to their language code */
143 /* NOTE: only support charset names that will be returned by
144 * g_mime_charset_iconv_name() so that we don't have to keep track of
145 * all the aliases too. */
149 } cjkr_lang_map[] = {
151 { "BIG5HKSCS", "zh" },
156 { "iso-2022-jp", "ja" },
157 { "Shift-JIS", "ja" },
167 static GHashTable *iconv_charsets = NULL;
168 static char **user_charsets = NULL;
169 static char *locale_charset = NULL;
170 static char *locale_lang = NULL;
172 #ifdef G_THREADS_ENABLED
173 static GStaticMutex charset_lock = G_STATIC_MUTEX_INIT;
174 #define CHARSET_LOCK() g_static_mutex_lock (&charset_lock);
175 #define CHARSET_UNLOCK() g_static_mutex_unlock (&charset_lock);
177 #define CHARSET_LOCK()
178 #define CHARSET_UNLOCK()
179 #endif /* G_THREADS_ENABLED */
183 * g_mime_charset_map_shutdown:
185 * Frees internal lookup tables created in g_mime_charset_map_init().
188 g_mime_charset_map_shutdown (void)
193 g_hash_table_destroy (iconv_charsets);
194 iconv_charsets = NULL;
196 g_free (locale_charset);
197 locale_charset = NULL;
199 g_free (locale_lang);
205 locale_parse_lang (const char *locale)
207 char *codeset, *lang;
209 if ((codeset = strchr (locale, '.')))
210 lang = g_strndup (locale, (size_t) (codeset - locale));
212 lang = g_strdup (locale);
214 /* validate the language */
215 if (strlen (lang) >= 2) {
216 if (lang[2] == '-' || lang[2] == '_') {
217 /* canonicalise the lang */
218 lang[0] = g_ascii_tolower (lang[0]);
219 lang[1] = g_ascii_tolower (lang[1]);
221 /* validate the country code */
222 if (strlen (lang + 3) > 2) {
223 /* invalid country code */
227 lang[3] = g_ascii_toupper (lang[3]);
228 lang[4] = g_ascii_toupper (lang[4]);
230 } else if (lang[2] != '\0') {
231 /* invalid language */
238 /* invalid language */
246 * g_mime_charset_map_init:
248 * Initializes character set maps.
250 * Note: g_mime_init() calls this routine for you.
253 g_mime_charset_map_init (void)
255 char *charset, *iconv_name, *locale;
261 iconv_charsets = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
263 for (i = 0; known_iconv_charsets[i].charset != NULL; i++) {
264 charset = g_ascii_strdown (known_iconv_charsets[i].charset, -1);
265 iconv_name = g_strdup (known_iconv_charsets[i].iconv_name);
266 g_hash_table_insert (iconv_charsets, charset, iconv_name);
271 if ((locale_charset = nl_langinfo (CODESET)) && locale_charset[0]) {
273 /* Apparently some versions of Cygwin, nl_langinfo(CODESET)
274 * always reports US-ASCII no matter what. */
275 if (strcmp (locale_charset, "US-ASCII") != 0) {
276 /* Guess this version of Cygwin is fixed. */
277 locale_charset = g_ascii_strdown (locale_charset, -1);
279 /* Cannot rely on US-ASCII being accurate. */
280 locale_charset = NULL;
283 locale_charset = g_ascii_strdown (locale_charset, -1);
286 locale_charset = NULL;
289 /* Apparently setlocale() is not reliable either... use getenv() instead. */
290 /*locale = setlocale (LC_ALL, NULL);*/
292 if (!(locale = getenv ("LC_ALL")) || !locale[0])
293 if (!(locale = getenv ("LC_CTYPE")) || !locale[0])
294 locale = getenv ("LANG");
296 if (!locale || !locale[0] || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
297 /* The locale "C" or "POSIX" is a portable locale; its
298 * LC_CTYPE part corresponds to the 7-bit ASCII character
301 locale_charset = NULL;
304 /* A locale name is typically of the form language[_terri-
305 * tory][.codeset][@modifier], where language is an ISO 639
306 * language code, territory is an ISO 3166 country code, and
307 * codeset is a character set or encoding identifier like
308 * ISO-8859-1 or UTF-8.
312 if (!locale_charset) {
313 codeset = strchr (locale, '.');
317 /* ; is a hack for debian systems and / is a hack for Solaris systems */
319 while (*p && !strchr ("@;/", *p))
322 locale_charset = g_ascii_strdown (codeset, (size_t)(p - codeset));
324 /* charset unknown */
325 locale_charset = NULL;
329 locale_parse_lang (locale);
332 locale_charset = g_strdup_printf ("cp%u", GetACP ());
338 * g_mime_locale_charset:
340 * Gets the user's locale charset (or iso-8859-1 by default).
342 * Returns: the user's locale charset (or iso-8859-1 by default).
345 g_mime_locale_charset (void)
349 g_mime_charset_map_init ();
352 return locale_charset ? locale_charset : "iso-8859-1";
357 * g_mime_locale_language:
359 * Gets the user's locale language code (or %NULL by default).
361 * Returns: the user's locale language code (or %NULL by default).
364 g_mime_locale_language (void)
368 g_mime_charset_map_init ();
376 * g_mime_charset_language:
377 * @charset: charset name
379 * Attempts to find a specific language code that is specific to
380 * @charset. Currently only handles CJK and Russian/Ukranian
381 * charset->lang mapping. Everything else will return %NULL.
383 * Returns: a language code that is specific to @charset, or %NULL on
387 g_mime_charset_language (const char *charset)
394 for (i = 0; i < G_N_ELEMENTS (cjkr_lang_map); i++) {
395 if (!g_ascii_strcasecmp (cjkr_lang_map[i].charset, charset))
396 return cjkr_lang_map[i].lang;
406 register char *s = str;
409 if (*s >= 'A' && *s <= 'Z')
418 * g_mime_charset_iconv_name:
419 * @charset: charset name
421 * Attempts to find an iconv-friendly charset name for @charset.
423 * Returns: an iconv-friendly charset name for @charset.
426 g_mime_charset_iconv_name (const char *charset)
428 char *name, *iconv_name, *buf;
433 name = g_alloca (strlen (charset) + 1);
434 strcpy (name, charset);
439 g_mime_charset_map_init ();
441 iconv_name = g_hash_table_lookup (iconv_charsets, name);
447 if (!strncmp (name, "iso", 3)) {
452 if (*buf == '-' || *buf == '_')
455 iso = strtoul (buf, &p, 10);
458 /* they all become ICONV_10646 */
459 iconv_name = g_strdup (ICONV_10646);
460 } else if (p > buf) {
462 if (*buf == '-' || *buf == '_')
465 codepage = strtoul (buf, &p, 10);
468 /* codepage is numeric */
471 iconv_name = g_strdup ("IBM-921");
474 iconv_name = g_strdup_printf (ICONV_ISO_INT_FORMAT,
477 /* codepage is a string - probably iso-2022-jp or something */
478 iconv_name = g_strdup_printf (ICONV_ISO_STR_FORMAT,
482 /* p == buf, which probably means we've
483 encountered an invalid iso charset name */
484 iconv_name = g_strdup (name);
486 } else if (!strncmp (name, "windows-", 8)) {
488 if (!strncmp (buf, "cp", 2))
491 iconv_name = g_strdup_printf ("CP%s", buf);
492 } else if (!strncmp (name, "microsoft-", 10)) {
494 if (!strncmp (buf, "cp", 2))
497 iconv_name = g_strdup_printf ("CP%s", buf);
499 /* assume charset name is ok as is? */
500 iconv_name = g_strdup (charset);
503 g_hash_table_insert (iconv_charsets, g_strdup (name), iconv_name);
511 static const char *iso_charsets[] = {
531 static const char *windows_charsets[] = {
546 * g_mime_charset_canon_name:
547 * @charset: charset name
549 * Attempts to find a canonical charset name for @charset.
551 * Note: Will normally return the same value as
552 * g_mime_charset_iconv_name() unless the system iconv does not use
553 * the canonical ISO charset names (such as using ISO8859-1 rather
554 * than the canonical form ISO-8859-1).
556 * Returns: a canonical charset name for @charset.
559 g_mime_charset_canon_name (const char *charset)
568 charset = g_mime_charset_iconv_name (charset);
569 if (g_ascii_strncasecmp (charset, "iso", 3) == 0) {
571 if (*ptr == '-' || *ptr == '_')
574 if (strncmp (ptr, "8859", 4) != 0)
578 if (*ptr == '-' || *ptr == '_')
581 iso = strtoul (ptr, &endptr, 10);
582 if (endptr == ptr || *endptr != '\0')
585 if (iso > G_N_ELEMENTS (iso_charsets))
588 return iso_charsets[iso];
589 } else if (!strncmp (charset, "CP125", 5)) {
591 if (*ptr >= '0' && *ptr <= '9')
592 return windows_charsets[*ptr - '0'];
600 * g_mime_charset_name:
601 * @charset: charset name
603 * Attempts to find an iconv-friendly charset name for @charset.
605 * Note: This function is deprecated. Use g_mime_charset_iconv_name()
608 * Returns: an iconv-friendly charset name for @charset.
611 g_mime_charset_name (const char *charset)
613 return g_mime_charset_iconv_name (charset);
618 * g_mime_charset_locale_name:
620 * Gets the user's locale charset (or iso-8859-1 by default).
622 * Note: This function is deprecated. Use g_mime_locale_charset()
625 * Returns: the user's locale charset (or iso-8859-1 by default).
628 g_mime_charset_locale_name (void)
630 return g_mime_locale_charset ();
635 * g_mime_charset_iso_to_windows:
636 * @isocharset: ISO-8859-# charset
638 * Maps the ISO-8859-# charset to the equivalent Windows-CP125#
641 * Returns: equivalent Windows charset.
644 g_mime_charset_iso_to_windows (const char *isocharset)
646 /* According to http://czyborra.com/charsets/codepages.html,
647 * the charset mapping is as follows:
649 * us-ascii maps to windows-cp1252
650 * iso-8859-1 maps to windows-cp1252
651 * iso-8859-2 maps to windows-cp1250
652 * iso-8859-3 maps to windows-cp????
653 * iso-8859-4 maps to windows-cp????
654 * iso-8859-5 maps to windows-cp1251
655 * iso-8859-6 maps to windows-cp1256
656 * iso-8859-7 maps to windows-cp1253
657 * iso-8859-8 maps to windows-cp1255
658 * iso-8859-9 maps to windows-cp1254
659 * iso-8859-10 maps to windows-cp????
660 * iso-8859-11 maps to windows-cp????
661 * iso-8859-12 maps to windows-cp????
662 * iso-8859-13 maps to windows-cp1257
665 * - I'm going to assume that since iso-8859-4 and
666 * iso-8859-13 are Baltic that it also maps to
670 isocharset = g_mime_charset_canon_name (isocharset);
672 if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii"))
673 return "windows-cp1252";
674 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2"))
675 return "windows-cp1250";
676 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4"))
677 return "windows-cp1257";
678 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5"))
679 return "windows-cp1251";
680 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6"))
681 return "windows-cp1256";
682 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7"))
683 return "windows-cp1253";
684 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8"))
685 return "windows-cp1255";
686 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9"))
687 return "windows-cp1254";
688 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13"))
689 return "windows-cp1257";
696 * g_mime_charset_init:
697 * @charset: charset mask
699 * Initializes a charset mask structure.
702 g_mime_charset_init (GMimeCharset *charset)
704 charset->mask = (unsigned int) ~0;
710 * g_mime_charset_step:
711 * @charset: charset structure
712 * @inbuf: input text buffer (must be in UTF-8)
713 * @inlen: input buffer length
715 * Steps through the input buffer 1 unicode character (glyph) at a
716 * time (ie, not necessarily 1 byte at a time). Bitwise 'and' our
717 * @charset->mask with the mask for each glyph. This has the effect of
718 * limiting what charsets our @charset->mask can match.
721 g_mime_charset_step (GMimeCharset *charset, const char *inbuf, size_t inlen)
723 register const char *inptr = inbuf;
724 const char *inend = inbuf + inlen;
725 register unsigned int mask;
728 mask = charset->mask;
729 level = charset->level;
731 while (inptr < inend) {
732 const char *newinptr;
735 newinptr = g_utf8_next_char (inptr);
736 c = g_utf8_get_char (inptr);
737 if (newinptr == NULL || !g_unichar_validate (c)) {
744 mask &= charset_mask (c);
746 if (c >= 128 && c < 256)
747 level = MAX (level, 1);
756 charset->mask = mask;
757 charset->level = level;
761 charset_best_mask (unsigned int mask)
766 for (i = 0; i < G_N_ELEMENTS (charinfo); i++) {
767 if (charinfo[i].bit & mask) {
768 lang = g_mime_charset_language (charinfo[i].name);
770 if (!lang || (locale_lang && !strncmp (locale_lang, lang, 2)))
771 return charinfo[i].name;
780 * g_mime_charset_best_name:
781 * @charset: charset mask
783 * Gets the best charset name based on the charset mask @charset.
785 * Returns: a pointer to a string containing the best charset name that
786 * can represent the charset mask @charset.
789 g_mime_charset_best_name (GMimeCharset *charset)
791 if (charset->level == 1)
793 else if (charset->level == 2)
794 return charset_best_mask (charset->mask);
801 * g_mime_charset_best:
802 * @inbuf: a UTF-8 text buffer
803 * @inlen: input buffer length
805 * Computes the best charset to use to encode this text buffer.
807 * Returns: the charset name best suited for the input text or %NULL if
808 * it is US-ASCII safe.
811 g_mime_charset_best (const char *inbuf, size_t inlen)
813 GMimeCharset charset;
815 g_mime_charset_init (&charset);
816 g_mime_charset_step (&charset, inbuf, inlen);
818 return g_mime_charset_best_name (&charset);
823 * g_mime_charset_can_encode:
824 * @mask: a #GMimeCharset mask
825 * @charset: a charset
826 * @text: utf-8 text to check
827 * @len: length of @text
829 * Check to see if the UTF-8 @text will fit safely within @charset.
831 * Returns: %TRUE if it is safe to encode @text into @charset or %FALSE
835 g_mime_charset_can_encode (GMimeCharset *mask, const char *charset, const char *text, size_t len)
837 const unsigned char *inptr = (const unsigned char *) text;
838 const unsigned char *inend = inptr + len;
839 size_t inleft, outleft, rc;
840 const char *inbuf = text;
841 char out[256], *outbuf;
842 const char *iconv_name;
849 if (mask->level == 0 && (!charset || !g_ascii_strcasecmp (charset, "us-ascii"))) {
850 /* simple US-ASCII case - is this scan even necessary? */
851 while (inptr < inend && is_ascii (*inptr))
860 if (!g_ascii_strcasecmp (charset, "utf-8")) {
861 /* we can encode anything in utf-8 */
865 charset = g_mime_charset_iconv_name (charset);
867 if (mask->level == 1)
868 return !g_ascii_strcasecmp (charset, "iso-8859-1");
870 /* check if this is a charset that we have precalculated masking for */
871 for (i = 0; i < G_N_ELEMENTS (charinfo); i++) {
872 iconv_name = g_mime_charset_iconv_name (charinfo[i].name);
873 if (charset == iconv_name)
877 if (i < G_N_ELEMENTS (charinfo)) {
878 /* indeed we do... */
879 return (charinfo[i].bit & mask->mask);
882 /* down to the nitty gritty slow and painful way... */
883 if ((cd = g_mime_iconv_open (charset, "UTF-8")) == (iconv_t) -1)
889 outleft = sizeof (out);
893 rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
894 if (rc == (size_t) -1 && errno != E2BIG)
896 } while (inleft > 0);
899 outleft = sizeof (out);
903 rc = iconv (cd, NULL, NULL, &outbuf, &outleft);
906 g_mime_iconv_close (cd);
908 return rc != (size_t) -1;
913 * g_mime_set_user_charsets:
914 * @charsets: an array of user-preferred charsets
916 * Set a list of charsets for GMime to use as a hint for encoding and
917 * decoding headers. The charset list should be in order of preference
918 * (e.g. most preferred first, least preferred last).
921 g_mime_set_user_charsets (const char **charsets)
924 g_strfreev (user_charsets);
926 if (charsets == NULL || charsets[0] == NULL) {
927 user_charsets = NULL;
931 user_charsets = g_strdupv ((char **) charsets);
936 * g_mime_user_charsets:
938 * Get the list of user-preferred charsets set with
939 * g_mime_set_user_charsets().
941 * Returns: an array of user-set charsets or %NULL if none set.
944 g_mime_user_charsets (void)
946 return (const char **) user_charsets;