gmime/gmime-charset.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*  GMime
   3  *  Copyright (C) 2000-2012 Jeffrey Stedfast
   4  *
   5  *  This library is free software; you can redistribute it and/or
   6  *  modify it under the terms of the GNU Lesser General Public License
   7  *  as published by the Free Software Foundation; either version 2.1
   8  *  of the License, or (at your option) any later version.
   9  *
  10  *  This library is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  *  Lesser General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU Lesser General Public
  16  *  License along with this library; if not, write to the Free
  17  *  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  18  *  02110-1301, USA.
  19  */
  20
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include <config.h>
  24 #endif
  25
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <sys/types.h>
  30 #include <sys/stat.h>
  31 #include <locale.h>
  32 #include <errno.h>
  33
  34 #ifdef HAVE_CODESET
  35 #include <langinfo.h>
  36 #endif
  37
  38 #if defined (WIN32) || defined (__CYGWIN__)
  39 #define WIN32_LEAN_AND_MEAN
  40 #include <windows.h>
  41 #endif
  42
  43 #include "gmime-charset-map-private.h"
  44 #include "gmime-table-private.h"
  45 #include "gmime-charset.h"
  46 #include "gmime-iconv.h"
  47
  48 #ifdef HAVE_ICONV_DETECT_H
  49 #include "iconv-detect.h"
  50 #else /* use old-style detection */
  51 #if defined (__aix__) || defined (__irix__) || defined (__sun__)
  52 #define ICONV_ISO_INT_FORMAT "ISO%u-%u"
  53 /* this one is for charsets like ISO-2022-JP, for which at least
  54    Solaris wants a - after the ISO */
  55 #define ICONV_ISO_STR_FORMAT "ISO-%u-%s"
  56 #elif defined (__hpux__)
  57 #define ICONV_ISO_INT_FORMAT "iso%u%u"
  58 #define ICONV_ISO_STR_FORMAT "iso%u%s"
  59 #else
  60 #define ICONV_ISO_INT_FORMAT "iso-%u-%u"
  61 #define ICONV_ISO_STR_FORMAT "iso-%u-%s"
  62 #endif /* __aix__, __irix__, __sun__ */
  63 #define ICONV_10646 "iso-10646"
  64 #endif /* USE_ICONV_DETECT */
  65
  66
  67 /**
  68  * SECTION: gmime-charset
  69  * @title: gmime-charset
  70  * @short_description: Charset helper functions
  71  * @see_also:
  72  *
  73  * Charset utility functions.
  74  **/
  75
  76
  77 /* a useful website on charset alaises:
  78  * http://www.li18nux.org/subgroups/sa/locnameguide/v1.1draft/CodesetAliasTable-V11.html */
  79
  80 static struct {
  81         const char *charset;
  82         const char *iconv_name;
  83 } known_iconv_charsets[] = {
  84         /* charset name, iconv-friendly name (sometimes case sensitive) */
  85         { "utf-8",           "UTF-8"      },
  86         { "utf8",            "UTF-8"      },
  87
  88         /* ANSI_X3.4-1968 is used on some systems and should be
  89            treated the same as US-ASCII */
  90         { "ANSI_X3.4-1968",  NULL         },
  91
  92         /* 10646 is a special case, its usually UCS-2 big endian */
  93         /* This might need some checking but should be ok for
  94            solaris/linux */
  95         { "iso-10646-1",     "UCS-2BE"    },
  96         { "iso_10646-1",     "UCS-2BE"    },
  97         { "iso10646-1",      "UCS-2BE"    },
  98         { "iso-10646",       "UCS-2BE"    },
  99         { "iso_10646",       "UCS-2BE"    },
 100         { "iso10646",        "UCS-2BE"    },
 101
 102         /* Korean charsets */
 103         /* Note: according to http://www.iana.org/assignments/character-sets,
 104          * ks_c_5601-1987 should really map to ISO-2022-KR, but the EUC-KR
 105          * mapping was given to me via a native Korean user, so I'm not sure
 106          * if I should change this... perhaps they are compatable? */
 107         { "ks_c_5601-1987",  "EUC-KR"     },
 108         { "5601",            "EUC-KR"     },
 109         { "ksc-5601",        "EUC-KR"     },
 110         { "ksc-5601-1987",   "EUC-KR"     },
 111         { "ksc-5601_1987",   "EUC-KR"     },
 112         { "ks_c_5861-1992",  "EUC-KR"     },
 113         { "euckr-0",         "EUC-KR"     },
 114
 115         /* Chinese charsets */
 116         { "big5-0",          "BIG5"       },
 117         { "big5.eten-0",     "BIG5"       },
 118         { "big5hkscs-0",     "BIG5HKSCS"  },
 119         /* Note: GBK is a superset of gb2312 (see
 120          * http://en.wikipedia.org/wiki/GBK for details), so 'upgrade'
 121          * gb2312 to GBK so that we can completely convert GBK text
 122          * that is incorrectly tagged as gb2312 to UTF-8. */
 123         { "gb2312",          "GBK"        },
 124         { "gb-2312",         "GBK"        },
 125         { "gb2312-0",        "GBK"        },
 126         { "gb2312-80",       "GBK"        },
 127         { "gb2312.1980-0",   "GBK"        },
 128         /* euc-cn is an alias for gb2312 */
 129         { "euc-cn",          "GBK"        },
 130         { "gb18030-0",       "gb18030"    },
 131         { "gbk-0",           "GBK"        },
 132
 133         /* Japanese charsets */
 134         { "eucjp-0",         "eucJP"      },  /* should this map to "EUC-JP" instead? */
 135         { "ujis-0",          "ujis"       },  /* we might want to map this to EUC-JP */
 136         { "jisx0208.1983-0", "SJIS"       },
 137         { "jisx0212.1990-0", "SJIS"       },
 138         { "pck",             "SJIS"       },
 139         { NULL,              NULL         }
 140 };
 141
 142 /* map CJKR charsets to their language code */
 143 /* NOTE: only support charset names that will be returned by
 144  * g_mime_charset_iconv_name() so that we don't have to keep track of
 145  * all the aliases too. */
 146 static struct {
 147         const char *charset;
 148         const char *lang;
 149 } cjkr_lang_map[] = {
 150         { "Big5",        "zh" },
 151         { "BIG5HKSCS",   "zh" },
 152         { "gb2312",      "zh" },
 153         { "gb18030",     "zh" },
 154         { "gbk",         "zh" },
 155         { "euc-tw",      "zh" },
 156         { "iso-2022-jp", "ja" },
 157         { "Shift-JIS",   "ja" },
 158         { "sjis",        "ja" },
 159         { "ujis",        "ja" },
 160         { "eucJP",       "ja" },
 161         { "euc-jp",      "ja" },
 162         { "euc-kr",      "ko" },
 163         { "koi8-r",      "ru" },
 164         { "koi8-u",      "uk" }
 165 };
 166
 167 static GHashTable *iconv_charsets = NULL;
 168 static char **user_charsets = NULL;
 169 static char *locale_charset = NULL;
 170 static char *locale_lang = NULL;
 171
 172 #ifdef G_THREADS_ENABLED
 173 static GStaticMutex charset_lock = G_STATIC_MUTEX_INIT;
 174 #define CHARSET_LOCK()   g_static_mutex_lock (&charset_lock);
 175 #define CHARSET_UNLOCK() g_static_mutex_unlock (&charset_lock);
 176 #else
 177 #define CHARSET_LOCK()
 178 #define CHARSET_UNLOCK()
 179 #endif /* G_THREADS_ENABLED */
 180
 181
 182 /**
 183  * g_mime_charset_map_shutdown:
 184  *
 185  * Frees internal lookup tables created in g_mime_charset_map_init().
 186  **/
 187 void
 188 g_mime_charset_map_shutdown (void)
 189 {
 190         if (!iconv_charsets)
 191                 return;
 192
 193         g_hash_table_destroy (iconv_charsets);
 194         iconv_charsets = NULL;
 195
 196         g_free (locale_charset);
 197         locale_charset = NULL;
 198
 199         g_free (locale_lang);
 200         locale_lang = NULL;
 201 }
 202
 203
 204 static void
 205 locale_parse_lang (const char *locale)
 206 {
 207         char *codeset, *lang;
 208
 209         if ((codeset = strchr (locale, '.')))
 210                 lang = g_strndup (locale, (size_t) (codeset - locale));
 211         else
 212                 lang = g_strdup (locale);
 213
 214         /* validate the language */
 215         if (strlen (lang) >= 2) {
 216                 if (lang[2] == '-' || lang[2] == '_') {
 217                         /* canonicalise the lang */
 218                         lang[0] = g_ascii_tolower (lang[0]);
 219                         lang[1] = g_ascii_tolower (lang[1]);
 220
 221                         /* validate the country code */
 222                         if (strlen (lang + 3) > 2) {
 223                                 /* invalid country code */
 224                                 lang[2] = '\0';
 225                         } else {
 226                                 lang[2] = '-';
 227                                 lang[3] = g_ascii_toupper (lang[3]);
 228                                 lang[4] = g_ascii_toupper (lang[4]);
 229                         }
 230                 } else if (lang[2] != '\0') {
 231                         /* invalid language */
 232                         g_free (lang);
 233                         lang = NULL;
 234                 }
 235
 236                 locale_lang = lang;
 237         } else {
 238                 /* invalid language */
 239                 locale_lang = NULL;
 240                 g_free (lang);
 241         }
 242 }
 243
 244
 245 /**
 246  * g_mime_charset_map_init:
 247  *
 248  * Initializes character set maps.
 249  *
 250  * Note: g_mime_init() calls this routine for you.
 251  **/
 252 void
 253 g_mime_charset_map_init (void)
 254 {
 255         char *charset, *iconv_name, *locale;
 256         int i;
 257
 258         if (iconv_charsets)
 259                 return;
 260
 261         iconv_charsets = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
 262
 263         for (i = 0; known_iconv_charsets[i].charset != NULL; i++) {
 264                 charset = g_ascii_strdown (known_iconv_charsets[i].charset, -1);
 265                 iconv_name = g_strdup (known_iconv_charsets[i].iconv_name);
 266                 g_hash_table_insert (iconv_charsets, charset, iconv_name);
 267         }
 268
 269 #ifndef WIN32
 270 #ifdef HAVE_CODESET
 271         if ((locale_charset = nl_langinfo (CODESET)) && locale_charset[0]) {
 272 #ifdef __CYGWIN__
 273                 /* Apparently some versions of Cygwin, nl_langinfo(CODESET)
 274                  * always reports US-ASCII no matter what. */
 275                 if (strcmp (locale_charset, "US-ASCII") != 0) {
 276                         /* Guess this version of Cygwin is fixed. */
 277                         locale_charset = g_ascii_strdown (locale_charset, -1);
 278                 } else {
 279                         /* Cannot rely on US-ASCII being accurate. */
 280                         locale_charset = NULL;
 281                 }
 282 #else
 283                 locale_charset = g_ascii_strdown (locale_charset, -1);
 284 #endif
 285         } else
 286                 locale_charset = NULL;
 287 #endif
 288
 289         /* Apparently setlocale() is not reliable either... use getenv() instead. */
 290         /*locale = setlocale (LC_ALL, NULL);*/
 291
 292         if (!(locale = getenv ("LC_ALL")) || !locale[0])
 293                 if (!(locale = getenv ("LC_CTYPE")) || !locale[0])
 294                         locale = getenv ("LANG");
 295
 296         if (!locale || !locale[0] || !strcmp (locale, "C") || !strcmp (locale, "POSIX")) {
 297                 /* The locale "C"  or  "POSIX"  is  a  portable  locale;  its
 298                  * LC_CTYPE  part  corresponds  to  the 7-bit ASCII character
 299                  * set.  */
 300
 301                 locale_charset = NULL;
 302                 locale_lang = NULL;
 303         } else {
 304                 /* A locale name is typically of  the  form  language[_terri-
 305                  * tory][.codeset][@modifier],  where  language is an ISO 639
 306                  * language code, territory is an ISO 3166 country code,  and
 307                  * codeset  is  a  character  set or encoding identifier like
 308                  * ISO-8859-1 or UTF-8.
 309                  */
 310                 char *codeset, *p;
 311
 312                 if (!locale_charset) {
 313                         codeset = strchr (locale, '.');
 314                         if (codeset) {
 315                                 codeset++;
 316
 317                                 /* ; is a hack for debian systems and / is a hack for Solaris systems */
 318                                 p = codeset;
 319                                 while (*p && !strchr ("@;/", *p))
 320                                         p++;
 321
 322                                 locale_charset = g_ascii_strdown (codeset, (size_t)(p - codeset));
 323                         } else {
 324                                 /* charset unknown */
 325                                 locale_charset = NULL;
 326                         }
 327                 }
 328
 329                 locale_parse_lang (locale);
 330         }
 331 #else /* WIN32 */
 332         locale_charset = g_strdup_printf ("cp%u", GetACP ());
 333 #endif
 334 }
 335
 336
 337 /**
 338  * g_mime_locale_charset:
 339  *
 340  * Gets the user's locale charset (or iso-8859-1 by default).
 341  *
 342  * Returns: the user's locale charset (or iso-8859-1 by default).
 343  **/
 344 const char *
 345 g_mime_locale_charset (void)
 346 {
 347         CHARSET_LOCK ();
 348         if (!iconv_charsets)
 349                 g_mime_charset_map_init ();
 350         CHARSET_UNLOCK ();
 351
 352         return locale_charset ? locale_charset : "iso-8859-1";
 353 }
 354
 355
 356 /**
 357  * g_mime_locale_language:
 358  *
 359  * Gets the user's locale language code (or %NULL by default).
 360  *
 361  * Returns: the user's locale language code (or %NULL by default).
 362  **/
 363 const char *
 364 g_mime_locale_language (void)
 365 {
 366         CHARSET_LOCK ();
 367         if (!iconv_charsets)
 368                 g_mime_charset_map_init ();
 369         CHARSET_UNLOCK ();
 370
 371         return locale_lang;
 372 }
 373
 374
 375 /**
 376  * g_mime_charset_language:
 377  * @charset: charset name
 378  *
 379  * Attempts to find a specific language code that is specific to
 380  * @charset. Currently only handles CJK and Russian/Ukranian
 381  * charset->lang mapping. Everything else will return %NULL.
 382  *
 383  * Returns: a language code that is specific to @charset, or %NULL on
 384  * fail.
 385  **/
 386 const char *
 387 g_mime_charset_language (const char *charset)
 388 {
 389         guint i;
 390
 391         if (!charset)
 392                 return NULL;
 393
 394         for (i = 0; i < G_N_ELEMENTS (cjkr_lang_map); i++) {
 395                 if (!g_ascii_strcasecmp (cjkr_lang_map[i].charset, charset))
 396                         return cjkr_lang_map[i].lang;
 397         }
 398
 399         return NULL;
 400 }
 401
 402
 403 static const char *
 404 strdown (char *str)
 405 {
 406         register char *s = str;
 407
 408         while (*s) {
 409                 if (*s >= 'A' && *s <= 'Z')
 410                         *s += 0x20;
 411                 s++;
 412         }
 413
 414         return str;
 415 }
 416
 417 /**
 418  * g_mime_charset_iconv_name:
 419  * @charset: charset name
 420  *
 421  * Attempts to find an iconv-friendly charset name for @charset.
 422  *
 423  * Returns: an iconv-friendly charset name for @charset.
 424  **/
 425 const char *
 426 g_mime_charset_iconv_name (const char *charset)
 427 {
 428         char *name, *iconv_name, *buf;
 429
 430         if (charset == NULL)
 431                 return NULL;
 432
 433         name = g_alloca (strlen (charset) + 1);
 434         strcpy (name, charset);
 435         strdown (name);
 436
 437         CHARSET_LOCK ();
 438         if (!iconv_charsets)
 439                 g_mime_charset_map_init ();
 440
 441         iconv_name = g_hash_table_lookup (iconv_charsets, name);
 442         if (iconv_name) {
 443                 CHARSET_UNLOCK ();
 444                 return iconv_name;
 445         }
 446
 447         if (!strncmp (name, "iso", 3)) {
 448                 int iso, codepage;
 449                 char *p;
 450
 451                 buf = name + 3;
 452                 if (*buf == '-' || *buf == '_')
 453                         buf++;
 454
 455                 iso = strtoul (buf, &p, 10);
 456
 457                 if (iso == 10646) {
 458                         /* they all become ICONV_10646 */
 459                         iconv_name = g_strdup (ICONV_10646);
 460                 } else if (p > buf) {
 461                         buf = p;
 462                         if (*buf == '-' || *buf == '_')
 463                                 buf++;
 464
 465                         codepage = strtoul (buf, &p, 10);
 466
 467                         if (p > buf) {
 468                                 /* codepage is numeric */
 469 #ifdef __aix__
 470                                 if (codepage == 13)
 471                                         iconv_name = g_strdup ("IBM-921");
 472                                 else
 473 #endif /* __aix__ */
 474                                         iconv_name = g_strdup_printf (ICONV_ISO_INT_FORMAT,
 475                                                                       iso, codepage);
 476                         } else {
 477                                 /* codepage is a string - probably iso-2022-jp or something */
 478                                 iconv_name = g_strdup_printf (ICONV_ISO_STR_FORMAT,
 479                                                               iso, p);
 480                         }
 481                 } else {
 482                         /* p == buf, which probably means we've
 483                            encountered an invalid iso charset name */
 484                         iconv_name = g_strdup (name);
 485                 }
 486         } else if (!strncmp (name, "windows-", 8)) {
 487                 buf = name + 8;
 488                 if (!strncmp (buf, "cp", 2))
 489                         buf += 2;
 490
 491                 iconv_name = g_strdup_printf ("CP%s", buf);
 492         } else if (!strncmp (name, "microsoft-", 10)) {
 493                 buf = name + 10;
 494                 if (!strncmp (buf, "cp", 2))
 495                         buf += 2;
 496
 497                 iconv_name = g_strdup_printf ("CP%s", buf);
 498         } else {
 499                 /* assume charset name is ok as is? */
 500                 iconv_name = g_strdup (charset);
 501         }
 502
 503         g_hash_table_insert (iconv_charsets, g_strdup (name), iconv_name);
 504
 505         CHARSET_UNLOCK ();
 506
 507         return iconv_name;
 508 }
 509
 510
 511 static const char *iso_charsets[] = {
 512         "us-ascii",
 513         "iso-8859-1",
 514         "iso-8859-2",
 515         "iso-8859-3",
 516         "iso-8859-4",
 517         "iso-8859-5",
 518         "iso-8859-6",
 519         "iso-8859-7",
 520         "iso-8859-8",
 521         "iso-8859-9",
 522         "iso-8859-10",
 523         "iso-8859-11",
 524         "iso-8859-12",
 525         "iso-8859-13",
 526         "iso-8859-14",
 527         "iso-8859-15",
 528         "iso-8859-16"
 529 };
 530
 531 static const char *windows_charsets[] = {
 532         "windows-cp1250",
 533         "windows-cp1251",
 534         "windows-cp1252",
 535         "windows-cp1253",
 536         "windows-cp1254",
 537         "windows-cp1255",
 538         "windows-cp1256",
 539         "windows-cp1257",
 540         "windows-cp1258",
 541         "windows-cp1259"
 542 };
 543
 544
 545 /**
 546  * g_mime_charset_canon_name:
 547  * @charset: charset name
 548  *
 549  * Attempts to find a canonical charset name for @charset.
 550  *
 551  * Note: Will normally return the same value as
 552  * g_mime_charset_iconv_name() unless the system iconv does not use
 553  * the canonical ISO charset names (such as using ISO8859-1 rather
 554  * than the canonical form ISO-8859-1).
 555  *
 556  * Returns: a canonical charset name for @charset.
 557  **/
 558 const char *
 559 g_mime_charset_canon_name (const char *charset)
 560 {
 561         const char *ptr;
 562         char *endptr;
 563         guint iso;
 564
 565         if (!charset)
 566                 return NULL;
 567
 568         charset = g_mime_charset_iconv_name (charset);
 569         if (g_ascii_strncasecmp (charset, "iso", 3) == 0) {
 570                 ptr = charset + 3;
 571                 if (*ptr == '-' || *ptr == '_')
 572                         ptr++;
 573
 574                 if (strncmp (ptr, "8859", 4) != 0)
 575                         return charset;
 576
 577                 ptr += 4;
 578                 if (*ptr == '-' || *ptr == '_')
 579                         ptr++;
 580
 581                 iso = strtoul (ptr, &endptr, 10);
 582                 if (endptr == ptr || *endptr != '\0')
 583                         return charset;
 584
 585                 if (iso > G_N_ELEMENTS (iso_charsets))
 586                         return charset;
 587
 588                 return iso_charsets[iso];
 589         } else if (!strncmp (charset, "CP125", 5)) {
 590                 ptr = charset + 5;
 591                 if (*ptr >= '0' && *ptr <= '9')
 592                         return windows_charsets[*ptr - '0'];
 593         }
 594
 595         return charset;
 596 }
 597
 598
 599 /**
 600  * g_mime_charset_name:
 601  * @charset: charset name
 602  *
 603  * Attempts to find an iconv-friendly charset name for @charset.
 604  *
 605  * Note: This function is deprecated. Use g_mime_charset_iconv_name()
 606  * instead.
 607  *
 608  * Returns: an iconv-friendly charset name for @charset.
 609  **/
 610 const char *
 611 g_mime_charset_name (const char *charset)
 612 {
 613         return g_mime_charset_iconv_name (charset);
 614 }
 615
 616
 617 /**
 618  * g_mime_charset_locale_name:
 619  *
 620  * Gets the user's locale charset (or iso-8859-1 by default).
 621  *
 622  * Note: This function is deprecated. Use g_mime_locale_charset()
 623  * instead.
 624  *
 625  * Returns: the user's locale charset (or iso-8859-1 by default).
 626  **/
 627 const char *
 628 g_mime_charset_locale_name (void)
 629 {
 630         return g_mime_locale_charset ();
 631 }
 632
 633
 634 /**
 635  * g_mime_charset_iso_to_windows:
 636  * @isocharset: ISO-8859-# charset
 637  *
 638  * Maps the ISO-8859-# charset to the equivalent Windows-CP125#
 639  * charset.
 640  *
 641  * Returns: equivalent Windows charset.
 642  **/
 643 const char *
 644 g_mime_charset_iso_to_windows (const char *isocharset)
 645 {
 646         /* According to http://czyborra.com/charsets/codepages.html,
 647          * the charset mapping is as follows:
 648          *
 649          * us-ascii    maps to windows-cp1252
 650          * iso-8859-1  maps to windows-cp1252
 651          * iso-8859-2  maps to windows-cp1250
 652          * iso-8859-3  maps to windows-cp????
 653          * iso-8859-4  maps to windows-cp????
 654          * iso-8859-5  maps to windows-cp1251
 655          * iso-8859-6  maps to windows-cp1256
 656          * iso-8859-7  maps to windows-cp1253
 657          * iso-8859-8  maps to windows-cp1255
 658          * iso-8859-9  maps to windows-cp1254
 659          * iso-8859-10 maps to windows-cp????
 660          * iso-8859-11 maps to windows-cp????
 661          * iso-8859-12 maps to windows-cp????
 662          * iso-8859-13 maps to windows-cp1257
 663          *
 664          * Assumptions:
 665          *  - I'm going to assume that since iso-8859-4 and
 666          *    iso-8859-13 are Baltic that it also maps to
 667          *    windows-cp1257.
 668          */
 669
 670         isocharset = g_mime_charset_canon_name (isocharset);
 671
 672         if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii"))
 673                 return "windows-cp1252";
 674         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2"))
 675                 return "windows-cp1250";
 676         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4"))
 677                 return "windows-cp1257";
 678         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5"))
 679                 return "windows-cp1251";
 680         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6"))
 681                 return "windows-cp1256";
 682         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7"))
 683                 return "windows-cp1253";
 684         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8"))
 685                 return "windows-cp1255";
 686         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9"))
 687                 return "windows-cp1254";
 688         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13"))
 689                 return "windows-cp1257";
 690
 691         return isocharset;
 692 }
 693
 694
 695 /**
 696  * g_mime_charset_init:
 697  * @charset: charset mask
 698  *
 699  * Initializes a charset mask structure.
 700  **/
 701 void
 702 g_mime_charset_init (GMimeCharset *charset)
 703 {
 704         charset->mask = (unsigned int) ~0;
 705         charset->level = 0;
 706 }
 707
 708
 709 /**
 710  * g_mime_charset_step:
 711  * @charset: charset structure
 712  * @inbuf: input text buffer (must be in UTF-8)
 713  * @inlen: input buffer length
 714  *
 715  * Steps through the input buffer 1 unicode character (glyph) at a
 716  * time (ie, not necessarily 1 byte at a time). Bitwise 'and' our
 717  * @charset->mask with the mask for each glyph. This has the effect of
 718  * limiting what charsets our @charset->mask can match.
 719  **/
 720 void
 721 g_mime_charset_step (GMimeCharset *charset, const char *inbuf, size_t inlen)
 722 {
 723         register const char *inptr = inbuf;
 724         const char *inend = inbuf + inlen;
 725         register unsigned int mask;
 726         register int level;
 727
 728         mask = charset->mask;
 729         level = charset->level;
 730
 731         while (inptr < inend) {
 732                 const char *newinptr;
 733                 gunichar c;
 734
 735                 newinptr = g_utf8_next_char (inptr);
 736                 c = g_utf8_get_char (inptr);
 737                 if (newinptr == NULL || !g_unichar_validate (c)) {
 738                         inptr++;
 739                         continue;
 740                 }
 741
 742                 inptr = newinptr;
 743                 if (c <= 0xffff) {
 744                         mask &= charset_mask (c);
 745
 746                         if (c >= 128 && c < 256)
 747                                 level = MAX (level, 1);
 748                         else if (c >= 256)
 749                                 level = 2;
 750                 } else {
 751                         mask = 0;
 752                         level = 2;
 753                 }
 754         }
 755
 756         charset->mask = mask;
 757         charset->level = level;
 758 }
 759
 760 static const char *
 761 charset_best_mask (unsigned int mask)
 762 {
 763         const char *lang;
 764         guint i;
 765
 766         for (i = 0; i < G_N_ELEMENTS (charinfo); i++) {
 767                 if (charinfo[i].bit & mask) {
 768                         lang = g_mime_charset_language (charinfo[i].name);
 769
 770                         if (!lang || (locale_lang && !strncmp (locale_lang, lang, 2)))
 771                                 return charinfo[i].name;
 772                 }
 773         }
 774
 775         return "UTF-8";
 776 }
 777
 778
 779 /**
 780  * g_mime_charset_best_name:
 781  * @charset: charset mask
 782  *
 783  * Gets the best charset name based on the charset mask @charset.
 784  *
 785  * Returns: a pointer to a string containing the best charset name that
 786  * can represent the charset mask @charset.
 787  **/
 788 const char *
 789 g_mime_charset_best_name (GMimeCharset *charset)
 790 {
 791         if (charset->level == 1)
 792                 return "iso-8859-1";
 793         else if (charset->level == 2)
 794                 return charset_best_mask (charset->mask);
 795         else
 796                 return NULL;
 797 }
 798
 799
 800 /**
 801  * g_mime_charset_best:
 802  * @inbuf: a UTF-8 text buffer
 803  * @inlen: input buffer length
 804  *
 805  * Computes the best charset to use to encode this text buffer.
 806  *
 807  * Returns: the charset name best suited for the input text or %NULL if
 808  * it is US-ASCII safe.
 809  **/
 810 const char *
 811 g_mime_charset_best (const char *inbuf, size_t inlen)
 812 {
 813         GMimeCharset charset;
 814
 815         g_mime_charset_init (&charset);
 816         g_mime_charset_step (&charset, inbuf, inlen);
 817
 818         return g_mime_charset_best_name (&charset);
 819 }
 820
 821
 822 /**
 823  * g_mime_charset_can_encode:
 824  * @mask: a #GMimeCharset mask
 825  * @charset: a charset
 826  * @text: utf-8 text to check
 827  * @len: length of @text
 828  *
 829  * Check to see if the UTF-8 @text will fit safely within @charset.
 830  *
 831  * Returns: %TRUE if it is safe to encode @text into @charset or %FALSE
 832  * otherwise.
 833  **/
 834 gboolean
 835 g_mime_charset_can_encode (GMimeCharset *mask, const char *charset, const char *text, size_t len)
 836 {
 837         const unsigned char *inptr = (const unsigned char *) text;
 838         const unsigned char *inend = inptr + len;
 839         size_t inleft, outleft, rc;
 840         const char *inbuf = text;
 841         char out[256], *outbuf;
 842         const char *iconv_name;
 843         iconv_t cd;
 844         guint i;
 845
 846         if (len == 0)
 847                 return TRUE;
 848
 849         if (mask->level == 0 && (!charset || !g_ascii_strcasecmp (charset, "us-ascii"))) {
 850                 /* simple US-ASCII case - is this scan even necessary? */
 851                 while (inptr < inend && is_ascii (*inptr))
 852                         inptr++;
 853
 854                 if (inptr == inend)
 855                         return TRUE;
 856
 857                 return FALSE;
 858         }
 859
 860         if (!g_ascii_strcasecmp (charset, "utf-8")) {
 861                 /* we can encode anything in utf-8 */
 862                 return TRUE;
 863         }
 864
 865         charset = g_mime_charset_iconv_name (charset);
 866
 867         if (mask->level == 1)
 868                 return !g_ascii_strcasecmp (charset, "iso-8859-1");
 869
 870         /* check if this is a charset that we have precalculated masking for */
 871         for (i = 0; i < G_N_ELEMENTS (charinfo); i++) {
 872                 iconv_name = g_mime_charset_iconv_name (charinfo[i].name);
 873                 if (charset == iconv_name)
 874                         break;
 875         }
 876
 877         if (i < G_N_ELEMENTS (charinfo)) {
 878                 /* indeed we do... */
 879                 return (charinfo[i].bit & mask->mask);
 880         }
 881
 882         /* down to the nitty gritty slow and painful way... */
 883         if ((cd = g_mime_iconv_open (charset, "UTF-8")) == (iconv_t) -1)
 884                 return FALSE;
 885
 886         inleft = len;
 887
 888         do {
 889                 outleft = sizeof (out);
 890                 outbuf = out;
 891                 errno = 0;
 892
 893                 rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
 894                 if (rc == (size_t) -1 && errno != E2BIG)
 895                         break;
 896         } while (inleft > 0);
 897
 898         if (inleft == 0) {
 899                 outleft = sizeof (out);
 900                 outbuf = out;
 901                 errno = 0;
 902
 903                 rc = iconv (cd, NULL, NULL, &outbuf, &outleft);
 904         }
 905
 906         g_mime_iconv_close (cd);
 907
 908         return rc != (size_t) -1;
 909 }
 910
 911
 912 /**
 913  * g_mime_set_user_charsets:
 914  * @charsets: an array of user-preferred charsets
 915  *
 916  * Set a list of charsets for GMime to use as a hint for encoding and
 917  * decoding headers. The charset list should be in order of preference
 918  * (e.g. most preferred first, least preferred last).
 919  **/
 920 void
 921 g_mime_set_user_charsets (const char **charsets)
 922 {
 923         if (user_charsets)
 924                 g_strfreev (user_charsets);
 925
 926         if (charsets == NULL || charsets[0] == NULL) {
 927                 user_charsets = NULL;
 928                 return;
 929         }
 930
 931         user_charsets = g_strdupv ((char **) charsets);
 932 }
 933
 934
 935 /**
 936  * g_mime_user_charsets:
 937  *
 938  * Get the list of user-preferred charsets set with
 939  * g_mime_set_user_charsets().
 940  *
 941  * Returns: an array of user-set charsets or %NULL if none set.
 942  **/
 943 const char **
 944 g_mime_user_charsets (void)
 945 {
 946         return (const char **) user_charsets;
 947 }