camel/camel-charset-map.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */
   2 /*
   3  * Authors:
   4  *   Michael Zucchi <notzed@ximian.com>
   5  *   Jeffrey Stedfast <fejj@ximian.com>
   6  *   Dan Winship <danw@ximian.com>
   7  *
   8  * Copyright 2000-2003 Ximian, Inc. (www.ximian.com)
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of version 2 of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
  22  * USA
  23  */
  24
  25
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include <errno.h>
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34
  35 /*
  36   if you want to build the charset map, compile this with something like:
  37     gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags`
  38   (plus any -I/-L/-l flags you need for iconv), then run it as
  39     ./a.out > camel-charset-map-private.h
  40
  41   Note that the big-endian variant isn't tested...
  42
  43   The tables genereated work like this:
  44
  45    An indirect array for each page of unicode character
  46    Each array element has an indirect pointer to one of the bytes of
  47    the generated bitmask.
  48 */
  49
  50 #ifdef BUILD_MAP
  51 #include <iconv.h>
  52 #include <glib.h>
  53
  54 static struct {
  55         char *name;
  56         unsigned int bit;       /* assigned bit */
  57 } tables[] = {
  58         /* These are the 8bit character sets (other than iso-8859-1,
  59          * which is special-cased) which are supported by both other
  60          * mailers and the GNOME environment. Note that the order
  61          * they're listed in is the order they'll be tried in, so put
  62          * the more-popular ones first.
  63          */
  64         { "iso-8859-2", 0 },    /* Central/Eastern European */
  65         { "iso-8859-4", 0 },    /* Baltic */
  66         { "koi8-r", 0 },        /* Russian */
  67         { "koi8-u", 0 },        /* Ukranian */
  68         { "iso-8859-5", 0 },    /* Least-popular Russian encoding */
  69         { "iso-8859-7", 0 },    /* Greek */
  70         { "iso-8859-8", 0 },    /* Hebrew; Visual */
  71         { "iso-8859-9", 0 },    /* Turkish */
  72         { "iso-8859-13", 0 },   /* Baltic again */
  73         { "iso-8859-15", 0 },   /* New-and-improved iso-8859-1, but most
  74                                  * programs that support this support UTF8
  75                                  */
  76         { "windows-1251", 0 },  /* Russian */
  77         { 0, 0 }
  78 };
  79
  80 unsigned int encoding_map[256 * 256];
  81
  82 #if G_BYTE_ORDER == G_BIG_ENDIAN
  83 #define UCS "UCS-4BE"
  84 #else
  85 #define UCS "UCS-4LE"
  86 #endif
  87
  88 int main (void)
  89 {
  90         int i, j;
  91         int max, min;
  92         int bit = 0x01;
  93         int k;
  94         int bytes;
  95         iconv_t cd;
  96         char in[128];
  97         guint32 out[128];
  98         char *inptr, *outptr;
  99         size_t inlen, outlen;
 100
 101         /* dont count the terminator */
 102         bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
 103
 104         for (i = 0; i < 128; i++)
 105                 in[i] = i + 128;
 106
 107         for (j = 0; tables[j].name; j++) {
 108                 cd = iconv_open (UCS, tables[j].name);
 109                 if (cd == (iconv_t)-1)
 110                         exit (1);
 111                 inptr = in;
 112                 outptr = (char *)(out);
 113                 inlen = sizeof (in);
 114                 outlen = sizeof (out);
 115                 while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
 116                         if (errno == EILSEQ) {
 117                                 inptr++;
 118                                 inlen--;
 119                         } else {
 120                                 printf ("%s\n", strerror (errno));
 121                                 exit (1);
 122                         }
 123                 }
 124                 iconv_close (cd);
 125
 126                 for (i = 0; i < 128 - outlen / 4; i++) {
 127                         encoding_map[i] |= bit;
 128                         encoding_map[out[i]] |= bit;
 129                 }
 130
 131                 tables[j].bit = bit;
 132                 bit <<= 1;
 133         }
 134
 135         printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
 136
 137         for (i=0;i<256;i++) {
 138                 /* first, do we need this block? */
 139                 for (k=0;k<bytes;k++) {
 140                         for (j=0;j<256;j++) {
 141                                 if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
 142                                         break;
 143                         }
 144                         if (j < 256) {
 145                                 /* yes, dump it */
 146                                 printf("static const unsigned char m%02x%x[256] = {\n\t", i, k);
 147                                 for (j=0;j<256;j++) {
 148                                         printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
 149                                         if (((j+1)&7) == 0 && j<255)
 150                                                 printf("\n\t");
 151                                 }
 152                                 printf("\n};\n\n");
 153                         }
 154                 }
 155         }
 156
 157         printf("static const struct {\n");
 158         for (k=0;k<bytes;k++) {
 159                 printf("\tconst unsigned char *bits%d;\n", k);
 160         }
 161         printf("} camel_charmap[256] = {\n\t");
 162         for (i=0;i<256;i++) {
 163                 /* first, do we need this block? */
 164                 printf("{ ");
 165                 for (k=0;k<bytes;k++) {
 166                         for (j=0;j<256;j++) {
 167                                 if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
 168                                         break;
 169                         }
 170                         if (j < 256) {
 171                                 printf("m%02x%x, ", i, k);
 172                         } else {
 173                                 printf("NULL, ");
 174                         }
 175                 }
 176                 printf("}, ");
 177                 if (((i+1)&7) == 0 && i<255)
 178                         printf("\n\t");
 179         }
 180         printf("\n};\n\n");
 181
 182         printf("static const struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
 183         for (j=0;tables[j].name;j++) {
 184                 printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
 185         }
 186         printf("};\n\n");
 187
 188         printf("#define charset_mask(x) \\\n");
 189         for (k=0;k<bytes;k++) {
 190                 if (k!=0)
 191                         printf("\t| ");
 192                 else
 193                         printf("\t");
 194                 printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
 195                 if (k<bytes-1)
 196                         printf("\t\\\n");
 197         }
 198         printf("\n\n");
 199
 200         return 0;
 201 }
 202
 203 #else
 204
 205 #include <glib.h>
 206 #include <locale.h>
 207 #ifdef HAVE_CODESET
 208 #include <langinfo.h>
 209 #endif
 210
 211 #include "camel-charset-map.h"
 212 #include "camel-charset-map-private.h"
 213 #include "camel-utf8.h"
 214
 215 #include <libedataserver/e-iconv.h>
 216
 217 void
 218 camel_charset_init (CamelCharset *c)
 219 {
 220         c->mask = (unsigned int) ~0;
 221         c->level = 0;
 222 }
 223
 224 void
 225 camel_charset_step (CamelCharset *cc, const char *in, int len)
 226 {
 227         const unsigned char *inptr = (const unsigned char *) in;
 228         const unsigned char *inend = inptr + len;
 229         register unsigned int mask;
 230         register int level;
 231         register guint32 c;
 232
 233         mask = cc->mask;
 234         level = cc->level;
 235
 236         /* check what charset a given string will fit in */
 237         while ((c = camel_utf8_getc_limit(&inptr, inend)) != 0xffff) {
 238                 if (c < 0xffff) {
 239                         mask &= charset_mask(c);
 240
 241                         if (c>=128 && c<256)
 242                                 level = MAX(level, 1);
 243                         else if (c>=256)
 244                                 level = 2;
 245                 } else {
 246                         mask = 0;
 247                         level = 2;
 248                         break;
 249                 }
 250         }
 251
 252         cc->mask = mask;
 253         cc->level = level;
 254 }
 255
 256 /* gets the best charset from the mask of chars in it */
 257 static const char *
 258 camel_charset_best_mask(unsigned int mask)
 259 {
 260         const char *locale_lang, *lang;
 261         int i;
 262
 263         locale_lang = e_iconv_locale_language ();
 264         for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) {
 265                 if (camel_charinfo[i].bit & mask) {
 266                         lang = e_iconv_charset_language (camel_charinfo[i].name);
 267
 268                         if (!locale_lang || (lang && !strncmp (locale_lang, lang, 2)))
 269                                 return camel_charinfo[i].name;
 270                 }
 271         }
 272
 273         return "UTF-8";
 274 }
 275
 276 const char *
 277 camel_charset_best_name (CamelCharset *charset)
 278 {
 279         if (charset->level == 1)
 280                 return "ISO-8859-1";
 281         else if (charset->level == 2)
 282                 return camel_charset_best_mask (charset->mask);
 283         else
 284                 return NULL;
 285 }
 286
 287 /* finds the minimum charset for this string NULL means US-ASCII */
 288 const char *
 289 camel_charset_best (const char *in, int len)
 290 {
 291         CamelCharset charset;
 292
 293         camel_charset_init (&charset);
 294         camel_charset_step (&charset, in, len);
 295         return camel_charset_best_name (&charset);
 296 }
 297
 298
 299 /**
 300  * camel_charset_iso_to_windows:
 301  * @isocharset: a canonicalised ISO charset
 302  *
 303  * Returns the equivalent Windows charset.
 304  **/
 305 const char *
 306 camel_charset_iso_to_windows (const char *isocharset)
 307 {
 308         /* According to http://czyborra.com/charsets/codepages.html,
 309          * the charset mapping is as follows:
 310          *
 311          * us-ascii    maps to windows-cp1252
 312          * iso-8859-1  maps to windows-cp1252
 313          * iso-8859-2  maps to windows-cp1250
 314          * iso-8859-3  maps to windows-cp????
 315          * iso-8859-4  maps to windows-cp????
 316          * iso-8859-5  maps to windows-cp1251
 317          * iso-8859-6  maps to windows-cp1256
 318          * iso-8859-7  maps to windows-cp1253
 319          * iso-8859-8  maps to windows-cp1255
 320          * iso-8859-9  maps to windows-cp1254
 321          * iso-8859-10 maps to windows-cp????
 322          * iso-8859-11 maps to windows-cp????
 323          * iso-8859-12 maps to windows-cp????
 324          * iso-8859-13 maps to windows-cp1257
 325          *
 326          * Assumptions:
 327          *  - I'm going to assume that since iso-8859-4 and
 328          *    iso-8859-13 are Baltic that it also maps to
 329          *    windows-cp1257.
 330          */
 331
 332         if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii"))
 333                 return "windows-cp1252";
 334         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2"))
 335                 return "windows-cp1250";
 336         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4"))
 337                 return "windows-cp1257";
 338         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5"))
 339                 return "windows-cp1251";
 340         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6"))
 341                 return "windows-cp1256";
 342         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7"))
 343                 return "windows-cp1253";
 344         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8"))
 345                 return "windows-cp1255";
 346         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9"))
 347                 return "windows-cp1254";
 348         else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13"))
 349                 return "windows-cp1257";
 350
 351         return isocharset;
 352 }
 353
 354 #endif /* !BUILD_MAP */