gmime/charset-map.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*  GMime
   3  *  Copyright (C) 2000-2012 Jeffrey Stedfast
   4  *
   5  *  This library is free software; you can redistribute it and/or
   6  *  modify it under the terms of the GNU Lesser General Public License
   7  *  as published by the Free Software Foundation; either version 2.1
   8  *  of the License, or (at your option) any later version.
   9  *
  10  *  This library is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  *  Lesser General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU Lesser General Public
  16  *  License along with this library; if not, write to the Free
  17  *  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  18  *  02110-1301, USA.
  19  */
  20
  21
  22 #ifdef HAVE_CONFIG_H
  23 #include <config.h>
  24 #endif
  25
  26 #include <glib.h>
  27
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30 #include <string.h>
  31 #include <sys/types.h>
  32 #include <sys/stat.h>
  33 #include <locale.h>
  34 #include <iconv.h>
  35 #include <errno.h>
  36
  37 #ifdef HAVE_CODESET
  38 #include <langinfo.h>
  39 #endif
  40
  41
  42 static struct {
  43         char *name;        /* charset name */
  44         int multibyte;     /* charset type */
  45         unsigned int bit;  /* assigned bit */
  46 } tables[] = {
  47         /* These are the 8bit character sets (other than iso-8859-1,
  48          * which is special-cased) which are supported by both other
  49          * mailers and the GNOME environment. Note that the order
  50          * they're listed in is the order they'll be tried in, so put
  51          * the more-popular ones first.
  52          */
  53         { "iso-8859-2",   0, 0 },  /* Central/Eastern European */
  54         { "iso-8859-4",   0, 0 },  /* Baltic */
  55         { "koi8-r",       0, 0 },  /* Russian */
  56         { "koi8-u",       0, 0 },  /* Ukranian */
  57         { "iso-8859-5",   0, 0 },  /* Least-popular Russian encoding */
  58         { "iso-8859-6",   0, 0 },  /* Arabic */
  59         { "iso-8859-7",   0, 0 },  /* Greek */
  60         { "iso-8859-8",   0, 0 },  /* Hebrew; Visual */
  61         { "iso-8859-9",   0, 0 },  /* Turkish */
  62         { "iso-8859-13",  0, 0 },  /* Baltic again */
  63         { "iso-8859-15",  0, 0 },  /* New-and-improved iso-8859-1, but most
  64                                     * programs that support this support UTF8
  65                                     */
  66         { "windows-1251", 0, 0 },  /* Russian */
  67
  68         /* These are the multibyte character sets which are commonly
  69          * supported by other mail clients. Note: order for multibyte
  70          * charsets does not affect priority unlike the 8bit charsets
  71          * listed above.
  72          */
  73         { "iso-2022-jp",  1, 0 },  /* Japanese designed for use over the Net */
  74         { "Shift-JIS",    1, 0 },  /* Japanese as used by Windows and MacOS systems */
  75         { "euc-jp",       1, 0 },  /* Japanese traditionally used on Unix systems */
  76         { "euc-kr",       1, 0 },  /* Korean */
  77         { "gb2312",       1, 0 },  /* Simplified Chinese */
  78         { "Big5",         1, 0 },  /* Traditional Chinese */
  79         { "euc-tw",       1, 0 },
  80         { NULL,           0, 0 }
  81 };
  82
  83 unsigned int encoding_map[256 * 256];
  84
  85 #if G_BYTE_ORDER == G_BIG_ENDIAN
  86 #define UCS "UCS-4BE"
  87 #else
  88 #define UCS "UCS-4LE"
  89 #endif
  90
  91 static guint
  92 block_hash (gconstpointer v)
  93 {
  94         const signed char *p = v;
  95         guint32 h = *p++;
  96         int i;
  97
  98         for (i = 0; i < 256; i++)
  99                 h = (h << 5) - h + *p++;
 100
 101         return h;
 102 }
 103
 104 static int
 105 block_equal (gconstpointer v1, gconstpointer v2)
 106 {
 107         return !memcmp (v1, v2, 256);
 108 }
 109
 110 int main (int argc, char **argv)
 111 {
 112         unsigned char *block = NULL;
 113         unsigned int bit = 0x01;
 114         GHashTable *table_hash;
 115         size_t inleft, outleft;
 116         char *inbuf, *outbuf;
 117         guint32 out[128], c;
 118         unsigned int i;
 119         char in[128];
 120         iconv_t cd;
 121         int bytes;
 122         int j, k;
 123
 124         /* dont count the terminator */
 125         bytes = ((sizeof (tables) / sizeof (tables[0])) + 7 - 1) / 8;
 126         g_assert (bytes <= 4);
 127
 128         for (i = 0; i < 128; i++)
 129                 in[i] = i + 128;
 130
 131         for (j = 0; tables[j].name && !tables[j].multibyte; j++) {
 132                 cd = iconv_open (UCS, tables[j].name);
 133                 inbuf = in;
 134                 inleft = sizeof (in);
 135                 outbuf = (char *) out;
 136                 outleft = sizeof (out);
 137                 while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == (size_t) -1) {
 138                         if (errno == EILSEQ) {
 139                                 inbuf++;
 140                                 inleft--;
 141                         } else {
 142                                 g_warning ("iconv (%s->UCS4, ..., %zu, ..., %zu): %s",
 143                                            tables[j].name, inleft, outleft,
 144                                            g_strerror (errno));
 145                                 exit (1);
 146                         }
 147                 }
 148                 iconv_close (cd);
 149
 150                 for (i = 0; i < 128 - outleft / 4; i++) {
 151                         encoding_map[i] |= bit;
 152                         encoding_map[out[i]] |= bit;
 153                 }
 154
 155                 tables[j].bit = bit;
 156                 bit <<= 1;
 157         }
 158
 159         /* Mutibyte tables */
 160         for ( ; tables[j].name && tables[j].multibyte; j++) {
 161                 cd = iconv_open (tables[j].name, UCS);
 162                 if (cd == (iconv_t) -1)
 163                         continue;
 164
 165                 for (c = 128, i = 0; c < 65535 && i < 65535; c++) {
 166                         inbuf = (char *) &c;
 167                         inleft = sizeof (c);
 168                         outbuf = in;
 169                         outleft = sizeof (in);
 170
 171                         if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1) {
 172                                 /* this is a legal character in charset table[j].name */
 173                                 iconv (cd, NULL, NULL, &outbuf, &outleft);
 174                                 encoding_map[i++] |= bit;
 175                                 encoding_map[c] |= bit;
 176                         } else {
 177                                 /* reset the iconv descriptor */
 178                                 iconv (cd, NULL, NULL, NULL, NULL);
 179                         }
 180                 }
 181
 182                 iconv_close (cd);
 183
 184                 tables[j].bit = bit;
 185                 bit <<= 1;
 186         }
 187
 188         printf ("/* This file is automatically generated: DO NOT EDIT */\n\n");
 189
 190         table_hash = g_hash_table_new_full (block_hash, block_equal, g_free, g_free);
 191
 192         for (i = 0; i < 256; i++) {
 193                 for (k = 0; k < bytes; k++) {
 194                         char name[32], *alias;
 195                         int has_bits = FALSE;
 196
 197                         if (!block) {
 198                                 /* we reuse malloc'd blocks that are not added to the
 199                                  * hash table to avoid unnecessary malloc/free's */
 200                                 block = g_malloc (256);
 201                         }
 202
 203                         for (j = 0; j < 256; j++) {
 204                                 if ((block[j] = (encoding_map[i * 256 + j] >> (k * 8)) & 0xff))
 205                                         has_bits = TRUE;
 206                         }
 207
 208                         if (!has_bits)
 209                                 continue;
 210
 211                         sprintf (name, "m%02x%x", i, k);
 212
 213                         if ((alias = g_hash_table_lookup (table_hash, block))) {
 214                                 /* this block is identical to an earlier block, just alias it */
 215                                 printf ("#define %s %s\n\n", name, alias);
 216                         } else {
 217                                 /* unique block, dump it */
 218                                 g_hash_table_insert (table_hash, block, g_strdup (name));
 219
 220                                 printf ("static unsigned char %s[256] = {\n\t", name);
 221                                 for (j = 0; j < 256; j++) {
 222                                         printf ("0x%02x, ", block[j]);
 223                                         if (((j + 1) & 7) == 0 && j < 255)
 224                                                 printf ("\n\t");
 225                                 }
 226                                 printf ("\n};\n\n");
 227
 228                                 /* force the next loop to malloc a new block */
 229                                 block = NULL;
 230                         }
 231                 }
 232         }
 233
 234         g_hash_table_destroy (table_hash);
 235         g_free (block);
 236
 237         printf ("static const struct {\n");
 238         for (k = 0; k < bytes; k++)
 239                 printf ("\tunsigned char *bits%d;\n", k);
 240
 241         printf ("} charmap[256] = {\n\t");
 242         for (i = 0; i < 256; i++) {
 243                 printf ("{ ");
 244                 for (k = 0; k < bytes; k++) {
 245                         for (j = 0; j < 256; j++) {
 246                                 if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
 247                                         break;
 248                         }
 249
 250                         if (j < 256)
 251                                 printf ("m%02x%x, ", i, k);
 252                         else
 253                                 printf ("NULL, ");
 254                 }
 255
 256                 printf ("}, ");
 257                 if (((i + 1) & 3) == 0 && i < 255)
 258                         printf ("\n\t");
 259         }
 260         printf ("\n};\n\n");
 261
 262         printf ("static const struct {\n\tconst char *name;\n\tunsigned int bit;\n} charinfo[] = {\n");
 263         for (j = 0; tables[j].name; j++)
 264                 printf ("\t{ \"%s\", 0x%08x },\n", tables[j].name, tables[j].bit);
 265         printf ("};\n\n");
 266
 267         printf ("#define charset_mask(x) \\\n");
 268         for (k = 0; k < bytes; k++) {
 269                 if (k != 0)
 270                         printf ("\t| ");
 271                 else
 272                         printf ("\t");
 273
 274                 printf ("(charmap[(x) >> 8].bits%d ? charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)",
 275                         k, k, k * 8);
 276
 277                 if (k < bytes - 1)
 278                         printf ("\t\\\n");
 279         }
 280         printf ("\n\n");
 281
 282         return 0;
 283 }