1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Copyright (C) 2000-2012 Jeffrey Stedfast
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License
7 * as published by the Free Software Foundation; either version 2.1
8 * of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free
17 * Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
31 #include <sys/types.h>
43 char *name; /* charset name */
44 int multibyte; /* charset type */
45 unsigned int bit; /* assigned bit */
47 /* These are the 8bit character sets (other than iso-8859-1,
48 * which is special-cased) which are supported by both other
49 * mailers and the GNOME environment. Note that the order
50 * they're listed in is the order they'll be tried in, so put
51 * the more-popular ones first.
53 { "iso-8859-2", 0, 0 }, /* Central/Eastern European */
54 { "iso-8859-4", 0, 0 }, /* Baltic */
55 { "koi8-r", 0, 0 }, /* Russian */
56 { "koi8-u", 0, 0 }, /* Ukranian */
57 { "iso-8859-5", 0, 0 }, /* Least-popular Russian encoding */
58 { "iso-8859-6", 0, 0 }, /* Arabic */
59 { "iso-8859-7", 0, 0 }, /* Greek */
60 { "iso-8859-8", 0, 0 }, /* Hebrew; Visual */
61 { "iso-8859-9", 0, 0 }, /* Turkish */
62 { "iso-8859-13", 0, 0 }, /* Baltic again */
63 { "iso-8859-15", 0, 0 }, /* New-and-improved iso-8859-1, but most
64 * programs that support this support UTF8
66 { "windows-1251", 0, 0 }, /* Russian */
68 /* These are the multibyte character sets which are commonly
69 * supported by other mail clients. Note: order for multibyte
70 * charsets does not affect priority unlike the 8bit charsets
73 { "iso-2022-jp", 1, 0 }, /* Japanese designed for use over the Net */
74 { "Shift-JIS", 1, 0 }, /* Japanese as used by Windows and MacOS systems */
75 { "euc-jp", 1, 0 }, /* Japanese traditionally used on Unix systems */
76 { "euc-kr", 1, 0 }, /* Korean */
77 { "gb2312", 1, 0 }, /* Simplified Chinese */
78 { "Big5", 1, 0 }, /* Traditional Chinese */
83 unsigned int encoding_map[256 * 256];
85 #if G_BYTE_ORDER == G_BIG_ENDIAN
92 block_hash (gconstpointer v)
94 const signed char *p = v;
98 for (i = 0; i < 256; i++)
99 h = (h << 5) - h + *p++;
105 block_equal (gconstpointer v1, gconstpointer v2)
107 return !memcmp (v1, v2, 256);
110 int main (int argc, char **argv)
112 unsigned char *block = NULL;
113 unsigned int bit = 0x01;
114 GHashTable *table_hash;
115 size_t inleft, outleft;
116 char *inbuf, *outbuf;
124 /* dont count the terminator */
125 bytes = ((sizeof (tables) / sizeof (tables[0])) + 7 - 1) / 8;
126 g_assert (bytes <= 4);
128 for (i = 0; i < 128; i++)
131 for (j = 0; tables[j].name && !tables[j].multibyte; j++) {
132 cd = iconv_open (UCS, tables[j].name);
134 inleft = sizeof (in);
135 outbuf = (char *) out;
136 outleft = sizeof (out);
137 while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == (size_t) -1) {
138 if (errno == EILSEQ) {
142 g_warning ("iconv (%s->UCS4, ..., %zu, ..., %zu): %s",
143 tables[j].name, inleft, outleft,
150 for (i = 0; i < 128 - outleft / 4; i++) {
151 encoding_map[i] |= bit;
152 encoding_map[out[i]] |= bit;
159 /* Mutibyte tables */
160 for ( ; tables[j].name && tables[j].multibyte; j++) {
161 cd = iconv_open (tables[j].name, UCS);
162 if (cd == (iconv_t) -1)
165 for (c = 128, i = 0; c < 65535 && i < 65535; c++) {
169 outleft = sizeof (in);
171 if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1) {
172 /* this is a legal character in charset table[j].name */
173 iconv (cd, NULL, NULL, &outbuf, &outleft);
174 encoding_map[i++] |= bit;
175 encoding_map[c] |= bit;
177 /* reset the iconv descriptor */
178 iconv (cd, NULL, NULL, NULL, NULL);
188 printf ("/* This file is automatically generated: DO NOT EDIT */\n\n");
190 table_hash = g_hash_table_new_full (block_hash, block_equal, g_free, g_free);
192 for (i = 0; i < 256; i++) {
193 for (k = 0; k < bytes; k++) {
194 char name[32], *alias;
195 int has_bits = FALSE;
198 /* we reuse malloc'd blocks that are not added to the
199 * hash table to avoid unnecessary malloc/free's */
200 block = g_malloc (256);
203 for (j = 0; j < 256; j++) {
204 if ((block[j] = (encoding_map[i * 256 + j] >> (k * 8)) & 0xff))
211 sprintf (name, "m%02x%x", i, k);
213 if ((alias = g_hash_table_lookup (table_hash, block))) {
214 /* this block is identical to an earlier block, just alias it */
215 printf ("#define %s %s\n\n", name, alias);
217 /* unique block, dump it */
218 g_hash_table_insert (table_hash, block, g_strdup (name));
220 printf ("static unsigned char %s[256] = {\n\t", name);
221 for (j = 0; j < 256; j++) {
222 printf ("0x%02x, ", block[j]);
223 if (((j + 1) & 7) == 0 && j < 255)
228 /* force the next loop to malloc a new block */
234 g_hash_table_destroy (table_hash);
237 printf ("static const struct {\n");
238 for (k = 0; k < bytes; k++)
239 printf ("\tunsigned char *bits%d;\n", k);
241 printf ("} charmap[256] = {\n\t");
242 for (i = 0; i < 256; i++) {
244 for (k = 0; k < bytes; k++) {
245 for (j = 0; j < 256; j++) {
246 if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
251 printf ("m%02x%x, ", i, k);
257 if (((i + 1) & 3) == 0 && i < 255)
262 printf ("static const struct {\n\tconst char *name;\n\tunsigned int bit;\n} charinfo[] = {\n");
263 for (j = 0; tables[j].name; j++)
264 printf ("\t{ \"%s\", 0x%08x },\n", tables[j].name, tables[j].bit);
267 printf ("#define charset_mask(x) \\\n");
268 for (k = 0; k < bytes; k++) {
274 printf ("(charmap[(x) >> 8].bits%d ? charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)",