1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; -*- */
4 * Michael Zucchi <notzed@ximian.com>
5 * Jeffrey Stedfast <fejj@ximian.com>
6 * Dan Winship <danw@ximian.com>
8 * Copyright 2000-2003 Ximian, Inc. (www.ximian.com)
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU Lesser General Public
12 * License as published by the Free Software Foundation.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
36 if you want to build the charset map, compile this with something like:
37 gcc -DBUILD_MAP camel-charset-map.c `glib-config --cflags`
38 (plus any -I/-L/-l flags you need for iconv), then run it as
39 ./a.out > camel-charset-map-private.h
41 Note that the big-endian variant isn't tested...
43 The tables genereated work like this:
45 An indirect array for each page of unicode character
46 Each array element has an indirect pointer to one of the bytes of
47 the generated bitmask.
56 unsigned int bit; /* assigned bit */
58 /* These are the 8bit character sets (other than iso-8859-1,
59 * which is special-cased) which are supported by both other
60 * mailers and the GNOME environment. Note that the order
61 * they're listed in is the order they'll be tried in, so put
62 * the more-popular ones first.
64 { "iso-8859-2", 0 }, /* Central/Eastern European */
65 { "iso-8859-4", 0 }, /* Baltic */
66 { "koi8-r", 0 }, /* Russian */
67 { "koi8-u", 0 }, /* Ukranian */
68 { "iso-8859-5", 0 }, /* Least-popular Russian encoding */
69 { "iso-8859-7", 0 }, /* Greek */
70 { "iso-8859-8", 0 }, /* Hebrew; Visual */
71 { "iso-8859-9", 0 }, /* Turkish */
72 { "iso-8859-13", 0 }, /* Baltic again */
73 { "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most
74 * programs that support this support UTF8
76 { "windows-1251", 0 }, /* Russian */
80 unsigned int encoding_map[256 * 256];
82 #if G_BYTE_ORDER == G_BIG_ENDIAN
101 /* dont count the terminator */
102 bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
104 for (i = 0; i < 128; i++)
107 for (j = 0; tables[j].name; j++) {
108 cd = iconv_open (UCS, tables[j].name);
109 if (cd == (iconv_t)-1)
112 outptr = (char *)(out);
114 outlen = sizeof (out);
115 while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
116 if (errno == EILSEQ) {
120 printf ("%s\n", strerror (errno));
126 for (i = 0; i < 128 - outlen / 4; i++) {
127 encoding_map[i] |= bit;
128 encoding_map[out[i]] |= bit;
135 printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
137 for (i=0;i<256;i++) {
138 /* first, do we need this block? */
139 for (k=0;k<bytes;k++) {
140 for (j=0;j<256;j++) {
141 if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
146 printf("static const unsigned char m%02x%x[256] = {\n\t", i, k);
147 for (j=0;j<256;j++) {
148 printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
149 if (((j+1)&7) == 0 && j<255)
157 printf("static const struct {\n");
158 for (k=0;k<bytes;k++) {
159 printf("\tconst unsigned char *bits%d;\n", k);
161 printf("} camel_charmap[256] = {\n\t");
162 for (i=0;i<256;i++) {
163 /* first, do we need this block? */
165 for (k=0;k<bytes;k++) {
166 for (j=0;j<256;j++) {
167 if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
171 printf("m%02x%x, ", i, k);
177 if (((i+1)&7) == 0 && i<255)
182 printf("static const struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
183 for (j=0;tables[j].name;j++) {
184 printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
188 printf("#define charset_mask(x) \\\n");
189 for (k=0;k<bytes;k++) {
194 printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
208 #include <langinfo.h>
211 #include "camel-charset-map.h"
212 #include "camel-charset-map-private.h"
213 #include "camel-utf8.h"
215 #include <libedataserver/e-iconv.h>
218 camel_charset_init (CamelCharset *c)
220 c->mask = (unsigned int) ~0;
225 camel_charset_step (CamelCharset *cc, const char *in, int len)
227 const unsigned char *inptr = (const unsigned char *) in;
228 const unsigned char *inend = inptr + len;
229 register unsigned int mask;
236 /* check what charset a given string will fit in */
237 while ((c = camel_utf8_getc_limit(&inptr, inend)) != 0xffff) {
239 mask &= charset_mask(c);
242 level = MAX(level, 1);
256 /* gets the best charset from the mask of chars in it */
258 camel_charset_best_mask(unsigned int mask)
260 const char *locale_lang, *lang;
263 locale_lang = e_iconv_locale_language ();
264 for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) {
265 if (camel_charinfo[i].bit & mask) {
266 lang = e_iconv_charset_language (camel_charinfo[i].name);
268 if (!locale_lang || (lang && !strncmp (locale_lang, lang, 2)))
269 return camel_charinfo[i].name;
277 camel_charset_best_name (CamelCharset *charset)
279 if (charset->level == 1)
281 else if (charset->level == 2)
282 return camel_charset_best_mask (charset->mask);
287 /* finds the minimum charset for this string NULL means US-ASCII */
289 camel_charset_best (const char *in, int len)
291 CamelCharset charset;
293 camel_charset_init (&charset);
294 camel_charset_step (&charset, in, len);
295 return camel_charset_best_name (&charset);
300 * camel_charset_iso_to_windows:
301 * @isocharset: a canonicalised ISO charset
303 * Returns the equivalent Windows charset.
306 camel_charset_iso_to_windows (const char *isocharset)
308 /* According to http://czyborra.com/charsets/codepages.html,
309 * the charset mapping is as follows:
311 * us-ascii maps to windows-cp1252
312 * iso-8859-1 maps to windows-cp1252
313 * iso-8859-2 maps to windows-cp1250
314 * iso-8859-3 maps to windows-cp????
315 * iso-8859-4 maps to windows-cp????
316 * iso-8859-5 maps to windows-cp1251
317 * iso-8859-6 maps to windows-cp1256
318 * iso-8859-7 maps to windows-cp1253
319 * iso-8859-8 maps to windows-cp1255
320 * iso-8859-9 maps to windows-cp1254
321 * iso-8859-10 maps to windows-cp????
322 * iso-8859-11 maps to windows-cp????
323 * iso-8859-12 maps to windows-cp????
324 * iso-8859-13 maps to windows-cp1257
327 * - I'm going to assume that since iso-8859-4 and
328 * iso-8859-13 are Baltic that it also maps to
332 if (!g_ascii_strcasecmp (isocharset, "iso-8859-1") || !g_ascii_strcasecmp (isocharset, "us-ascii"))
333 return "windows-cp1252";
334 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-2"))
335 return "windows-cp1250";
336 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-4"))
337 return "windows-cp1257";
338 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-5"))
339 return "windows-cp1251";
340 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-6"))
341 return "windows-cp1256";
342 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-7"))
343 return "windows-cp1253";
344 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-8"))
345 return "windows-cp1255";
346 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-9"))
347 return "windows-cp1254";
348 else if (!g_ascii_strcasecmp (isocharset, "iso-8859-13"))
349 return "windows-cp1257";
354 #endif /* !BUILD_MAP */