2 * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * See the corresponding header file for a description of the functions
21 * that this file provides.
23 * This was first written for Ogg Vorbis but could be of general use.
25 * The only deliberate assumption about data sizes is that a short has
26 * at least 16 bits, but this code has only been tested on systems with
27 * 8-bit char, 16-bit short and 32-bit int.
30 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
39 * This is like the standard strcasecmp, but it does not depend
40 * on the locale. Locale-dependent functions can be dangerous:
41 * we once had a bug involving strcasecmp("iso", "ISO") in a
44 * (I'm not really sure what the official standard says
45 * about the sign of strcasecmp("Z", "["), but usually
46 * we're only interested in whether it's zero.)
49 static int ascii_strcasecmp(const char *s1, const char *s2)
59 if ('a' <= c1 && c1 <= 'z')
62 if ('a' <= c2 && c2 <= 'z')
67 return (unsigned char)*s1 - (unsigned char)*s2;
71 * UTF-8 equivalents of the C library's wctomb() and mbtowc().
74 int utf8_mbtowc(int *pwc, const char *s, size_t n)
91 if (n >= 2 && (s[1] & 0xc0) == 0x80) {
93 *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
112 wc = *s++ & ((1 << (7 - k)) - 1);
113 for (i = 1; i < k; i++) {
114 if ((*s & 0xc0) != 0x80)
116 wc = (wc << 6) | (*s++ & 0x3f);
118 if (wc < (1 << (5 * k - 4)))
125 int utf8_wctomb(char *s, int wc1)
127 unsigned int wc = wc1;
131 if (wc < (1u << 7)) {
135 else if (wc < (1u << 11)) {
136 *s++ = 0xc0 | (wc >> 6);
137 *s++ = 0x80 | (wc & 0x3f);
140 else if (wc < (1u << 16)) {
141 *s++ = 0xe0 | (wc >> 12);
142 *s++ = 0x80 | ((wc >> 6) & 0x3f);
143 *s++ = 0x80 | (wc & 0x3f);
146 else if (wc < (1u << 21)) {
147 *s++ = 0xf0 | (wc >> 18);
148 *s++ = 0x80 | ((wc >> 12) & 0x3f);
149 *s++ = 0x80 | ((wc >> 6) & 0x3f);
150 *s++ = 0x80 | (wc & 0x3f);
153 else if (wc < (1u << 26)) {
154 *s++ = 0xf8 | (wc >> 24);
155 *s++ = 0x80 | ((wc >> 18) & 0x3f);
156 *s++ = 0x80 | ((wc >> 12) & 0x3f);
157 *s++ = 0x80 | ((wc >> 6) & 0x3f);
158 *s++ = 0x80 | (wc & 0x3f);
161 else if (wc < (1u << 31)) {
162 *s++ = 0xfc | (wc >> 30);
163 *s++ = 0x80 | ((wc >> 24) & 0x3f);
164 *s++ = 0x80 | ((wc >> 18) & 0x3f);
165 *s++ = 0x80 | ((wc >> 12) & 0x3f);
166 *s++ = 0x80 | ((wc >> 6) & 0x3f);
167 *s++ = 0x80 | (wc & 0x3f);
175 * The charset "object" and methods.
180 int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
181 int (*wctomb)(void *table, char *s, int wc);
185 int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
187 return (*charset->mbtowc)(charset->map, pwc, s, n);
190 int charset_wctomb(struct charset *charset, char *s, int wc)
192 return (*charset->wctomb)(charset->map, s, wc);
195 int charset_max(struct charset *charset)
201 * Implementation of UTF-8.
204 static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
207 return utf8_mbtowc(pwc, s, n);
210 static int wctomb_utf8(void *map, char *s, int wc)
213 return utf8_wctomb(s, wc);
217 * Implementation of US-ASCII.
218 * Probably on most architectures this compiles to less than 256 bytes
219 * of code, so we can save space by not having a table for this one.
222 static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
229 wc = (unsigned char)*s;
237 static int wctomb_ascii(void *map, char *s, int wc)
249 * Implementation of ISO-8859-1.
250 * Probably on most architectures this compiles to less than 256 bytes
251 * of code, so we can save space by not having a table for this one.
254 static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
261 wc = (unsigned char)*s;
269 static int wctomb_iso1(void *map, char *s, int wc)
281 * Implementation of any 8-bit charset.
285 const unsigned short *from;
286 struct inverse_map *to;
289 static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
291 struct map *map = map1;
296 wc = map->from[(unsigned char)*s];
305 * For the inverse map we use a hash table, which has the advantages
306 * of small constant memory requirement and simple memory allocation,
307 * but the disadvantage of slow conversion in the worst case.
308 * If you need real-time performance while letting a potentially
309 * malicious user define their own map, then the method used in
310 * linux/drivers/char/consolemap.c would be more appropriate.
314 unsigned char first[256];
315 unsigned char next[256];
319 * The simple hash is good enough for this application.
320 * Use the alternative trivial hashes for testing.
322 #define HASH(i) ((i) & 0xff)
323 /* #define HASH(i) 0 */
324 /* #define HASH(i) 99 */
326 static struct inverse_map *make_inverse_map(const unsigned short *from)
328 struct inverse_map *to;
332 to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
335 for (i = 0; i < 256; i++)
336 to->first[i] = to->next[i] = used[i] = 0;
337 for (i = 255; i >= 0; i--)
338 if (from[i] != 0xffff) {
340 to->next[i] = to->first[k];
345 /* Point the empty buckets at an empty list. */
346 for (i = 0; i < 256; i++)
350 for (j = 0; j < 256; j++)
357 int wctomb_8bit(void *map1, char *s, int wc1)
359 struct map *map = map1;
360 unsigned short wc = wc1;
369 if (1) /* Change 1 to 0 to test the case where malloc fails. */
371 map->to = make_inverse_map(map->from);
374 /* Use the inverse map. */
375 i = map->to->first[HASH(wc)];
377 if (map->from[i] == wc) {
381 if (!(i = map->to->next[i]))
386 /* We don't have an inverse map, so do a linear search. */
387 for (i = 0; i < 256; i++)
388 if (map->from[i] == wc) {
398 * The "constructor" charset_find().
401 struct charset charset_utf8 = {
408 struct charset charset_iso1 = {
415 struct charset charset_ascii = {
422 struct charset *charset_find(const char *code)
426 /* Find good (MIME) name. */
427 for (i = 0; names[i].bad; i++)
428 if (!ascii_strcasecmp(code, names[i].bad)) {
429 code = names[i].good;
433 /* Recognise some charsets for which we avoid using a table. */
434 if (!ascii_strcasecmp(code, "UTF-8"))
435 return &charset_utf8;
436 if (!ascii_strcasecmp(code, "US-ASCII"))
437 return &charset_ascii;
438 if (!ascii_strcasecmp(code, "ISO-8859-1"))
439 return &charset_iso1;
441 /* Look for a mapping for a simple 8-bit encoding. */
442 for (i = 0; maps[i].name; i++)
443 if (!ascii_strcasecmp(code, maps[i].name)) {
444 if (!maps[i].charset) {
445 maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
446 if (maps[i].charset) {
447 struct map *map = (struct map *)malloc(sizeof(struct map));
449 free(maps[i].charset);
453 maps[i].charset->max = 1;
454 maps[i].charset->mbtowc = &mbtowc_8bit;
455 maps[i].charset->wctomb = &wctomb_8bit;
456 maps[i].charset->map = map;
457 map->from = maps[i].map;
458 map->to = 0; /* inverse mapping is created when required */
462 return maps[i].charset;
469 * Function to convert a buffer from one encoding to another.
470 * Invalid bytes are replaced by '#', and characters that are
471 * not available in the target encoding are replaced by '?'.
472 * Each of TO and TOLEN may be zero, if the result is not needed.
473 * The output buffer is null-terminated, so it is all right to
474 * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
477 int charset_convert(const char *fromcode, const char *tocode,
478 const char *from, size_t fromlen,
479 char **to, size_t *tolen)
482 struct charset *charset1, *charset2;
483 char *tobuf, *p, *newbuf;
486 charset1 = charset_find(fromcode);
487 charset2 = charset_find(tocode);
488 if (!charset1 || !charset2 )
491 tobuf = (char *)malloc(fromlen * charset2->max + 1);
495 for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
496 i = charset_mbtowc(charset1, &wc, from, fromlen);
504 j = charset_wctomb(charset2, p, wc);
508 j = charset_wctomb(charset2, p, '?');
518 newbuf = realloc(tobuf, p - tobuf);
519 *to = newbuf ? newbuf : tobuf;
527 #endif /* USE_CHARSET_ICONV */