src/share/utf8/charset.c

   1 /*
   2  * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17  */
  18
  19 /*
  20  * See the corresponding header file for a description of the functions
  21  * that this file provides.
  22  *
  23  * This was first written for Ogg Vorbis but could be of general use.
  24  *
  25  * The only deliberate assumption about data sizes is that a short has
  26  * at least 16 bits, but this code has only been tested on systems with
  27  * 8-bit char, 16-bit short and 32-bit int.
  28  */
  29
  30 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
  31
  32 #include <stdlib.h>
  33
  34 #include "charset.h"
  35
  36 #include "charmaps.h"
  37
  38 /*
  39  * This is like the standard strcasecmp, but it does not depend
  40  * on the locale. Locale-dependent functions can be dangerous:
  41  * we once had a bug involving strcasecmp("iso", "ISO") in a
  42  * Turkish locale!
  43  *
  44  * (I'm not really sure what the official standard says
  45  * about the sign of strcasecmp("Z", "["), but usually
  46  * we're only interested in whether it's zero.)
  47  */
  48
  49 static int ascii_strcasecmp(const char *s1, const char *s2)
  50 {
  51   char c1, c2;
  52
  53   for (;; s1++, s2++) {
  54     if (!*s1 || !*s1)
  55       break;
  56     if (*s1 == *s2)
  57       continue;
  58     c1 = *s1;
  59     if ('a' <= c1 && c1 <= 'z')
  60       c1 += 'A' - 'a';
  61     c2 = *s2;
  62     if ('a' <= c2 && c2 <= 'z')
  63       c2 += 'A' - 'a';
  64     if (c1 != c2)
  65       break;
  66   }
  67   return (unsigned char)*s1 - (unsigned char)*s2;
  68 }
  69
  70 /*
  71  * UTF-8 equivalents of the C library's wctomb() and mbtowc().
  72  */
  73
  74 int utf8_mbtowc(int *pwc, const char *s, size_t n)
  75 {
  76   unsigned char c;
  77   int wc, i, k;
  78
  79   if (!n || !s)
  80     return 0;
  81
  82   c = *s;
  83   if (c < 0x80) {
  84     if (pwc)
  85       *pwc = c;
  86     return c ? 1 : 0;
  87   }
  88   else if (c < 0xc2)
  89     return -1;
  90   else if (c < 0xe0) {
  91     if (n >= 2 && (s[1] & 0xc0) == 0x80) {
  92       if (pwc)
  93         *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
  94       return 2;
  95     }
  96     else
  97       return -1;
  98   }
  99   else if (c < 0xf0)
 100     k = 3;
 101   else if (c < 0xf8)
 102     k = 4;
 103   else if (c < 0xfc)
 104     k = 5;
 105   else if (c < 0xfe)
 106     k = 6;
 107   else
 108     return -1;
 109
 110   if (n < (size_t)k)
 111     return -1;
 112   wc = *s++ & ((1 << (7 - k)) - 1);
 113   for (i = 1; i < k; i++) {
 114     if ((*s & 0xc0) != 0x80)
 115       return -1;
 116     wc = (wc << 6) | (*s++ & 0x3f);
 117   }
 118   if (wc < (1 << (5 * k - 4)))
 119     return -1;
 120   if (pwc)
 121     *pwc = wc;
 122   return k;
 123 }
 124
 125 int utf8_wctomb(char *s, int wc1)
 126 {
 127   unsigned int wc = wc1;
 128
 129   if (!s)
 130     return 0;
 131   if (wc < (1u << 7)) {
 132     *s++ = wc;
 133     return 1;
 134   }
 135   else if (wc < (1u << 11)) {
 136     *s++ = 0xc0 | (wc >> 6);
 137     *s++ = 0x80 | (wc & 0x3f);
 138     return 2;
 139   }
 140   else if (wc < (1u << 16)) {
 141     *s++ = 0xe0 | (wc >> 12);
 142     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 143     *s++ = 0x80 | (wc & 0x3f);
 144     return 3;
 145   }
 146   else if (wc < (1u << 21)) {
 147     *s++ = 0xf0 | (wc >> 18);
 148     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 149     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 150     *s++ = 0x80 | (wc & 0x3f);
 151     return 4;
 152   }
 153   else if (wc < (1u << 26)) {
 154     *s++ = 0xf8 | (wc >> 24);
 155     *s++ = 0x80 | ((wc >> 18) & 0x3f);
 156     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 157     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 158     *s++ = 0x80 | (wc & 0x3f);
 159     return 5;
 160   }
 161   else if (wc < (1u << 31)) {
 162     *s++ = 0xfc | (wc >> 30);
 163     *s++ = 0x80 | ((wc >> 24) & 0x3f);
 164     *s++ = 0x80 | ((wc >> 18) & 0x3f);
 165     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 166     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 167     *s++ = 0x80 | (wc & 0x3f);
 168     return 6;
 169   }
 170   else
 171     return -1;
 172 }
 173
 174 /*
 175  * The charset "object" and methods.
 176  */
 177
 178 struct charset {
 179   int max;
 180   int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
 181   int (*wctomb)(void *table, char *s, int wc);
 182   void *map;
 183 };
 184
 185 int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
 186 {
 187   return (*charset->mbtowc)(charset->map, pwc, s, n);
 188 }
 189
 190 int charset_wctomb(struct charset *charset, char *s, int wc)
 191 {
 192   return (*charset->wctomb)(charset->map, s, wc);
 193 }
 194
 195 int charset_max(struct charset *charset)
 196 {
 197   return charset->max;
 198 }
 199
 200 /*
 201  * Implementation of UTF-8.
 202  */
 203
 204 static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
 205 {
 206   (void)map;
 207   return utf8_mbtowc(pwc, s, n);
 208 }
 209
 210 static int wctomb_utf8(void *map, char *s, int wc)
 211 {
 212   (void)map;
 213   return utf8_wctomb(s, wc);
 214 }
 215
 216 /*
 217  * Implementation of US-ASCII.
 218  * Probably on most architectures this compiles to less than 256 bytes
 219  * of code, so we can save space by not having a table for this one.
 220  */
 221
 222 static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
 223 {
 224   int wc;
 225
 226   (void)map;
 227   if (!n || !s)
 228     return 0;
 229   wc = (unsigned char)*s;
 230   if (wc & ~0x7f)
 231     return -1;
 232   if (pwc)
 233     *pwc = wc;
 234   return wc ? 1 : 0;
 235 }
 236
 237 static int wctomb_ascii(void *map, char *s, int wc)
 238 {
 239   (void)map;
 240   if (!s)
 241     return 0;
 242   if (wc & ~0x7f)
 243     return -1;
 244   *s = wc;
 245   return 1;
 246 }
 247
 248 /*
 249  * Implementation of ISO-8859-1.
 250  * Probably on most architectures this compiles to less than 256 bytes
 251  * of code, so we can save space by not having a table for this one.
 252  */
 253
 254 static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
 255 {
 256   int wc;
 257
 258   (void)map;
 259   if (!n || !s)
 260     return 0;
 261   wc = (unsigned char)*s;
 262   if (wc & ~0xff)
 263     return -1;
 264   if (pwc)
 265     *pwc = wc;
 266   return wc ? 1 : 0;
 267 }
 268
 269 static int wctomb_iso1(void *map, char *s, int wc)
 270 {
 271   (void)map;
 272   if (!s)
 273     return 0;
 274   if (wc & ~0xff)
 275     return -1;
 276   *s = wc;
 277   return 1;
 278 }
 279
 280 /*
 281  * Implementation of any 8-bit charset.
 282  */
 283
 284 struct map {
 285   const unsigned short *from;
 286   struct inverse_map *to;
 287 };
 288
 289 static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
 290 {
 291   struct map *map = map1;
 292   unsigned short wc;
 293
 294   if (!n || !s)
 295     return 0;
 296   wc = map->from[(unsigned char)*s];
 297   if (wc == 0xffff)
 298     return -1;
 299   if (pwc)
 300     *pwc = (int)wc;
 301   return wc ? 1 : 0;
 302 }
 303
 304 /*
 305  * For the inverse map we use a hash table, which has the advantages
 306  * of small constant memory requirement and simple memory allocation,
 307  * but the disadvantage of slow conversion in the worst case.
 308  * If you need real-time performance while letting a potentially
 309  * malicious user define their own map, then the method used in
 310  * linux/drivers/char/consolemap.c would be more appropriate.
 311  */
 312
 313 struct inverse_map {
 314   unsigned char first[256];
 315   unsigned char next[256];
 316 };
 317
 318 /*
 319  * The simple hash is good enough for this application.
 320  * Use the alternative trivial hashes for testing.
 321  */
 322 #define HASH(i) ((i) & 0xff)
 323 /* #define HASH(i) 0 */
 324 /* #define HASH(i) 99 */
 325
 326 static struct inverse_map *make_inverse_map(const unsigned short *from)
 327 {
 328   struct inverse_map *to;
 329   char used[256];
 330   int i, j, k;
 331
 332   to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
 333   if (!to)
 334     return 0;
 335   for (i = 0; i < 256; i++)
 336     to->first[i] = to->next[i] = used[i] = 0;
 337   for (i = 255; i >= 0; i--)
 338     if (from[i] != 0xffff) {
 339       k = HASH(from[i]);
 340       to->next[i] = to->first[k];
 341       to->first[k] = i;
 342       used[k] = 1;
 343     }
 344
 345   /* Point the empty buckets at an empty list. */
 346   for (i = 0; i < 256; i++)
 347     if (!to->next[i])
 348       break;
 349   if (i < 256)
 350     for (j = 0; j < 256; j++)
 351       if (!used[j])
 352         to->first[j] = i;
 353
 354   return to;
 355 }
 356
 357 int wctomb_8bit(void *map1, char *s, int wc1)
 358 {
 359   struct map *map = map1;
 360   unsigned short wc = wc1;
 361   int i;
 362
 363   if (!s)
 364     return 0;
 365
 366   if (wc1 & ~0xffff)
 367     return -1;
 368
 369   if (1) /* Change 1 to 0 to test the case where malloc fails. */
 370     if (!map->to)
 371       map->to = make_inverse_map(map->from);
 372
 373   if (map->to) {
 374     /* Use the inverse map. */
 375     i = map->to->first[HASH(wc)];
 376     for (;;) {
 377       if (map->from[i] == wc) {
 378         *s = i;
 379         return 1;
 380       }
 381       if (!(i = map->to->next[i]))
 382         break;
 383     }
 384   }
 385   else {
 386     /* We don't have an inverse map, so do a linear search. */
 387     for (i = 0; i < 256; i++)
 388       if (map->from[i] == wc) {
 389         *s = i;
 390         return 1;
 391       }
 392   }
 393
 394   return -1;
 395 }
 396
 397 /*
 398  * The "constructor" charset_find().
 399  */
 400
 401 struct charset charset_utf8 = {
 402   6,
 403   &mbtowc_utf8,
 404   &wctomb_utf8,
 405   0
 406 };
 407
 408 struct charset charset_iso1 = {
 409   1,
 410   &mbtowc_iso1,
 411   &wctomb_iso1,
 412   0
 413 };
 414
 415 struct charset charset_ascii = {
 416   1,
 417   &mbtowc_ascii,
 418   &wctomb_ascii,
 419   0
 420 };
 421
 422 struct charset *charset_find(const char *code)
 423 {
 424   int i;
 425
 426   /* Find good (MIME) name. */
 427   for (i = 0; names[i].bad; i++)
 428     if (!ascii_strcasecmp(code, names[i].bad)) {
 429       code = names[i].good;
 430       break;
 431     }
 432
 433   /* Recognise some charsets for which we avoid using a table. */
 434   if (!ascii_strcasecmp(code, "UTF-8"))
 435     return &charset_utf8;
 436   if (!ascii_strcasecmp(code, "US-ASCII"))
 437     return &charset_ascii;
 438   if (!ascii_strcasecmp(code, "ISO-8859-1"))
 439     return &charset_iso1;
 440
 441   /* Look for a mapping for a simple 8-bit encoding. */
 442   for (i = 0; maps[i].name; i++)
 443     if (!ascii_strcasecmp(code, maps[i].name)) {
 444       if (!maps[i].charset) {
 445         maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
 446         if (maps[i].charset) {
 447           struct map *map = (struct map *)malloc(sizeof(struct map));
 448           if (!map) {
 449             free(maps[i].charset);
 450             maps[i].charset = 0;
 451           }
 452           else {
 453             maps[i].charset->max = 1;
 454             maps[i].charset->mbtowc = &mbtowc_8bit;
 455             maps[i].charset->wctomb = &wctomb_8bit;
 456             maps[i].charset->map = map;
 457             map->from = maps[i].map;
 458             map->to = 0; /* inverse mapping is created when required */
 459           }
 460         }
 461       }
 462       return maps[i].charset;
 463     }
 464
 465   return 0;
 466 }
 467
 468 /*
 469  * Function to convert a buffer from one encoding to another.
 470  * Invalid bytes are replaced by '#', and characters that are
 471  * not available in the target encoding are replaced by '?'.
 472  * Each of TO and TOLEN may be zero, if the result is not needed.
 473  * The output buffer is null-terminated, so it is all right to
 474  * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
 475  */
 476
 477 int charset_convert(const char *fromcode, const char *tocode,
 478                     const char *from, size_t fromlen,
 479                     char **to, size_t *tolen)
 480 {
 481   int ret = 0;
 482   struct charset *charset1, *charset2;
 483   char *tobuf, *p, *newbuf;
 484   int i, j, wc;
 485
 486   charset1 = charset_find(fromcode);
 487   charset2 = charset_find(tocode);
 488   if (!charset1 || !charset2 )
 489     return -1;
 490
 491   tobuf = (char *)malloc(fromlen * charset2->max + 1);
 492   if (!tobuf)
 493     return -2;
 494
 495   for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
 496     i = charset_mbtowc(charset1, &wc, from, fromlen);
 497     if (!i)
 498       i = 1;
 499     else if (i == -1) {
 500       i  = 1;
 501       wc = '#';
 502       ret = 2;
 503     }
 504     j = charset_wctomb(charset2, p, wc);
 505     if (j == -1) {
 506       if (!ret)
 507         ret = 1;
 508       j = charset_wctomb(charset2, p, '?');
 509       if (j == -1)
 510         j = 0;
 511     }
 512   }
 513
 514   if (tolen)
 515     *tolen = p - tobuf;
 516   *p++ = '\0';
 517   if (to) {
 518     newbuf = realloc(tobuf, p - tobuf);
 519     *to = newbuf ? newbuf : tobuf;
 520   }
 521   else
 522     free(tobuf);
 523
 524   return ret;
 525 }
 526
 527 #endif /* USE_CHARSET_ICONV */