src/share/utf8/charset.c

   1 /*
   2  * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17  */
  18
  19 /*
  20  * See the corresponding header file for a description of the functions
  21  * that this file provides.
  22  *
  23  * This was first written for Ogg Vorbis but could be of general use.
  24  *
  25  * The only deliberate assumption about data sizes is that a short has
  26  * at least 16 bits, but this code has only been tested on systems with
  27  * 8-bit char, 16-bit short and 32-bit int.
  28  */
  29
  30 #if HAVE_CONFIG_H
  31 #  include <config.h>
  32 #endif
  33
  34 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
  35
  36 #include <stdlib.h>
  37
  38 #include "charset.h"
  39
  40 #include "charmaps.h"
  41
  42 /*
  43  * This is like the standard strcasecmp, but it does not depend
  44  * on the locale. Locale-dependent functions can be dangerous:
  45  * we once had a bug involving strcasecmp("iso", "ISO") in a
  46  * Turkish locale!
  47  *
  48  * (I'm not really sure what the official standard says
  49  * about the sign of strcasecmp("Z", "["), but usually
  50  * we're only interested in whether it's zero.)
  51  */
  52
  53 static int ascii_strcasecmp(const char *s1, const char *s2)
  54 {
  55   char c1, c2;
  56
  57   for (;; s1++, s2++) {
  58     if (!*s1 || !*s1)
  59       break;
  60     if (*s1 == *s2)
  61       continue;
  62     c1 = *s1;
  63     if ('a' <= c1 && c1 <= 'z')
  64       c1 += 'A' - 'a';
  65     c2 = *s2;
  66     if ('a' <= c2 && c2 <= 'z')
  67       c2 += 'A' - 'a';
  68     if (c1 != c2)
  69       break;
  70   }
  71   return (unsigned char)*s1 - (unsigned char)*s2;
  72 }
  73
  74 /*
  75  * UTF-8 equivalents of the C library's wctomb() and mbtowc().
  76  */
  77
  78 int utf8_mbtowc(int *pwc, const char *s, size_t n)
  79 {
  80   unsigned char c;
  81   int wc, i, k;
  82
  83   if (!n || !s)
  84     return 0;
  85
  86   c = *s;
  87   if (c < 0x80) {
  88     if (pwc)
  89       *pwc = c;
  90     return c ? 1 : 0;
  91   }
  92   else if (c < 0xc2)
  93     return -1;
  94   else if (c < 0xe0) {
  95     if (n >= 2 && (s[1] & 0xc0) == 0x80) {
  96       if (pwc)
  97         *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
  98       return 2;
  99     }
 100     else
 101       return -1;
 102   }
 103   else if (c < 0xf0)
 104     k = 3;
 105   else if (c < 0xf8)
 106     k = 4;
 107   else if (c < 0xfc)
 108     k = 5;
 109   else if (c < 0xfe)
 110     k = 6;
 111   else
 112     return -1;
 113
 114   if (n < (size_t)k)
 115     return -1;
 116   wc = *s++ & ((1 << (7 - k)) - 1);
 117   for (i = 1; i < k; i++) {
 118     if ((*s & 0xc0) != 0x80)
 119       return -1;
 120     wc = (wc << 6) | (*s++ & 0x3f);
 121   }
 122   if (wc < (1 << (5 * k - 4)))
 123     return -1;
 124   if (pwc)
 125     *pwc = wc;
 126   return k;
 127 }
 128
 129 int utf8_wctomb(char *s, int wc1)
 130 {
 131   unsigned int wc = wc1;
 132
 133   if (!s)
 134     return 0;
 135   if (wc < (1u << 7)) {
 136     *s++ = wc;
 137     return 1;
 138   }
 139   else if (wc < (1u << 11)) {
 140     *s++ = 0xc0 | (wc >> 6);
 141     *s++ = 0x80 | (wc & 0x3f);
 142     return 2;
 143   }
 144   else if (wc < (1u << 16)) {
 145     *s++ = 0xe0 | (wc >> 12);
 146     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 147     *s++ = 0x80 | (wc & 0x3f);
 148     return 3;
 149   }
 150   else if (wc < (1u << 21)) {
 151     *s++ = 0xf0 | (wc >> 18);
 152     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 153     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 154     *s++ = 0x80 | (wc & 0x3f);
 155     return 4;
 156   }
 157   else if (wc < (1u << 26)) {
 158     *s++ = 0xf8 | (wc >> 24);
 159     *s++ = 0x80 | ((wc >> 18) & 0x3f);
 160     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 161     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 162     *s++ = 0x80 | (wc & 0x3f);
 163     return 5;
 164   }
 165   else if (wc < (1u << 31)) {
 166     *s++ = 0xfc | (wc >> 30);
 167     *s++ = 0x80 | ((wc >> 24) & 0x3f);
 168     *s++ = 0x80 | ((wc >> 18) & 0x3f);
 169     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 170     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 171     *s++ = 0x80 | (wc & 0x3f);
 172     return 6;
 173   }
 174   else
 175     return -1;
 176 }
 177
 178 /*
 179  * The charset "object" and methods.
 180  */
 181
 182 struct charset {
 183   int max;
 184   int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
 185   int (*wctomb)(void *table, char *s, int wc);
 186   void *map;
 187 };
 188
 189 int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
 190 {
 191   return (*charset->mbtowc)(charset->map, pwc, s, n);
 192 }
 193
 194 int charset_wctomb(struct charset *charset, char *s, int wc)
 195 {
 196   return (*charset->wctomb)(charset->map, s, wc);
 197 }
 198
 199 int charset_max(struct charset *charset)
 200 {
 201   return charset->max;
 202 }
 203
 204 /*
 205  * Implementation of UTF-8.
 206  */
 207
 208 static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
 209 {
 210   (void)map;
 211   return utf8_mbtowc(pwc, s, n);
 212 }
 213
 214 static int wctomb_utf8(void *map, char *s, int wc)
 215 {
 216   (void)map;
 217   return utf8_wctomb(s, wc);
 218 }
 219
 220 /*
 221  * Implementation of US-ASCII.
 222  * Probably on most architectures this compiles to less than 256 bytes
 223  * of code, so we can save space by not having a table for this one.
 224  */
 225
 226 static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
 227 {
 228   int wc;
 229
 230   (void)map;
 231   if (!n || !s)
 232     return 0;
 233   wc = (unsigned char)*s;
 234   if (wc & ~0x7f)
 235     return -1;
 236   if (pwc)
 237     *pwc = wc;
 238   return wc ? 1 : 0;
 239 }
 240
 241 static int wctomb_ascii(void *map, char *s, int wc)
 242 {
 243   (void)map;
 244   if (!s)
 245     return 0;
 246   if (wc & ~0x7f)
 247     return -1;
 248   *s = wc;
 249   return 1;
 250 }
 251
 252 /*
 253  * Implementation of ISO-8859-1.
 254  * Probably on most architectures this compiles to less than 256 bytes
 255  * of code, so we can save space by not having a table for this one.
 256  */
 257
 258 static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
 259 {
 260   int wc;
 261
 262   (void)map;
 263   if (!n || !s)
 264     return 0;
 265   wc = (unsigned char)*s;
 266   if (wc & ~0xff)
 267     return -1;
 268   if (pwc)
 269     *pwc = wc;
 270   return wc ? 1 : 0;
 271 }
 272
 273 static int wctomb_iso1(void *map, char *s, int wc)
 274 {
 275   (void)map;
 276   if (!s)
 277     return 0;
 278   if (wc & ~0xff)
 279     return -1;
 280   *s = wc;
 281   return 1;
 282 }
 283
 284 /*
 285  * Implementation of any 8-bit charset.
 286  */
 287
 288 struct map {
 289   const unsigned short *from;
 290   struct inverse_map *to;
 291 };
 292
 293 static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
 294 {
 295   struct map *map = map1;
 296   unsigned short wc;
 297
 298   if (!n || !s)
 299     return 0;
 300   wc = map->from[(unsigned char)*s];
 301   if (wc == 0xffff)
 302     return -1;
 303   if (pwc)
 304     *pwc = (int)wc;
 305   return wc ? 1 : 0;
 306 }
 307
 308 /*
 309  * For the inverse map we use a hash table, which has the advantages
 310  * of small constant memory requirement and simple memory allocation,
 311  * but the disadvantage of slow conversion in the worst case.
 312  * If you need real-time performance while letting a potentially
 313  * malicious user define their own map, then the method used in
 314  * linux/drivers/char/consolemap.c would be more appropriate.
 315  */
 316
 317 struct inverse_map {
 318   unsigned char first[256];
 319   unsigned char next[256];
 320 };
 321
 322 /*
 323  * The simple hash is good enough for this application.
 324  * Use the alternative trivial hashes for testing.
 325  */
 326 #define HASH(i) ((i) & 0xff)
 327 /* #define HASH(i) 0 */
 328 /* #define HASH(i) 99 */
 329
 330 static struct inverse_map *make_inverse_map(const unsigned short *from)
 331 {
 332   struct inverse_map *to;
 333   char used[256];
 334   int i, j, k;
 335
 336   to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
 337   if (!to)
 338     return 0;
 339   for (i = 0; i < 256; i++)
 340     to->first[i] = to->next[i] = used[i] = 0;
 341   for (i = 255; i >= 0; i--)
 342     if (from[i] != 0xffff) {
 343       k = HASH(from[i]);
 344       to->next[i] = to->first[k];
 345       to->first[k] = i;
 346       used[k] = 1;
 347     }
 348
 349   /* Point the empty buckets at an empty list. */
 350   for (i = 0; i < 256; i++)
 351     if (!to->next[i])
 352       break;
 353   if (i < 256)
 354     for (j = 0; j < 256; j++)
 355       if (!used[j])
 356         to->first[j] = i;
 357
 358   return to;
 359 }
 360
 361 int wctomb_8bit(void *map1, char *s, int wc1)
 362 {
 363   struct map *map = map1;
 364   unsigned short wc = wc1;
 365   int i;
 366
 367   if (!s)
 368     return 0;
 369
 370   if (wc1 & ~0xffff)
 371     return -1;
 372
 373   if (1) /* Change 1 to 0 to test the case where malloc fails. */
 374     if (!map->to)
 375       map->to = make_inverse_map(map->from);
 376
 377   if (map->to) {
 378     /* Use the inverse map. */
 379     i = map->to->first[HASH(wc)];
 380     for (;;) {
 381       if (map->from[i] == wc) {
 382         *s = i;
 383         return 1;
 384       }
 385       if (!(i = map->to->next[i]))
 386         break;
 387     }
 388   }
 389   else {
 390     /* We don't have an inverse map, so do a linear search. */
 391     for (i = 0; i < 256; i++)
 392       if (map->from[i] == wc) {
 393         *s = i;
 394         return 1;
 395       }
 396   }
 397
 398   return -1;
 399 }
 400
 401 /*
 402  * The "constructor" charset_find().
 403  */
 404
 405 struct charset charset_utf8 = {
 406   6,
 407   &mbtowc_utf8,
 408   &wctomb_utf8,
 409   0
 410 };
 411
 412 struct charset charset_iso1 = {
 413   1,
 414   &mbtowc_iso1,
 415   &wctomb_iso1,
 416   0
 417 };
 418
 419 struct charset charset_ascii = {
 420   1,
 421   &mbtowc_ascii,
 422   &wctomb_ascii,
 423   0
 424 };
 425
 426 struct charset *charset_find(const char *code)
 427 {
 428   int i;
 429
 430   /* Find good (MIME) name. */
 431   for (i = 0; names[i].bad; i++)
 432     if (!ascii_strcasecmp(code, names[i].bad)) {
 433       code = names[i].good;
 434       break;
 435     }
 436
 437   /* Recognise some charsets for which we avoid using a table. */
 438   if (!ascii_strcasecmp(code, "UTF-8"))
 439     return &charset_utf8;
 440   if (!ascii_strcasecmp(code, "US-ASCII"))
 441     return &charset_ascii;
 442   if (!ascii_strcasecmp(code, "ISO-8859-1"))
 443     return &charset_iso1;
 444
 445   /* Look for a mapping for a simple 8-bit encoding. */
 446   for (i = 0; maps[i].name; i++)
 447     if (!ascii_strcasecmp(code, maps[i].name)) {
 448       if (!maps[i].charset) {
 449         maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
 450         if (maps[i].charset) {
 451           struct map *map = (struct map *)malloc(sizeof(struct map));
 452           if (!map) {
 453             free(maps[i].charset);
 454             maps[i].charset = 0;
 455           }
 456           else {
 457             maps[i].charset->max = 1;
 458             maps[i].charset->mbtowc = &mbtowc_8bit;
 459             maps[i].charset->wctomb = &wctomb_8bit;
 460             maps[i].charset->map = map;
 461             map->from = maps[i].map;
 462             map->to = 0; /* inverse mapping is created when required */
 463           }
 464         }
 465       }
 466       return maps[i].charset;
 467     }
 468
 469   return 0;
 470 }
 471
 472 /*
 473  * Function to convert a buffer from one encoding to another.
 474  * Invalid bytes are replaced by '#', and characters that are
 475  * not available in the target encoding are replaced by '?'.
 476  * Each of TO and TOLEN may be zero, if the result is not needed.
 477  * The output buffer is null-terminated, so it is all right to
 478  * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
 479  */
 480
 481 int charset_convert(const char *fromcode, const char *tocode,
 482                     const char *from, size_t fromlen,
 483                     char **to, size_t *tolen)
 484 {
 485   int ret = 0;
 486   struct charset *charset1, *charset2;
 487   char *tobuf, *p, *newbuf;
 488   int i, j, wc;
 489
 490   charset1 = charset_find(fromcode);
 491   charset2 = charset_find(tocode);
 492   if (!charset1 || !charset2 )
 493     return -1;
 494
 495   tobuf = (char *)malloc(fromlen * charset2->max + 1);
 496   if (!tobuf)
 497     return -2;
 498
 499   for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
 500     i = charset_mbtowc(charset1, &wc, from, fromlen);
 501     if (!i)
 502       i = 1;
 503     else if (i == -1) {
 504       i  = 1;
 505       wc = '#';
 506       ret = 2;
 507     }
 508     j = charset_wctomb(charset2, p, wc);
 509     if (j == -1) {
 510       if (!ret)
 511         ret = 1;
 512       j = charset_wctomb(charset2, p, '?');
 513       if (j == -1)
 514         j = 0;
 515     }
 516   }
 517
 518   if (tolen)
 519     *tolen = p - tobuf;
 520   *p++ = '\0';
 521   if (to) {
 522     newbuf = realloc(tobuf, p - tobuf);
 523     *to = newbuf ? newbuf : tobuf;
 524   }
 525   else
 526     free(tobuf);
 527
 528   return ret;
 529 }
 530
 531 #endif /* USE_CHARSET_ICONV */