src/share/utf8/charset.c

   1 /*
   2  * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License along
  15  * with this program; if not, write to the Free Software Foundation, Inc.,
  16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  17  */
  18
  19 /*
  20  * See the corresponding header file for a description of the functions
  21  * that this file provides.
  22  *
  23  * This was first written for Ogg Vorbis but could be of general use.
  24  *
  25  * The only deliberate assumption about data sizes is that a short has
  26  * at least 16 bits, but this code has only been tested on systems with
  27  * 8-bit char, 16-bit short and 32-bit int.
  28  */
  29
  30 #if HAVE_CONFIG_H
  31 #  include <config.h>
  32 #endif
  33
  34 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
  35
  36 #include <stdlib.h>
  37
  38 #include "share/alloc.h"
  39 #include "charset.h"
  40
  41 #include "charmaps.h"
  42
  43 /*
  44  * This is like the standard strcasecmp, but it does not depend
  45  * on the locale. Locale-dependent functions can be dangerous:
  46  * we once had a bug involving strcasecmp("iso", "ISO") in a
  47  * Turkish locale!
  48  *
  49  * (I'm not really sure what the official standard says
  50  * about the sign of strcasecmp("Z", "["), but usually
  51  * we're only interested in whether it's zero.)
  52  */
  53
  54 static int ascii_strcasecmp(const char *s1, const char *s2)
  55 {
  56   char c1, c2;
  57
  58   for (;; s1++, s2++) {
  59     if (!*s1 || !*s1)
  60       break;
  61     if (*s1 == *s2)
  62       continue;
  63     c1 = *s1;
  64     if ('a' <= c1 && c1 <= 'z')
  65       c1 += 'A' - 'a';
  66     c2 = *s2;
  67     if ('a' <= c2 && c2 <= 'z')
  68       c2 += 'A' - 'a';
  69     if (c1 != c2)
  70       break;
  71   }
  72   return (unsigned char)*s1 - (unsigned char)*s2;
  73 }
  74
  75 /*
  76  * UTF-8 equivalents of the C library's wctomb() and mbtowc().
  77  */
  78
  79 int utf8_mbtowc(int *pwc, const char *s, size_t n)
  80 {
  81   unsigned char c;
  82   int wc, i, k;
  83
  84   if (!n || !s)
  85     return 0;
  86
  87   c = *s;
  88   if (c < 0x80) {
  89     if (pwc)
  90       *pwc = c;
  91     return c ? 1 : 0;
  92   }
  93   else if (c < 0xc2)
  94     return -1;
  95   else if (c < 0xe0) {
  96     if (n >= 2 && (s[1] & 0xc0) == 0x80) {
  97       if (pwc)
  98         *pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
  99       return 2;
 100     }
 101     else
 102       return -1;
 103   }
 104   else if (c < 0xf0)
 105     k = 3;
 106   else if (c < 0xf8)
 107     k = 4;
 108   else if (c < 0xfc)
 109     k = 5;
 110   else if (c < 0xfe)
 111     k = 6;
 112   else
 113     return -1;
 114
 115   if (n < (size_t)k)
 116     return -1;
 117   wc = *s++ & ((1 << (7 - k)) - 1);
 118   for (i = 1; i < k; i++) {
 119     if ((*s & 0xc0) != 0x80)
 120       return -1;
 121     wc = (wc << 6) | (*s++ & 0x3f);
 122   }
 123   if (wc < (1 << (5 * k - 4)))
 124     return -1;
 125   if (pwc)
 126     *pwc = wc;
 127   return k;
 128 }
 129
 130 int utf8_wctomb(char *s, int wc1)
 131 {
 132   unsigned int wc = wc1;
 133
 134   if (!s)
 135     return 0;
 136   if (wc < (1u << 7)) {
 137     *s++ = wc;
 138     return 1;
 139   }
 140   else if (wc < (1u << 11)) {
 141     *s++ = 0xc0 | (wc >> 6);
 142     *s++ = 0x80 | (wc & 0x3f);
 143     return 2;
 144   }
 145   else if (wc < (1u << 16)) {
 146     *s++ = 0xe0 | (wc >> 12);
 147     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 148     *s++ = 0x80 | (wc & 0x3f);
 149     return 3;
 150   }
 151   else if (wc < (1u << 21)) {
 152     *s++ = 0xf0 | (wc >> 18);
 153     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 154     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 155     *s++ = 0x80 | (wc & 0x3f);
 156     return 4;
 157   }
 158   else if (wc < (1u << 26)) {
 159     *s++ = 0xf8 | (wc >> 24);
 160     *s++ = 0x80 | ((wc >> 18) & 0x3f);
 161     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 162     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 163     *s++ = 0x80 | (wc & 0x3f);
 164     return 5;
 165   }
 166   else if (wc < (1u << 31)) {
 167     *s++ = 0xfc | (wc >> 30);
 168     *s++ = 0x80 | ((wc >> 24) & 0x3f);
 169     *s++ = 0x80 | ((wc >> 18) & 0x3f);
 170     *s++ = 0x80 | ((wc >> 12) & 0x3f);
 171     *s++ = 0x80 | ((wc >> 6) & 0x3f);
 172     *s++ = 0x80 | (wc & 0x3f);
 173     return 6;
 174   }
 175   else
 176     return -1;
 177 }
 178
 179 /*
 180  * The charset "object" and methods.
 181  */
 182
 183 struct charset {
 184   int max;
 185   int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
 186   int (*wctomb)(void *table, char *s, int wc);
 187   void *map;
 188 };
 189
 190 int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
 191 {
 192   return (*charset->mbtowc)(charset->map, pwc, s, n);
 193 }
 194
 195 int charset_wctomb(struct charset *charset, char *s, int wc)
 196 {
 197   return (*charset->wctomb)(charset->map, s, wc);
 198 }
 199
 200 int charset_max(struct charset *charset)
 201 {
 202   return charset->max;
 203 }
 204
 205 /*
 206  * Implementation of UTF-8.
 207  */
 208
 209 static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
 210 {
 211   (void)map;
 212   return utf8_mbtowc(pwc, s, n);
 213 }
 214
 215 static int wctomb_utf8(void *map, char *s, int wc)
 216 {
 217   (void)map;
 218   return utf8_wctomb(s, wc);
 219 }
 220
 221 /*
 222  * Implementation of US-ASCII.
 223  * Probably on most architectures this compiles to less than 256 bytes
 224  * of code, so we can save space by not having a table for this one.
 225  */
 226
 227 static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
 228 {
 229   int wc;
 230
 231   (void)map;
 232   if (!n || !s)
 233     return 0;
 234   wc = (unsigned char)*s;
 235   if (wc & ~0x7f)
 236     return -1;
 237   if (pwc)
 238     *pwc = wc;
 239   return wc ? 1 : 0;
 240 }
 241
 242 static int wctomb_ascii(void *map, char *s, int wc)
 243 {
 244   (void)map;
 245   if (!s)
 246     return 0;
 247   if (wc & ~0x7f)
 248     return -1;
 249   *s = wc;
 250   return 1;
 251 }
 252
 253 /*
 254  * Implementation of ISO-8859-1.
 255  * Probably on most architectures this compiles to less than 256 bytes
 256  * of code, so we can save space by not having a table for this one.
 257  */
 258
 259 static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
 260 {
 261   int wc;
 262
 263   (void)map;
 264   if (!n || !s)
 265     return 0;
 266   wc = (unsigned char)*s;
 267   if (wc & ~0xff)
 268     return -1;
 269   if (pwc)
 270     *pwc = wc;
 271   return wc ? 1 : 0;
 272 }
 273
 274 static int wctomb_iso1(void *map, char *s, int wc)
 275 {
 276   (void)map;
 277   if (!s)
 278     return 0;
 279   if (wc & ~0xff)
 280     return -1;
 281   *s = wc;
 282   return 1;
 283 }
 284
 285 /*
 286  * Implementation of any 8-bit charset.
 287  */
 288
 289 struct map {
 290   const unsigned short *from;
 291   struct inverse_map *to;
 292 };
 293
 294 static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
 295 {
 296   struct map *map = map1;
 297   unsigned short wc;
 298
 299   if (!n || !s)
 300     return 0;
 301   wc = map->from[(unsigned char)*s];
 302   if (wc == 0xffff)
 303     return -1;
 304   if (pwc)
 305     *pwc = (int)wc;
 306   return wc ? 1 : 0;
 307 }
 308
 309 /*
 310  * For the inverse map we use a hash table, which has the advantages
 311  * of small constant memory requirement and simple memory allocation,
 312  * but the disadvantage of slow conversion in the worst case.
 313  * If you need real-time performance while letting a potentially
 314  * malicious user define their own map, then the method used in
 315  * linux/drivers/char/consolemap.c would be more appropriate.
 316  */
 317
 318 struct inverse_map {
 319   unsigned char first[256];
 320   unsigned char next[256];
 321 };
 322
 323 /*
 324  * The simple hash is good enough for this application.
 325  * Use the alternative trivial hashes for testing.
 326  */
 327 #define HASH(i) ((i) & 0xff)
 328 /* #define HASH(i) 0 */
 329 /* #define HASH(i) 99 */
 330
 331 static struct inverse_map *make_inverse_map(const unsigned short *from)
 332 {
 333   struct inverse_map *to;
 334   char used[256];
 335   int i, j, k;
 336
 337   to = malloc(sizeof(struct inverse_map));
 338   if (!to)
 339     return 0;
 340   for (i = 0; i < 256; i++)
 341     to->first[i] = to->next[i] = used[i] = 0;
 342   for (i = 255; i >= 0; i--)
 343     if (from[i] != 0xffff) {
 344       k = HASH(from[i]);
 345       to->next[i] = to->first[k];
 346       to->first[k] = i;
 347       used[k] = 1;
 348     }
 349
 350   /* Point the empty buckets at an empty list. */
 351   for (i = 0; i < 256; i++)
 352     if (!to->next[i])
 353       break;
 354   if (i < 256)
 355     for (j = 0; j < 256; j++)
 356       if (!used[j])
 357         to->first[j] = i;
 358
 359   return to;
 360 }
 361
 362 static int wctomb_8bit(void *map1, char *s, int wc1)
 363 {
 364   struct map *map = map1;
 365   unsigned short wc = wc1;
 366   int i;
 367
 368   if (!s)
 369     return 0;
 370
 371   if (wc1 & ~0xffff)
 372     return -1;
 373
 374   if (1) /* Change 1 to 0 to test the case where malloc fails. */
 375     if (!map->to)
 376       map->to = make_inverse_map(map->from);
 377
 378   if (map->to) {
 379     /* Use the inverse map. */
 380     i = map->to->first[HASH(wc)];
 381     for (;;) {
 382       if (map->from[i] == wc) {
 383         *s = i;
 384         return 1;
 385       }
 386       if (!(i = map->to->next[i]))
 387         break;
 388     }
 389   }
 390   else {
 391     /* We don't have an inverse map, so do a linear search. */
 392     for (i = 0; i < 256; i++)
 393       if (map->from[i] == wc) {
 394         *s = i;
 395         return 1;
 396       }
 397   }
 398
 399   return -1;
 400 }
 401
 402 /*
 403  * The "constructor" charset_find().
 404  */
 405
 406 struct charset charset_utf8 = {
 407   6,
 408   &mbtowc_utf8,
 409   &wctomb_utf8,
 410   0
 411 };
 412
 413 struct charset charset_iso1 = {
 414   1,
 415   &mbtowc_iso1,
 416   &wctomb_iso1,
 417   0
 418 };
 419
 420 struct charset charset_ascii = {
 421   1,
 422   &mbtowc_ascii,
 423   &wctomb_ascii,
 424   0
 425 };
 426
 427 struct charset *charset_find(const char *code)
 428 {
 429   int i;
 430
 431   /* Find good (MIME) name. */
 432   for (i = 0; names[i].bad; i++)
 433     if (!ascii_strcasecmp(code, names[i].bad)) {
 434       code = names[i].good;
 435       break;
 436     }
 437
 438   /* Recognise some charsets for which we avoid using a table. */
 439   if (!ascii_strcasecmp(code, "UTF-8"))
 440     return &charset_utf8;
 441   if (!ascii_strcasecmp(code, "US-ASCII"))
 442     return &charset_ascii;
 443   if (!ascii_strcasecmp(code, "ISO-8859-1"))
 444     return &charset_iso1;
 445
 446   /* Look for a mapping for a simple 8-bit encoding. */
 447   for (i = 0; maps[i].name; i++)
 448     if (!ascii_strcasecmp(code, maps[i].name)) {
 449       if (!maps[i].charset) {
 450         maps[i].charset = malloc(sizeof(struct charset));
 451         if (maps[i].charset) {
 452           struct map *map = malloc(sizeof(struct map));
 453           if (!map) {
 454             free(maps[i].charset);
 455             maps[i].charset = 0;
 456           }
 457           else {
 458             maps[i].charset->max = 1;
 459             maps[i].charset->mbtowc = &mbtowc_8bit;
 460             maps[i].charset->wctomb = &wctomb_8bit;
 461             maps[i].charset->map = map;
 462             map->from = maps[i].map;
 463             map->to = 0; /* inverse mapping is created when required */
 464           }
 465         }
 466       }
 467       return maps[i].charset;
 468     }
 469
 470   return 0;
 471 }
 472
 473 /*
 474  * Function to convert a buffer from one encoding to another.
 475  * Invalid bytes are replaced by '#', and characters that are
 476  * not available in the target encoding are replaced by '?'.
 477  * Each of TO and TOLEN may be zero, if the result is not needed.
 478  * The output buffer is null-terminated, so it is all right to
 479  * use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
 480  */
 481
 482 int charset_convert(const char *fromcode, const char *tocode,
 483                     const char *from, size_t fromlen,
 484                     char **to, size_t *tolen)
 485 {
 486   int ret = 0;
 487   struct charset *charset1, *charset2;
 488   char *tobuf, *p, *newbuf;
 489   int i, j, wc;
 490
 491   charset1 = charset_find(fromcode);
 492   charset2 = charset_find(tocode);
 493   if (!charset1 || !charset2 )
 494     return -1;
 495
 496   tobuf = safe_malloc_mul2add_(fromlen, /*times*/charset2->max, /*+*/1);
 497   if (!tobuf)
 498     return -2;
 499
 500   for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
 501     i = charset_mbtowc(charset1, &wc, from, fromlen);
 502     if (!i)
 503       i = 1;
 504     else if (i == -1) {
 505       i  = 1;
 506       wc = '#';
 507       ret = 2;
 508     }
 509     j = charset_wctomb(charset2, p, wc);
 510     if (j == -1) {
 511       if (!ret)
 512         ret = 1;
 513       j = charset_wctomb(charset2, p, '?');
 514       if (j == -1)
 515         j = 0;
 516     }
 517   }
 518
 519   if (tolen)
 520     *tolen = p - tobuf;
 521   *p++ = '\0';
 522   if (to) {
 523     newbuf = realloc(tobuf, p - tobuf);
 524     *to = newbuf ? newbuf : tobuf;
 525   }
 526   else
 527     free(tobuf);
 528
 529   return ret;
 530 }
 531
 532 #endif /* USE_CHARSET_ICONV */