src/share/utf8/utf8.c

   1 /*
   2  * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com>
   3  * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  */
  19
  20 /*
  21  * Convert a string between UTF-8 and the locale's charset.
  22  */
  23
  24 #if HAVE_CONFIG_H
  25 #  include <config.h>
  26 #endif
  27
  28 #include <stdlib.h>
  29 #include <string.h>
  30
  31 #include "utf8.h"
  32 #include "charset.h"
  33
  34
  35 #ifdef _WIN32
  36
  37         /* Thanks to Peter Harris <peter.harris@hummingbird.com> for this win32
  38          * code.
  39          */
  40
  41 #include <stdio.h>
  42 #include <windows.h>
  43
  44 static unsigned char *make_utf8_string(const wchar_t *unicode)
  45 {
  46     int size = 0, index = 0, out_index = 0;
  47     unsigned char *out;
  48     unsigned short c;
  49
  50     /* first calculate the size of the target string */
  51     c = unicode[index++];
  52     while(c) {
  53         if(c < 0x0080) {
  54             size += 1;
  55         } else if(c < 0x0800) {
  56             size += 2;
  57         } else {
  58             size += 3;
  59         }
  60         c = unicode[index++];
  61     }
  62
  63     out = malloc(size + 1);
  64     if (out == NULL)
  65         return NULL;
  66     index = 0;
  67
  68     c = unicode[index++];
  69     while(c)
  70     {
  71         if(c < 0x080) {
  72             out[out_index++] = (unsigned char)c;
  73         } else if(c < 0x800) {
  74             out[out_index++] = 0xc0 | (c >> 6);
  75             out[out_index++] = 0x80 | (c & 0x3f);
  76         } else {
  77             out[out_index++] = 0xe0 | (c >> 12);
  78             out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
  79             out[out_index++] = 0x80 | (c & 0x3f);
  80         }
  81         c = unicode[index++];
  82     }
  83     out[out_index] = 0x00;
  84
  85     return out;
  86 }
  87
  88 static wchar_t *make_unicode_string(const unsigned char *utf8)
  89 {
  90     int size = 0, index = 0, out_index = 0;
  91     wchar_t *out;
  92     unsigned char c;
  93
  94     /* first calculate the size of the target string */
  95     c = utf8[index++];
  96     while(c) {
  97         if((c & 0x80) == 0) {
  98             index += 0;
  99         } else if((c & 0xe0) == 0xe0) {
 100             index += 2;
 101         } else {
 102             index += 1;
 103         }
 104         size += 1;
 105         c = utf8[index++];
 106     }
 107
 108     out = malloc((size + 1) * sizeof(wchar_t));
 109     if (out == NULL)
 110         return NULL;
 111     index = 0;
 112
 113     c = utf8[index++];
 114     while(c)
 115     {
 116         if((c & 0x80) == 0) {
 117             out[out_index++] = c;
 118         } else if((c & 0xe0) == 0xe0) {
 119             out[out_index] = (c & 0x1F) << 12;
 120                 c = utf8[index++];
 121             out[out_index] |= (c & 0x3F) << 6;
 122                 c = utf8[index++];
 123             out[out_index++] |= (c & 0x3F);
 124         } else {
 125             out[out_index] = (c & 0x3F) << 6;
 126                 c = utf8[index++];
 127             out[out_index++] |= (c & 0x3F);
 128         }
 129         c = utf8[index++];
 130     }
 131     out[out_index] = 0;
 132
 133     return out;
 134 }
 135
 136 int utf8_encode(const char *from, char **to)
 137 {
 138         wchar_t *unicode;
 139         int wchars, err;
 140
 141         wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
 142                         strlen(from), NULL, 0);
 143
 144         if(wchars == 0)
 145         {
 146                 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
 147                 return -1;
 148         }
 149
 150         unicode = calloc(wchars + 1, sizeof(unsigned short));
 151         if(unicode == NULL)
 152         {
 153                 fprintf(stderr, "Out of memory processing string to UTF8\n");
 154                 return -1;
 155         }
 156
 157         err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
 158                         strlen(from), unicode, wchars);
 159         if(err != wchars)
 160         {
 161                 free(unicode);
 162                 fprintf(stderr, "Unicode translation error %d\n", GetLastError());
 163                 return -1;
 164         }
 165
 166         /* On NT-based windows systems, we could use WideCharToMultiByte(), but
 167          * MS doesn't actually have a consistent API across win32.
 168          */
 169         *to = make_utf8_string(unicode);
 170
 171         free(unicode);
 172         return 0;
 173 }
 174
 175 int utf8_decode(const char *from, char **to)
 176 {
 177     wchar_t *unicode;
 178     int chars, err;
 179
 180     /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but
 181      * MS doesn't actually have a consistent API across win32.
 182      */
 183     unicode = make_unicode_string(from);
 184     if(unicode == NULL)
 185     {
 186         fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n");
 187         return -1;
 188     }
 189
 190     chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
 191             -1, NULL, 0, NULL, NULL);
 192
 193     if(chars == 0)
 194     {
 195         fprintf(stderr, "Unicode translation error %d\n", GetLastError());
 196         free(unicode);
 197         return -1;
 198     }
 199
 200     *to = calloc(chars + 1, sizeof(unsigned char));
 201     if(*to == NULL)
 202     {
 203         fprintf(stderr, "Out of memory processing string to local charset\n");
 204         free(unicode);
 205         return -1;
 206     }
 207
 208     err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
 209             -1, *to, chars, NULL, NULL);
 210     if(err != chars)
 211     {
 212         fprintf(stderr, "Unicode translation error %d\n", GetLastError());
 213         free(unicode);
 214         free(*to);
 215         *to = NULL;
 216         return -1;
 217     }
 218
 219     free(unicode);
 220     return 0;
 221 }
 222
 223 #else /* End win32. Rest is for real operating systems */
 224
 225
 226 #ifdef HAVE_LANGINFO_CODESET
 227 #include <langinfo.h>
 228 #endif
 229
 230 int iconvert(const char *fromcode, const char *tocode,
 231              const char *from, size_t fromlen,
 232              char **to, size_t *tolen);
 233
 234 static const char *current_charset(void)
 235 {
 236   const char *c = 0;
 237 #ifdef HAVE_LANGINFO_CODESET
 238   c = nl_langinfo(CODESET);
 239 #endif
 240
 241   if (!c)
 242     c = getenv("CHARSET");
 243
 244   return c? c : "US-ASCII";
 245 }
 246
 247 static int convert_buffer(const char *fromcode, const char *tocode,
 248                           const char *from, size_t fromlen,
 249                           char **to, size_t *tolen)
 250 {
 251   int ret = -1;
 252
 253 #ifdef HAVE_ICONV
 254   ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
 255   if (ret != -1)
 256     return ret;
 257 #endif
 258
 259 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
 260   ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
 261   if (ret != -1)
 262     return ret;
 263 #endif
 264
 265   return ret;
 266 }
 267
 268 static int convert_string(const char *fromcode, const char *tocode,
 269                           const char *from, char **to, char replace)
 270 {
 271   int ret;
 272   size_t fromlen;
 273   char *s;
 274
 275   fromlen = strlen(from);
 276   ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
 277   if (ret == -2)
 278     return -1;
 279   if (ret != -1)
 280     return ret;
 281
 282   s = malloc(fromlen + 1);
 283   if (!s)
 284     return -1;
 285   strcpy(s, from);
 286   *to = s;
 287   for (; *s; s++)
 288     if (*s & ~0x7f)
 289       *s = replace;
 290   return 3;
 291 }
 292
 293 int utf8_encode(const char *from, char **to)
 294 {
 295   char *charset;
 296
 297   return convert_string(current_charset(), "UTF-8", from, to, '#');
 298 }
 299
 300 int utf8_decode(const char *from, char **to)
 301 {
 302   char *charset;
 303
 304   return convert_string("UTF-8", current_charset(), from, to, '?');
 305 }
 306
 307 #endif