src/share/utf8/utf8.c

   1 /*
   2  * Copyright (C) 2001 Peter Harris <peter.harris@hummingbird.com>
   3  * Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
   4  *
   5  * Buffer overflow checking added: Josh Coalson, 9/9/2007
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License along
  18  * with this program; if not, write to the Free Software Foundation, Inc.,
  19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20  */
  21
  22 /*
  23  * Convert a string between UTF-8 and the locale's charset.
  24  */
  25
  26 #if HAVE_CONFIG_H
  27 #  include <config.h>
  28 #endif
  29
  30 #include <stdlib.h>
  31 #include <string.h>
  32
  33 #include "share/alloc.h"
  34 #include "utf8.h"
  35 #include "charset.h"
  36
  37
  38 #ifdef _WIN32
  39
  40         /* Thanks to Peter Harris <peter.harris@hummingbird.com> for this win32
  41          * code.
  42          */
  43
  44 #include <stdio.h>
  45 #include <windows.h>
  46
  47 static unsigned char *make_utf8_string(const wchar_t *unicode)
  48 {
  49     size_t size = 0, n;
  50     int index = 0, out_index = 0;
  51     unsigned char *out;
  52     unsigned short c;
  53
  54     /* first calculate the size of the target string */
  55     c = unicode[index++];
  56     while(c) {
  57         if(c < 0x0080) {
  58             n = 1;
  59         } else if(c < 0x0800) {
  60             n = 2;
  61         } else {
  62             n = 3;
  63         }
  64         if(size+n < size) /* overflow check */
  65             return NULL;
  66         size += n;
  67         c = unicode[index++];
  68     }
  69
  70     out = safe_malloc_add_2op_(size, /*+*/1);
  71     if (out == NULL)
  72         return NULL;
  73     index = 0;
  74
  75     c = unicode[index++];
  76     while(c)
  77     {
  78         if(c < 0x080) {
  79             out[out_index++] = (unsigned char)c;
  80         } else if(c < 0x800) {
  81             out[out_index++] = 0xc0 | (c >> 6);
  82             out[out_index++] = 0x80 | (c & 0x3f);
  83         } else {
  84             out[out_index++] = 0xe0 | (c >> 12);
  85             out[out_index++] = 0x80 | ((c >> 6) & 0x3f);
  86             out[out_index++] = 0x80 | (c & 0x3f);
  87         }
  88         c = unicode[index++];
  89     }
  90     out[out_index] = 0x00;
  91
  92     return out;
  93 }
  94
  95 static wchar_t *make_unicode_string(const unsigned char *utf8)
  96 {
  97     size_t size = 0;
  98     int index = 0, out_index = 0;
  99     wchar_t *out;
 100     unsigned char c;
 101
 102     /* first calculate the size of the target string */
 103     c = utf8[index++];
 104     while(c) {
 105         if((c & 0x80) == 0) {
 106             index += 0;
 107         } else if((c & 0xe0) == 0xe0) {
 108             index += 2;
 109         } else {
 110             index += 1;
 111         }
 112         if(size + 1 == 0) /* overflow check */
 113             return NULL;
 114         size++;
 115         c = utf8[index++];
 116     }
 117
 118     if(size + 1 == 0) /* overflow check */
 119         return NULL;
 120     out = safe_malloc_mul_2op_(size+1, /*times*/sizeof(wchar_t));
 121     if (out == NULL)
 122         return NULL;
 123     index = 0;
 124
 125     c = utf8[index++];
 126     while(c)
 127     {
 128         if((c & 0x80) == 0) {
 129             out[out_index++] = c;
 130         } else if((c & 0xe0) == 0xe0) {
 131             out[out_index] = (c & 0x1F) << 12;
 132                 c = utf8[index++];
 133             out[out_index] |= (c & 0x3F) << 6;
 134                 c = utf8[index++];
 135             out[out_index++] |= (c & 0x3F);
 136         } else {
 137             out[out_index] = (c & 0x3F) << 6;
 138                 c = utf8[index++];
 139             out[out_index++] |= (c & 0x3F);
 140         }
 141         c = utf8[index++];
 142     }
 143     out[out_index] = 0;
 144
 145     return out;
 146 }
 147
 148 int utf8_encode(const char *from, char **to)
 149 {
 150         wchar_t *unicode;
 151         int wchars, err;
 152
 153         wchars = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
 154                         strlen(from), NULL, 0);
 155
 156         if(wchars == 0)
 157         {
 158                 fprintf(stderr, "Unicode translation error %d\n", (int)GetLastError());
 159                 return -1;
 160         }
 161
 162         if(wchars < 0) /* underflow check */
 163                 return -1;
 164
 165         unicode = safe_calloc_((size_t)wchars + 1, sizeof(unsigned short));
 166         if(unicode == NULL)
 167         {
 168                 fprintf(stderr, "Out of memory processing string to UTF8\n");
 169                 return -1;
 170         }
 171
 172         err = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, from,
 173                         strlen(from), unicode, wchars);
 174         if(err != wchars)
 175         {
 176                 free(unicode);
 177                 fprintf(stderr, "Unicode translation error %d\n", (int)GetLastError());
 178                 return -1;
 179         }
 180
 181         /* On NT-based windows systems, we could use WideCharToMultiByte(), but
 182          * MS doesn't actually have a consistent API across win32.
 183          */
 184         *to = make_utf8_string(unicode);
 185
 186         free(unicode);
 187         return 0;
 188 }
 189
 190 int utf8_decode(const char *from, char **to)
 191 {
 192     wchar_t *unicode;
 193     int chars, err;
 194
 195     /* On NT-based windows systems, we could use MultiByteToWideChar(CP_UTF8), but
 196      * MS doesn't actually have a consistent API across win32.
 197      */
 198     unicode = make_unicode_string(from);
 199     if(unicode == NULL)
 200     {
 201         fprintf(stderr, "Out of memory processing string from UTF8 to UNICODE16\n");
 202         return -1;
 203     }
 204
 205     chars = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
 206             -1, NULL, 0, NULL, NULL);
 207
 208     if(chars < 0) /* underflow check */
 209         return -1;
 210
 211     if(chars == 0)
 212     {
 213         fprintf(stderr, "Unicode translation error %d\n", (int)GetLastError());
 214         free(unicode);
 215         return -1;
 216     }
 217
 218     *to = safe_calloc_((size_t)chars + 1, sizeof(unsigned char));
 219     if(*to == NULL)
 220     {
 221         fprintf(stderr, "Out of memory processing string to local charset\n");
 222         free(unicode);
 223         return -1;
 224     }
 225
 226     err = WideCharToMultiByte(GetConsoleCP(), WC_COMPOSITECHECK, unicode,
 227             -1, *to, chars, NULL, NULL);
 228     if(err != chars)
 229     {
 230         fprintf(stderr, "Unicode translation error %d\n", (int)GetLastError());
 231         free(unicode);
 232         free(*to);
 233         *to = NULL;
 234         return -1;
 235     }
 236
 237     free(unicode);
 238     return 0;
 239 }
 240
 241 #else /* End win32. Rest is for real operating systems */
 242
 243
 244 #ifdef HAVE_LANGINFO_CODESET
 245 #include <langinfo.h>
 246 #endif
 247
 248 #include "iconvert.h"
 249
 250 static const char *current_charset(void)
 251 {
 252   const char *c = 0;
 253 #ifdef HAVE_LANGINFO_CODESET
 254   c = nl_langinfo(CODESET);
 255 #endif
 256
 257   if (!c)
 258     c = getenv("CHARSET");
 259
 260   return c? c : "US-ASCII";
 261 }
 262
 263 static int convert_buffer(const char *fromcode, const char *tocode,
 264                           const char *from, size_t fromlen,
 265                           char **to, size_t *tolen)
 266 {
 267   int ret = -1;
 268
 269 #ifdef HAVE_ICONV
 270   ret = iconvert(fromcode, tocode, from, fromlen, to, tolen);
 271   if (ret != -1)
 272     return ret;
 273 #endif
 274
 275 #ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
 276   ret = charset_convert(fromcode, tocode, from, fromlen, to, tolen);
 277   if (ret != -1)
 278     return ret;
 279 #endif
 280
 281   return ret;
 282 }
 283
 284 static int convert_string(const char *fromcode, const char *tocode,
 285                           const char *from, char **to, char replace)
 286 {
 287   int ret;
 288   size_t fromlen;
 289   char *s;
 290
 291   fromlen = strlen(from);
 292   ret = convert_buffer(fromcode, tocode, from, fromlen, to, 0);
 293   if (ret == -2)
 294     return -1;
 295   if (ret != -1)
 296     return ret;
 297
 298   s = safe_malloc_add_2op_(fromlen, /*+*/1);
 299   if (!s)
 300     return -1;
 301   strcpy(s, from);
 302   *to = s;
 303   for (; *s; s++)
 304     if (*s & ~0x7f)
 305       *s = replace;
 306   return 3;
 307 }
 308
 309 int utf8_encode(const char *from, char **to)
 310 {
 311   return convert_string(current_charset(), "UTF-8", from, to, '#');
 312 }
 313
 314 int utf8_decode(const char *from, char **to)
 315 {
 316   return convert_string("UTF-8", current_charset(), from, to, '?');
 317 }
 318
 319 #endif