camel/camel-utf8.c

   1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
   2 /*
   3  *  Authors: Michael Zucchi <notzed@ximian.com>
   4  *
   5  *  Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
   6  *
   7  *  This program is free software; you can redistribute it and/or modify
   8  *  it under the terms of the GNU Lesser General Public License as published by
   9  *  the Free Software Foundation; either version 2 of the License, or
  10  *  (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *  GNU Lesser General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Lesser General Public License
  18  *  along with this program; if not, write to the Free Software
  19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20  *
  21  */
  22
  23 #ifdef HAVE_CONFIG_H
  24 #include <config.h>
  25 #endif
  26
  27 #include <string.h>
  28 #include <sys/types.h>
  29
  30 #include "camel-utf8.h"
  31
  32 /**
  33  * camel_utf8_putc:
  34  * @ptr:
  35  * @c:
  36  *
  37  * Output a 32 bit unicode character as utf8 octets.  At most 4 octets will
  38  * be written to @ptr.  @ptr will be advanced to the next character position.
  39  **/
  40 void
  41 camel_utf8_putc (guchar **ptr,
  42                  guint32 c)
  43 {
  44         register guchar *p = *ptr;
  45
  46         if (c <= 0x7f)
  47                 *p++ = c;
  48         else if (c <= 0x7ff) {
  49                 *p++ = 0xc0 | c >> 6;
  50                 *p++ = 0x80 | (c & 0x3f);
  51         } else if (c <= 0xffff) {
  52                 *p++ = 0xe0 | c >> 12;
  53                 *p++ = 0x80 | ((c >> 6) & 0x3f);
  54                 *p++ = 0x80 | (c & 0x3f);
  55         } else {
  56                 /* see unicode standard 3.0, S 3.8, max 4 octets */
  57                 *p++ = 0xf0 | c >> 18;
  58                 *p++ = 0x80 | ((c >> 12) & 0x3f);
  59                 *p++ = 0x80 | ((c >> 6) & 0x3f);
  60                 *p++ = 0x80 | (c & 0x3f);
  61         }
  62
  63         *ptr = p;
  64 }
  65
  66 /**
  67  * camel_utf8_getc:
  68  * @ptr:
  69  *
  70  * Get a Unicode character from a utf8 stream.  @ptr will be advanced
  71  * to the next character position.  Invalid utf8 characters will be
  72  * silently skipped.  @ptr should point to a NUL terminated array.
  73  *
  74  * Returns: The next Unicode character.  @ptr will be advanced to
  75  * the next character always.
  76  **/
  77 guint32
  78 camel_utf8_getc (const guchar **ptr)
  79 {
  80         register guchar *p = (guchar *) * ptr;
  81         register guchar c, r;
  82         register guint32 v, m;
  83
  84 again:
  85         r = *p++;
  86 loop:
  87         if (r < 0x80) {
  88                 *ptr = p;
  89                 v = r;
  90         } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
  91                 v = r;
  92                 m = 0x7f80;     /* used to mask out the length bits */
  93                 do {
  94                         c = *p++;
  95                         if ((c & 0xc0) != 0x80) {
  96                                 r = c;
  97                                 goto loop;
  98                         }
  99                         v = (v << 6) | (c & 0x3f);
 100                         r <<= 1;
 101                         m <<= 5;
 102                 } while (r & 0x40);
 103
 104                 *ptr = p;
 105
 106                 v &= ~m;
 107         } else {
 108                 goto again;
 109         }
 110
 111         return v;
 112 }
 113
 114 /**
 115  * camel_utf8_getc_limit:
 116  * @ptr:
 117  * @end: must not be NULL.
 118  *
 119  * Get the next utf8 gchar at @ptr, and return it, advancing @ptr to
 120  * the next character.  If @end is reached before a full utf8
 121  * character can be read, then the invalid Unicode gchar 0xffff is
 122  * returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
 123  * advanced.
 124  *
 125  * Returns: The next utf8 char, or 0xffff.
 126  **/
 127 guint32
 128 camel_utf8_getc_limit (const guchar **ptr,
 129                        const guchar *end)
 130 {
 131         register guchar *p = (guchar *) * ptr;
 132         register guchar c, r;
 133         register guint32 v = 0xffff, m;
 134
 135 again:
 136         while (p < end) {
 137                 r = *p++;
 138 loop:
 139                 if (r < 0x80) {
 140                         *ptr = p;
 141                         return r;
 142                 } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
 143                         v = r;
 144                         m = 0x7f80;     /* used to mask out the length bits */
 145                         do {
 146                                 if (p >= end)
 147                                         return 0xffff;
 148
 149                                 c = *p++;
 150                                 if ((c & 0xc0) != 0x80) {
 151                                         r = c;
 152                                         goto loop;
 153                                 }
 154                                 v = (v << 6) | (c & 0x3f);
 155                                 r <<= 1;
 156                                 m <<= 5;
 157                         } while (r & 0x40);
 158
 159                         *ptr = p;
 160
 161                         v &= ~m;
 162                         return v;
 163                 } else {
 164                         goto again;
 165                 }
 166         }
 167
 168         return 0xffff;
 169 }
 170
 171 void
 172 g_string_append_u (GString *out,
 173                    guint32 c)
 174 {
 175         guchar buffer[8];
 176         guchar *p = buffer;
 177
 178         camel_utf8_putc (&p, c);
 179         *p = 0;
 180         g_string_append (out, (const gchar *) buffer);
 181 }
 182
 183 static const gchar utf7_alphabet[] =
 184         "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
 185
 186 static const guchar utf7_rank[256] = {
 187         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 188         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 189         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff,
 190         0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff,
 191         0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
 192         0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff,
 193         0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
 194         0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff,
 195         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 196         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 197         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 198         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 199         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 200         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 201         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 202         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
 203 };
 204
 205 /**
 206  * camel_utf7_utf8:
 207  * @ptr:
 208  *
 209  * Convert a modified utf7 string to utf8.  If the utf7 string
 210  * contains 8 bit characters, they are treated as iso-8859-1.
 211  *
 212  * The IMAP rules [rfc2060] are used in the utf7 encoding.
 213  *
 214  * Returns: The converted string.
 215  **/
 216 gchar *
 217 camel_utf7_utf8 (const gchar *ptr)
 218 {
 219         const guchar *p = (guchar *) ptr;
 220         guint c;
 221         guint32 v = 0, x;
 222         GString *out;
 223         gint i = 0;
 224         gint state = 0;
 225         gchar *ret;
 226
 227         out = g_string_new ("");
 228         do {
 229                 c = *p++;
 230                 switch (state) {
 231                 case 0:
 232                         if (c == '&')
 233                                 state = 1;
 234                         else
 235                                 g_string_append_c (out, c);
 236                         break;
 237                 case 1:
 238                         if (c == '-') {
 239                                 g_string_append_c (out, '&');
 240                                 state = 0;
 241                         } else if (utf7_rank[c] != 0xff) {
 242                                 v = utf7_rank[c];
 243                                 i = 6;
 244                                 state = 2;
 245                         } else {
 246                                 /* invalid */
 247                                 g_string_append (out, "&-");
 248                                 state = 0;
 249                         }
 250                         break;
 251                 case 2:
 252                         if (c == '-') {
 253                                 state = 0;
 254                         } else if (utf7_rank[c] != 0xff) {
 255                                 v = (v << 6) | utf7_rank[c];
 256                                 i+=6;
 257                                 if (i >= 16) {
 258                                         x = (v >> (i - 16)) & 0xffff;
 259                                         g_string_append_u (out, x);
 260                                         i-=16;
 261                                 }
 262                         } else {
 263                                 g_string_append_u (out, c);
 264                                 state = 0;
 265                         }
 266                         break;
 267                 }
 268         } while (c);
 269
 270         ret = g_strdup (out->str);
 271         g_string_free (out, TRUE);
 272
 273         return ret;
 274 }
 275
 276 static void utf7_closeb64 (GString *out, guint32 v, guint32 i)
 277 {
 278         guint32 x;
 279
 280         if (i > 0) {
 281                 x = (v << (6 - i)) & 0x3f;
 282                 g_string_append_c (out, utf7_alphabet[x]);
 283         }
 284         g_string_append_c (out, '-');
 285 }
 286
 287 /**
 288  * camel_utf8_utf7:
 289  * @ptr:
 290  *
 291  * Convert a utf8 string to a modified utf7 format.
 292  *
 293  * The IMAP rules [rfc2060] are used in the utf7 encoding.
 294  *
 295  * Returns:
 296  **/
 297 gchar *
 298 camel_utf8_utf7 (const gchar *ptr)
 299 {
 300         const guchar *p = (guchar *) ptr;
 301         guint c;
 302         guint32 x, v = 0;
 303         gint state = 0;
 304         GString *out;
 305         gint i = 0;
 306         gchar *ret;
 307
 308         out = g_string_new ("");
 309
 310         while ((c = camel_utf8_getc (&p))) {
 311                 if (c >= 0x20 && c <= 0x7e) {
 312                         if (state == 1) {
 313                                 utf7_closeb64 (out, v, i);
 314                                 state = 0;
 315                                 i = 0;
 316                         }
 317                         if (c == '&')
 318                                 g_string_append (out, "&-");
 319                         else
 320                                 g_string_append_c (out, c);
 321                 } else {
 322                         if (state == 0) {
 323                                 g_string_append_c (out, '&');
 324                                 state = 1;
 325                         }
 326                         v = (v << 16) | c;
 327                         i += 16;
 328                         while (i >= 6) {
 329                                 x = (v >> (i - 6)) & 0x3f;
 330                                 g_string_append_c (out, utf7_alphabet[x]);
 331                                 i -= 6;
 332                         }
 333                 }
 334         }
 335
 336         if (state == 1)
 337                 utf7_closeb64 (out, v, i);
 338
 339         ret = g_strdup (out->str);
 340         g_string_free (out, TRUE);
 341
 342         return ret;
 343 }
 344
 345 /**
 346  * camel_utf8_ucs2:
 347  * @ptr:
 348  *
 349  * Convert a utf8 string into a ucs2 one.  The ucs string will be in
 350  * network byte order, and terminated with a 16 bit NULL.
 351  *
 352  * Returns:
 353  **/
 354 gchar *
 355 camel_utf8_ucs2 (const gchar *pptr)
 356 {
 357         GByteArray *work = g_byte_array_new ();
 358         guint32 c;
 359         gchar *out;
 360         const guchar *ptr = (const guchar *) pptr;
 361
 362         /* what if c is > 0xffff ? */
 363
 364         while ((c = camel_utf8_getc (&ptr))) {
 365                 guint16 s = g_htons (c);
 366
 367                 g_byte_array_append (work, (guchar *) &s, 2);
 368         }
 369
 370         g_byte_array_append (work, (guchar *) "\000\000", 2);
 371         out = g_malloc (work->len);
 372         memcpy (out, work->data, work->len);
 373         g_byte_array_free (work, TRUE);
 374
 375         return out;
 376 }
 377
 378 /**
 379  * camel_ucs2_utf8:
 380  * @ptr:
 381  *
 382  * Convert a ucs2 string into a utf8 one.  The ucs2 string is treated
 383  * as network byte ordered, and terminated with a 16 bit NUL.
 384  *
 385  * Returns:
 386  **/
 387 gchar *camel_ucs2_utf8 (const gchar *ptr)
 388 {
 389         guint16 *ucs = (guint16 *) ptr;
 390         guint32 c;
 391         GString *work = g_string_new ("");
 392         gchar *out;
 393
 394         while ((c = *ucs++))
 395                 g_string_append_u (work, g_ntohs (c));
 396
 397         out = g_strdup (work->str);
 398         g_string_free (work, TRUE);
 399
 400         return out;
 401 }
 402
 403 /**
 404  * camel_utf8_make_valid:
 405  * @text:
 406  *
 407  * Ensures the returned text will be valid UTF-8 string, with incorrect letters
 408  * changed to question marks. Returned pointer should be freed with g_free.
 409  *
 410  * Since: 2.26
 411  **/
 412 gchar *
 413 camel_utf8_make_valid (const gchar *text)
 414 {
 415         gchar *res = g_strdup (text), *p;
 416
 417         if (!res)
 418                 return res;
 419
 420         p = res;
 421         while (!g_utf8_validate (p, -1, (const gchar **) &p)) {
 422                 /* make all invalid characters appear as question marks */
 423                 *p = '?';
 424         }
 425
 426         return res;
 427 }