1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * Authors: Michael Zucchi <notzed@ximian.com>
5 * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
28 #include <sys/types.h>
30 #include "camel-utf8.h"
37 * Output a 32 bit unicode character as utf8 octets. At most 4 octets will
38 * be written to @ptr. @ptr will be advanced to the next character position.
41 camel_utf8_putc (guchar **ptr,
44 register guchar *p = *ptr;
48 else if (c <= 0x7ff) {
50 *p++ = 0x80 | (c & 0x3f);
51 } else if (c <= 0xffff) {
52 *p++ = 0xe0 | c >> 12;
53 *p++ = 0x80 | ((c >> 6) & 0x3f);
54 *p++ = 0x80 | (c & 0x3f);
56 /* see unicode standard 3.0, S 3.8, max 4 octets */
57 *p++ = 0xf0 | c >> 18;
58 *p++ = 0x80 | ((c >> 12) & 0x3f);
59 *p++ = 0x80 | ((c >> 6) & 0x3f);
60 *p++ = 0x80 | (c & 0x3f);
70 * Get a Unicode character from a utf8 stream. @ptr will be advanced
71 * to the next character position. Invalid utf8 characters will be
72 * silently skipped. @ptr should point to a NUL terminated array.
74 * Returns: The next Unicode character. @ptr will be advanced to
75 * the next character always.
78 camel_utf8_getc (const guchar **ptr)
80 register guchar *p = (guchar *) * ptr;
82 register guint32 v, m;
90 } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
92 m = 0x7f80; /* used to mask out the length bits */
95 if ((c & 0xc0) != 0x80) {
99 v = (v << 6) | (c & 0x3f);
115 * camel_utf8_getc_limit:
117 * @end: must not be NULL.
119 * Get the next utf8 gchar at @ptr, and return it, advancing @ptr to
120 * the next character. If @end is reached before a full utf8
121 * character can be read, then the invalid Unicode gchar 0xffff is
122 * returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
125 * Returns: The next utf8 char, or 0xffff.
128 camel_utf8_getc_limit (const guchar **ptr,
131 register guchar *p = (guchar *) * ptr;
132 register guchar c, r;
133 register guint32 v = 0xffff, m;
142 } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
144 m = 0x7f80; /* used to mask out the length bits */
150 if ((c & 0xc0) != 0x80) {
154 v = (v << 6) | (c & 0x3f);
172 g_string_append_u (GString *out,
178 camel_utf8_putc (&p, c);
180 g_string_append (out, (const gchar *) buffer);
183 static const gchar utf7_alphabet[] =
184 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
186 static const guchar utf7_rank[256] = {
187 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
188 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
189 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff,
190 0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff,
191 0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
192 0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff,
193 0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
194 0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff,
195 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
196 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
197 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
198 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
199 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
200 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
201 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
202 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
209 * Convert a modified utf7 string to utf8. If the utf7 string
210 * contains 8 bit characters, they are treated as iso-8859-1.
212 * The IMAP rules [rfc2060] are used in the utf7 encoding.
214 * Returns: The converted string.
217 camel_utf7_utf8 (const gchar *ptr)
219 const guchar *p = (guchar *) ptr;
227 out = g_string_new ("");
235 g_string_append_c (out, c);
239 g_string_append_c (out, '&');
241 } else if (utf7_rank[c] != 0xff) {
247 g_string_append (out, "&-");
254 } else if (utf7_rank[c] != 0xff) {
255 v = (v << 6) | utf7_rank[c];
258 x = (v >> (i - 16)) & 0xffff;
259 g_string_append_u (out, x);
263 g_string_append_u (out, c);
270 ret = g_strdup (out->str);
271 g_string_free (out, TRUE);
276 static void utf7_closeb64 (GString *out, guint32 v, guint32 i)
281 x = (v << (6 - i)) & 0x3f;
282 g_string_append_c (out, utf7_alphabet[x]);
284 g_string_append_c (out, '-');
291 * Convert a utf8 string to a modified utf7 format.
293 * The IMAP rules [rfc2060] are used in the utf7 encoding.
298 camel_utf8_utf7 (const gchar *ptr)
300 const guchar *p = (guchar *) ptr;
308 out = g_string_new ("");
310 while ((c = camel_utf8_getc (&p))) {
311 if (c >= 0x20 && c <= 0x7e) {
313 utf7_closeb64 (out, v, i);
318 g_string_append (out, "&-");
320 g_string_append_c (out, c);
323 g_string_append_c (out, '&');
329 x = (v >> (i - 6)) & 0x3f;
330 g_string_append_c (out, utf7_alphabet[x]);
337 utf7_closeb64 (out, v, i);
339 ret = g_strdup (out->str);
340 g_string_free (out, TRUE);
349 * Convert a utf8 string into a ucs2 one. The ucs string will be in
350 * network byte order, and terminated with a 16 bit NULL.
355 camel_utf8_ucs2 (const gchar *pptr)
357 GByteArray *work = g_byte_array_new ();
360 const guchar *ptr = (const guchar *) pptr;
362 /* what if c is > 0xffff ? */
364 while ((c = camel_utf8_getc (&ptr))) {
365 guint16 s = g_htons (c);
367 g_byte_array_append (work, (guchar *) &s, 2);
370 g_byte_array_append (work, (guchar *) "\000\000", 2);
371 out = g_malloc (work->len);
372 memcpy (out, work->data, work->len);
373 g_byte_array_free (work, TRUE);
382 * Convert a ucs2 string into a utf8 one. The ucs2 string is treated
383 * as network byte ordered, and terminated with a 16 bit NUL.
387 gchar *camel_ucs2_utf8 (const gchar *ptr)
389 guint16 *ucs = (guint16 *) ptr;
391 GString *work = g_string_new ("");
395 g_string_append_u (work, g_ntohs (c));
397 out = g_strdup (work->str);
398 g_string_free (work, TRUE);
404 * camel_utf8_make_valid:
407 * Ensures the returned text will be valid UTF-8 string, with incorrect letters
408 * changed to question marks. Returned pointer should be freed with g_free.
413 camel_utf8_make_valid (const gchar *text)
415 gchar *res = g_strdup (text), *p;
421 while (!g_utf8_validate (p, -1, (const gchar **) &p)) {
422 /* make all invalid characters appear as question marks */