Fix FSF address (Tobias Mueller, #470445)
[platform/upstream/evolution-data-server.git] / camel / camel-utf8.c
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  *  Authors: Michael Zucchi <notzed@ximian.com>
4  *
5  *  Copyright 2003 Ximian, Inc. (www.ximian.com)
6  *
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU Lesser General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU Lesser General Public License for more details.
16  *
17  *  You should have received a copy of the GNU Lesser General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20  *
21  */
22
23
24 #ifdef HAVE_CONFIG_H
25 #include <config.h>
26 #endif
27
28 #include <string.h>
29 #include <sys/types.h>
30
31 #include <glib.h>
32
33 #include "camel-utf8.h"
34
35 /**
36  * camel_utf8_putc:
37  * @ptr: 
38  * @c: 
39  * 
40  * Output a 32 bit unicode character as utf8 octets.  At most 4 octets will
41  * be written to @ptr.  @ptr will be advanced to the next character position.
42  **/
43 void
44 camel_utf8_putc(unsigned char **ptr, guint32 c)
45 {
46         register unsigned char *p = *ptr;
47
48         if (c <= 0x7f)
49                 *p++ = c;
50         else if (c <= 0x7ff) {
51                 *p++ = 0xc0 | c >> 6;
52                 *p++ = 0x80 | (c & 0x3f);
53         } else if (c <= 0xffff) {
54                 *p++ = 0xe0 | c >> 12;
55                 *p++ = 0x80 | ((c >> 6) & 0x3f);
56                 *p++ = 0x80 | (c & 0x3f);
57         } else {
58                 /* see unicode standard 3.0, S 3.8, max 4 octets */
59                 *p++ = 0xf0 | c >> 18;
60                 *p++ = 0x80 | ((c >> 12) & 0x3f);
61                 *p++ = 0x80 | ((c >> 6) & 0x3f);
62                 *p++ = 0x80 | (c & 0x3f);
63         }
64
65         *ptr = p;
66 }
67
68 /**
69  * camel_utf8_getc:
70  * @ptr: 
71  * 
72  * Get a Unicode character from a utf8 stream.  @ptr will be advanced
73  * to the next character position.  Invalid utf8 characters will be
74  * silently skipped.  @ptr should point to a NUL terminated array.
75  * 
76  * Return value: The next Unicode character.  @ptr will be advanced to
77  * the next character always.
78  **/
79 guint32
80 camel_utf8_getc(const unsigned char **ptr)
81 {
82         register unsigned char *p = (unsigned char *)*ptr;
83         register unsigned char c, r;
84         register guint32 v, m;
85
86 again:
87         r = *p++;
88 loop:
89         if (r < 0x80) {
90                 *ptr = p;
91                 v = r;
92         } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
93                 v = r;
94                 m = 0x7f80;     /* used to mask out the length bits */
95                 do {
96                         c = *p++;
97                         if ((c & 0xc0) != 0x80) {
98                                 r = c;
99                                 goto loop;
100                         }
101                         v = (v<<6) | (c & 0x3f);
102                         r<<=1;
103                         m<<=5;
104                 } while (r & 0x40);
105                 
106                 *ptr = p;
107
108                 v &= ~m;
109         } else {
110                 goto again;
111         }
112
113         return v;
114 }
115
116 /**
117  * camel_utf8_getc_limit:
118  * @ptr: 
119  * @end: must not be NULL.
120  * 
121  * Get the next utf8 char at @ptr, and return it, advancing @ptr to
122  * the next character.  If @end is reached before a full utf8
123  * character can be read, then the invalid Unicode char 0xffff is
124  * returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
125  * advanced.
126  * 
127  * Return value: The next utf8 char, or 0xffff.
128  **/
129 guint32
130 camel_utf8_getc_limit(const unsigned char **ptr, const unsigned char *end)
131 {
132         register unsigned char *p = (unsigned char *)*ptr;
133         register unsigned char c, r;
134         register guint32 v = 0xffff, m;
135
136 again:
137         while (p < end) {
138                 r = *p++;
139 loop:
140                 if (r < 0x80) {
141                         *ptr = p;
142                         return r;
143                 } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
144                         v = r;
145                         m = 0x7f80;     /* used to mask out the length bits */
146                         do {
147                                 if (p >= end)
148                                         return 0xffff;
149                                 
150                                 c = *p++;
151                                 if ((c & 0xc0) != 0x80) {
152                                         r = c;
153                                         goto loop;
154                                 }
155                                 v = (v<<6) | (c & 0x3f);
156                                 r<<=1;
157                                 m<<=5;
158                         } while (r & 0x40);
159                         
160                         *ptr = p;
161                         
162                         v &= ~m;
163                         return v;
164                 } else {
165                         goto again;
166                 }
167         }
168
169         return 0xffff;
170 }
171
172 void
173 g_string_append_u(GString *out, guint32 c)
174 {
175         unsigned char buffer[8];
176         unsigned char *p = buffer;
177
178         camel_utf8_putc(&p, c);
179         *p = 0;
180         g_string_append(out, (gchar *) buffer);
181 }
182
183 static const char utf7_alphabet[] =
184         "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
185
186 static const unsigned char utf7_rank[256] = {
187         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
188         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
189         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff,
190         0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff,
191         0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
192         0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff,
193         0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
194         0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff,
195         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
196         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
197         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
198         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
199         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
200         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
201         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
202         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
203 };
204
205 /**
206  * camel_utf7_utf8:
207  * @ptr: 
208  * 
209  * Convert a modified utf7 string to utf8.  If the utf7 string
210  * contains 8 bit characters, they are treated as iso-8859-1.
211  * 
212  * The IMAP rules [rfc2060] are used in the utf7 encoding.
213  *
214  * Return value: The converted string.
215  **/
216 char *
217 camel_utf7_utf8(const char *ptr)
218 {
219         const unsigned char *p = (unsigned char *)ptr;
220         unsigned int c;
221         guint32 v=0, x;
222         GString *out;
223         int i=0;
224         int state = 0;
225         char *ret;
226
227         out = g_string_new("");
228         do {
229                 c = *p++;
230                 switch(state) {
231                 case 0:
232                         if (c == '&')
233                                 state = 1;
234                         else
235                                 g_string_append_u(out, c);
236                         break;
237                 case 1:
238                         if (c == '-') {
239                                 g_string_append_c(out, '&');
240                                 state = 0;
241                         } else if (utf7_rank[c] != 0xff) {
242                                 v = utf7_rank[c];
243                                 i = 6;
244                                 state = 2;
245                         } else {
246                                 /* invalid */
247                                 g_string_append(out, "&-");
248                                 state = 0;
249                         }
250                         break;
251                 case 2:
252                         if (c == '-') {
253                                 state = 0;
254                         } else if (utf7_rank[c] != 0xff) {
255                                 v = (v<<6) | utf7_rank[c];
256                                 i+=6;
257                                 if (i >= 16) {
258                                         x = (v >> (i-16)) & 0xffff;
259                                         g_string_append_u(out, x);
260                                         i-=16;
261                                 }
262                         } else {
263                                 g_string_append_u(out, c);
264                                 state = 0;
265                         }
266                         break;
267                 }
268         } while (c);
269
270         ret = g_strdup(out->str);
271         g_string_free(out, TRUE);
272
273         return ret;
274 }
275
276 static void utf7_closeb64(GString *out, guint32 v, guint32 i)
277 {
278         guint32 x;
279
280         if (i>0) {
281                 x = (v << (6-i)) & 0x3f;
282                 g_string_append_c(out, utf7_alphabet[x]);
283         }
284         g_string_append_c(out, '-');
285 }
286
287 /**
288  * camel_utf8_utf7:
289  * @ptr: 
290  * 
291  * Convert a utf8 string to a modified utf7 format.
292  *
293  * The IMAP rules [rfc2060] are used in the utf7 encoding.
294  * 
295  * Return value: 
296  **/
297 char *
298 camel_utf8_utf7(const char *ptr)
299 {
300         const unsigned char *p = (unsigned char *)ptr;
301         unsigned int c;
302         guint32 x, v = 0;
303         int state = 0;
304         GString *out;
305         int i = 0;
306         char *ret;
307
308         out = g_string_new("");
309
310         while ( (c = camel_utf8_getc(&p)) ) {
311                 if (c >= 0x20 && c <= 0x7e) {
312                         if (state == 1) {
313                                 utf7_closeb64(out, v, i);
314                                 state = 0;
315                                 i = 0;
316                         }
317                         if (c == '&')
318                                 g_string_append(out, "&-");
319                         else
320                                 g_string_append_c(out, c);
321                 } else {
322                         if (state == 0) {
323                                 g_string_append_c(out, '&');
324                                 state = 1;
325                         }
326                         v = (v << 16) | c;
327                         i += 16;
328                         while (i >= 6) {
329                                 x = (v >> (i-6)) & 0x3f;
330                                 g_string_append_c(out, utf7_alphabet[x]);
331                                 i -= 6;
332                         }
333                 }
334         }
335
336         if (state == 1)
337                 utf7_closeb64(out, v, i);
338
339         ret = g_strdup(out->str);
340         g_string_free(out, TRUE);
341
342         return ret;
343 }
344
345 /**
346  * camel_utf8_ucs2:
347  * @ptr: 
348  * 
349  * Convert a utf8 string into a ucs2 one.  The ucs string will be in
350  * network byte order, and terminated with a 16 bit NULL.
351  * 
352  * Return value: 
353  **/
354 char *
355 camel_utf8_ucs2(const char *ptr)
356 {
357         GByteArray *work = g_byte_array_new();
358         guint32 c;
359         char *out;
360
361         /* what if c is > 0xffff ? */
362
363         while ( (c = camel_utf8_getc(&ptr)) ) {
364                 guint16 s = g_htons(c);
365
366                 g_byte_array_append(work, (unsigned char *) &s, 2);
367         }
368
369         g_byte_array_append(work, (unsigned char *) "\000\000", 2);
370         out = g_malloc(work->len);
371         memcpy(out, work->data, work->len);
372         g_byte_array_free(work, TRUE);
373
374         return out;
375 }
376
377 /**
378  * camel_ucs2_utf8:
379  * @ptr: 
380  * 
381  * Convert a ucs2 string into a utf8 one.  The ucs2 string is treated
382  * as network byte ordered, and terminated with a 16 bit NUL.
383  * 
384  * Return value: 
385  **/
386 char *camel_ucs2_utf8(const char *ptr)
387 {
388         guint16 *ucs = (guint16 *)ptr;
389         guint32 c;
390         GString *work = g_string_new("");
391         char *out;
392
393         while ( (c = *ucs++) )
394                 g_string_append_u(work, g_ntohs(c));
395
396         out = g_strdup(work->str);
397         g_string_free(work, TRUE);
398
399         return out;
400 }