Extending test-client-custom-summary to try e_book_client_get_contacts_uids()
[platform/upstream/evolution-data-server.git] / camel / camel-utf8.c
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  *  Authors: Michael Zucchi <notzed@ximian.com>
4  *
5  *  Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
6  *
7  *  This program is free software; you can redistribute it and/or modify
8  *  it under the terms of the GNU Lesser General Public License as published by
9  *  the Free Software Foundation; either version 2 of the License, or
10  *  (at your option) any later version.
11  *
12  *  This program is distributed in the hope that it will be useful,
13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *  GNU Lesser General Public License for more details.
16  *
17  *  You should have received a copy of the GNU Lesser General Public License
18  *  along with this program; if not, write to the Free Software
19  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20  *
21  */
22
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26
27 #include <string.h>
28 #include <sys/types.h>
29
30 #include "camel-utf8.h"
31
32 /**
33  * camel_utf8_putc:
34  * @ptr:
35  * @c:
36  *
37  * Output a 32 bit unicode character as utf8 octets.  At most 4 octets will
38  * be written to @ptr.  @ptr will be advanced to the next character position.
39  **/
40 void
41 camel_utf8_putc (guchar **ptr,
42                  guint32 c)
43 {
44         register guchar *p = *ptr;
45
46         if (c <= 0x7f)
47                 *p++ = c;
48         else if (c <= 0x7ff) {
49                 *p++ = 0xc0 | c >> 6;
50                 *p++ = 0x80 | (c & 0x3f);
51         } else if (c <= 0xffff) {
52                 *p++ = 0xe0 | c >> 12;
53                 *p++ = 0x80 | ((c >> 6) & 0x3f);
54                 *p++ = 0x80 | (c & 0x3f);
55         } else {
56                 /* see unicode standard 3.0, S 3.8, max 4 octets */
57                 *p++ = 0xf0 | c >> 18;
58                 *p++ = 0x80 | ((c >> 12) & 0x3f);
59                 *p++ = 0x80 | ((c >> 6) & 0x3f);
60                 *p++ = 0x80 | (c & 0x3f);
61         }
62
63         *ptr = p;
64 }
65
66 /**
67  * camel_utf8_getc:
68  * @ptr:
69  *
70  * Get a Unicode character from a utf8 stream.  @ptr will be advanced
71  * to the next character position.  Invalid utf8 characters will be
72  * silently skipped.  @ptr should point to a NUL terminated array.
73  *
74  * Returns: The next Unicode character.  @ptr will be advanced to
75  * the next character always.
76  **/
77 guint32
78 camel_utf8_getc (const guchar **ptr)
79 {
80         register guchar *p = (guchar *) * ptr;
81         register guchar c, r;
82         register guint32 v, m;
83
84 again:
85         r = *p++;
86 loop:
87         if (r < 0x80) {
88                 *ptr = p;
89                 v = r;
90         } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
91                 v = r;
92                 m = 0x7f80;     /* used to mask out the length bits */
93                 do {
94                         c = *p++;
95                         if ((c & 0xc0) != 0x80) {
96                                 r = c;
97                                 goto loop;
98                         }
99                         v = (v << 6) | (c & 0x3f);
100                         r <<= 1;
101                         m <<= 5;
102                 } while (r & 0x40);
103
104                 *ptr = p;
105
106                 v &= ~m;
107         } else {
108                 goto again;
109         }
110
111         return v;
112 }
113
114 /**
115  * camel_utf8_getc_limit:
116  * @ptr:
117  * @end: must not be NULL.
118  *
119  * Get the next utf8 gchar at @ptr, and return it, advancing @ptr to
120  * the next character.  If @end is reached before a full utf8
121  * character can be read, then the invalid Unicode gchar 0xffff is
122  * returned as a sentinel (Unicode 3.1, section 2.7), and @ptr is not
123  * advanced.
124  *
125  * Returns: The next utf8 char, or 0xffff.
126  **/
127 guint32
128 camel_utf8_getc_limit (const guchar **ptr,
129                        const guchar *end)
130 {
131         register guchar *p = (guchar *) * ptr;
132         register guchar c, r;
133         register guint32 v = 0xffff, m;
134
135 again:
136         while (p < end) {
137                 r = *p++;
138 loop:
139                 if (r < 0x80) {
140                         *ptr = p;
141                         return r;
142                 } else if (r < 0xf8) { /* valid start char? (max 4 octets) */
143                         v = r;
144                         m = 0x7f80;     /* used to mask out the length bits */
145                         do {
146                                 if (p >= end)
147                                         return 0xffff;
148
149                                 c = *p++;
150                                 if ((c & 0xc0) != 0x80) {
151                                         r = c;
152                                         goto loop;
153                                 }
154                                 v = (v << 6) | (c & 0x3f);
155                                 r <<= 1;
156                                 m <<= 5;
157                         } while (r & 0x40);
158
159                         *ptr = p;
160
161                         v &= ~m;
162                         return v;
163                 } else {
164                         goto again;
165                 }
166         }
167
168         return 0xffff;
169 }
170
171 void
172 g_string_append_u (GString *out,
173                    guint32 c)
174 {
175         guchar buffer[8];
176         guchar *p = buffer;
177
178         camel_utf8_putc (&p, c);
179         *p = 0;
180         g_string_append (out, (const gchar *) buffer);
181 }
182
183 static const gchar utf7_alphabet[] =
184         "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
185
186 static const guchar utf7_rank[256] = {
187         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
188         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
189         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x3e,0x3f,0xff,0xff,0xff,
190         0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0xff,0xff,0xff,0xff,0xff,0xff,
191         0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
192         0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0xff,0xff,0xff,0xff,0xff,
193         0xff,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
194         0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,0x31,0x32,0x33,0xff,0xff,0xff,0xff,0xff,
195         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
196         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
197         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
198         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
199         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
200         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
201         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
202         0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
203 };
204
205 /**
206  * camel_utf7_utf8:
207  * @ptr:
208  *
209  * Convert a modified utf7 string to utf8.  If the utf7 string
210  * contains 8 bit characters, they are treated as iso-8859-1.
211  *
212  * The IMAP rules [rfc2060] are used in the utf7 encoding.
213  *
214  * Returns: The converted string.
215  **/
216 gchar *
217 camel_utf7_utf8 (const gchar *ptr)
218 {
219         const guchar *p = (guchar *) ptr;
220         guint c;
221         guint32 v = 0, x;
222         GString *out;
223         gint i = 0;
224         gint state = 0;
225         gchar *ret;
226
227         out = g_string_new ("");
228         do {
229                 c = *p++;
230                 switch (state) {
231                 case 0:
232                         if (c == '&')
233                                 state = 1;
234                         else
235                                 g_string_append_c (out, c);
236                         break;
237                 case 1:
238                         if (c == '-') {
239                                 g_string_append_c (out, '&');
240                                 state = 0;
241                         } else if (utf7_rank[c] != 0xff) {
242                                 v = utf7_rank[c];
243                                 i = 6;
244                                 state = 2;
245                         } else {
246                                 /* invalid */
247                                 g_string_append (out, "&-");
248                                 state = 0;
249                         }
250                         break;
251                 case 2:
252                         if (c == '-') {
253                                 state = 0;
254                         } else if (utf7_rank[c] != 0xff) {
255                                 v = (v << 6) | utf7_rank[c];
256                                 i+=6;
257                                 if (i >= 16) {
258                                         x = (v >> (i - 16)) & 0xffff;
259                                         g_string_append_u (out, x);
260                                         i-=16;
261                                 }
262                         } else {
263                                 g_string_append_u (out, c);
264                                 state = 0;
265                         }
266                         break;
267                 }
268         } while (c);
269
270         ret = g_strdup (out->str);
271         g_string_free (out, TRUE);
272
273         return ret;
274 }
275
276 static void utf7_closeb64 (GString *out, guint32 v, guint32 i)
277 {
278         guint32 x;
279
280         if (i > 0) {
281                 x = (v << (6 - i)) & 0x3f;
282                 g_string_append_c (out, utf7_alphabet[x]);
283         }
284         g_string_append_c (out, '-');
285 }
286
287 /**
288  * camel_utf8_utf7:
289  * @ptr:
290  *
291  * Convert a utf8 string to a modified utf7 format.
292  *
293  * The IMAP rules [rfc2060] are used in the utf7 encoding.
294  *
295  * Returns:
296  **/
297 gchar *
298 camel_utf8_utf7 (const gchar *ptr)
299 {
300         const guchar *p = (guchar *) ptr;
301         guint c;
302         guint32 x, v = 0;
303         gint state = 0;
304         GString *out;
305         gint i = 0;
306         gchar *ret;
307
308         out = g_string_new ("");
309
310         while ((c = camel_utf8_getc (&p))) {
311                 if (c >= 0x20 && c <= 0x7e) {
312                         if (state == 1) {
313                                 utf7_closeb64 (out, v, i);
314                                 state = 0;
315                                 i = 0;
316                         }
317                         if (c == '&')
318                                 g_string_append (out, "&-");
319                         else
320                                 g_string_append_c (out, c);
321                 } else {
322                         if (state == 0) {
323                                 g_string_append_c (out, '&');
324                                 state = 1;
325                         }
326                         v = (v << 16) | c;
327                         i += 16;
328                         while (i >= 6) {
329                                 x = (v >> (i - 6)) & 0x3f;
330                                 g_string_append_c (out, utf7_alphabet[x]);
331                                 i -= 6;
332                         }
333                 }
334         }
335
336         if (state == 1)
337                 utf7_closeb64 (out, v, i);
338
339         ret = g_strdup (out->str);
340         g_string_free (out, TRUE);
341
342         return ret;
343 }
344
345 /**
346  * camel_utf8_ucs2:
347  * @ptr:
348  *
349  * Convert a utf8 string into a ucs2 one.  The ucs string will be in
350  * network byte order, and terminated with a 16 bit NULL.
351  *
352  * Returns:
353  **/
354 gchar *
355 camel_utf8_ucs2 (const gchar *pptr)
356 {
357         GByteArray *work = g_byte_array_new ();
358         guint32 c;
359         gchar *out;
360         const guchar *ptr = (const guchar *) pptr;
361
362         /* what if c is > 0xffff ? */
363
364         while ((c = camel_utf8_getc (&ptr))) {
365                 guint16 s = g_htons (c);
366
367                 g_byte_array_append (work, (guchar *) &s, 2);
368         }
369
370         g_byte_array_append (work, (guchar *) "\000\000", 2);
371         out = g_malloc (work->len);
372         memcpy (out, work->data, work->len);
373         g_byte_array_free (work, TRUE);
374
375         return out;
376 }
377
378 /**
379  * camel_ucs2_utf8:
380  * @ptr:
381  *
382  * Convert a ucs2 string into a utf8 one.  The ucs2 string is treated
383  * as network byte ordered, and terminated with a 16 bit NUL.
384  *
385  * Returns:
386  **/
387 gchar *camel_ucs2_utf8 (const gchar *ptr)
388 {
389         guint16 *ucs = (guint16 *) ptr;
390         guint32 c;
391         GString *work = g_string_new ("");
392         gchar *out;
393
394         while ((c = *ucs++))
395                 g_string_append_u (work, g_ntohs (c));
396
397         out = g_strdup (work->str);
398         g_string_free (work, TRUE);
399
400         return out;
401 }
402
403 /**
404  * camel_utf8_make_valid:
405  * @text:
406  *
407  * Ensures the returned text will be valid UTF-8 string, with incorrect letters
408  * changed to question marks. Returned pointer should be freed with g_free.
409  *
410  * Since: 2.26
411  **/
412 gchar *
413 camel_utf8_make_valid (const gchar *text)
414 {
415         gchar *res = g_strdup (text), *p;
416
417         if (!res)
418                 return res;
419
420         p = res;
421         while (!g_utf8_validate (p, -1, (const gchar **) &p)) {
422                 /* make all invalid characters appear as question marks */
423                 *p = '?';
424         }
425
426         return res;
427 }