1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* util/support/utf8.c */
4 * Copyright 2008 by the Massachusetts Institute of Technology.
7 * Export of this software from the United States of America may
8 * require a specific license from the United States Government.
9 * It is the responsibility of any person or organization contemplating
10 * export to obtain such a license before exporting.
12 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
13 * distribute this software and its documentation for any purpose and
14 * without fee is hereby granted, provided that the above copyright
15 * notice appear in all copies and that both that copyright notice and
16 * this permission notice appear in supporting documentation, and that
17 * the name of M.I.T. not be used in advertising or publicity pertaining
18 * to distribution of the software without specific, written prior
19 * permission. Furthermore if you modify this software you must label
20 * your software as modified software and not distribute it in such a
21 * fashion that it might be confused with the original M.I.T. software.
22 * M.I.T. makes no representations about the suitability of
23 * this software for any purpose. It is provided "as is" without express
24 * or implied warranty.
27 * Copyright 1998-2008 The OpenLDAP Foundation.
28 * All rights reserved.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted only as authorized by the OpenLDAP
34 * A copy of this license is available in the file LICENSE in the
35 * top-level directory of the distribution or, alternatively, at
36 * <http://www.OpenLDAP.org/license.html>.
39 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. */
41 /* Basic UTF-8 routines
43 * These routines are "dumb". Though they understand UTF-8,
44 * they don't grok Unicode. That is, they can push bits,
45 * but don't have a clue what the bits represent. That's
46 * good enough for use with the KRB5 Client SDK.
48 * These routines are not optimized.
51 #include "k5-platform.h"
56 * return the number of bytes required to hold the
57 * NULL-terminated UTF-8 string NOT INCLUDING the
60 size_t krb5int_utf8_bytes(const char *p)
64 for (bytes = 0; p[bytes]; bytes++)
70 size_t krb5int_utf8_chars(const char *p)
72 /* could be optimized and could check for invalid sequences */
75 for ( ; *p ; KRB5_UTF8_INCR(p))
81 size_t krb5int_utf8c_chars(const char *p, size_t length)
83 /* could be optimized and could check for invalid sequences */
85 const char *end = p + length;
87 for ( ; p < end; KRB5_UTF8_INCR(p))
93 /* return offset to next character */
94 int krb5int_utf8_offset(const char *p)
96 return KRB5_UTF8_NEXT(p) - p;
100 * Returns length indicated by first byte.
102 const char krb5int_utf8_lentab[] = {
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
108 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
109 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
110 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
112 int krb5int_utf8_charlen(const char *p)
117 return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
121 * Make sure the UTF-8 char used the shortest possible encoding
122 * returns charlen if valid, 0 if not.
124 * Here are the valid UTF-8 encodings, taken from RFC 3629 page 4.
125 * The table is slightly modified from that of the RFC.
127 * UCS-4 range (hex) UTF-8 sequence (binary)
128 * 0000 0000-0000 007F 0.......
129 * 0000 0080-0000 07FF 110++++. 10......
130 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
131 * 0001 0000-0010 FFFF 11110+++ 10++.... 10...... 10......
133 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
134 * at least one of the '+' bits must be set, otherwise the character
135 * should have been encoded in fewer octets. Note that in the two-octet
136 * case, only the first octet needs to be validated, and this is done
137 * in the krb5int_utf8_lentab[] above.
140 /* mask of required bits in second octet */
143 c krb5int_utf8_mintab[] = {
144 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
145 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
146 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x00, (c)0x00, (c)0x00,
147 (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00 };
150 int krb5int_utf8_charlen2(const char *p)
152 int i = KRB5_UTF8_CHARLEN(p);
155 if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
163 * Convert a UTF8 character to a UCS4 character. Return 0 on success,
166 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
168 const unsigned char *c = (const unsigned char *) p;
171 static unsigned char mask[] = {
172 0, 0x7f, 0x1f, 0x0f, 0x07 };
175 len = KRB5_UTF8_CHARLEN2(p, len);
180 ch = c[0] & mask[len];
182 for (i = 1; i < len; i++) {
183 if ((c[i] & 0xc0) != 0x80)
197 int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
202 if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
204 *out = (krb5_ucs2) ch;
208 /* conv UCS-4 to UTF-8 */
209 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
212 unsigned char *p = (unsigned char *) buf;
214 /* not a valid Unicode character */
218 /* Just return length, don't convert */
220 if (c < 0x80) return 1;
221 else if (c < 0x800) return 2;
222 else if (c < 0x10000) return 3;
228 } else if (c < 0x800) {
229 p[len++] = 0xc0 | ( c >> 6 );
230 p[len++] = 0x80 | ( c & 0x3f );
231 } else if (c < 0x10000) {
232 p[len++] = 0xe0 | ( c >> 12 );
233 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
234 p[len++] = 0x80 | ( c & 0x3f );
235 } else /* if (c < 0x110000) */ {
236 p[len++] = 0xf0 | ( c >> 18 );
237 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
238 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
239 p[len++] = 0x80 | ( c & 0x3f );
245 size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
247 return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
251 * Advance to the next UTF-8 character
253 * Ignores length of multibyte character, instead rely on
254 * continuation markers to find start of next character.
255 * This allows for "resyncing" of when invalid characters
256 * are provided provided the start of the next character
257 * is appears within the 6 bytes examined.
259 char *krb5int_utf8_next(const char *p)
262 const unsigned char *u = (const unsigned char *) p;
264 if (KRB5_UTF8_ISASCII(u)) {
265 return (char *) &p[1];
268 for (i = 1; i < 6; i++) {
269 if ((u[i] & 0xc0) != 0x80) {
270 return (char *) &p[i];
274 return (char *) &p[i];
278 * Advance to the previous UTF-8 character
280 * Ignores length of multibyte character, instead rely on
281 * continuation markers to find start of next character.
282 * This allows for "resyncing" of when invalid characters
283 * are provided provided the start of the next character
284 * is appears within the 6 bytes examined.
286 char *krb5int_utf8_prev(const char *p)
289 const unsigned char *u = (const unsigned char *) p;
291 for (i = -1; i>-6 ; i--) {
292 if ((u[i] & 0xc0 ) != 0x80) {
293 return (char *) &p[i];
297 return (char *) &p[i];
301 * Copy one UTF-8 character from src to dst returning
302 * number of bytes copied.
304 * Ignores length of multibyte character, instead rely on
305 * continuation markers to find start of next character.
306 * This allows for "resyncing" of when invalid characters
307 * are provided provided the start of the next character
308 * is appears within the 6 bytes examined.
310 int krb5int_utf8_copy(char* dst, const char *src)
313 const unsigned char *u = (const unsigned char *) src;
317 if (KRB5_UTF8_ISASCII(u)) {
321 for (i=1; i<6; i++) {
322 if ((u[i] & 0xc0) != 0x80) {
331 #ifndef UTF8_ALPHA_CTYPE
333 * UTF-8 ctype routines
334 * Only deals with characters < 0x80 (ie: US-ASCII)
337 int krb5int_utf8_isascii(const char * p)
339 unsigned c = * (const unsigned char *) p;
341 return KRB5_ASCII(c);
344 int krb5int_utf8_isdigit(const char * p)
346 unsigned c = * (const unsigned char *) p;
351 return KRB5_DIGIT( c );
354 int krb5int_utf8_isxdigit(const char * p)
356 unsigned c = * (const unsigned char *) p;
364 int krb5int_utf8_isspace(const char * p)
366 unsigned c = * (const unsigned char *) p;
385 * These are not needed by the C SDK and are
386 * not "good enough" for general use.
388 int krb5int_utf8_isalpha(const char * p)
390 unsigned c = * (const unsigned char *) p;
395 return KRB5_ALPHA(c);
398 int krb5int_utf8_isalnum(const char * p)
400 unsigned c = * (const unsigned char *) p;
405 return KRB5_ALNUM(c);
411 * UTF-8 string routines
415 char *krb5int_utf8_strchr(const char *str, const char *chr)
419 if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
421 for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
422 if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
429 /* like strcspn() but returns number of bytes, not characters */
430 size_t krb5int_utf8_strcspn(const char *str, const char *set)
432 const char *cstr, *cset;
433 krb5_ucs4 chstr, chset;
435 for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
436 for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
437 if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
438 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
446 /* like strspn() but returns number of bytes, not characters */
447 size_t krb5int_utf8_strspn(const char *str, const char *set)
449 const char *cstr, *cset;
450 krb5_ucs4 chstr, chset;
452 for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
453 for (cset = set; ; KRB5_UTF8_INCR(cset)) {
456 if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
457 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
465 /* like strpbrk(), replaces strchr() as well */
466 char *krb5int_utf8_strpbrk(const char *str, const char *set)
469 krb5_ucs4 chstr, chset;
471 for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
472 for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
473 if (krb5int_utf8_to_ucs4(str, &chstr) == 0
474 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
482 /* like strtok_r(), not strtok() */
483 char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
491 begin = str ? str : *last;
493 begin += krb5int_utf8_strspn(begin, sep);
495 if (*begin == '\0') {
500 end = &begin[krb5int_utf8_strcspn(begin, sep)];
503 char *next = KRB5_UTF8_NEXT(end);