src/util/support/utf8.c

   1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 /* util/support/utf8.c */
   3 /*
   4  * Copyright 2008 by the Massachusetts Institute of Technology.
   5  * All Rights Reserved.
   6  *
   7  * Export of this software from the United States of America may
   8  *   require a specific license from the United States Government.
   9  *   It is the responsibility of any person or organization contemplating
  10  *   export to obtain such a license before exporting.
  11  *
  12  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
  13  * distribute this software and its documentation for any purpose and
  14  * without fee is hereby granted, provided that the above copyright
  15  * notice appear in all copies and that both that copyright notice and
  16  * this permission notice appear in supporting documentation, and that
  17  * the name of M.I.T. not be used in advertising or publicity pertaining
  18  * to distribution of the software without specific, written prior
  19  * permission.  Furthermore if you modify this software you must label
  20  * your software as modified software and not distribute it in such a
  21  * fashion that it might be confused with the original M.I.T. software.
  22  * M.I.T. makes no representations about the suitability of
  23  * this software for any purpose.  It is provided "as is" without express
  24  * or implied warranty.
  25  */
  26 /*
  27  * Copyright 1998-2008 The OpenLDAP Foundation.
  28  * All rights reserved.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted only as authorized by the OpenLDAP
  32  * Public License.
  33  *
  34  * A copy of this license is available in the file LICENSE in the
  35  * top-level directory of the distribution or, alternatively, at
  36  * <http://www.OpenLDAP.org/license.html>.
  37  */
  38
  39 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. */
  40
  41 /* Basic UTF-8 routines
  42  *
  43  * These routines are "dumb".  Though they understand UTF-8,
  44  * they don't grok Unicode.  That is, they can push bits,
  45  * but don't have a clue what the bits represent.  That's
  46  * good enough for use with the KRB5 Client SDK.
  47  *
  48  * These routines are not optimized.
  49  */
  50
  51 #include "k5-platform.h"
  52 #include "k5-utf8.h"
  53 #include "supp-int.h"
  54
  55 /*
  56  * return the number of bytes required to hold the
  57  * NULL-terminated UTF-8 string NOT INCLUDING the
  58  * termination.
  59  */
  60 size_t krb5int_utf8_bytes(const char *p)
  61 {
  62     size_t bytes;
  63
  64     for (bytes = 0; p[bytes]; bytes++)
  65         ;
  66
  67     return bytes;
  68 }
  69
  70 size_t krb5int_utf8_chars(const char *p)
  71 {
  72     /* could be optimized and could check for invalid sequences */
  73     size_t chars = 0;
  74
  75     for ( ; *p ; KRB5_UTF8_INCR(p))
  76         chars++;
  77
  78     return chars;
  79 }
  80
  81 size_t krb5int_utf8c_chars(const char *p, size_t length)
  82 {
  83     /* could be optimized and could check for invalid sequences */
  84     size_t chars = 0;
  85     const char *end = p + length;
  86
  87     for ( ; p < end; KRB5_UTF8_INCR(p))
  88         chars++;
  89
  90     return chars;
  91 }
  92
  93 /* return offset to next character */
  94 int krb5int_utf8_offset(const char *p)
  95 {
  96     return KRB5_UTF8_NEXT(p) - p;
  97 }
  98
  99 /*
 100  * Returns length indicated by first byte.
 101  */
 102 const char krb5int_utf8_lentab[] = {
 103     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 105     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 106     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 107     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 108     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 109     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 110     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 111
 112 int krb5int_utf8_charlen(const char *p)
 113 {
 114     if (!(*p & 0x80))
 115         return 1;
 116
 117     return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
 118 }
 119
 120 /*
 121  * Make sure the UTF-8 char used the shortest possible encoding
 122  * returns charlen if valid, 0 if not.
 123  *
 124  * Here are the valid UTF-8 encodings, taken from RFC 3629 page 4.
 125  * The table is slightly modified from that of the RFC.
 126  *
 127  * UCS-4 range (hex)      UTF-8 sequence (binary)
 128  * 0000 0000-0000 007F   0.......
 129  * 0000 0080-0000 07FF   110++++. 10......
 130  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
 131  * 0001 0000-0010 FFFF   11110+++ 10++.... 10...... 10......
 132  *
 133  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
 134  * at least one of the '+' bits must be set, otherwise the character
 135  * should have been encoded in fewer octets. Note that in the two-octet
 136  * case, only the first octet needs to be validated, and this is done
 137  * in the krb5int_utf8_lentab[] above.
 138  */
 139
 140 /* mask of required bits in second octet */
 141 #undef c
 142 #define c const char
 143 c krb5int_utf8_mintab[] = {
 144     (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 145     (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
 146     (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x00, (c)0x00, (c)0x00,
 147     (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00, (c)0x00 };
 148 #undef c
 149
 150 int krb5int_utf8_charlen2(const char *p)
 151 {
 152     int i = KRB5_UTF8_CHARLEN(p);
 153
 154     if (i > 2) {
 155         if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
 156             i = 0;
 157     }
 158
 159     return i;
 160 }
 161
 162 /*
 163  * Convert a UTF8 character to a UCS4 character.  Return 0 on success,
 164  * -1 on failure.
 165  */
 166 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
 167 {
 168     const unsigned char *c = (const unsigned char *) p;
 169     krb5_ucs4 ch;
 170     int len, i;
 171     static unsigned char mask[] = {
 172         0, 0x7f, 0x1f, 0x0f, 0x07 };
 173
 174     *out = 0;
 175     len = KRB5_UTF8_CHARLEN2(p, len);
 176
 177     if (len == 0)
 178         return -1;
 179
 180     ch = c[0] & mask[len];
 181
 182     for (i = 1; i < len; i++) {
 183         if ((c[i] & 0xc0) != 0x80)
 184             return -1;
 185
 186         ch <<= 6;
 187         ch |= c[i] & 0x3f;
 188     }
 189
 190     if (ch > 0x10ffff)
 191         return -1;
 192
 193     *out = ch;
 194     return 0;
 195 }
 196
 197 int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
 198 {
 199     krb5_ucs4 ch;
 200
 201     *out = 0;
 202     if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
 203         return -1;
 204     *out = (krb5_ucs2) ch;
 205     return 0;
 206 }
 207
 208 /* conv UCS-4 to UTF-8 */
 209 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
 210 {
 211     size_t len = 0;
 212     unsigned char *p = (unsigned char *) buf;
 213
 214     /* not a valid Unicode character */
 215     if (c > 0x10ffff)
 216         return 0;
 217
 218     /* Just return length, don't convert */
 219     if (buf == NULL) {
 220         if (c < 0x80) return 1;
 221         else if (c < 0x800) return 2;
 222         else if (c < 0x10000) return 3;
 223         else return 4;
 224     }
 225
 226     if (c < 0x80) {
 227         p[len++] = c;
 228     } else if (c < 0x800) {
 229         p[len++] = 0xc0 | ( c >> 6 );
 230         p[len++] = 0x80 | ( c & 0x3f );
 231     } else if (c < 0x10000) {
 232         p[len++] = 0xe0 | ( c >> 12 );
 233         p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 234         p[len++] = 0x80 | ( c & 0x3f );
 235     } else /* if (c < 0x110000) */ {
 236         p[len++] = 0xf0 | ( c >> 18 );
 237         p[len++] = 0x80 | ( (c >> 12) & 0x3f );
 238         p[len++] = 0x80 | ( (c >> 6) & 0x3f );
 239         p[len++] = 0x80 | ( c & 0x3f );
 240     }
 241
 242     return len;
 243 }
 244
 245 size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
 246 {
 247     return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
 248 }
 249
 250 /*
 251  * Advance to the next UTF-8 character
 252  *
 253  * Ignores length of multibyte character, instead rely on
 254  * continuation markers to find start of next character.
 255  * This allows for "resyncing" of when invalid characters
 256  * are provided provided the start of the next character
 257  * is appears within the 6 bytes examined.
 258  */
 259 char *krb5int_utf8_next(const char *p)
 260 {
 261     int i;
 262     const unsigned char *u = (const unsigned char *) p;
 263
 264     if (KRB5_UTF8_ISASCII(u)) {
 265         return (char *) &p[1];
 266     }
 267
 268     for (i = 1; i < 6; i++) {
 269         if ((u[i] & 0xc0) != 0x80) {
 270             return (char *) &p[i];
 271         }
 272     }
 273
 274     return (char *) &p[i];
 275 }
 276
 277 /*
 278  * Advance to the previous UTF-8 character
 279  *
 280  * Ignores length of multibyte character, instead rely on
 281  * continuation markers to find start of next character.
 282  * This allows for "resyncing" of when invalid characters
 283  * are provided provided the start of the next character
 284  * is appears within the 6 bytes examined.
 285  */
 286 char *krb5int_utf8_prev(const char *p)
 287 {
 288     int i;
 289     const unsigned char *u = (const unsigned char *) p;
 290
 291     for (i = -1; i>-6 ; i--) {
 292         if ((u[i] & 0xc0 ) != 0x80) {
 293             return (char *) &p[i];
 294         }
 295     }
 296
 297     return (char *) &p[i];
 298 }
 299
 300 /*
 301  * Copy one UTF-8 character from src to dst returning
 302  * number of bytes copied.
 303  *
 304  * Ignores length of multibyte character, instead rely on
 305  * continuation markers to find start of next character.
 306  * This allows for "resyncing" of when invalid characters
 307  * are provided provided the start of the next character
 308  * is appears within the 6 bytes examined.
 309  */
 310 int krb5int_utf8_copy(char* dst, const char *src)
 311 {
 312     int i;
 313     const unsigned char *u = (const unsigned char *) src;
 314
 315     dst[0] = src[0];
 316
 317     if (KRB5_UTF8_ISASCII(u)) {
 318         return 1;
 319     }
 320
 321     for (i=1; i<6; i++) {
 322         if ((u[i] & 0xc0) != 0x80) {
 323             return i;
 324         }
 325         dst[i] = src[i];
 326     }
 327
 328     return i;
 329 }
 330
 331 #ifndef UTF8_ALPHA_CTYPE
 332 /*
 333  * UTF-8 ctype routines
 334  * Only deals with characters < 0x80 (ie: US-ASCII)
 335  */
 336
 337 int krb5int_utf8_isascii(const char * p)
 338 {
 339     unsigned c = * (const unsigned char *) p;
 340
 341     return KRB5_ASCII(c);
 342 }
 343
 344 int krb5int_utf8_isdigit(const char * p)
 345 {
 346     unsigned c = * (const unsigned char *) p;
 347
 348     if (!KRB5_ASCII(c))
 349         return 0;
 350
 351     return KRB5_DIGIT( c );
 352 }
 353
 354 int krb5int_utf8_isxdigit(const char * p)
 355 {
 356     unsigned c = * (const unsigned char *) p;
 357
 358     if (!KRB5_ASCII(c))
 359         return 0;
 360
 361     return KRB5_HEX(c);
 362 }
 363
 364 int krb5int_utf8_isspace(const char * p)
 365 {
 366     unsigned c = * (const unsigned char *) p;
 367
 368     if (!KRB5_ASCII(c))
 369         return 0;
 370
 371     switch(c) {
 372     case ' ':
 373     case '\t':
 374     case '\n':
 375     case '\r':
 376     case '\v':
 377     case '\f':
 378         return 1;
 379     }
 380
 381     return 0;
 382 }
 383
 384 /*
 385  * These are not needed by the C SDK and are
 386  * not "good enough" for general use.
 387  */
 388 int krb5int_utf8_isalpha(const char * p)
 389 {
 390     unsigned c = * (const unsigned char *) p;
 391
 392     if (!KRB5_ASCII(c))
 393         return 0;
 394
 395     return KRB5_ALPHA(c);
 396 }
 397
 398 int krb5int_utf8_isalnum(const char * p)
 399 {
 400     unsigned c = * (const unsigned char *) p;
 401
 402     if (!KRB5_ASCII(c))
 403         return 0;
 404
 405     return KRB5_ALNUM(c);
 406 }
 407 #endif
 408
 409
 410 /*
 411  * UTF-8 string routines
 412  */
 413
 414 /* like strchr() */
 415 char *krb5int_utf8_strchr(const char *str, const char *chr)
 416 {
 417     krb5_ucs4 chs, ch;
 418
 419     if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
 420         return NULL;
 421     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
 422         if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
 423             return (char *)str;
 424     }
 425
 426     return NULL;
 427 }
 428
 429 /* like strcspn() but returns number of bytes, not characters */
 430 size_t krb5int_utf8_strcspn(const char *str, const char *set)
 431 {
 432     const char *cstr, *cset;
 433     krb5_ucs4 chstr, chset;
 434
 435     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
 436         for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
 437             if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
 438                 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
 439                 return cstr - str;
 440         }
 441     }
 442
 443     return cstr - str;
 444 }
 445
 446 /* like strspn() but returns number of bytes, not characters */
 447 size_t krb5int_utf8_strspn(const char *str, const char *set)
 448 {
 449     const char *cstr, *cset;
 450     krb5_ucs4 chstr, chset;
 451
 452     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
 453         for (cset = set; ; KRB5_UTF8_INCR(cset)) {
 454             if (*cset == '\0')
 455                 return cstr - str;
 456             if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
 457                 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
 458                 break;
 459         }
 460     }
 461
 462     return cstr - str;
 463 }
 464
 465 /* like strpbrk(), replaces strchr() as well */
 466 char *krb5int_utf8_strpbrk(const char *str, const char *set)
 467 {
 468     const char *cset;
 469     krb5_ucs4 chstr, chset;
 470
 471     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
 472         for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
 473             if (krb5int_utf8_to_ucs4(str, &chstr) == 0
 474                 && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
 475                 return (char *)str;
 476         }
 477     }
 478
 479     return NULL;
 480 }
 481
 482 /* like strtok_r(), not strtok() */
 483 char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
 484 {
 485     char *begin;
 486     char *end;
 487
 488     if (last == NULL)
 489         return NULL;
 490
 491     begin = str ? str : *last;
 492
 493     begin += krb5int_utf8_strspn(begin, sep);
 494
 495     if (*begin == '\0') {
 496         *last = NULL;
 497         return NULL;
 498     }
 499
 500     end = &begin[krb5int_utf8_strcspn(begin, sep)];
 501
 502     if (*end != '\0') {
 503         char *next = KRB5_UTF8_NEXT(end);
 504         *end = '\0';
 505         end = next;
 506     }
 507
 508     *last = end;
 509
 510     return begin;
 511 }