src/util/support/utf8_conv.c

   1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 /* util/support/utf8_conv.c */
   3 /*
   4  * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
   5  * All Rights Reserved.
   6  *
   7  * Export of this software from the United States of America may
   8  *   require a specific license from the United States Government.
   9  *   It is the responsibility of any person or organization contemplating
  10  *   export to obtain such a license before exporting.
  11  *
  12  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
  13  * distribute this software and its documentation for any purpose and
  14  * without fee is hereby granted, provided that the above copyright
  15  * notice appear in all copies and that both that copyright notice and
  16  * this permission notice appear in supporting documentation, and that
  17  * the name of M.I.T. not be used in advertising or publicity pertaining
  18  * to distribution of the software without specific, written prior
  19  * permission.  Furthermore if you modify this software you must label
  20  * your software as modified software and not distribute it in such a
  21  * fashion that it might be confused with the original M.I.T. software.
  22  * M.I.T. makes no representations about the suitability of
  23  * this software for any purpose.  It is provided "as is" without express
  24  * or implied warranty.
  25  */
  26 /*
  27  * Copyright 1998-2008 The OpenLDAP Foundation.
  28  * All rights reserved.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted only as authorized by the OpenLDAP
  32  * Public License.
  33  *
  34  * A copy of this license is available in the file LICENSE in the
  35  * top-level directory of the distribution or, alternatively, at
  36  * <http://www.OpenLDAP.org/license.html>.
  37  */
  38 /* Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
  39  *
  40  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
  41  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
  42  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
  43  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
  44  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
  45  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
  46  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
  47  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  48  */
  49
  50 /* This work is based on OpenLDAP Software <http://www.openldap.org/>. */
  51
  52 /*
  53  * These routines convert between UTF-16 and UTF-8.  UTF-16 encodes a Unicode
  54  * character in either two or four bytes.  Characters in the Basic Multilingual
  55  * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
  56  * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
  57  * surrogate and a low surrogate, each containing ten bits of the character
  58  * value, and encoded in four bytes.
  59  */
  60
  61 #include "k5-platform.h"
  62 #include "k5-utf8.h"
  63 #include "k5-buf.h"
  64 #include "k5-input.h"
  65 #include "supp-int.h"
  66
  67 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  68
  69 /* A high surrogate is ten bits masked with 0xD800. */
  70 #define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
  71
  72 /* A low surrogate is ten bits masked with 0xDC00. */
  73 #define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
  74
  75 /* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
  76  * value. */
  77 #define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
  78 #define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
  79
  80 /* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
  81  * surrogate value. */
  82 #define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
  83
  84 /* Characters in the Supplementary Planes have a base value subtracted from
  85  * their code points to form a 20-bit value; ten bits go in each surrogate. */
  86 #define BASE 0x10000
  87 #define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
  88 #define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
  89 #define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
  90
  91 int
  92 k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
  93 {
  94     struct k5buf buf;
  95     krb5_ucs4 ch;
  96     size_t chlen, i;
  97     uint8_t *p;
  98
  99     *utf16_out = NULL;
 100     *nbytes_out = 0;
 101
 102     /* UTF-16 conversion is used for RC4 string-to-key, so treat this data as
 103      * sensitive. */
 104     k5_buf_init_dynamic_zap(&buf);
 105
 106     /* Examine next UTF-8 character. */
 107     while (*utf8 != '\0') {
 108         /* Get UTF-8 sequence length from first byte. */
 109         chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
 110         if (chlen == 0)
 111             goto invalid;
 112
 113         /* First byte minus length tag */
 114         ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
 115
 116         for (i = 1; i < chlen; i++) {
 117             /* Subsequent bytes must start with 10. */
 118             if ((utf8[i] & 0xc0) != 0x80)
 119                 goto invalid;
 120
 121             /* 6 bits of data in each subsequent byte */
 122             ch <<= 6;
 123             ch |= (krb5_ucs4)(utf8[i] & 0x3f);
 124         }
 125         if (!IS_VALID_UNICODE(ch))
 126             goto invalid;
 127
 128         /* Characters in the basic multilingual plane are encoded using two
 129          * bytes; other characters are encoded using four bytes. */
 130         p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4);
 131         if (p == NULL)
 132             return ENOMEM;
 133         if (IS_BMP(ch)) {
 134             store_16_le(ch, p);
 135         } else {
 136             /* 0x10000 is subtracted from ch; then the high ten bits plus
 137              * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
 138             store_16_le(HIGH_SURROGATE(ch), p);
 139             store_16_le(LOW_SURROGATE(ch), p + 2);
 140         }
 141
 142         /* Move to next UTF-8 character. */
 143         utf8 += chlen;
 144     }
 145
 146     *utf16_out = buf.data;
 147     *nbytes_out = buf.len;
 148     return 0;
 149
 150 invalid:
 151     k5_buf_free(&buf);
 152     return EINVAL;
 153 }
 154
 155 int
 156 k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
 157 {
 158     struct k5buf buf;
 159     struct k5input in;
 160     uint16_t ch1, ch2;
 161     krb5_ucs4 ch;
 162     size_t chlen;
 163     void *p;
 164
 165     *utf8_out = NULL;
 166
 167     if (nbytes % 2 != 0)
 168         return EINVAL;
 169
 170     k5_buf_init_dynamic(&buf);
 171     k5_input_init(&in, utf16bytes, nbytes);
 172     while (!in.status && in.len > 0) {
 173         /* Get the next character or high surrogate.  A low surrogate without a
 174          * preceding high surrogate is invalid. */
 175         ch1 = k5_input_get_uint16_le(&in);
 176         if (IS_LOW_SURROGATE(ch1))
 177             goto invalid;
 178         if (IS_HIGH_SURROGATE(ch1)) {
 179             /* Get the low surrogate and combine the pair. */
 180             ch2 = k5_input_get_uint16_le(&in);
 181             if (!IS_LOW_SURROGATE(ch2))
 182                 goto invalid;
 183             ch = COMPOSE(ch1, ch2);
 184         } else {
 185             ch = ch1;
 186         }
 187
 188         chlen = krb5int_ucs4_to_utf8(ch, NULL);
 189         p = k5_buf_get_space(&buf, chlen);
 190         if (p == NULL)
 191             return ENOMEM;
 192         (void)krb5int_ucs4_to_utf8(ch, p);
 193     }
 194
 195     if (in.status)
 196         goto invalid;
 197
 198     *utf8_out = buf.data;
 199     return 0;
 200
 201 invalid:
 202     k5_buf_free(&buf);
 203     return EINVAL;
 204 }