lib/crypto_backend/utf8.c

   1 /*
   2  * UTF8/16 helpers, copied and adapted from systemd project.
   3  *
   4  * Copyright (C) 2010 Lennart Poettering
   5  *
   6  * cryptsetup related changes
   7  * Copyright (C) 2021-2023 Vojtech Trefny
   8
   9  * Parts of the original systemd implementation are based on the GLIB utf8
  10  * validation functions.
  11  * gutf8.c - Operations on UTF-8 strings.
  12  *
  13  * Copyright (C) 1999 Tom Tromey
  14  * Copyright (C) 2000 Red Hat, Inc.
  15  *
  16  * This library is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU Library General Public
  18  * License as published by the Free Software Foundation; either
  19  * version 2 of the License, or (at your option) any later version.
  20  *
  21  * This library is distributed in the hope that it will be useful,
  22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  24  * Library General Public License for more details.
  25  *
  26  * You should have received a copy of the GNU Library General Public
  27  * License along with this library; if not, write to the Free Software
  28  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  29  */
  30
  31 #include <errno.h>
  32 #include <endian.h>
  33
  34 #include "crypto_backend.h"
  35
  36 static inline bool utf16_is_surrogate(char16_t c)
  37 {
  38         return c >= 0xd800U && c <= 0xdfffU;
  39 }
  40
  41 static inline bool utf16_is_trailing_surrogate(char16_t c)
  42 {
  43         return c >= 0xdc00U && c <= 0xdfffU;
  44 }
  45
  46 static inline char32_t utf16_surrogate_pair_to_unichar(char16_t lead, char16_t trail)
  47 {
  48         return ((((char32_t) lead - 0xd800U) << 10) + ((char32_t) trail - 0xdc00U) + 0x10000U);
  49 }
  50
  51 /**
  52  * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
  53  * @out_utf8: output buffer of at least 4 bytes or NULL
  54  * @g: UCS-4 character to encode
  55  *
  56  * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
  57  * The length of the character is returned. It is not zero-terminated! If the
  58  * output buffer is NULL, only the length is returned.
  59  *
  60  * Returns: The length in bytes that the UTF-8 representation does or would
  61  *          occupy.
  62  */
  63 static size_t utf8_encode_unichar(char *out_utf8, char32_t g)
  64 {
  65         if (g < (1 << 7)) {
  66                 if (out_utf8)
  67                         out_utf8[0] = g & 0x7f;
  68                 return 1;
  69         } else if (g < (1 << 11)) {
  70                 if (out_utf8) {
  71                         out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
  72                         out_utf8[1] = 0x80 | (g & 0x3f);
  73                 }
  74                 return 2;
  75         } else if (g < (1 << 16)) {
  76                 if (out_utf8) {
  77                         out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
  78                         out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
  79                         out_utf8[2] = 0x80 | (g & 0x3f);
  80                 }
  81                 return 3;
  82         } else if (g < (1 << 21)) {
  83                 if (out_utf8) {
  84                         out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
  85                         out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
  86                         out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
  87                         out_utf8[3] = 0x80 | (g & 0x3f);
  88                 }
  89                 return 4;
  90         }
  91
  92         return 0;
  93 }
  94
  95 /**
  96  * crypt_utf16_to_utf8()
  97  * @out: output buffer, should be 2 * @length + 1 long
  98  * @s: string to convert
  99  * @length: length of @s in bytes
 100  *
 101  * Converts a UTF16LE encoded string to a UTF8 encoded string.
 102  *
 103  * Returns: 0 on success, negative errno otherwise
 104  */
 105 int crypt_utf16_to_utf8(char **out, const char16_t *s, size_t length /* bytes! */)
 106 {
 107         const uint8_t *f;
 108         char *t;
 109
 110         assert(s);
 111         assert(out);
 112         assert(*out);
 113
 114         /* Input length is in bytes, i.e. the shortest possible character takes 2 bytes. Each unicode character may
 115          * take up to 4 bytes in UTF-8. Let's also account for a trailing NUL byte. */
 116         if (length * 2 < length)
 117                 return -EOVERFLOW; /* overflow */
 118
 119         f = (const uint8_t*) s;
 120         t = *out;
 121
 122         while (f + 1 < (const uint8_t*) s + length) {
 123                 char16_t w1, w2;
 124
 125                 /* see RFC 2781 section 2.2 */
 126
 127                 w1 = f[1] << 8 | f[0];
 128                 f += 2;
 129
 130                 if (!utf16_is_surrogate(w1)) {
 131                         t += utf8_encode_unichar(t, w1);
 132                         continue;
 133                 }
 134
 135                 if (utf16_is_trailing_surrogate(w1))
 136                         continue; /* spurious trailing surrogate, ignore */
 137
 138                 if (f + 1 >= (const uint8_t*) s + length)
 139                         break;
 140
 141                 w2 = f[1] << 8 | f[0];
 142                 f += 2;
 143
 144                 if (!utf16_is_trailing_surrogate(w2)) {
 145                         f -= 2;
 146                         continue; /* surrogate missing its trailing surrogate, ignore */
 147                 }
 148
 149                 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
 150         }
 151
 152         *t = 0;
 153         return 0;
 154 }
 155
 156 /* count of characters used to encode one unicode char */
 157 static size_t utf8_encoded_expected_len(uint8_t c)
 158 {
 159         if (c < 0x80)
 160                 return 1;
 161         if ((c & 0xe0) == 0xc0)
 162                 return 2;
 163         if ((c & 0xf0) == 0xe0)
 164                 return 3;
 165         if ((c & 0xf8) == 0xf0)
 166                 return 4;
 167         if ((c & 0xfc) == 0xf8)
 168                 return 5;
 169         if ((c & 0xfe) == 0xfc)
 170                 return 6;
 171
 172         return 0;
 173 }
 174
 175 /* decode one unicode char */
 176 static int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar)
 177 {
 178         char32_t unichar;
 179         size_t len, i;
 180
 181         assert(str);
 182
 183         len = utf8_encoded_expected_len(str[0]);
 184
 185         switch (len) {
 186         case 1:
 187                 *ret_unichar = (char32_t)str[0];
 188                 return 0;
 189         case 2:
 190                 unichar = str[0] & 0x1f;
 191                 break;
 192         case 3:
 193                 unichar = (char32_t)str[0] & 0x0f;
 194                 break;
 195         case 4:
 196                 unichar = (char32_t)str[0] & 0x07;
 197                 break;
 198         case 5:
 199                 unichar = (char32_t)str[0] & 0x03;
 200                 break;
 201         case 6:
 202                 unichar = (char32_t)str[0] & 0x01;
 203                 break;
 204         default:
 205                 return -EINVAL;
 206         }
 207
 208         for (i = 1; i < len; i++) {
 209                 if (((char32_t)str[i] & 0xc0) != 0x80)
 210                         return -EINVAL;
 211
 212                 unichar <<= 6;
 213                 unichar |= (char32_t)str[i] & 0x3f;
 214         }
 215
 216         *ret_unichar = unichar;
 217
 218         return 0;
 219 }
 220
 221 static size_t utf16_encode_unichar(char16_t *out, char32_t c)
 222 {
 223         /* Note that this encodes as little-endian. */
 224
 225         switch (c) {
 226
 227         case 0 ... 0xd7ffU:
 228         case 0xe000U ... 0xffffU:
 229                 out[0] = htole16(c);
 230                 return 1;
 231
 232         case 0x10000U ... 0x10ffffU:
 233                 c -= 0x10000U;
 234                 out[0] = htole16((c >> 10) + 0xd800U);
 235                 out[1] = htole16((c & 0x3ffU) + 0xdc00U);
 236                 return 2;
 237
 238         default: /* A surrogate (invalid) */
 239                 return 0;
 240         }
 241 }
 242
 243 /**
 244  * crypt_utf8_to_utf16()
 245  * @out: output buffer, should be @length + 1 long
 246  * @s: string to convert
 247  * @length: length of @s in bytes
 248  *
 249  * Converts a UTF8 encoded string to a UTF16LE encoded string.
 250  *
 251  * Returns: 0 on success, negative errno otherwise
 252  */
 253 int crypt_utf8_to_utf16(char16_t **out, const char *s, size_t length)
 254 {
 255         char16_t *p;
 256         size_t i;
 257         int r;
 258
 259         assert(s);
 260
 261         p = *out;
 262
 263         for (i = 0; i < length;) {
 264                 char32_t unichar;
 265                 size_t e;
 266
 267                 e = utf8_encoded_expected_len(s[i]);
 268                 if (e <= 1) /* Invalid and single byte characters are copied as they are */
 269                         goto copy;
 270
 271                 if (i + e > length) /* sequence longer than input buffer, then copy as-is */
 272                         goto copy;
 273
 274                 r = utf8_encoded_to_unichar(s + i, &unichar);
 275                 if (r < 0) /* sequence invalid, then copy as-is */
 276                         goto copy;
 277
 278                 p += utf16_encode_unichar(p, unichar);
 279                 i += e;
 280                 continue;
 281
 282         copy:
 283                 *(p++) = htole16(s[i++]);
 284         }
 285
 286         *p = 0;
 287         return 0;
 288 }