2 * UTF8/16 helpers, copied and adapted from systemd project.
4 * Copyright (C) 2010 Lennart Poettering
6 * cryptsetup related changes
7 * Copyright (C) 2021-2023 Vojtech Trefny
9 * Parts of the original systemd implementation are based on the GLIB utf8
10 * validation functions.
11 * gutf8.c - Operations on UTF-8 strings.
13 * Copyright (C) 1999 Tom Tromey
14 * Copyright (C) 2000 Red Hat, Inc.
16 * This library is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU Library General Public
18 * License as published by the Free Software Foundation; either
19 * version 2 of the License, or (at your option) any later version.
21 * This library is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 * Library General Public License for more details.
26 * You should have received a copy of the GNU Library General Public
27 * License along with this library; if not, write to the Free Software
28 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include "crypto_backend.h"
36 static inline bool utf16_is_surrogate(char16_t c)
38 return c >= 0xd800U && c <= 0xdfffU;
41 static inline bool utf16_is_trailing_surrogate(char16_t c)
43 return c >= 0xdc00U && c <= 0xdfffU;
46 static inline char32_t utf16_surrogate_pair_to_unichar(char16_t lead, char16_t trail)
48 return ((((char32_t) lead - 0xd800U) << 10) + ((char32_t) trail - 0xdc00U) + 0x10000U);
52 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
53 * @out_utf8: output buffer of at least 4 bytes or NULL
54 * @g: UCS-4 character to encode
56 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
57 * The length of the character is returned. It is not zero-terminated! If the
58 * output buffer is NULL, only the length is returned.
60 * Returns: The length in bytes that the UTF-8 representation does or would
63 static size_t utf8_encode_unichar(char *out_utf8, char32_t g)
67 out_utf8[0] = g & 0x7f;
69 } else if (g < (1 << 11)) {
71 out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
72 out_utf8[1] = 0x80 | (g & 0x3f);
75 } else if (g < (1 << 16)) {
77 out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
78 out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
79 out_utf8[2] = 0x80 | (g & 0x3f);
82 } else if (g < (1 << 21)) {
84 out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
85 out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
86 out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
87 out_utf8[3] = 0x80 | (g & 0x3f);
96 * crypt_utf16_to_utf8()
97 * @out: output buffer, should be 2 * @length + 1 long
98 * @s: string to convert
99 * @length: length of @s in bytes
101 * Converts a UTF16LE encoded string to a UTF8 encoded string.
103 * Returns: 0 on success, negative errno otherwise
105 int crypt_utf16_to_utf8(char **out, const char16_t *s, size_t length /* bytes! */)
114 /* Input length is in bytes, i.e. the shortest possible character takes 2 bytes. Each unicode character may
115 * take up to 4 bytes in UTF-8. Let's also account for a trailing NUL byte. */
116 if (length * 2 < length)
117 return -EOVERFLOW; /* overflow */
119 f = (const uint8_t*) s;
122 while (f + 1 < (const uint8_t*) s + length) {
125 /* see RFC 2781 section 2.2 */
127 w1 = f[1] << 8 | f[0];
130 if (!utf16_is_surrogate(w1)) {
131 t += utf8_encode_unichar(t, w1);
135 if (utf16_is_trailing_surrogate(w1))
136 continue; /* spurious trailing surrogate, ignore */
138 if (f + 1 >= (const uint8_t*) s + length)
141 w2 = f[1] << 8 | f[0];
144 if (!utf16_is_trailing_surrogate(w2)) {
146 continue; /* surrogate missing its trailing surrogate, ignore */
149 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
156 /* count of characters used to encode one unicode char */
157 static size_t utf8_encoded_expected_len(uint8_t c)
161 if ((c & 0xe0) == 0xc0)
163 if ((c & 0xf0) == 0xe0)
165 if ((c & 0xf8) == 0xf0)
167 if ((c & 0xfc) == 0xf8)
169 if ((c & 0xfe) == 0xfc)
175 /* decode one unicode char */
176 static int utf8_encoded_to_unichar(const char *str, char32_t *ret_unichar)
183 len = utf8_encoded_expected_len(str[0]);
187 *ret_unichar = (char32_t)str[0];
190 unichar = str[0] & 0x1f;
193 unichar = (char32_t)str[0] & 0x0f;
196 unichar = (char32_t)str[0] & 0x07;
199 unichar = (char32_t)str[0] & 0x03;
202 unichar = (char32_t)str[0] & 0x01;
208 for (i = 1; i < len; i++) {
209 if (((char32_t)str[i] & 0xc0) != 0x80)
213 unichar |= (char32_t)str[i] & 0x3f;
216 *ret_unichar = unichar;
221 static size_t utf16_encode_unichar(char16_t *out, char32_t c)
223 /* Note that this encodes as little-endian. */
228 case 0xe000U ... 0xffffU:
232 case 0x10000U ... 0x10ffffU:
234 out[0] = htole16((c >> 10) + 0xd800U);
235 out[1] = htole16((c & 0x3ffU) + 0xdc00U);
238 default: /* A surrogate (invalid) */
244 * crypt_utf8_to_utf16()
245 * @out: output buffer, should be @length + 1 long
246 * @s: string to convert
247 * @length: length of @s in bytes
249 * Converts a UTF8 encoded string to a UTF16LE encoded string.
251 * Returns: 0 on success, negative errno otherwise
253 int crypt_utf8_to_utf16(char16_t **out, const char *s, size_t length)
263 for (i = 0; i < length;) {
267 e = utf8_encoded_expected_len(s[i]);
268 if (e <= 1) /* Invalid and single byte characters are copied as they are */
271 if (i + e > length) /* sequence longer than input buffer, then copy as-is */
274 r = utf8_encoded_to_unichar(s + i, &unichar);
275 if (r < 0) /* sequence invalid, then copy as-is */
278 p += utf16_encode_unichar(p, unichar);
283 *(p++) = htole16(s[i++]);