1 // SPDX-License-Identifier: GPL-2.0+
3 * charset conversion utils
5 * Copyright (c) 2017 Rob Clark
9 #include <capitalization.h>
12 static struct capitalization_table capitalization_table[] =
13 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
14 UNICODE_CAPITALIZATION_TABLE;
15 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
16 CP1250_CAPITALIZATION_TABLE;
18 CP437_CAPITALIZATION_TABLE;
21 s32 utf8_get(const char **src)
36 * We do not expect a continuation byte (0x80 - 0xbf).
37 * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
39 * The highest code point is 0x10ffff which is coded as
40 * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
42 if (c < 0xc2 || code > 0xf4)
53 if (c < 0x80 || c > 0xbf)
61 if ((code >= 0xD800 && code <= 0xDFFF) ||
68 if (c < 0x80 || c > 0xbf)
71 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
75 if (c < 0x80 || c > 0xbf)
84 int utf8_put(s32 code, char **dst)
88 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
94 **dst = code >> 6 | 0xC0;
97 **dst = code >> 12 | 0xE0;
99 **dst = code >> 18 | 0xF0;
101 **dst = (code >> 12 & 0x3F) | 0x80;
104 **dst = (code >> 6 & 0x3F) | 0x80;
107 **dst = (code & 0x3F) | 0x80;
113 size_t utf8_utf16_strnlen(const char *src, size_t count)
117 for (; *src && count; --count) {
118 s32 code = utf8_get(&src);
123 /* Reserve space for a replacement character */
125 } else if (code < 0x10000) {
134 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
136 if (!src || !dst || !*dst)
139 for (; count && *src; --count) {
140 s32 code = utf8_get(&src);
144 utf16_put(code, dst);
150 s32 utf16_get(const u16 **src)
160 if (code >= 0xDC00 && code <= 0xDFFF)
162 if (code >= 0xD800 && code <= 0xDBFF) {
170 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
178 int utf16_put(s32 code, u16 **dst)
182 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
184 if (code < 0x10000) {
188 **dst = code >> 10 | 0xD800;
190 **dst = (code & 0x3ff) | 0xDC00;
196 size_t utf16_strnlen(const u16 *src, size_t count)
200 for (; *src && count; --count) {
201 s32 code = utf16_get(&src);
206 * In case of an illegal sequence still reserve space for a
207 * replacement character.
214 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
218 for (; *src && count; --count) {
219 s32 code = utf16_get(&src);
224 /* Reserve space for a replacement character */
226 else if (code < 0x80)
228 else if (code < 0x800)
230 else if (code < 0x10000)
238 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
240 if (!src || !dst || !*dst)
243 for (; count && *src; --count) {
244 s32 code = utf16_get(&src);
254 s32 utf_to_lower(const s32 code)
256 struct capitalization_table *pos = capitalization_table;
260 if (code >= 'A' && code <= 'Z')
264 for (; pos->upper; ++pos) {
265 if (pos->upper == code) {
273 s32 utf_to_upper(const s32 code)
275 struct capitalization_table *pos = capitalization_table;
279 if (code >= 'a' && code <= 'z')
283 for (; pos->lower; ++pos) {
284 if (pos->lower == code) {
292 size_t u16_strlen(const u16 *in)
295 for (i = 0; in[i]; i++);
299 size_t u16_strnlen(const u16 *in, size_t count)
302 for (i = 0; count-- && in[i]; i++);
306 /* Convert UTF-16 to UTF-8. */
307 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
309 uint32_t code_high = 0;
312 uint32_t code = *src++;
315 if (code >= 0xDC00 && code <= 0xDFFF) {
316 /* Surrogate pair. */
317 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
319 *dest++ = (code >> 18) | 0xF0;
320 *dest++ = ((code >> 12) & 0x3F) | 0x80;
321 *dest++ = ((code >> 6) & 0x3F) | 0x80;
322 *dest++ = (code & 0x3F) | 0x80;
326 /* *src may be valid. Don't eat it. */
332 if (code <= 0x007F) {
334 } else if (code <= 0x07FF) {
335 *dest++ = (code >> 6) | 0xC0;
336 *dest++ = (code & 0x3F) | 0x80;
337 } else if (code >= 0xD800 && code <= 0xDBFF) {
340 } else if (code >= 0xDC00 && code <= 0xDFFF) {
343 } else if (code < 0x10000) {
344 *dest++ = (code >> 12) | 0xE0;
345 *dest++ = ((code >> 6) & 0x3F) | 0x80;
346 *dest++ = (code & 0x3F) | 0x80;
348 *dest++ = (code >> 18) | 0xF0;
349 *dest++ = ((code >> 12) & 0x3F) | 0x80;
350 *dest++ = ((code >> 6) & 0x3F) | 0x80;
351 *dest++ = (code & 0x3F) | 0x80;