1 // SPDX-License-Identifier: GPL-2.0+
3 * charset conversion utils
5 * Copyright (c) 2017 Rob Clark
10 #include <capitalization.h>
12 #include <efi_loader.h>
17 * codepage_437 - Unicode to codepage 437 translation table
19 const u16 codepage_437[128] = CP437;
21 static struct capitalization_table capitalization_table[] =
22 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
23 UNICODE_CAPITALIZATION_TABLE;
24 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
25 CP1250_CAPITALIZATION_TABLE;
27 CP437_CAPITALIZATION_TABLE;
31 * get_code() - read Unicode code point from UTF-8 stream
33 * @read_u8: - stream reader
34 * @src: - string buffer passed to stream reader, optional
35 * Return: - Unicode code point, or -1
37 static int get_code(u8 (*read_u8)(void *data), void *data)
44 if (ch >= 0xc2 && ch <= 0xf4) {
53 if (ch < 0x80 || ch > 0xbf)
61 if ((code >= 0xD800 && code <= 0xDFFF) ||
65 if (ch < 0x80 || ch > 0xbf)
68 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
72 if (ch < 0x80 || ch > 0xbf)
76 } else if (ch >= 0x80) {
85 * read_string() - read byte from character string
87 * @data: - pointer to string
90 * The string pointer is incremented if it does not point to '\0'.
92 static u8 read_string(void *data)
95 const char **src = (const char **)data;
98 if (!src || !*src || !**src)
106 * read_console() - read byte from console
108 * @data - not used, needed to match interface
109 * Return: - byte read or 0 on error
111 static u8 read_console(void *data)
121 int console_read_unicode(s32 *code)
127 /* No input available */
131 /* Read Unicode code */
132 c = get_code(read_console, NULL);
140 s32 utf8_get(const char **src)
142 return get_code(read_string, src);
145 int utf8_put(s32 code, char **dst)
149 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
151 if (code <= 0x007F) {
154 if (code <= 0x07FF) {
155 **dst = code >> 6 | 0xC0;
157 if (code < 0x10000) {
158 **dst = code >> 12 | 0xE0;
160 **dst = code >> 18 | 0xF0;
162 **dst = (code >> 12 & 0x3F) | 0x80;
165 **dst = (code >> 6 & 0x3F) | 0x80;
168 **dst = (code & 0x3F) | 0x80;
174 size_t utf8_utf16_strnlen(const char *src, size_t count)
178 for (; *src && count; --count) {
179 s32 code = utf8_get(&src);
184 /* Reserve space for a replacement character */
186 } else if (code < 0x10000) {
195 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
197 if (!src || !dst || !*dst)
200 for (; count && *src; --count) {
201 s32 code = utf8_get(&src);
205 utf16_put(code, dst);
211 s32 utf16_get(const u16 **src)
221 if (code >= 0xDC00 && code <= 0xDFFF)
223 if (code >= 0xD800 && code <= 0xDBFF) {
231 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
239 int utf16_put(s32 code, u16 **dst)
243 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
245 if (code < 0x10000) {
249 **dst = code >> 10 | 0xD800;
251 **dst = (code & 0x3ff) | 0xDC00;
257 size_t utf16_strnlen(const u16 *src, size_t count)
261 for (; *src && count; --count) {
262 s32 code = utf16_get(&src);
267 * In case of an illegal sequence still reserve space for a
268 * replacement character.
275 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
279 for (; *src && count; --count) {
280 s32 code = utf16_get(&src);
285 /* Reserve space for a replacement character */
287 else if (code < 0x80)
289 else if (code < 0x800)
291 else if (code < 0x10000)
299 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
301 if (!src || !dst || !*dst)
304 for (; count && *src; --count) {
305 s32 code = utf16_get(&src);
315 s32 utf_to_lower(const s32 code)
317 struct capitalization_table *pos = capitalization_table;
321 if (code >= 'A' && code <= 'Z')
325 for (; pos->upper; ++pos) {
326 if (pos->upper == code) {
334 s32 utf_to_upper(const s32 code)
336 struct capitalization_table *pos = capitalization_table;
340 if (code >= 'a' && code <= 'z')
344 for (; pos->lower; ++pos) {
345 if (pos->lower == code) {
354 * u16_strncmp() - compare two u16 string
356 * @s1: first string to compare
357 * @s2: second string to compare
358 * @n: maximum number of u16 to compare
359 * Return: 0 if the first n u16 are the same in s1 and s2
360 * < 0 if the first different u16 in s1 is less than the
361 * corresponding u16 in s2
362 * > 0 if the first different u16 in s1 is greater than the
363 * corresponding u16 in s2
365 int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
369 for (; n; --n, ++s1, ++s2) {
378 size_t __efi_runtime u16_strnlen(const u16 *in, size_t count)
381 for (i = 0; count-- && in[i]; i++);
385 size_t u16_strsize(const void *in)
387 return (u16_strlen(in) + 1) * sizeof(u16);
390 u16 *u16_strcpy(u16 *dest, const u16 *src)
394 for (;; dest++, src++) {
403 u16 *u16_strdup(const void *src)
410 len = u16_strsize(src);
414 memcpy(new, src, len);
419 /* Convert UTF-16 to UTF-8. */
420 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
422 uint32_t code_high = 0;
425 uint32_t code = *src++;
428 if (code >= 0xDC00 && code <= 0xDFFF) {
429 /* Surrogate pair. */
430 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
432 *dest++ = (code >> 18) | 0xF0;
433 *dest++ = ((code >> 12) & 0x3F) | 0x80;
434 *dest++ = ((code >> 6) & 0x3F) | 0x80;
435 *dest++ = (code & 0x3F) | 0x80;
439 /* *src may be valid. Don't eat it. */
445 if (code <= 0x007F) {
447 } else if (code <= 0x07FF) {
448 *dest++ = (code >> 6) | 0xC0;
449 *dest++ = (code & 0x3F) | 0x80;
450 } else if (code >= 0xD800 && code <= 0xDBFF) {
453 } else if (code >= 0xDC00 && code <= 0xDFFF) {
456 } else if (code < 0x10000) {
457 *dest++ = (code >> 12) | 0xE0;
458 *dest++ = ((code >> 6) & 0x3F) | 0x80;
459 *dest++ = (code & 0x3F) | 0x80;
461 *dest++ = (code >> 18) | 0xF0;
462 *dest++ = ((code >> 12) & 0x3F) | 0x80;
463 *dest++ = ((code >> 6) & 0x3F) | 0x80;
464 *dest++ = (code & 0x3F) | 0x80;
472 int utf_to_cp(s32 *c, const u16 *codepage)
477 /* Look up codepage translation */
478 for (j = 0; j < 0x80; ++j) {
479 if (*c == codepage[j]) {
490 int utf8_to_cp437_stream(u8 c, char *buffer)
499 end = buffer + strlen(buffer);
505 ret = utf_to_cp(&s, codepage_437);
514 int utf8_to_utf32_stream(u8 c, char *buffer)
522 end = buffer + strlen(buffer);