1 // SPDX-License-Identifier: GPL-2.0+
3 * charset conversion utils
5 * Copyright (c) 2017 Rob Clark
10 #include <capitalization.h>
13 static struct capitalization_table capitalization_table[] =
14 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
15 UNICODE_CAPITALIZATION_TABLE;
16 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
17 CP1250_CAPITALIZATION_TABLE;
19 CP437_CAPITALIZATION_TABLE;
23 * get_code() - read Unicode code point from UTF-8 stream
25 * @read_u8: - stream reader
26 * @src: - string buffer passed to stream reader, optional
27 * Return: - Unicode code point
29 static int get_code(u8 (*read_u8)(void *data), void *data)
36 if (ch >= 0xc2 && ch <= 0xf4) {
45 if (ch < 0x80 || ch > 0xbf)
53 if ((code >= 0xD800 && code <= 0xDFFF) ||
57 if (ch < 0x80 || ch > 0xbf)
60 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
64 if (ch < 0x80 || ch > 0xbf)
68 } else if (ch >= 0x80) {
77 * read_string() - read byte from character string
79 * @data: - pointer to string
82 * The string pointer is incremented if it does not point to '\0'.
84 static u8 read_string(void *data)
87 const char **src = (const char **)data;
90 if (!src || !*src || !**src)
98 * read_console() - read byte from console
100 * @data - not used, needed to match interface
101 * Return: - byte read or 0 on error
103 static u8 read_console(void *data)
113 int console_read_unicode(s32 *code)
116 /* No input available */
120 /* Read Unicode code */
121 *code = get_code(read_console, NULL);
125 s32 utf8_get(const char **src)
127 return get_code(read_string, src);
130 int utf8_put(s32 code, char **dst)
134 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
136 if (code <= 0x007F) {
139 if (code <= 0x07FF) {
140 **dst = code >> 6 | 0xC0;
142 if (code < 0x10000) {
143 **dst = code >> 12 | 0xE0;
145 **dst = code >> 18 | 0xF0;
147 **dst = (code >> 12 & 0x3F) | 0x80;
150 **dst = (code >> 6 & 0x3F) | 0x80;
153 **dst = (code & 0x3F) | 0x80;
159 size_t utf8_utf16_strnlen(const char *src, size_t count)
163 for (; *src && count; --count) {
164 s32 code = utf8_get(&src);
169 /* Reserve space for a replacement character */
171 } else if (code < 0x10000) {
180 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
182 if (!src || !dst || !*dst)
185 for (; count && *src; --count) {
186 s32 code = utf8_get(&src);
190 utf16_put(code, dst);
196 s32 utf16_get(const u16 **src)
206 if (code >= 0xDC00 && code <= 0xDFFF)
208 if (code >= 0xD800 && code <= 0xDBFF) {
216 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
224 int utf16_put(s32 code, u16 **dst)
228 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
230 if (code < 0x10000) {
234 **dst = code >> 10 | 0xD800;
236 **dst = (code & 0x3ff) | 0xDC00;
242 size_t utf16_strnlen(const u16 *src, size_t count)
246 for (; *src && count; --count) {
247 s32 code = utf16_get(&src);
252 * In case of an illegal sequence still reserve space for a
253 * replacement character.
260 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
264 for (; *src && count; --count) {
265 s32 code = utf16_get(&src);
270 /* Reserve space for a replacement character */
272 else if (code < 0x80)
274 else if (code < 0x800)
276 else if (code < 0x10000)
284 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
286 if (!src || !dst || !*dst)
289 for (; count && *src; --count) {
290 s32 code = utf16_get(&src);
300 s32 utf_to_lower(const s32 code)
302 struct capitalization_table *pos = capitalization_table;
306 if (code >= 'A' && code <= 'Z')
310 for (; pos->upper; ++pos) {
311 if (pos->upper == code) {
319 s32 utf_to_upper(const s32 code)
321 struct capitalization_table *pos = capitalization_table;
325 if (code >= 'a' && code <= 'z')
329 for (; pos->lower; ++pos) {
330 if (pos->lower == code) {
338 size_t u16_strlen(const u16 *in)
341 for (i = 0; in[i]; i++);
345 size_t u16_strnlen(const u16 *in, size_t count)
348 for (i = 0; count-- && in[i]; i++);
352 u16 *u16_strcpy(u16 *dest, const u16 *src)
356 for (;; dest++, src++) {
365 u16 *u16_strdup(const u16 *src)
372 new = malloc((u16_strlen(src) + 1) * sizeof(u16));
376 u16_strcpy(new, src);
381 /* Convert UTF-16 to UTF-8. */
382 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
384 uint32_t code_high = 0;
387 uint32_t code = *src++;
390 if (code >= 0xDC00 && code <= 0xDFFF) {
391 /* Surrogate pair. */
392 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
394 *dest++ = (code >> 18) | 0xF0;
395 *dest++ = ((code >> 12) & 0x3F) | 0x80;
396 *dest++ = ((code >> 6) & 0x3F) | 0x80;
397 *dest++ = (code & 0x3F) | 0x80;
401 /* *src may be valid. Don't eat it. */
407 if (code <= 0x007F) {
409 } else if (code <= 0x07FF) {
410 *dest++ = (code >> 6) | 0xC0;
411 *dest++ = (code & 0x3F) | 0x80;
412 } else if (code >= 0xD800 && code <= 0xDBFF) {
415 } else if (code >= 0xDC00 && code <= 0xDFFF) {
418 } else if (code < 0x10000) {
419 *dest++ = (code >> 12) | 0xE0;
420 *dest++ = ((code >> 6) & 0x3F) | 0x80;
421 *dest++ = (code & 0x3F) | 0x80;
423 *dest++ = (code >> 18) | 0xF0;
424 *dest++ = ((code >> 12) & 0x3F) | 0x80;
425 *dest++ = ((code >> 6) & 0x3F) | 0x80;
426 *dest++ = (code & 0x3F) | 0x80;