lib/charset.c

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /*
   3  *  charset conversion utils
   4  *
   5  *  Copyright (c) 2017 Rob Clark
   6  */
   7
   8 #include <common.h>
   9 #include <charset.h>
  10 #include <capitalization.h>
  11 #include <cp437.h>
  12 #include <efi_loader.h>
  13 #include <malloc.h>
  14
  15 /**
  16  * codepage_437 - Unicode to codepage 437 translation table
  17  */
  18 const u16 codepage_437[128] = CP437;
  19
  20 static struct capitalization_table capitalization_table[] =
  21 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
  22         UNICODE_CAPITALIZATION_TABLE;
  23 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
  24         CP1250_CAPITALIZATION_TABLE;
  25 #else
  26         CP437_CAPITALIZATION_TABLE;
  27 #endif
  28
  29 /**
  30  * get_code() - read Unicode code point from UTF-8 stream
  31  *
  32  * @read_u8:    - stream reader
  33  * @src:        - string buffer passed to stream reader, optional
  34  * Return:      - Unicode code point
  35  */
  36 static int get_code(u8 (*read_u8)(void *data), void *data)
  37 {
  38         s32 ch = 0;
  39
  40         ch = read_u8(data);
  41         if (!ch)
  42                 return 0;
  43         if (ch >= 0xc2 && ch <= 0xf4) {
  44                 int code = 0;
  45
  46                 if (ch >= 0xe0) {
  47                         if (ch >= 0xf0) {
  48                                 /* 0xf0 - 0xf4 */
  49                                 ch &= 0x07;
  50                                 code = ch << 18;
  51                                 ch = read_u8(data);
  52                                 if (ch < 0x80 || ch > 0xbf)
  53                                         goto error;
  54                                 ch &= 0x3f;
  55                         } else {
  56                                 /* 0xe0 - 0xef */
  57                                 ch &= 0x0f;
  58                         }
  59                         code += ch << 12;
  60                         if ((code >= 0xD800 && code <= 0xDFFF) ||
  61                             code >= 0x110000)
  62                                 goto error;
  63                         ch = read_u8(data);
  64                         if (ch < 0x80 || ch > 0xbf)
  65                                 goto error;
  66                 }
  67                 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
  68                 ch &= 0x3f;
  69                 code += ch << 6;
  70                 ch = read_u8(data);
  71                 if (ch < 0x80 || ch > 0xbf)
  72                         goto error;
  73                 ch &= 0x3f;
  74                 ch += code;
  75         } else if (ch >= 0x80) {
  76                 goto error;
  77         }
  78         return ch;
  79 error:
  80         return '?';
  81 }
  82
  83 /**
  84  * read_string() - read byte from character string
  85  *
  86  * @data:       - pointer to string
  87  * Return:      - byte read
  88  *
  89  * The string pointer is incremented if it does not point to '\0'.
  90  */
  91 static u8 read_string(void *data)
  92
  93 {
  94         const char **src = (const char **)data;
  95         u8 c;
  96
  97         if (!src || !*src || !**src)
  98                 return 0;
  99         c = **src;
 100         ++*src;
 101         return c;
 102 }
 103
 104 /**
 105  * read_console() - read byte from console
 106  *
 107  * @data        - not used, needed to match interface
 108  * Return:      - byte read or 0 on error
 109  */
 110 static u8 read_console(void *data)
 111 {
 112         int ch;
 113
 114         ch = getchar();
 115         if (ch < 0)
 116                 ch = 0;
 117         return ch;
 118 }
 119
 120 int console_read_unicode(s32 *code)
 121 {
 122         if (!tstc()) {
 123                 /* No input available */
 124                 return 1;
 125         }
 126
 127         /* Read Unicode code */
 128         *code = get_code(read_console, NULL);
 129         return 0;
 130 }
 131
 132 s32 utf8_get(const char **src)
 133 {
 134         return get_code(read_string, src);
 135 }
 136
 137 int utf8_put(s32 code, char **dst)
 138 {
 139         if (!dst || !*dst)
 140                 return -1;
 141         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
 142                 return -1;
 143         if (code <= 0x007F) {
 144                 **dst = code;
 145         } else {
 146                 if (code <= 0x07FF) {
 147                         **dst = code >> 6 | 0xC0;
 148                 } else {
 149                         if (code < 0x10000) {
 150                                 **dst = code >> 12 | 0xE0;
 151                         } else {
 152                                 **dst = code >> 18 | 0xF0;
 153                                 ++*dst;
 154                                 **dst = (code >> 12 & 0x3F) | 0x80;
 155                         }
 156                         ++*dst;
 157                         **dst = (code >> 6 & 0x3F) | 0x80;
 158                 }
 159                 ++*dst;
 160                 **dst = (code & 0x3F) | 0x80;
 161         }
 162         ++*dst;
 163         return 0;
 164 }
 165
 166 size_t utf8_utf16_strnlen(const char *src, size_t count)
 167 {
 168         size_t len = 0;
 169
 170         for (; *src && count; --count)  {
 171                 s32 code = utf8_get(&src);
 172
 173                 if (!code)
 174                         break;
 175                 if (code < 0) {
 176                         /* Reserve space for a replacement character */
 177                         len += 1;
 178                 } else if (code < 0x10000) {
 179                         len += 1;
 180                 } else {
 181                         len += 2;
 182                 }
 183         }
 184         return len;
 185 }
 186
 187 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
 188 {
 189         if (!src || !dst || !*dst)
 190                 return -1;
 191
 192         for (; count && *src; --count) {
 193                 s32 code = utf8_get(&src);
 194
 195                 if (code < 0)
 196                         code = '?';
 197                 utf16_put(code, dst);
 198         }
 199         **dst = 0;
 200         return 0;
 201 }
 202
 203 s32 utf16_get(const u16 **src)
 204 {
 205         s32 code, code2;
 206
 207         if (!src || !*src)
 208                 return -1;
 209         if (!**src)
 210                 return 0;
 211         code = **src;
 212         ++*src;
 213         if (code >= 0xDC00 && code <= 0xDFFF)
 214                 return -1;
 215         if (code >= 0xD800 && code <= 0xDBFF) {
 216                 if (!**src)
 217                         return -1;
 218                 code &= 0x3ff;
 219                 code <<= 10;
 220                 code += 0x10000;
 221                 code2 = **src;
 222                 ++*src;
 223                 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
 224                         return -1;
 225                 code2 &= 0x3ff;
 226                 code += code2;
 227         }
 228         return code;
 229 }
 230
 231 int utf16_put(s32 code, u16 **dst)
 232 {
 233         if (!dst || !*dst)
 234                 return -1;
 235         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
 236                 return -1;
 237         if (code < 0x10000) {
 238                 **dst = code;
 239         } else {
 240                 code -= 0x10000;
 241                 **dst = code >> 10 | 0xD800;
 242                 ++*dst;
 243                 **dst = (code & 0x3ff) | 0xDC00;
 244         }
 245         ++*dst;
 246         return 0;
 247 }
 248
 249 size_t utf16_strnlen(const u16 *src, size_t count)
 250 {
 251         size_t len = 0;
 252
 253         for (; *src && count; --count)  {
 254                 s32 code = utf16_get(&src);
 255
 256                 if (!code)
 257                         break;
 258                 /*
 259                  * In case of an illegal sequence still reserve space for a
 260                  * replacement character.
 261                  */
 262                 ++len;
 263         }
 264         return len;
 265 }
 266
 267 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
 268 {
 269         size_t len = 0;
 270
 271         for (; *src && count; --count)  {
 272                 s32 code = utf16_get(&src);
 273
 274                 if (!code)
 275                         break;
 276                 if (code < 0)
 277                         /* Reserve space for a replacement character */
 278                         len += 1;
 279                 else if (code < 0x80)
 280                         len += 1;
 281                 else if (code < 0x800)
 282                         len += 2;
 283                 else if (code < 0x10000)
 284                         len += 3;
 285                 else
 286                         len += 4;
 287         }
 288         return len;
 289 }
 290
 291 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
 292 {
 293         if (!src || !dst || !*dst)
 294                 return -1;
 295
 296         for (; count && *src; --count) {
 297                 s32 code = utf16_get(&src);
 298
 299                 if (code < 0)
 300                         code = '?';
 301                 utf8_put(code, dst);
 302         }
 303         **dst = 0;
 304         return 0;
 305 }
 306
 307 s32 utf_to_lower(const s32 code)
 308 {
 309         struct capitalization_table *pos = capitalization_table;
 310         s32 ret = code;
 311
 312         if (code <= 0x7f) {
 313                 if (code >= 'A' && code <= 'Z')
 314                         ret += 0x20;
 315                 return ret;
 316         }
 317         for (; pos->upper; ++pos) {
 318                 if (pos->upper == code) {
 319                         ret = pos->lower;
 320                         break;
 321                 }
 322         }
 323         return ret;
 324 }
 325
 326 s32 utf_to_upper(const s32 code)
 327 {
 328         struct capitalization_table *pos = capitalization_table;
 329         s32 ret = code;
 330
 331         if (code <= 0x7f) {
 332                 if (code >= 'a' && code <= 'z')
 333                         ret -= 0x20;
 334                 return ret;
 335         }
 336         for (; pos->lower; ++pos) {
 337                 if (pos->lower == code) {
 338                         ret = pos->upper;
 339                         break;
 340                 }
 341         }
 342         return ret;
 343 }
 344
 345 /*
 346  * u16_strncmp() - compare two u16 string
 347  *
 348  * @s1:         first string to compare
 349  * @s2:         second string to compare
 350  * @n:          maximum number of u16 to compare
 351  * Return:      0  if the first n u16 are the same in s1 and s2
 352  *              < 0 if the first different u16 in s1 is less than the
 353  *              corresponding u16 in s2
 354  *              > 0 if the first different u16 in s1 is greater than the
 355  *              corresponding u16 in s2
 356  */
 357 int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
 358 {
 359         int ret = 0;
 360
 361         for (; n; --n, ++s1, ++s2) {
 362                 ret = *s1 - *s2;
 363                 if (ret || !*s1)
 364                         break;
 365         }
 366
 367         return ret;
 368 }
 369
 370 size_t u16_strlen(const void *in)
 371 {
 372         const char *pos = in;
 373         size_t ret;
 374
 375         for (; pos[0] || pos[1]; pos += 2)
 376                 ;
 377         ret = pos - (char *)in;
 378         ret >>= 1;
 379         return ret;
 380 }
 381
 382 size_t __efi_runtime u16_strnlen(const u16 *in, size_t count)
 383 {
 384         size_t i;
 385         for (i = 0; count-- && in[i]; i++);
 386         return i;
 387 }
 388
 389 size_t u16_strsize(const void *in)
 390 {
 391         return (u16_strlen(in) + 1) * sizeof(u16);
 392 }
 393
 394 u16 *u16_strcpy(u16 *dest, const u16 *src)
 395 {
 396         u16 *tmp = dest;
 397
 398         for (;; dest++, src++) {
 399                 *dest = *src;
 400                 if (!*src)
 401                         break;
 402         }
 403
 404         return tmp;
 405 }
 406
 407 u16 *u16_strdup(const void *src)
 408 {
 409         u16 *new;
 410         size_t len;
 411
 412         if (!src)
 413                 return NULL;
 414         len = (u16_strlen(src) + 1) * sizeof(u16);
 415         new = malloc(len);
 416         if (!new)
 417                 return NULL;
 418         memcpy(new, src, len);
 419
 420         return new;
 421 }
 422
 423 /* Convert UTF-16 to UTF-8.  */
 424 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
 425 {
 426         uint32_t code_high = 0;
 427
 428         while (size--) {
 429                 uint32_t code = *src++;
 430
 431                 if (code_high) {
 432                         if (code >= 0xDC00 && code <= 0xDFFF) {
 433                                 /* Surrogate pair.  */
 434                                 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
 435
 436                                 *dest++ = (code >> 18) | 0xF0;
 437                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
 438                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 439                                 *dest++ = (code & 0x3F) | 0x80;
 440                         } else {
 441                                 /* Error...  */
 442                                 *dest++ = '?';
 443                                 /* *src may be valid. Don't eat it.  */
 444                                 src--;
 445                         }
 446
 447                         code_high = 0;
 448                 } else {
 449                         if (code <= 0x007F) {
 450                                 *dest++ = code;
 451                         } else if (code <= 0x07FF) {
 452                                 *dest++ = (code >> 6) | 0xC0;
 453                                 *dest++ = (code & 0x3F) | 0x80;
 454                         } else if (code >= 0xD800 && code <= 0xDBFF) {
 455                                 code_high = code;
 456                                 continue;
 457                         } else if (code >= 0xDC00 && code <= 0xDFFF) {
 458                                 /* Error... */
 459                                 *dest++ = '?';
 460                         } else if (code < 0x10000) {
 461                                 *dest++ = (code >> 12) | 0xE0;
 462                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 463                                 *dest++ = (code & 0x3F) | 0x80;
 464                         } else {
 465                                 *dest++ = (code >> 18) | 0xF0;
 466                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
 467                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 468                                 *dest++ = (code & 0x3F) | 0x80;
 469                         }
 470                 }
 471         }
 472
 473         return dest;
 474 }