lib/charset.c

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /*
   3  *  charset conversion utils
   4  *
   5  *  Copyright (c) 2017 Rob Clark
   6  */
   7
   8 #include <common.h>
   9 #include <charset.h>
  10 #include <capitalization.h>
  11 #include <malloc.h>
  12
  13 static struct capitalization_table capitalization_table[] =
  14 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
  15         UNICODE_CAPITALIZATION_TABLE;
  16 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
  17         CP1250_CAPITALIZATION_TABLE;
  18 #else
  19         CP437_CAPITALIZATION_TABLE;
  20 #endif
  21
  22 /**
  23  * get_code() - read Unicode code point from UTF-8 stream
  24  *
  25  * @read_u8:    - stream reader
  26  * @src:        - string buffer passed to stream reader, optional
  27  * Return:      - Unicode code point
  28  */
  29 static int get_code(u8 (*read_u8)(void *data), void *data)
  30 {
  31         s32 ch = 0;
  32
  33         ch = read_u8(data);
  34         if (!ch)
  35                 return 0;
  36         if (ch >= 0xc2 && ch <= 0xf4) {
  37                 int code = 0;
  38
  39                 if (ch >= 0xe0) {
  40                         if (ch >= 0xf0) {
  41                                 /* 0xf0 - 0xf4 */
  42                                 ch &= 0x07;
  43                                 code = ch << 18;
  44                                 ch = read_u8(data);
  45                                 if (ch < 0x80 || ch > 0xbf)
  46                                         goto error;
  47                                 ch &= 0x3f;
  48                         } else {
  49                                 /* 0xe0 - 0xef */
  50                                 ch &= 0x0f;
  51                         }
  52                         code += ch << 12;
  53                         if ((code >= 0xD800 && code <= 0xDFFF) ||
  54                             code >= 0x110000)
  55                                 goto error;
  56                         ch = read_u8(data);
  57                         if (ch < 0x80 || ch > 0xbf)
  58                                 goto error;
  59                 }
  60                 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
  61                 ch &= 0x3f;
  62                 code += ch << 6;
  63                 ch = read_u8(data);
  64                 if (ch < 0x80 || ch > 0xbf)
  65                         goto error;
  66                 ch &= 0x3f;
  67                 ch += code;
  68         } else if (ch >= 0x80) {
  69                 goto error;
  70         }
  71         return ch;
  72 error:
  73         return '?';
  74 }
  75
  76 /**
  77  * read_string() - read byte from character string
  78  *
  79  * @data:       - pointer to string
  80  * Return:      - byte read
  81  *
  82  * The string pointer is incremented if it does not point to '\0'.
  83  */
  84 static u8 read_string(void *data)
  85
  86 {
  87         const char **src = (const char **)data;
  88         u8 c;
  89
  90         if (!src || !*src || !**src)
  91                 return 0;
  92         c = **src;
  93         ++*src;
  94         return c;
  95 }
  96
  97 /**
  98  * read_console() - read byte from console
  99  *
 100  * @data        - not used, needed to match interface
 101  * Return:      - byte read or 0 on error
 102  */
 103 static u8 read_console(void *data)
 104 {
 105         int ch;
 106
 107         ch = getc();
 108         if (ch < 0)
 109                 ch = 0;
 110         return ch;
 111 }
 112
 113 int console_read_unicode(s32 *code)
 114 {
 115         if (!tstc()) {
 116                 /* No input available */
 117                 return 1;
 118         }
 119
 120         /* Read Unicode code */
 121         *code = get_code(read_console, NULL);
 122         return 0;
 123 }
 124
 125 s32 utf8_get(const char **src)
 126 {
 127         return get_code(read_string, src);
 128 }
 129
 130 int utf8_put(s32 code, char **dst)
 131 {
 132         if (!dst || !*dst)
 133                 return -1;
 134         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
 135                 return -1;
 136         if (code <= 0x007F) {
 137                 **dst = code;
 138         } else {
 139                 if (code <= 0x07FF) {
 140                         **dst = code >> 6 | 0xC0;
 141                 } else {
 142                         if (code < 0x10000) {
 143                                 **dst = code >> 12 | 0xE0;
 144                         } else {
 145                                 **dst = code >> 18 | 0xF0;
 146                                 ++*dst;
 147                                 **dst = (code >> 12 & 0x3F) | 0x80;
 148                         }
 149                         ++*dst;
 150                         **dst = (code >> 6 & 0x3F) | 0x80;
 151                 }
 152                 ++*dst;
 153                 **dst = (code & 0x3F) | 0x80;
 154         }
 155         ++*dst;
 156         return 0;
 157 }
 158
 159 size_t utf8_utf16_strnlen(const char *src, size_t count)
 160 {
 161         size_t len = 0;
 162
 163         for (; *src && count; --count)  {
 164                 s32 code = utf8_get(&src);
 165
 166                 if (!code)
 167                         break;
 168                 if (code < 0) {
 169                         /* Reserve space for a replacement character */
 170                         len += 1;
 171                 } else if (code < 0x10000) {
 172                         len += 1;
 173                 } else {
 174                         len += 2;
 175                 }
 176         }
 177         return len;
 178 }
 179
 180 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
 181 {
 182         if (!src || !dst || !*dst)
 183                 return -1;
 184
 185         for (; count && *src; --count) {
 186                 s32 code = utf8_get(&src);
 187
 188                 if (code < 0)
 189                         code = '?';
 190                 utf16_put(code, dst);
 191         }
 192         **dst = 0;
 193         return 0;
 194 }
 195
 196 s32 utf16_get(const u16 **src)
 197 {
 198         s32 code, code2;
 199
 200         if (!src || !*src)
 201                 return -1;
 202         if (!**src)
 203                 return 0;
 204         code = **src;
 205         ++*src;
 206         if (code >= 0xDC00 && code <= 0xDFFF)
 207                 return -1;
 208         if (code >= 0xD800 && code <= 0xDBFF) {
 209                 if (!**src)
 210                         return -1;
 211                 code &= 0x3ff;
 212                 code <<= 10;
 213                 code += 0x10000;
 214                 code2 = **src;
 215                 ++*src;
 216                 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
 217                         return -1;
 218                 code2 &= 0x3ff;
 219                 code += code2;
 220         }
 221         return code;
 222 }
 223
 224 int utf16_put(s32 code, u16 **dst)
 225 {
 226         if (!dst || !*dst)
 227                 return -1;
 228         if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
 229                 return -1;
 230         if (code < 0x10000) {
 231                 **dst = code;
 232         } else {
 233                 code -= 0x10000;
 234                 **dst = code >> 10 | 0xD800;
 235                 ++*dst;
 236                 **dst = (code & 0x3ff) | 0xDC00;
 237         }
 238         ++*dst;
 239         return 0;
 240 }
 241
 242 size_t utf16_strnlen(const u16 *src, size_t count)
 243 {
 244         size_t len = 0;
 245
 246         for (; *src && count; --count)  {
 247                 s32 code = utf16_get(&src);
 248
 249                 if (!code)
 250                         break;
 251                 /*
 252                  * In case of an illegal sequence still reserve space for a
 253                  * replacement character.
 254                  */
 255                 ++len;
 256         }
 257         return len;
 258 }
 259
 260 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
 261 {
 262         size_t len = 0;
 263
 264         for (; *src && count; --count)  {
 265                 s32 code = utf16_get(&src);
 266
 267                 if (!code)
 268                         break;
 269                 if (code < 0)
 270                         /* Reserve space for a replacement character */
 271                         len += 1;
 272                 else if (code < 0x80)
 273                         len += 1;
 274                 else if (code < 0x800)
 275                         len += 2;
 276                 else if (code < 0x10000)
 277                         len += 3;
 278                 else
 279                         len += 4;
 280         }
 281         return len;
 282 }
 283
 284 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
 285 {
 286         if (!src || !dst || !*dst)
 287                 return -1;
 288
 289         for (; count && *src; --count) {
 290                 s32 code = utf16_get(&src);
 291
 292                 if (code < 0)
 293                         code = '?';
 294                 utf8_put(code, dst);
 295         }
 296         **dst = 0;
 297         return 0;
 298 }
 299
 300 s32 utf_to_lower(const s32 code)
 301 {
 302         struct capitalization_table *pos = capitalization_table;
 303         s32 ret = code;
 304
 305         if (code <= 0x7f) {
 306                 if (code >= 'A' && code <= 'Z')
 307                         ret += 0x20;
 308                 return ret;
 309         }
 310         for (; pos->upper; ++pos) {
 311                 if (pos->upper == code) {
 312                         ret = pos->lower;
 313                         break;
 314                 }
 315         }
 316         return ret;
 317 }
 318
 319 s32 utf_to_upper(const s32 code)
 320 {
 321         struct capitalization_table *pos = capitalization_table;
 322         s32 ret = code;
 323
 324         if (code <= 0x7f) {
 325                 if (code >= 'a' && code <= 'z')
 326                         ret -= 0x20;
 327                 return ret;
 328         }
 329         for (; pos->lower; ++pos) {
 330                 if (pos->lower == code) {
 331                         ret = pos->upper;
 332                         break;
 333                 }
 334         }
 335         return ret;
 336 }
 337
 338 /*
 339  * u16_strncmp() - compare two u16 string
 340  *
 341  * @s1:         first string to compare
 342  * @s2:         second string to compare
 343  * @n:          maximum number of u16 to compare
 344  * Return:      0  if the first n u16 are the same in s1 and s2
 345  *              < 0 if the first different u16 in s1 is less than the
 346  *              corresponding u16 in s2
 347  *              > 0 if the first different u16 in s1 is greater than the
 348  *              corresponding u16 in s2
 349  */
 350 int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
 351 {
 352         int ret = 0;
 353
 354         for (; n; --n, ++s1, ++s2) {
 355                 ret = *s1 - *s2;
 356                 if (ret || !*s1)
 357                         break;
 358         }
 359
 360         return ret;
 361 }
 362
 363 size_t u16_strlen(const void *in)
 364 {
 365         const char *pos = in;
 366         size_t ret;
 367
 368         for (; pos[0] || pos[1]; pos += 2)
 369                 ;
 370         ret = pos - (char *)in;
 371         ret >>= 1;
 372         return ret;
 373 }
 374
 375 size_t u16_strnlen(const u16 *in, size_t count)
 376 {
 377         size_t i;
 378         for (i = 0; count-- && in[i]; i++);
 379         return i;
 380 }
 381
 382 u16 *u16_strcpy(u16 *dest, const u16 *src)
 383 {
 384         u16 *tmp = dest;
 385
 386         for (;; dest++, src++) {
 387                 *dest = *src;
 388                 if (!*src)
 389                         break;
 390         }
 391
 392         return tmp;
 393 }
 394
 395 u16 *u16_strdup(const void *src)
 396 {
 397         u16 *new;
 398         size_t len;
 399
 400         if (!src)
 401                 return NULL;
 402         len = (u16_strlen(src) + 1) * sizeof(u16);
 403         new = malloc(len);
 404         if (!new)
 405                 return NULL;
 406         memcpy(new, src, len);
 407
 408         return new;
 409 }
 410
 411 /* Convert UTF-16 to UTF-8.  */
 412 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
 413 {
 414         uint32_t code_high = 0;
 415
 416         while (size--) {
 417                 uint32_t code = *src++;
 418
 419                 if (code_high) {
 420                         if (code >= 0xDC00 && code <= 0xDFFF) {
 421                                 /* Surrogate pair.  */
 422                                 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
 423
 424                                 *dest++ = (code >> 18) | 0xF0;
 425                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
 426                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 427                                 *dest++ = (code & 0x3F) | 0x80;
 428                         } else {
 429                                 /* Error...  */
 430                                 *dest++ = '?';
 431                                 /* *src may be valid. Don't eat it.  */
 432                                 src--;
 433                         }
 434
 435                         code_high = 0;
 436                 } else {
 437                         if (code <= 0x007F) {
 438                                 *dest++ = code;
 439                         } else if (code <= 0x07FF) {
 440                                 *dest++ = (code >> 6) | 0xC0;
 441                                 *dest++ = (code & 0x3F) | 0x80;
 442                         } else if (code >= 0xD800 && code <= 0xDBFF) {
 443                                 code_high = code;
 444                                 continue;
 445                         } else if (code >= 0xDC00 && code <= 0xDFFF) {
 446                                 /* Error... */
 447                                 *dest++ = '?';
 448                         } else if (code < 0x10000) {
 449                                 *dest++ = (code >> 12) | 0xE0;
 450                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 451                                 *dest++ = (code & 0x3F) | 0x80;
 452                         } else {
 453                                 *dest++ = (code >> 18) | 0xF0;
 454                                 *dest++ = ((code >> 12) & 0x3F) | 0x80;
 455                                 *dest++ = ((code >> 6) & 0x3F) | 0x80;
 456                                 *dest++ = (code & 0x3F) | 0x80;
 457                         }
 458                 }
 459         }
 460
 461         return dest;
 462 }