src/tsm_unicode.c

   1 /*
   2  * TSM - Unicode Handling
   3  *
   4  * Copyright (c) 2011 David Herrmann <dh.herrmann@googlemail.com>
   5  * Copyright (c) 2011-2012 University of Tuebingen
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining
   8  * a copy of this software and associated documentation files
   9  * (the "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sublicense, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included
  16  * in all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  22  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  */
  26
  27 /*
  28  * The tsm-utf8-state-machine is based on the wayland-compositor demos:
  29  *
  30  * Copyright © 2008 Kristian Høgsberg
  31  *
  32  * Permission to use, copy, modify, distribute, and sell this software and
  33  * its documentation for any purpose is hereby granted without fee, provided
  34  * that the above copyright notice appear in all copies and that both that
  35  * copyright notice and this permission notice appear in supporting
  36  * documentation, and that the name of the copyright holders not be used in
  37  * advertising or publicity pertaining to distribution of the software
  38  * without specific, written prior permission.  The copyright holders make
  39  * no representations about the suitability of this software for any
  40  * purpose.  It is provided "as is" without express or implied warranty.
  41  *
  42  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  43  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  44  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  45  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
  46  * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
  47  * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  48  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  49  */
  50
  51 /*
  52  * Unicode Helpers
  53  * This implements several helpers for Unicode/UTF8/UCS4 input and output. See
  54  * below for comments on each helper.
  55  */
  56
  57 #include <errno.h>
  58 #include <inttypes.h>
  59 #include <stdlib.h>
  60 #include <string.h>
  61 #include "external/wcwidth.h"
  62 #include "shl_array.h"
  63 #include "shl_hashtable.h"
  64 #include "shl_misc.h"
  65 #include "tsm_unicode.h"
  66
  67 /*
  68  * Unicode Symbol Handling
  69  * The main goal of the tsm_symbol_* functions is to provide a datatype which
  70  * can contain the representation of any printable character. This includes all
  71  * basic Unicode characters but also combined characters.
  72  * To avoid all the memory management we still represent a character as a single
  73  * integer value (tsm_symbol_t) but internally we allocate a string which is
  74  * represented by this value.
  75  *
  76  * A tsm_symbol_t is an integer which represents a single character point.
  77  * For most Unicode characters this is simply the UCS4 representation. In fact,
  78  * every UCS4 characters is a valid tsm_symbol_t object.
  79  * However, Unicode standard allows combining marks. Therefore, some characters
  80  * consists of more than one Unicode character.
  81  * A global symbol-table provides all those combined characters as single
  82  * integers. You simply create a valid base character and append your combining
  83  * marks and the table will return a new valid tsm_symbol_t. It is no longer
  84  * a valid UCS4 value, though. But no memory management is needed as all
  85  * tsm_symbol_t objects are simple integers.
  86  *
  87  * The symbol table contains two-way
  88  * references. The Hash Table contains all the symbols with the symbol ucs4
  89  * string as key and the symbol ID as value.
  90  * The index array contains the symbol ID as key and a pointer to the ucs4
  91  * string as value. But the hash table owns the ucs4 string.
  92  * This allows fast implementations of *_get() and *_append() without long
  93  * search intervals.
  94  *
  95  * When creating a new symbol, we simply return the UCS4 value as new symbol. We
  96  * do not add it to our symbol table as it is only one character. However, if a
  97  * character is appended to an existing symbol, we create a new ucs4 string and
  98  * push the new symbol into the symbol table.
  99  */
 100
 101 SHL_EXPORT
 102 const tsm_symbol_t tsm_symbol_default = 0;
 103
 104 struct tsm_symbol_table {
 105         unsigned long ref;
 106         uint32_t next_id;
 107         struct shl_array *index;
 108         struct shl_hashtable *symbols;
 109 };
 110
 111 /* TODO: remove the default context */
 112 static struct tsm_symbol_table *tsm_symbol_table_default;
 113
 114 static unsigned int hash_ucs4(const void *key)
 115 {
 116         unsigned int val = 5381;
 117         size_t i;
 118         const uint32_t *ucs4 = key;
 119
 120         i = 0;
 121         while (ucs4[i] <= TSM_UCS4_MAX) {
 122                 val = val * 33 + ucs4[i];
 123                 ++i;
 124         }
 125
 126         return val;
 127 }
 128
 129 static bool cmp_ucs4(const void *a, const void *b)
 130 {
 131         size_t i;
 132         const uint32_t *v1, *v2;
 133
 134         v1 = a;
 135         v2 = b;
 136         i = 0;
 137
 138         while (1) {
 139                 if (v1[i] > TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX)
 140                         return true;
 141                 if (v1[i] > TSM_UCS4_MAX && v2[i] <= TSM_UCS4_MAX)
 142                         return false;
 143                 if (v1[i] <= TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX)
 144                         return false;
 145                 if (v1[i] != v2[i])
 146                         return false;
 147
 148                 ++i;
 149         }
 150 }
 151
 152 SHL_EXPORT
 153 int tsm_symbol_table_new(struct tsm_symbol_table **out)
 154 {
 155         struct tsm_symbol_table *tbl;
 156         int ret;
 157         static const uint32_t *val = NULL; /* we need a valid lvalue */
 158
 159         if (!out)
 160                 return -EINVAL;
 161
 162         tbl = malloc(sizeof(*tbl));
 163         if (!tbl)
 164                 return -ENOMEM;
 165         memset(tbl, 0, sizeof(*tbl));
 166         tbl->ref = 1;
 167         tbl->next_id = TSM_UCS4_MAX + 2;
 168
 169         ret = shl_array_new(&tbl->index, sizeof(uint32_t*), 4);
 170         if (ret)
 171                 goto err_free;
 172
 173         /* first entry is not used so add dummy */
 174         shl_array_push(tbl->index, &val);
 175
 176         ret = shl_hashtable_new(&tbl->symbols, hash_ucs4, cmp_ucs4,
 177                                 free, NULL);
 178         if (ret)
 179                 goto err_array;
 180
 181         *out = tbl;
 182         return 0;
 183
 184 err_array:
 185         shl_array_free(tbl->index);
 186 err_free:
 187         free(tbl);
 188         return ret;
 189 }
 190
 191 SHL_EXPORT
 192 void tsm_symbol_table_ref(struct tsm_symbol_table *tbl)
 193 {
 194         if (!tbl || !tbl->ref)
 195                 return;
 196
 197         ++tbl->ref;
 198 }
 199
 200 SHL_EXPORT
 201 void tsm_symbol_table_unref(struct tsm_symbol_table *tbl)
 202 {
 203         if (!tbl || !tbl->ref || --tbl->ref)
 204                 return;
 205
 206         shl_hashtable_free(tbl->symbols);
 207         shl_array_free(tbl->index);
 208         free(tbl);
 209 }
 210
 211 SHL_EXPORT
 212 tsm_symbol_t tsm_symbol_make(uint32_t ucs4)
 213 {
 214         if (ucs4 > TSM_UCS4_MAX)
 215                 return 0;
 216         else
 217                 return ucs4;
 218 }
 219
 220 /*
 221  * This decomposes a symbol into a ucs4 string and a size value. If \sym is a
 222  * valid UCS4 character, this returns a pointer to \sym and writes 1 into \size.
 223  * Therefore, the returned value may get destroyed if your \sym argument gets
 224  * destroyed.
 225  * If \sym is a composed ucs4 string, then the returned value points into the
 226  * hash table of the symbol table and lives as long as the symbol table does.
 227  *
 228  * This always returns a valid value. If an error happens, the default character
 229  * is returned. If \size is NULL, then the size value is omitted.
 230  */
 231 SHL_EXPORT
 232 const uint32_t *tsm_symbol_get(struct tsm_symbol_table *tbl,
 233                                tsm_symbol_t *sym, size_t *size)
 234 {
 235         uint32_t *ucs4, idx;
 236         int ret;
 237
 238         if (*sym <= TSM_UCS4_MAX) {
 239                 if (size)
 240                         *size = 1;
 241                 return sym;
 242         }
 243
 244         if (!tbl)
 245                 tbl = tsm_symbol_table_default;
 246
 247         if (!tbl) {
 248                 ret = tsm_symbol_table_new(&tbl);
 249                 if (ret) {
 250                         if (size)
 251                                 *size = 1;
 252                         return &tsm_symbol_default;
 253                 }
 254                 tsm_symbol_table_default = tbl;
 255         }
 256
 257         idx = *sym - (TSM_UCS4_MAX + 1);
 258         if (idx >= shl_array_get_length(tbl->index))
 259                 ucs4 = NULL;
 260         else
 261                 ucs4 = *SHL_ARRAY_AT(tbl->index, uint32_t*, idx);
 262
 263         if (!ucs4) {
 264                 if (size)
 265                         *size = 1;
 266                 return &tsm_symbol_default;
 267         }
 268
 269         if (size) {
 270                 *size = 0;
 271                 while (ucs4[*size] <= TSM_UCS4_MAX)
 272                         ++*size;
 273         }
 274
 275         return ucs4;
 276 }
 277
 278 SHL_EXPORT
 279 tsm_symbol_t tsm_symbol_append(struct tsm_symbol_table *tbl,
 280                                tsm_symbol_t sym, uint32_t ucs4)
 281 {
 282         uint32_t buf[TSM_UCS4_MAXLEN + 1], nsym, *nval;
 283         const uint32_t *ptr;
 284         size_t s;
 285         void *tmp;
 286         bool res;
 287         int ret;
 288
 289         if (!tbl)
 290                 tbl = tsm_symbol_table_default;
 291
 292         if (!tbl) {
 293                 ret = tsm_symbol_table_new(&tbl);
 294                 if (ret)
 295                         return sym;
 296                 tsm_symbol_table_default = tbl;
 297         }
 298
 299         if (ucs4 > TSM_UCS4_MAX)
 300                 return sym;
 301
 302         ptr = tsm_symbol_get(tbl, &sym, &s);
 303         if (s >= TSM_UCS4_MAXLEN)
 304                 return sym;
 305
 306         memcpy(buf, ptr, s * sizeof(uint32_t));
 307         buf[s++] = ucs4;
 308         buf[s++] = TSM_UCS4_MAX + 1;
 309
 310         res = shl_hashtable_find(tbl->symbols, &tmp, buf);
 311         if (res)
 312                 return (uint32_t)(long)tmp;
 313
 314         nval = malloc(sizeof(uint32_t) * s);
 315         if (!nval)
 316                 return sym;
 317
 318         memcpy(nval, buf, s * sizeof(uint32_t));
 319         nsym = tbl->next_id + 1;
 320         /* Out of IDs; we actually have 2 Billion IDs so this seems
 321          * very unlikely but lets be safe here */
 322         if (nsym <= tbl->next_id++)
 323                 goto err_id;
 324
 325         ret = shl_hashtable_insert(tbl->symbols, nval, (void*)(long)nsym);
 326         if (ret)
 327                 goto err_id;
 328
 329         ret = shl_array_push(tbl->index, &nval);
 330         if (ret)
 331                 goto err_symbol;
 332
 333         return nsym;
 334
 335 err_symbol:
 336         shl_hashtable_remove(tbl->symbols, nval);
 337 err_id:
 338         --tbl->next_id;
 339         free(nval);
 340         return sym;
 341 }
 342
 343 SHL_EXPORT
 344 unsigned int tsm_symbol_get_width(struct tsm_symbol_table *tbl,
 345                                   tsm_symbol_t sym)
 346 {
 347         int ret;
 348         const uint32_t *ch;
 349         size_t len;
 350
 351         if (!tbl)
 352                 tbl = tsm_symbol_table_default;
 353
 354         if (!tbl) {
 355                 ret = tsm_symbol_table_new(&tbl);
 356                 if (ret)
 357                         return sym;
 358                 tsm_symbol_table_default = tbl;
 359         }
 360
 361         ch = tsm_symbol_get(tbl, &sym, &len);
 362         if (len == 0)
 363                 return 0;
 364
 365         return tsm_ucs4_get_width(*ch);
 366 }
 367
 368 /*
 369  * Convert UCS4 character to UTF-8. This creates one of:
 370  *   0xxxxxxx
 371  *   110xxxxx 10xxxxxx
 372  *   1110xxxx 10xxxxxx 10xxxxxx
 373  *   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 374  * This is based on the same function from "terminology" from the Enlightenment
 375  * project. See COPYING for more information.
 376  *
 377  * @txt must point to a 4 byte-buffer. A number between 0 and 4 is returned and
 378  * indicates how long the written UTF8 string is.
 379  *
 380  * Please note @g is a real UCS4 code and not a tsm_symbol_t object!
 381  *
 382  * Unicode symbols between 0xD800 and 0xDFFF are not assigned and reserved for
 383  * UTF16 compatibility. It is an error to encode them. Same applies to numbers
 384  * greater than 0x10FFFF, the range 0xFDD0-0xFDEF and codepoints ending with
 385  * 0xFFFF or 0xFFFE.
 386  */
 387
 388 SHL_EXPORT
 389 unsigned int tsm_ucs4_get_width(uint32_t ucs4)
 390 {
 391         int ret;
 392
 393         ret = mk_wcwidth(ucs4);
 394         if (ret <= 0)
 395                 return 0;
 396
 397         return ret;
 398 }
 399
 400 SHL_EXPORT
 401 size_t tsm_ucs4_to_utf8(uint32_t g, char *txt)
 402 {
 403         if (g >= 0xd800 && g <= 0xdfff)
 404                 return 0;
 405         if (g > 0x10ffff || (g & 0xffff) == 0xffff || (g & 0xffff) == 0xfffe)
 406                 return 0;
 407         if (g >= 0xfdd0 && g <= 0xfdef)
 408                 return 0;
 409
 410         if (g < (1 << 7)) {
 411                 txt[0] = g & 0x7f;
 412                 return 1;
 413         } else if (g < (1 << (5 + 6))) {
 414                 txt[0] = 0xc0 | ((g >> 6) & 0x1f);
 415                 txt[1] = 0x80 | ((g     ) & 0x3f);
 416                 return 2;
 417         } else if (g < (1 << (4 + 6 + 6))) {
 418                 txt[0] = 0xe0 | ((g >> 12) & 0x0f);
 419                 txt[1] = 0x80 | ((g >>  6) & 0x3f);
 420                 txt[2] = 0x80 | ((g      ) & 0x3f);
 421                 return 3;
 422         } else if (g < (1 << (3 + 6 + 6 + 6))) {
 423                 txt[0] = 0xf0 | ((g >> 18) & 0x07);
 424                 txt[1] = 0x80 | ((g >> 12) & 0x3f);
 425                 txt[2] = 0x80 | ((g >>  6) & 0x3f);
 426                 txt[3] = 0x80 | ((g      ) & 0x3f);
 427                 return 4;
 428         } else {
 429                 return 0;
 430         }
 431 }
 432
 433 SHL_EXPORT
 434 char *tsm_ucs4_to_utf8_alloc(const uint32_t *ucs4, size_t len, size_t *len_out)
 435 {
 436         char *val;
 437         size_t i, pos;
 438
 439         val = malloc(4 * len);
 440         if (!val)
 441                 return NULL;
 442
 443         pos = 0;
 444         for (i = 0; i < len; ++i)
 445                 pos += tsm_ucs4_to_utf8(ucs4[i], &val[pos]);
 446
 447         if (!pos) {
 448                 free(val);
 449                 return NULL;
 450         }
 451
 452         if (len_out)
 453                 *len_out = pos;
 454         return val;
 455 }
 456
 457 /*
 458  * UTF8 State Machine
 459  * This state machine parses UTF8 and converts it into a stream of Unicode
 460  * characters (UCS4 values). A state-machine is represented by a
 461  * "struct tsm_utf8_mach" object. It has no global state and all functions are
 462  * re-entrant if called with different state-machine objects.
 463  *
 464  * tsm_utf8_mach_new(): This creates a new state-machine and resets it to its
 465  * initial state. Returns 0 on success.
 466  *
 467  * tsm_uft8_mach_free(): This destroys a state-machine and frees all internally
 468  * allocated memory.
 469  *
 470  * tsm_utf8_mach_reset(): Reset a given state-machine to its initial state. This
 471  * is the same state the machine is in after it got created.
 472  *
 473  * tsm_uft8_mach_feed(): Feed one byte of the UTF8 input stream into the
 474  * state-machine. This function returns the new state of the state-machine after
 475  * this character has been parsed. If it is TSM_UTF8_ACCEPT or TSM_UTF8_REJECT,
 476  * then there is a pending UCS4 character that you should retrieve via
 477  * tsm_utf8_mach_get(). If it is TSM_UTF8_ACCEPT, then a character was
 478  * successfully parsed. If it is TSM_UTF8_REJECT, the input was invalid UTF8 and
 479  * some error recovery was tried or a replacement character was choosen. All
 480  * other states mean that the machine needs more input to parse the stream.
 481  *
 482  * tsm_utf8_mach_get(): Returns the last parsed character. It has no effect on
 483  * the state machine so you can call it multiple times.
 484  *
 485  * Internally, we use TSM_UTF8_START whenever the state-machine is reset. This
 486  * can be used to ignore the last read input or to simply reset the machine.
 487  * TSM_UTF8_EXPECT* is used to remember how many bytes are still to be read to
 488  * get a full UTF8 sequence.
 489  * If an error occurs during reading, we go to state TSM_UTF8_REJECT and the
 490  * user will read a replacement character. If further errors occur, we go to
 491  * state TSM_UTF8_START to avoid printing multiple replacement characters for a
 492  * single misinterpreted UTF8 sequence. However, under some circumstances it may
 493  * happen that we stay in TSM_UTF8_REJECT and a next replacement character is
 494  * returned.
 495  * It is difficult to decide how to interpret wrong input but this machine seems
 496  * to be quite good at deciding what to do. Generally, we prefer discarding or
 497  * replacing input instead of trying to decipher ASCII values from the invalid
 498  * data. This guarantees that we do not send wrong values to the terminal
 499  * emulator. Some might argue that an ASCII fallback would be better. However,
 500  * this means that we might send very weird escape-sequences to the VTE layer.
 501  * Especially with C1 codes applications can really break many terminal features
 502  * so we avoid any non-ASCII+non-UTF8 input to prevent this.
 503  */
 504
 505 struct tsm_utf8_mach {
 506         int state;
 507         uint32_t ch;
 508 };
 509
 510 SHL_EXPORT
 511 int tsm_utf8_mach_new(struct tsm_utf8_mach **out)
 512 {
 513         struct tsm_utf8_mach *mach;
 514
 515         if (!out)
 516                 return -EINVAL;
 517
 518         mach = malloc(sizeof(*mach));
 519         if (!mach)
 520                 return -ENOMEM;
 521
 522         memset(mach, 0, sizeof(*mach));
 523         mach->state = TSM_UTF8_START;
 524
 525         *out = mach;
 526         return 0;
 527 }
 528
 529 SHL_EXPORT
 530 void tsm_utf8_mach_free(struct tsm_utf8_mach *mach)
 531 {
 532         if (!mach)
 533                 return;
 534
 535         free(mach);
 536 }
 537
 538 SHL_EXPORT
 539 int tsm_utf8_mach_feed(struct tsm_utf8_mach *mach, char ci)
 540 {
 541         uint32_t c;
 542
 543         if (!mach)
 544                 return TSM_UTF8_START;
 545
 546         c = ci;
 547
 548         switch (mach->state) {
 549         case TSM_UTF8_START:
 550         case TSM_UTF8_ACCEPT:
 551         case TSM_UTF8_REJECT:
 552                 if (c == 0xC0 || c == 0xC1) {
 553                         /* overlong encoding for ASCII, reject */
 554                         mach->state = TSM_UTF8_REJECT;
 555                 } else if ((c & 0x80) == 0) {
 556                         /* single byte, accept */
 557                         mach->ch = c;
 558                         mach->state = TSM_UTF8_ACCEPT;
 559                 } else if ((c & 0xC0) == 0x80) {
 560                         /* parser out of sync, ignore byte */
 561                         mach->state = TSM_UTF8_START;
 562                 } else if ((c & 0xE0) == 0xC0) {
 563                         /* start of two byte sequence */
 564                         mach->ch = (c & 0x1F) << 6;
 565                         mach->state = TSM_UTF8_EXPECT1;
 566                 } else if ((c & 0xF0) == 0xE0) {
 567                         /* start of three byte sequence */
 568                         mach->ch = (c & 0x0F) << 12;
 569                         mach->state = TSM_UTF8_EXPECT2;
 570                 } else if ((c & 0xF8) == 0xF0) {
 571                         /* start of four byte sequence */
 572                         mach->ch = (c & 0x07) << 18;
 573                         mach->state = TSM_UTF8_EXPECT3;
 574                 } else {
 575                         /* overlong encoding, reject */
 576                         mach->state = TSM_UTF8_REJECT;
 577                 }
 578                 break;
 579         case TSM_UTF8_EXPECT3:
 580                 mach->ch |= (c & 0x3F) << 12;
 581                 if ((c & 0xC0) == 0x80)
 582                         mach->state = TSM_UTF8_EXPECT2;
 583                 else
 584                         mach->state = TSM_UTF8_REJECT;
 585                 break;
 586         case TSM_UTF8_EXPECT2:
 587                 mach->ch |= (c & 0x3F) << 6;
 588                 if ((c & 0xC0) == 0x80)
 589                         mach->state = TSM_UTF8_EXPECT1;
 590                 else
 591                         mach->state = TSM_UTF8_REJECT;
 592                 break;
 593         case TSM_UTF8_EXPECT1:
 594                 mach->ch |= c & 0x3F;
 595                 if ((c & 0xC0) == 0x80)
 596                         mach->state = TSM_UTF8_ACCEPT;
 597                 else
 598                         mach->state = TSM_UTF8_REJECT;
 599                 break;
 600         default:
 601                 mach->state = TSM_UTF8_REJECT;
 602                 break;
 603         }
 604
 605         return mach->state;
 606 }
 607
 608 SHL_EXPORT
 609 uint32_t tsm_utf8_mach_get(struct tsm_utf8_mach *mach)
 610 {
 611         if (!mach || mach->state != TSM_UTF8_ACCEPT)
 612                 return TSM_UCS4_REPLACEMENT;
 613
 614         return mach->ch;
 615 }
 616
 617 SHL_EXPORT
 618 void tsm_utf8_mach_reset(struct tsm_utf8_mach *mach)
 619 {
 620         if (!mach)
 621                 return;
 622
 623         mach->state = TSM_UTF8_START;
 624 }