2 * TSM - Unicode Handling
4 * Copyright (c) 2011 David Herrmann <dh.herrmann@googlemail.com>
5 * Copyright (c) 2011-2012 University of Tuebingen
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files
9 * (the "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
15 * The above copyright notice and this permission notice shall be included
16 * in all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 * The tsm-utf8-state-machine is based on the wayland-compositor demos:
30 * Copyright © 2008 Kristian Høgsberg
32 * Permission to use, copy, modify, distribute, and sell this software and
33 * its documentation for any purpose is hereby granted without fee, provided
34 * that the above copyright notice appear in all copies and that both that
35 * copyright notice and this permission notice appear in supporting
36 * documentation, and that the name of the copyright holders not be used in
37 * advertising or publicity pertaining to distribution of the software
38 * without specific, written prior permission. The copyright holders make
39 * no representations about the suitability of this software for any
40 * purpose. It is provided "as is" without express or implied warranty.
42 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
43 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
44 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
45 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
46 * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
47 * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
48 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * This implements several helpers for Unicode/UTF8/UCS4 input and output. See
54 * below for comments on each helper.
61 #include "external/wcwidth.h"
62 #include "shl_array.h"
63 #include "shl_hashtable.h"
65 #include "tsm_unicode.h"
68 * Unicode Symbol Handling
69 * The main goal of the tsm_symbol_* functions is to provide a datatype which
70 * can contain the representation of any printable character. This includes all
71 * basic Unicode characters but also combined characters.
72 * To avoid all the memory management we still represent a character as a single
73 * integer value (tsm_symbol_t) but internally we allocate a string which is
74 * represented by this value.
76 * A tsm_symbol_t is an integer which represents a single character point.
77 * For most Unicode characters this is simply the UCS4 representation. In fact,
78 * every UCS4 characters is a valid tsm_symbol_t object.
79 * However, Unicode standard allows combining marks. Therefore, some characters
80 * consists of more than one Unicode character.
81 * A global symbol-table provides all those combined characters as single
82 * integers. You simply create a valid base character and append your combining
83 * marks and the table will return a new valid tsm_symbol_t. It is no longer
84 * a valid UCS4 value, though. But no memory management is needed as all
85 * tsm_symbol_t objects are simple integers.
87 * The symbol table contains two-way
88 * references. The Hash Table contains all the symbols with the symbol ucs4
89 * string as key and the symbol ID as value.
90 * The index array contains the symbol ID as key and a pointer to the ucs4
91 * string as value. But the hash table owns the ucs4 string.
92 * This allows fast implementations of *_get() and *_append() without long
95 * When creating a new symbol, we simply return the UCS4 value as new symbol. We
96 * do not add it to our symbol table as it is only one character. However, if a
97 * character is appended to an existing symbol, we create a new ucs4 string and
98 * push the new symbol into the symbol table.
102 const tsm_symbol_t tsm_symbol_default = 0;
104 struct tsm_symbol_table {
107 struct shl_array *index;
108 struct shl_hashtable *symbols;
111 /* TODO: remove the default context */
112 static struct tsm_symbol_table *tsm_symbol_table_default;
114 static unsigned int hash_ucs4(const void *key)
116 unsigned int val = 5381;
118 const uint32_t *ucs4 = key;
121 while (ucs4[i] <= TSM_UCS4_MAX) {
122 val = val * 33 + ucs4[i];
129 static bool cmp_ucs4(const void *a, const void *b)
132 const uint32_t *v1, *v2;
139 if (v1[i] > TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX)
141 if (v1[i] > TSM_UCS4_MAX && v2[i] <= TSM_UCS4_MAX)
143 if (v1[i] <= TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX)
153 int tsm_symbol_table_new(struct tsm_symbol_table **out)
155 struct tsm_symbol_table *tbl;
157 static const uint32_t *val = NULL; /* we need a valid lvalue */
162 tbl = malloc(sizeof(*tbl));
165 memset(tbl, 0, sizeof(*tbl));
167 tbl->next_id = TSM_UCS4_MAX + 2;
169 ret = shl_array_new(&tbl->index, sizeof(uint32_t*), 4);
173 /* first entry is not used so add dummy */
174 shl_array_push(tbl->index, &val);
176 ret = shl_hashtable_new(&tbl->symbols, hash_ucs4, cmp_ucs4,
185 shl_array_free(tbl->index);
192 void tsm_symbol_table_ref(struct tsm_symbol_table *tbl)
194 if (!tbl || !tbl->ref)
201 void tsm_symbol_table_unref(struct tsm_symbol_table *tbl)
203 if (!tbl || !tbl->ref || --tbl->ref)
206 shl_hashtable_free(tbl->symbols);
207 shl_array_free(tbl->index);
212 tsm_symbol_t tsm_symbol_make(uint32_t ucs4)
214 if (ucs4 > TSM_UCS4_MAX)
221 * This decomposes a symbol into a ucs4 string and a size value. If \sym is a
222 * valid UCS4 character, this returns a pointer to \sym and writes 1 into \size.
223 * Therefore, the returned value may get destroyed if your \sym argument gets
225 * If \sym is a composed ucs4 string, then the returned value points into the
226 * hash table of the symbol table and lives as long as the symbol table does.
228 * This always returns a valid value. If an error happens, the default character
229 * is returned. If \size is NULL, then the size value is omitted.
232 const uint32_t *tsm_symbol_get(struct tsm_symbol_table *tbl,
233 tsm_symbol_t *sym, size_t *size)
238 if (*sym <= TSM_UCS4_MAX) {
245 tbl = tsm_symbol_table_default;
248 ret = tsm_symbol_table_new(&tbl);
252 return &tsm_symbol_default;
254 tsm_symbol_table_default = tbl;
257 idx = *sym - (TSM_UCS4_MAX + 1);
258 if (idx >= shl_array_get_length(tbl->index))
261 ucs4 = *SHL_ARRAY_AT(tbl->index, uint32_t*, idx);
266 return &tsm_symbol_default;
271 while (ucs4[*size] <= TSM_UCS4_MAX)
279 tsm_symbol_t tsm_symbol_append(struct tsm_symbol_table *tbl,
280 tsm_symbol_t sym, uint32_t ucs4)
282 uint32_t buf[TSM_UCS4_MAXLEN + 1], nsym, *nval;
290 tbl = tsm_symbol_table_default;
293 ret = tsm_symbol_table_new(&tbl);
296 tsm_symbol_table_default = tbl;
299 if (ucs4 > TSM_UCS4_MAX)
302 ptr = tsm_symbol_get(tbl, &sym, &s);
303 if (s >= TSM_UCS4_MAXLEN)
306 memcpy(buf, ptr, s * sizeof(uint32_t));
308 buf[s++] = TSM_UCS4_MAX + 1;
310 res = shl_hashtable_find(tbl->symbols, &tmp, buf);
312 return (uint32_t)(long)tmp;
314 nval = malloc(sizeof(uint32_t) * s);
318 memcpy(nval, buf, s * sizeof(uint32_t));
319 nsym = tbl->next_id + 1;
320 /* Out of IDs; we actually have 2 Billion IDs so this seems
321 * very unlikely but lets be safe here */
322 if (nsym <= tbl->next_id++)
325 ret = shl_hashtable_insert(tbl->symbols, nval, (void*)(long)nsym);
329 ret = shl_array_push(tbl->index, &nval);
336 shl_hashtable_remove(tbl->symbols, nval);
344 unsigned int tsm_symbol_get_width(struct tsm_symbol_table *tbl,
352 tbl = tsm_symbol_table_default;
355 ret = tsm_symbol_table_new(&tbl);
358 tsm_symbol_table_default = tbl;
361 ch = tsm_symbol_get(tbl, &sym, &len);
365 return tsm_ucs4_get_width(*ch);
369 * Convert UCS4 character to UTF-8. This creates one of:
372 * 1110xxxx 10xxxxxx 10xxxxxx
373 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
374 * This is based on the same function from "terminology" from the Enlightenment
375 * project. See COPYING for more information.
377 * @txt must point to a 4 byte-buffer. A number between 0 and 4 is returned and
378 * indicates how long the written UTF8 string is.
380 * Please note @g is a real UCS4 code and not a tsm_symbol_t object!
382 * Unicode symbols between 0xD800 and 0xDFFF are not assigned and reserved for
383 * UTF16 compatibility. It is an error to encode them. Same applies to numbers
384 * greater than 0x10FFFF, the range 0xFDD0-0xFDEF and codepoints ending with
389 unsigned int tsm_ucs4_get_width(uint32_t ucs4)
393 ret = mk_wcwidth(ucs4);
401 size_t tsm_ucs4_to_utf8(uint32_t g, char *txt)
403 if (g >= 0xd800 && g <= 0xdfff)
405 if (g > 0x10ffff || (g & 0xffff) == 0xffff || (g & 0xffff) == 0xfffe)
407 if (g >= 0xfdd0 && g <= 0xfdef)
413 } else if (g < (1 << (5 + 6))) {
414 txt[0] = 0xc0 | ((g >> 6) & 0x1f);
415 txt[1] = 0x80 | ((g ) & 0x3f);
417 } else if (g < (1 << (4 + 6 + 6))) {
418 txt[0] = 0xe0 | ((g >> 12) & 0x0f);
419 txt[1] = 0x80 | ((g >> 6) & 0x3f);
420 txt[2] = 0x80 | ((g ) & 0x3f);
422 } else if (g < (1 << (3 + 6 + 6 + 6))) {
423 txt[0] = 0xf0 | ((g >> 18) & 0x07);
424 txt[1] = 0x80 | ((g >> 12) & 0x3f);
425 txt[2] = 0x80 | ((g >> 6) & 0x3f);
426 txt[3] = 0x80 | ((g ) & 0x3f);
434 char *tsm_ucs4_to_utf8_alloc(const uint32_t *ucs4, size_t len, size_t *len_out)
439 val = malloc(4 * len);
444 for (i = 0; i < len; ++i)
445 pos += tsm_ucs4_to_utf8(ucs4[i], &val[pos]);
459 * This state machine parses UTF8 and converts it into a stream of Unicode
460 * characters (UCS4 values). A state-machine is represented by a
461 * "struct tsm_utf8_mach" object. It has no global state and all functions are
462 * re-entrant if called with different state-machine objects.
464 * tsm_utf8_mach_new(): This creates a new state-machine and resets it to its
465 * initial state. Returns 0 on success.
467 * tsm_uft8_mach_free(): This destroys a state-machine and frees all internally
470 * tsm_utf8_mach_reset(): Reset a given state-machine to its initial state. This
471 * is the same state the machine is in after it got created.
473 * tsm_uft8_mach_feed(): Feed one byte of the UTF8 input stream into the
474 * state-machine. This function returns the new state of the state-machine after
475 * this character has been parsed. If it is TSM_UTF8_ACCEPT or TSM_UTF8_REJECT,
476 * then there is a pending UCS4 character that you should retrieve via
477 * tsm_utf8_mach_get(). If it is TSM_UTF8_ACCEPT, then a character was
478 * successfully parsed. If it is TSM_UTF8_REJECT, the input was invalid UTF8 and
479 * some error recovery was tried or a replacement character was choosen. All
480 * other states mean that the machine needs more input to parse the stream.
482 * tsm_utf8_mach_get(): Returns the last parsed character. It has no effect on
483 * the state machine so you can call it multiple times.
485 * Internally, we use TSM_UTF8_START whenever the state-machine is reset. This
486 * can be used to ignore the last read input or to simply reset the machine.
487 * TSM_UTF8_EXPECT* is used to remember how many bytes are still to be read to
488 * get a full UTF8 sequence.
489 * If an error occurs during reading, we go to state TSM_UTF8_REJECT and the
490 * user will read a replacement character. If further errors occur, we go to
491 * state TSM_UTF8_START to avoid printing multiple replacement characters for a
492 * single misinterpreted UTF8 sequence. However, under some circumstances it may
493 * happen that we stay in TSM_UTF8_REJECT and a next replacement character is
495 * It is difficult to decide how to interpret wrong input but this machine seems
496 * to be quite good at deciding what to do. Generally, we prefer discarding or
497 * replacing input instead of trying to decipher ASCII values from the invalid
498 * data. This guarantees that we do not send wrong values to the terminal
499 * emulator. Some might argue that an ASCII fallback would be better. However,
500 * this means that we might send very weird escape-sequences to the VTE layer.
501 * Especially with C1 codes applications can really break many terminal features
502 * so we avoid any non-ASCII+non-UTF8 input to prevent this.
505 struct tsm_utf8_mach {
511 int tsm_utf8_mach_new(struct tsm_utf8_mach **out)
513 struct tsm_utf8_mach *mach;
518 mach = malloc(sizeof(*mach));
522 memset(mach, 0, sizeof(*mach));
523 mach->state = TSM_UTF8_START;
530 void tsm_utf8_mach_free(struct tsm_utf8_mach *mach)
539 int tsm_utf8_mach_feed(struct tsm_utf8_mach *mach, char ci)
544 return TSM_UTF8_START;
548 switch (mach->state) {
550 case TSM_UTF8_ACCEPT:
551 case TSM_UTF8_REJECT:
552 if (c == 0xC0 || c == 0xC1) {
553 /* overlong encoding for ASCII, reject */
554 mach->state = TSM_UTF8_REJECT;
555 } else if ((c & 0x80) == 0) {
556 /* single byte, accept */
558 mach->state = TSM_UTF8_ACCEPT;
559 } else if ((c & 0xC0) == 0x80) {
560 /* parser out of sync, ignore byte */
561 mach->state = TSM_UTF8_START;
562 } else if ((c & 0xE0) == 0xC0) {
563 /* start of two byte sequence */
564 mach->ch = (c & 0x1F) << 6;
565 mach->state = TSM_UTF8_EXPECT1;
566 } else if ((c & 0xF0) == 0xE0) {
567 /* start of three byte sequence */
568 mach->ch = (c & 0x0F) << 12;
569 mach->state = TSM_UTF8_EXPECT2;
570 } else if ((c & 0xF8) == 0xF0) {
571 /* start of four byte sequence */
572 mach->ch = (c & 0x07) << 18;
573 mach->state = TSM_UTF8_EXPECT3;
575 /* overlong encoding, reject */
576 mach->state = TSM_UTF8_REJECT;
579 case TSM_UTF8_EXPECT3:
580 mach->ch |= (c & 0x3F) << 12;
581 if ((c & 0xC0) == 0x80)
582 mach->state = TSM_UTF8_EXPECT2;
584 mach->state = TSM_UTF8_REJECT;
586 case TSM_UTF8_EXPECT2:
587 mach->ch |= (c & 0x3F) << 6;
588 if ((c & 0xC0) == 0x80)
589 mach->state = TSM_UTF8_EXPECT1;
591 mach->state = TSM_UTF8_REJECT;
593 case TSM_UTF8_EXPECT1:
594 mach->ch |= c & 0x3F;
595 if ((c & 0xC0) == 0x80)
596 mach->state = TSM_UTF8_ACCEPT;
598 mach->state = TSM_UTF8_REJECT;
601 mach->state = TSM_UTF8_REJECT;
609 uint32_t tsm_utf8_mach_get(struct tsm_utf8_mach *mach)
611 if (!mach || mach->state != TSM_UTF8_ACCEPT)
612 return TSM_UCS4_REPLACEMENT;
618 void tsm_utf8_mach_reset(struct tsm_utf8_mach *mach)
623 mach->state = TSM_UTF8_START;