From 2bbaf7c7d1f2fecbdc3b2ed6cf34c63d117824bd Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sun, 9 Feb 2014 13:50:21 +0200 Subject: [PATCH] Add utf8.{c,h} for common UTF-8 util functions We need to validate some UTF-8, so this adds an is_valid_utf8() function, which is probably pretty slow but should work correctly. Signed-off-by: Ran Benita --- Makefile.am | 6 ++- src/keysym-utf.c | 42 +-------------- src/utf8.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++ src/utf8.h | 36 +++++++++++++ test/.gitignore | 1 + test/utf8.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 342 insertions(+), 42 deletions(-) create mode 100644 src/utf8.c create mode 100644 src/utf8.h create mode 100644 test/utf8.c diff --git a/Makefile.am b/Makefile.am index 38df147..ca5c3ff 100644 --- a/Makefile.am +++ b/Makefile.am @@ -76,6 +76,8 @@ libxkbcommon_la_SOURCES = \ src/state.c \ src/text.c \ src/text.h \ + src/utf8.c \ + src/utf8.h \ src/utils.c \ src/utils.h @@ -166,7 +168,8 @@ TESTS = \ test/stringcomp \ test/buffercomp \ test/log \ - test/atom + test/atom \ + test/utf8 check_PROGRAMS = \ test/rmlvo-to-kccgst \ test/print-compiled-keymap \ @@ -183,6 +186,7 @@ test_stringcomp_LDADD = $(TESTS_LDADD) test_buffercomp_LDADD = $(TESTS_LDADD) test_log_LDADD = $(TESTS_LDADD) test_atom_LDADD = $(TESTS_LDADD) +test_utf8_LDADD = $(TESTS_LDADD) test_rmlvo_to_kccgst_LDADD = $(TESTS_LDADD) test_print_compiled_keymap_LDADD = $(TESTS_LDADD) test_bench_key_proc_LDADD = $(TESTS_LDADD) -lrt diff --git a/src/keysym-utf.c b/src/keysym-utf.c index 9b6fbd7..ffe2cea 100644 --- a/src/keysym-utf.c +++ b/src/keysym-utf.c @@ -37,6 +37,7 @@ #include "xkbcommon/xkbcommon.h" #include "utils.h" +#include "utf8.h" /* We don't use the uint32_t types here, to save some space. */ struct codepair { @@ -912,47 +913,6 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym) * Author: Rob Bradford */ -static int -utf32_to_utf8(uint32_t unichar, char *buffer) -{ - int count, shift, length; - uint8_t head; - - if (unichar <= 0x007f) { - buffer[0] = unichar; - buffer[1] = '\0'; - return 2; - } - else if (unichar <= 0x07FF) { - length = 2; - head = 0xc0; - } - else if (unichar <= 0xffff) { - length = 3; - head = 0xe0; - } - else if (unichar <= 0x1fffff) { - length = 4; - head = 0xf0; - } - else if (unichar <= 0x3ffffff) { - length = 5; - head = 0xf8; - } - else { - length = 6; - head = 0xfc; - } - - for (count = length - 1, shift = 0; count > 0; count--, shift += 6) - buffer[count] = 0x80 | ((unichar >> shift) & 0x3f); - - buffer[0] = head | ((unichar >> shift) & 0x3f); - buffer[length] = '\0'; - - return length + 1; -} - XKB_EXPORT int xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size) { diff --git a/src/utf8.c b/src/utf8.c new file mode 100644 index 0000000..11382c8 --- /dev/null +++ b/src/utf8.c @@ -0,0 +1,142 @@ +/* + * Copyright © 2012 Intel Corporation + * Copyright © 2014 Ran Benita + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Rob Bradford + */ + +#include +#include +#include + +#include "utf8.h" + +int +utf32_to_utf8(uint32_t unichar, char *buffer) +{ + int count, shift, length; + uint8_t head; + + if (unichar <= 0x007f) { + buffer[0] = unichar; + buffer[1] = '\0'; + return 2; + } + else if (unichar <= 0x07FF) { + length = 2; + head = 0xc0; + } + else if (unichar <= 0xffff) { + length = 3; + head = 0xe0; + } + else if (unichar <= 0x1fffff) { + length = 4; + head = 0xf0; + } + else if (unichar <= 0x3ffffff) { + length = 5; + head = 0xf8; + } + else { + length = 6; + head = 0xfc; + } + + for (count = length - 1, shift = 0; count > 0; count--, shift += 6) + buffer[count] = 0x80 | ((unichar >> shift) & 0x3f); + + buffer[0] = head | ((unichar >> shift) & 0x3f); + buffer[length] = '\0'; + + return length + 1; +} + +bool +is_valid_utf8(const char *ss, size_t len) +{ + size_t i = 0; + size_t tail_bytes = 0; + const uint8_t *s = (const uint8_t *) ss; + + /* This beauty is from: + * The Unicode Standard Version 6.2 - Core Specification, Table 3.7 + * http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G7404 + * We can optimize if needed. */ + while (i < len) + { + if (s[i] <= 0x7F) { + tail_bytes = 0; + } + else if (s[i] >= 0xC2 && s[i] <= 0xDF) { + tail_bytes = 1; + } + else if (s[i] == 0xE0) { + i++; + if (i >= len || !(s[i] >= 0xA0 && s[i] <= 0xBF)) + return false; + tail_bytes = 1; + } + else if (s[i] >= 0xE1 && s[i] <= 0xEC) { + tail_bytes = 2; + } + else if (s[i] == 0xED) { + i++; + if (i >= len || !(s[i] >= 0x80 && s[i] <= 0x9F)) + return false; + tail_bytes = 1; + } + else if (s[i] >= 0xEE && s[i] <= 0xEF) { + tail_bytes = 2; + } + else if (s[i] == 0xF0) { + i++; + if (i >= len || !(s[i] >= 0x90 && s[i] <= 0xBF)) + return false; + tail_bytes = 2; + } + else if (s[i] >= 0xF1 && s[i] <= 0xF3) { + tail_bytes = 3; + } + else if (s[i] == 0xF4) { + i++; + if (i >= len || !(s[i] >= 0x80 && s[i] <= 0x8F)) + return false; + tail_bytes = 2; + } + else { + return false; + } + + i++; + + while (i < len && tail_bytes > 0 && s[i] >= 0x80 && s[i] <= 0xBF) { + i++; + tail_bytes--; + } + + if (tail_bytes != 0) + return false; + } + + return true; +} diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..6371cb5 --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,36 @@ +/* + * Copyright © 2012 Intel Corporation + * Copyright © 2014 Ran Benita + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Author: Rob Bradford + */ + +#ifndef XKBCOMMON_UTF8_H +#define XKBCOMMON_UTF8_H + +int +utf32_to_utf8(uint32_t unichar, char *buffer); + +bool +is_valid_utf8(const char *ss, size_t len); + +#endif diff --git a/test/.gitignore b/test/.gitignore index e4b7758..d04eec3 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -17,3 +17,4 @@ bench-key-proc atom x11 interactive-x11 +utf8 diff --git a/test/utf8.c b/test/utf8.c new file mode 100644 index 0000000..17c7156 --- /dev/null +++ b/test/utf8.c @@ -0,0 +1,157 @@ +/* + * Copyright © 2014 Ran Benita + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include "utf8.h" + +#define VALID(lit) assert(is_valid_utf8(lit, sizeof(lit)-1)) +#define INVALID(lit) assert(!is_valid_utf8(lit, sizeof(lit)-1)) + +static void +test_is_valid_utf8(void) +{ + /* + * Mostly taken from: + * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt + */ + + VALID("ascii"); + VALID("\xCE\xBA\xE1\xBD\xB9\xCF\x83\xCE\xBC\xCE\xB5"); + + VALID(""); + VALID("\x00"); + VALID("\x00\x00"); + + VALID("\x50"); + VALID("\xC2\x80"); + VALID("\xE0\xA0\x80"); + VALID("\xF0\x90\x80\x80"); + + /* 5/6-byte continuations aren't allowed (unlike UTF-8-test). */ + INVALID("\xF8\x88\x80\x80\x80"); + INVALID("\xFC\x84\x80\x80\x80\x80"); + + VALID("\x7F"); + VALID("\xDF\xBF"); + VALID("\xEF\xBF\xBF"); + /* VALID("\xF7\xBF\xBF\xBF"); */ + + /* 5/6-byte continuations aren't allowed (unlike UTF-8-test). */ + INVALID("\xFB\xBF\xBF\xBF\xBF"); + INVALID("\xFD\xBFxBF\xBF\xBF"); + + VALID("\xED\x9F\xBF"); + VALID("\xEE\x80\x80"); + VALID("\xEF\xBF\xBD"); + VALID("\xF4\x8F\xBF\xBF"); + /* VALID("\xF4\x90\x80\x80"); */ + + INVALID("\x80"); + INVALID("\xBF"); + INVALID("\x80\xBF"); + INVALID("\x80\xBF\x80"); + INVALID("\x80\xBF\x80\xBF"); + INVALID("\x80\xBF\x80\xBF\x80"); + INVALID("\x80\xBF\x80\xBF\x80\xBF"); + INVALID("\x80\xBF\x80\xBF\x80\xBF\x80"); + INVALID("\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F" + "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" + "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"); + + INVALID("\xC0 \xC1 \xC2 \xC3 \xC4 \xC5 \xC6 \xC7 \xC8 \xC9 \xCA \xCB \xCC " + "\xCD \xCE \xCF " + "\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 \xD8 \xD9 \xDA \xDB \xDD " + "\xDD \xDE \xDF "); + INVALID("\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 "); + INVALID("\xF8 \xF9 \xFA \xFB "); + INVALID("\xFC \xFD "); + + INVALID("\xC0"); + INVALID("\xE0\x80"); + INVALID("\xF0\x80\x80"); + INVALID("\xF8\x80\x80\x80"); + INVALID("\xFC\x80\x80\x80\x80"); + INVALID("\xDF"); + INVALID("\xEF\xBF"); + INVALID("\xF7\xBF\xBF"); + INVALID("\xFB\xBF\xBF\xBF"); + INVALID("\xFD\xBF\xBF\xBF\xBF"); + + INVALID("\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80" + "\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF"); + + INVALID("\xFE"); + INVALID("\xFF"); + INVALID("\xFE\xFE\xFF\xFF"); + + INVALID("\xC0\xAF"); + INVALID("\xE0\x80\xAF"); + INVALID("\xF0\x80\x80\xAF"); + INVALID("\xF8\x80\x80\x80\xAF"); + INVALID("\xFC\x80\x80\x80\x80\xAF"); + + INVALID("\xC1\xBF"); + INVALID("\xE0\x9F\xBF"); + INVALID("\xF0\x8F\xBF\xBF"); + INVALID("\xF8\x87\xBF\xBF\xBF"); + INVALID("\xFC\x83\xBF\xBF\xBF\xBF"); + + INVALID("\xC0\x80"); + INVALID("\xE0\x80\x80"); + INVALID("\xF0\x80\x80\x80"); + INVALID("\xF8\x80\x80\x80\x80"); + INVALID("\xFC\x80\x80\x80\x80\x80"); + + INVALID("\xED\xA0\x80"); + INVALID("\xED\xAD\xBF"); + INVALID("\xED\xAE\x80"); + INVALID("\xED\xAF\xBF"); + INVALID("\xED\xB0\x80"); + INVALID("\xED\xBE\x80"); + INVALID("\xED\xBF\xBF"); + + INVALID("\xED\xA0\x80\xED\xB0\x80"); + INVALID("\xED\xA0\x80\xED\xBF\xBF"); + INVALID("\xED\xAD\xBF\xED\xB0\x80"); + INVALID("\xED\xAD\xBF\xED\xBF\xBF"); + INVALID("\xED\xAE\x80\xED\xB0\x80"); + INVALID("\xED\xAE\x80\xED\xBF\xBF"); + INVALID("\xED\xAF\xBF\xED\xB0\x80"); + INVALID("\xED\xAF\xBF\xED\xBF\xBF"); + + /* INVALID("\xEF\xBF\xBE"); */ + /* INVALID("\xEF\xBF\xBF"); */ +} + +int +main(void) +{ + test_is_valid_utf8(); + + return 0; +} -- 2.7.4