keysym-utf: reject out-of-range Unicode codepoints in xkb_keysym_to_utf{8,32}
authorRan Benita <ran234@gmail.com>
Sat, 23 Jun 2018 19:00:19 +0000 (22:00 +0300)
committerRan Benita <ran234@gmail.com>
Sat, 23 Jun 2018 19:53:42 +0000 (22:53 +0300)
It used to be UTF-8 was defined for inputs > 0x10FFFF, but nowadays
that's the maximum and a codepoint is encoded up to 4 bytes, not 6.

Fixes: https://github.com/xkbcommon/libxkbcommon/issues/58
Fixes: https://github.com/xkbcommon/libxkbcommon/issues/59
Reported-by: @andrecbarros
Signed-off-by: Ran Benita <ran234@gmail.com>
src/keysym-utf.c
src/utf8.c
test/keysym.c
test/utf8.c

index 1ccfc0e..c0e76f5 100644 (file)
@@ -881,9 +881,15 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym)
         keysym == XKB_KEY_KP_Enter || keysym == XKB_KEY_KP_Equal)
         return keysym & 0x7f;
 
-    /* also check for directly encoded 24-bit UCS characters */
-    if ((keysym & 0xff000000) == 0x01000000)
-        return keysym & 0x00ffffff;
+    /* also check for directly encoded Unicode codepoints */
+    /*
+     * In theory, this is supposed to start from 0x100100, such that the ASCII
+     * range, which is already covered by 0x00-0xff, can't be encoded in two
+     * ways. However, changing this after a couple of decades probably won't
+     * go well, so it stays as it is.
+     */
+    if (0x01000000 <= keysym && keysym <= 0x0110ffff)
+        return keysym - 0x01000000;
 
     /* search main table */
     return bin_search(keysymtab, ARRAY_SIZE(keysymtab) - 1, keysym);
index a7fa82e..a76b001 100644 (file)
@@ -49,17 +49,13 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
         length = 3;
         head = 0xe0;
     }
-    else if (unichar <= 0x1fffff) {
+    else if (unichar <= 0x10ffff) {
         length = 4;
         head = 0xf0;
     }
-    else if (unichar <= 0x3ffffff) {
-        length = 5;
-        head = 0xf8;
-    }
     else {
-        length = 6;
-        head = 0xfc;
+        buffer[0] = '\0';
+        return 0;
     }
 
     for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
index 4414523..e5347dd 100644 (file)
@@ -79,6 +79,7 @@ test_utf8(xkb_keysym_t keysym, const char *expected)
     fprintf(stderr, "Received keysym %#x -> %s (%u bytes)\n\n", keysym, s,
             (unsigned) strlen(s));
 
+    assert(expected != NULL);
     return streq(s, expected);
 }
 
@@ -175,6 +176,10 @@ main(void)
     assert(test_utf8(XKB_KEY_KP_Multiply, "*"));
     assert(test_utf8(XKB_KEY_KP_Subtract, "-"));
 
+    assert(test_utf8(0x10005d0, "א"));
+    assert(test_utf8(0x110ffff, "\xf4\x8f\xbf\xbf"));
+    assert(test_utf8(0x1110000, NULL) == 0);
+
     assert(xkb_keysym_is_lower(XKB_KEY_a));
     assert(xkb_keysym_is_lower(XKB_KEY_Greek_lambda));
     assert(xkb_keysym_is_lower(xkb_keysym_from_name("U03b1", 0))); /* GREEK SMALL LETTER ALPHA */
index 60673c1..1d1c073 100644 (file)
 #include <inttypes.h>
 #include <stdbool.h>
 #include <stddef.h>
+#include <string.h>
 
 #include "utf8.h"
+#include "utils.h"
 
 #define VALID(lit) assert(is_valid_utf8(lit, sizeof(lit)-1))
 #define INVALID(lit) assert(!is_valid_utf8(lit, sizeof(lit)-1))
@@ -148,10 +150,34 @@ test_is_valid_utf8(void)
     /* INVALID("\xEF\xBF\xBF"); */
 }
 
+static void
+check_utf32_to_utf8(uint32_t unichar, int expected_length, const char *expected) {
+    char buffer[7];
+    int length;
+
+    length = utf32_to_utf8(unichar, buffer);
+
+    assert(length == expected_length);
+    assert(streq(buffer, expected));
+}
+
+static void
+test_utf32_to_utf8(void)
+{
+    check_utf32_to_utf8(0x0, 2, "");
+    check_utf32_to_utf8(0x40, 2, "\x40");
+    check_utf32_to_utf8(0xA1, 3, "\xc2\xa1");
+    check_utf32_to_utf8(0x2701, 4, "\xe2\x9c\x81");
+    check_utf32_to_utf8(0x1f004, 5, "\xf0\x9f\x80\x84");
+    check_utf32_to_utf8(0x110000, 0, "");
+    check_utf32_to_utf8(0xffffffff, 0, "");
+}
+
 int
 main(void)
 {
     test_is_valid_utf8();
+    test_utf32_to_utf8();
 
     return 0;
 }