keysym-utf: reject out-of-range Unicode codepoints in xkb_keysym_to_utf{8,32}

author Ran Benita <ran234@gmail.com>

Sat, 23 Jun 2018 19:00:19 +0000 (22:00 +0300)

committer Ran Benita <ran234@gmail.com>

Sat, 23 Jun 2018 19:53:42 +0000 (22:53 +0300)
author Ran Benita <ran234@gmail.com>
Sat, 23 Jun 2018 19:00:19 +0000 (22:00 +0300)
committer Ran Benita <ran234@gmail.com>
Sat, 23 Jun 2018 19:53:42 +0000 (22:53 +0300)
diff --git a/src/keysym-utf.c b/src/keysym-utf.c

index 1ccfc0e..c0e76f5 100644 (file)
--- a/src/keysym-utf.c
+++ b/src/keysym-utf.c
@@ -881,9 +881,15 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym)
          keysym == XKB_KEY_KP_Enter || keysym == XKB_KEY_KP_Equal)
          return keysym & 0x7f;
  
-    /* also check for directly encoded 24-bit UCS characters */
-    if ((keysym & 0xff000000) == 0x01000000)
-        return keysym & 0x00ffffff;
+    /* also check for directly encoded Unicode codepoints */
+    /*
+     * In theory, this is supposed to start from 0x100100, such that the ASCII
+     * range, which is already covered by 0x00-0xff, can't be encoded in two
+     * ways. However, changing this after a couple of decades probably won't
+     * go well, so it stays as it is.
+     */
+    if (0x01000000 <= keysym && keysym <= 0x0110ffff)
+        return keysym - 0x01000000;
  
      /* search main table */
      return bin_search(keysymtab, ARRAY_SIZE(keysymtab) - 1, keysym);
diff --git a/src/utf8.c b/src/utf8.c

index a7fa82e..a76b001 100644 (file)
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -49,17 +49,13 @@ utf32_to_utf8(uint32_t unichar, char *buffer)
          length = 3;
          head = 0xe0;
      }
-    else if (unichar <= 0x1fffff) {
+    else if (unichar <= 0x10ffff) {
          length = 4;
          head = 0xf0;
      }
-    else if (unichar <= 0x3ffffff) {
-        length = 5;
-        head = 0xf8;
-    }
      else {
-        length = 6;
-        head = 0xfc;
+        buffer[0] = '\0';
+        return 0;
      }
  
      for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
diff --git a/test/keysym.c b/test/keysym.c

index 4414523..e5347dd 100644 (file)
--- a/test/keysym.c
+++ b/test/keysym.c
@@ -79,6 +79,7 @@ test_utf8(xkb_keysym_t keysym, const char *expected)
      fprintf(stderr, "Received keysym %#x -> %s (%u bytes)\n\n", keysym, s,
              (unsigned) strlen(s));
  
+    assert(expected != NULL);
      return streq(s, expected);
  }
  
@@ -175,6 +176,10 @@ main(void)
      assert(test_utf8(XKB_KEY_KP_Multiply, "*"));
      assert(test_utf8(XKB_KEY_KP_Subtract, "-"));
  
+    assert(test_utf8(0x10005d0, "א"));
+    assert(test_utf8(0x110ffff, "\xf4\x8f\xbf\xbf"));
+    assert(test_utf8(0x1110000, NULL) == 0);
+
      assert(xkb_keysym_is_lower(XKB_KEY_a));
      assert(xkb_keysym_is_lower(XKB_KEY_Greek_lambda));
      assert(xkb_keysym_is_lower(xkb_keysym_from_name("U03b1", 0))); /* GREEK SMALL LETTER ALPHA */
diff --git a/test/utf8.c b/test/utf8.c

index 60673c1..1d1c073 100644 (file)
--- a/test/utf8.c
+++ b/test/utf8.c
@@ -25,8 +25,10 @@
  #include <inttypes.h>
  #include <stdbool.h>
  #include <stddef.h>
+#include <string.h>
  
  #include "utf8.h"
+#include "utils.h"
  
  #define VALID(lit) assert(is_valid_utf8(lit, sizeof(lit)-1))
  #define INVALID(lit) assert(!is_valid_utf8(lit, sizeof(lit)-1))
@@ -148,10 +150,34 @@ test_is_valid_utf8(void)
      /* INVALID("\xEF\xBF\xBF"); */
  }
  
+static void
+check_utf32_to_utf8(uint32_t unichar, int expected_length, const char *expected) {
+    char buffer[7];
+    int length;
+
+    length = utf32_to_utf8(unichar, buffer);
+
+    assert(length == expected_length);
+    assert(streq(buffer, expected));
+}
+
+static void
+test_utf32_to_utf8(void)
+{
+    check_utf32_to_utf8(0x0, 2, "");
+    check_utf32_to_utf8(0x40, 2, "\x40");
+    check_utf32_to_utf8(0xA1, 3, "\xc2\xa1");
+    check_utf32_to_utf8(0x2701, 4, "\xe2\x9c\x81");
+    check_utf32_to_utf8(0x1f004, 5, "\xf0\x9f\x80\x84");
+    check_utf32_to_utf8(0x110000, 0, "");
+    check_utf32_to_utf8(0xffffffff, 0, "");
+}
+
  int
  main(void)
  {
      test_is_valid_utf8();
+    test_utf32_to_utf8();
  
      return 0;
  }
author	Ran Benita <ran234@gmail.com>
	Sat, 23 Jun 2018 19:00:19 +0000 (22:00 +0300)
committer	Ran Benita <ran234@gmail.com>
	Sat, 23 Jun 2018 19:53:42 +0000 (22:53 +0300)
src/keysym-utf.c		patch \| blob \| history
src/utf8.c		patch \| blob \| history
test/keysym.c		patch \| blob \| history
test/utf8.c		patch \| blob \| history