Removed build dependency on xproto.

[platform/upstream/libxkbcommon.git] / src / keysym-utf.c
diff --git a/src/keysym-utf.c b/src/keysym-utf.c

index 99cbfcd..0bb9a4f 100644 (file)
--- a/src/keysym-utf.c
+++ b/src/keysym-utf.c
@@ -34,14 +34,22 @@
   * This software is in the public domain. Share and enjoy!
   *
   */
-#include <stdint.h>
+
+#include "config.h"
+
  #include "xkbcommon/xkbcommon.h"
  #include "utils.h"
+#include "utf8.h"
+
+#define NO_KEYSYM_UNICODE_CONVERSION 0
  
+/* We don't use the uint32_t types here, to save some space. */
  struct codepair {
-    xkb_keysym_t keysym;
+    uint16_t keysym;
      uint16_t ucs;
-} keysymtab[] = {
+};
+
+static const struct codepair keysymtab[] = {
      { 0x01a1, 0x0104 }, /*                     Aogonek Ą LATIN CAPITAL LETTER A WITH OGONEK */
      { 0x01a2, 0x02d8 }, /*                       breve ˘ BREVE */
      { 0x01a3, 0x0141 }, /*                     Lstroke Ł LATIN CAPITAL LETTER L WITH STROKE */
@@ -517,7 +525,7 @@ struct codepair {
      { 0x0aa8, 0x200a }, /*                   hairspace   HAIR SPACE */
      { 0x0aa9, 0x2014 }, /*                      emdash — EM DASH */
      { 0x0aaa, 0x2013 }, /*                      endash – EN DASH */
-    /*  0x0aac                               signifblank ? ??? */
+    { 0x0aac, 0x2423 }, /*                 signifblank ␣ OPEN BOX */
      { 0x0aae, 0x2026 }, /*                    ellipsis … HORIZONTAL ELLIPSIS */
      { 0x0aaf, 0x2025 }, /*             doubbaselinedot ‥ TWO DOT LEADER */
      { 0x0ab0, 0x2153 }, /*                    onethird ⅓ VULGAR FRACTION ONE THIRD */
@@ -530,9 +538,9 @@ struct codepair {
      { 0x0ab7, 0x215a }, /*                  fivesixths ⅚ VULGAR FRACTION FIVE SIXTHS */
      { 0x0ab8, 0x2105 }, /*                      careof ℅ CARE OF */
      { 0x0abb, 0x2012 }, /*                     figdash ‒ FIGURE DASH */
-    { 0x0abc, 0x2329 }, /*            leftanglebracket 〈 LEFT-POINTING ANGLE BRACKET */
+    { 0x0abc, 0x27e8 }, /*            leftanglebracket ⟨ MATHEMATICAL LEFT ANGLE BRACKET */
      { 0x0abd, 0x002e }, /*                decimalpoint . FULL STOP */
-    { 0x0abe, 0x232a }, /*           rightanglebracket 〉 RIGHT-POINTING ANGLE BRACKET */
+    { 0x0abe, 0x27e9 }, /*           rightanglebracket ⟩ MATHEMATICAL RIGHT ANGLE BRACKET */
      /*  0x0abf                                  marker ? ??? */
      { 0x0ac3, 0x215b }, /*                   oneeighth ⅛ VULGAR FRACTION ONE EIGHTH */
      { 0x0ac4, 0x215c }, /*                threeeighths ⅜ VULGAR FRACTION THREE EIGHTHS */
@@ -550,6 +558,7 @@ struct codepair {
      { 0x0ad2, 0x201c }, /*         leftdoublequotemark “ LEFT DOUBLE QUOTATION MARK */
      { 0x0ad3, 0x201d }, /*        rightdoublequotemark ” RIGHT DOUBLE QUOTATION MARK */
      { 0x0ad4, 0x211e }, /*                prescription ℞ PRESCRIPTION TAKE */
+    { 0x0ad5, 0x2030 }, /*                    permille ‰ PER MILLE SIGN */
      { 0x0ad6, 0x2032 }, /*                     minutes ′ PRIME */
      { 0x0ad7, 0x2033 }, /*                     seconds ″ DOUBLE PRIME */
      { 0x0ad9, 0x271d }, /*                  latincross ✝ LATIN CROSS */
@@ -607,8 +616,8 @@ struct codepair {
      { 0x0bd6, 0x222a }, /*                    downshoe ∪ UNION */
      { 0x0bd8, 0x2283 }, /*                   rightshoe ⊃ SUPERSET OF */
      { 0x0bda, 0x2282 }, /*                    leftshoe ⊂ SUBSET OF */
-    { 0x0bdc, 0x22a2 }, /*                    lefttack ⊢ RIGHT TACK */
-    { 0x0bfc, 0x22a3 }, /*                   righttack ⊣ LEFT TACK */
+    { 0x0bdc, 0x22a3 }, /*                    lefttack ⊣ LEFT TACK */
+    { 0x0bfc, 0x22a2 }, /*                   righttack ⊢ RIGHT TACK */
      { 0x0cdf, 0x2017 }, /*        hebrew_doublelowline ‗ DOUBLE LOW LINE */
      { 0x0ce0, 0x05d0 }, /*                hebrew_aleph א HEBREW LETTER ALEF */
      { 0x0ce1, 0x05d1 }, /*                  hebrew_bet ב HEBREW LETTER BET */
@@ -803,7 +812,7 @@ struct codepair {
      { 0x0ef0, 0x3171 }, /*    Hangul_SunkyeongeumMieum ㅱ HANGUL LETTER KAPYEOUNMIEUM */
      { 0x0ef1, 0x3178 }, /*    Hangul_SunkyeongeumPieub ㅸ HANGUL LETTER KAPYEOUNPIEUP */
      { 0x0ef2, 0x317f }, /*              Hangul_PanSios ㅿ HANGUL LETTER PANSIOS */
-/*  0x0ef3                  Hangul_KkogjiDalrinIeung ? ??? */
+    { 0x0ef3, 0x3181 }, /*    Hangul_KkogjiDalrinIeung ㆁ HANGUL LETTER YESIEUNG */
      { 0x0ef4, 0x3184 }, /*   Hangul_SunkyeongeumPhieuf ㆄ HANGUL LETTER KAPYEOUNPHIEUPH */
      { 0x0ef5, 0x3186 }, /*          Hangul_YeorinHieuh ㆆ HANGUL LETTER YEORINHIEUH */
      { 0x0ef6, 0x318d }, /*                Hangul_AraeA ㆍ HANGUL LETTER ARAEA */
@@ -812,69 +821,106 @@ struct codepair {
      { 0x0ef9, 0x11f0 }, /*  Hangul_J_KkogjiDalrinIeung ᇰ HANGUL JONGSEONG YESIEUNG */
      { 0x0efa, 0x11f9 }, /*        Hangul_J_YeorinHieuh ᇹ HANGUL JONGSEONG YEORINHIEUH */
      { 0x0eff, 0x20a9 }, /*                  Korean_Won ₩ WON SIGN */
-    { 0x13a4, 0x20ac }, /*                        Euro € EURO SIGN */
      { 0x13bc, 0x0152 }, /*                          OE Œ LATIN CAPITAL LIGATURE OE */
      { 0x13bd, 0x0153 }, /*                          oe œ LATIN SMALL LIGATURE OE */
      { 0x13be, 0x0178 }, /*                  Ydiaeresis Ÿ LATIN CAPITAL LETTER Y WITH DIAERESIS */
-    { 0x20a0, 0x20a0 }, /*                     EcuSign ₠ EURO-CURRENCY SIGN */
-    { 0x20a1, 0x20a1 }, /*                   ColonSign ₡ COLON SIGN */
-    { 0x20a2, 0x20a2 }, /*                CruzeiroSign ₢ CRUZEIRO SIGN */
-    { 0x20a3, 0x20a3 }, /*                  FFrancSign ₣ FRENCH FRANC SIGN */
-    { 0x20a4, 0x20a4 }, /*                    LiraSign ₤ LIRA SIGN */
-    { 0x20a5, 0x20a5 }, /*                    MillSign ₥ MILL SIGN */
-    { 0x20a6, 0x20a6 }, /*                   NairaSign ₦ NAIRA SIGN */
-    { 0x20a7, 0x20a7 }, /*                  PesetaSign ₧ PESETA SIGN */
-    { 0x20a8, 0x20a8 }, /*                   RupeeSign ₨ RUPEE SIGN */
-    { 0x20a9, 0x20a9 }, /*                     WonSign ₩ WON SIGN */
-    { 0x20aa, 0x20aa }, /*               NewSheqelSign ₪ NEW SHEQEL SIGN */
-    { 0x20ab, 0x20ab }, /*                    DongSign ₫ DONG SIGN */
      { 0x20ac, 0x20ac }, /*                    EuroSign € EURO SIGN */
-
-    { 0xff80, 0x0020 }, /*                    KP_Space   SPACE */
-    { 0xffaa, 0x002a }, /*                 KP_Multiply * ASTERISK */
-    { 0xffab, 0x002b }, /*                     KP_Plus + PLUS SIGN */
-    /* XXX: It's debatable what KP_Separator and KP_Decimal should represent,
-     *      as well as locale-specific.  So just enforce English colonial
-     *      hegemony on the world for the time being. */
-    { 0xffac, 0x002e }, /*                KP_Separator . FULL STOP */
-    { 0xffad, 0x002d }, /*                 KP_Subtract - HYPHEN-MINUS */
-    { 0xffae, 0x002e }, /*                  KP_Decimal . FULL STOP */
-    { 0xffaf, 0x002f }, /*                   KP_Divide / SOLIDUS */
-    { 0xffbd, 0x003d }, /*                    KP_Equal = EQUAL SIGN */
  };
  
-_X_EXPORT uint32_t
-xkb_keysym_to_utf32(xkb_keysym_t keysym)
+/* binary search with range check */
+static uint32_t
+bin_search(const struct codepair *table, size_t length, xkb_keysym_t keysym)
  {
-    int min = 0;
-    int max = sizeof(keysymtab) / sizeof(struct codepair) - 1;
-    int mid;
+    size_t first = 0;
+    size_t last = length;
+
+    if (keysym < table[0].keysym  || keysym > table[length].keysym)
+        return 0;
  
+    /* binary search in table */
+    while (last >= first) {
+        size_t mid = (first + last) / 2;
+        if (table[mid].keysym < keysym)
+            first = mid + 1;
+        else if (table[mid].keysym > keysym)
+            last = mid - 1;
+        else /* found it */
+            return table[mid].ucs;
+    }
+
+    /* no matching Unicode value found in table */
+    return NO_KEYSYM_UNICODE_CONVERSION;
+}
+
+XKB_EXPORT uint32_t
+xkb_keysym_to_utf32(xkb_keysym_t keysym)
+{
      /* first check for Latin-1 characters (1:1 mapping) */
      if ((keysym >= 0x0020 && keysym <= 0x007e) ||
          (keysym >= 0x00a0 && keysym <= 0x00ff))
          return keysym;
  
-    if (keysym >= 0xffb0 && keysym <= 0xffb9)
-        return keysym - (0xffb0 - 0x0030);
+    /* patch encoding botch */
+    if (keysym == XKB_KEY_KP_Space)
+        return XKB_KEY_space & 0x7f;
  
-    /* also check for directly encoded 24-bit UCS characters */
-    if ((keysym & 0xff000000) == 0x01000000)
-        return keysym & 0x00ffffff;
+    /* special keysyms */
+    if ((keysym >= XKB_KEY_BackSpace && keysym <= XKB_KEY_Clear) ||
+        (keysym >= XKB_KEY_KP_Multiply && keysym <= XKB_KEY_KP_9) ||
+        keysym == XKB_KEY_Return || keysym == XKB_KEY_Escape ||
+        keysym == XKB_KEY_Delete || keysym == XKB_KEY_KP_Tab ||
+        keysym == XKB_KEY_KP_Enter || keysym == XKB_KEY_KP_Equal)
+        return keysym & 0x7f;
  
-    /* binary search in table */
-    while (max >= min) {
-        mid = (min + max) / 2;
-        if (keysymtab[mid].keysym < keysym)
-            min = mid + 1;
-        else if (keysymtab[mid].keysym > keysym)
-            max = mid - 1;
-        else /* found it */
-            return keysymtab[mid].ucs;
-    }
+    /* also check for directly encoded Unicode codepoints */
+
+    /* Exclude surrogates: they are invalid in UTF-32.
+     * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
+     * for further details.
+    */
+    if (0x0100d800 <= keysym && keysym <= 0x0100dfff)
+        return NO_KEYSYM_UNICODE_CONVERSION;
+    /*
+     * In theory, this is supposed to start from 0x100100, such that the ASCII
+     * range, which is already covered by 0x00-0xff, can't be encoded in two
+     * ways. However, changing this after a couple of decades probably won't
+     * go well, so it stays as it is.
+     */
+    if (0x01000000 <= keysym && keysym <= 0x0110ffff)
+        return keysym - 0x01000000;
  
-    /* no matching Unicode value found */
-    return 0;
+    /* search main table */
+    return bin_search(keysymtab, ARRAY_SIZE(keysymtab) - 1, keysym);
+}
+
+XKB_EXPORT xkb_keysym_t
+xkb_utf32_to_keysym(uint32_t ucs)
+{
+    /* first check for Latin-1 characters (1:1 mapping) */
+    if ((ucs >= 0x0020 && ucs <= 0x007e) ||
+        (ucs >= 0x00a0 && ucs <= 0x00ff))
+        return ucs;
+
+    /* special keysyms */
+    if ((ucs >= (XKB_KEY_BackSpace & 0x7f) && ucs <= (XKB_KEY_Clear & 0x7f)) ||
+        ucs == (XKB_KEY_Return & 0x7f) || ucs == (XKB_KEY_Escape & 0x7f))
+        return ucs | 0xff00;
+    if (ucs == (XKB_KEY_Delete & 0x7f))
+        return XKB_KEY_Delete;
+
+    /* Unicode non-symbols and code points outside Unicode planes */
+    if ((ucs >= 0xd800 && ucs <= 0xdfff) ||
+        (ucs >= 0xfdd0 && ucs <= 0xfdef) ||
+        ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
+        return XKB_KEY_NoSymbol;
+
+    /* search main table */
+    for (size_t i = 0; i < ARRAY_SIZE(keysymtab); i++)
+        if (keysymtab[i].ucs == ucs)
+            return keysymtab[i].keysym;
+
+    /* Use direct encoding if everything else fails */
+    return ucs | 0x01000000;
  }
  
  /*
@@ -902,48 +948,7 @@ xkb_keysym_to_utf32(xkb_keysym_t keysym)
   * Author: Rob Bradford <rob@linux.intel.com>
   */
  
-static int
-utf32_to_utf8(uint32_t unichar, char *buffer)
-{
-    int count, shift, length;
-    uint8_t head;
-
-    if (unichar <= 0x007f) {
-        buffer[0] = unichar;
-        buffer[1] = '\0';
-        return 2;
-    }
-    else if (unichar <= 0x07FF) {
-        length = 2;
-        head = 0xc0;
-    }
-    else if (unichar <= 0xffff) {
-        length = 3;
-        head = 0xe0;
-    }
-    else if (unichar <= 0x1fffff) {
-        length = 4;
-        head = 0xf0;
-    }
-    else if (unichar <= 0x3ffffff) {
-        length = 5;
-        head = 0xf8;
-    }
-    else {
-        length = 6;
-        head = 0xfc;
-    }
-
-    for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
-        buffer[count] = 0x80 | ((unichar >> shift) & 0x3f);
-
-    buffer[0] = head | ((unichar >> shift) & 0x3f);
-    buffer[length] = '\0';
-
-    return length + 1;
-}
-
-_X_EXPORT int
+XKB_EXPORT int
  xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size)
  {
      uint32_t codepoint;
@@ -953,7 +958,7 @@ xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size)
  
      codepoint = xkb_keysym_to_utf32(keysym);
  
-    if (codepoint == 0)
+    if (codepoint == NO_KEYSYM_UNICODE_CONVERSION)
          return 0;
  
      return utf32_to_utf8(codepoint, buffer);