* This software is in the public domain. Share and enjoy!
*
*/
-#include <stdint.h>
+
+#include "config.h"
+
#include "xkbcommon/xkbcommon.h"
#include "utils.h"
+#include "utf8.h"
+
+#define NO_KEYSYM_UNICODE_CONVERSION 0
+/* We don't use the uint32_t types here, to save some space. */
struct codepair {
- xkb_keysym_t keysym;
+ uint16_t keysym;
uint16_t ucs;
-} keysymtab[] = {
+};
+
+static const struct codepair keysymtab[] = {
{ 0x01a1, 0x0104 }, /* Aogonek Ą LATIN CAPITAL LETTER A WITH OGONEK */
{ 0x01a2, 0x02d8 }, /* breve ˘ BREVE */
{ 0x01a3, 0x0141 }, /* Lstroke Ł LATIN CAPITAL LETTER L WITH STROKE */
{ 0x0aa8, 0x200a }, /* hairspace HAIR SPACE */
{ 0x0aa9, 0x2014 }, /* emdash — EM DASH */
{ 0x0aaa, 0x2013 }, /* endash – EN DASH */
- /* 0x0aac signifblank ? ??? */
+ { 0x0aac, 0x2423 }, /* signifblank ␣ OPEN BOX */
{ 0x0aae, 0x2026 }, /* ellipsis … HORIZONTAL ELLIPSIS */
{ 0x0aaf, 0x2025 }, /* doubbaselinedot ‥ TWO DOT LEADER */
{ 0x0ab0, 0x2153 }, /* onethird ⅓ VULGAR FRACTION ONE THIRD */
{ 0x0ab7, 0x215a }, /* fivesixths ⅚ VULGAR FRACTION FIVE SIXTHS */
{ 0x0ab8, 0x2105 }, /* careof ℅ CARE OF */
{ 0x0abb, 0x2012 }, /* figdash ‒ FIGURE DASH */
- { 0x0abc, 0x2329 }, /* leftanglebracket 〈 LEFT-POINTING ANGLE BRACKET */
+ { 0x0abc, 0x27e8 }, /* leftanglebracket ⟨ MATHEMATICAL LEFT ANGLE BRACKET */
{ 0x0abd, 0x002e }, /* decimalpoint . FULL STOP */
- { 0x0abe, 0x232a }, /* rightanglebracket 〉 RIGHT-POINTING ANGLE BRACKET */
+ { 0x0abe, 0x27e9 }, /* rightanglebracket ⟩ MATHEMATICAL RIGHT ANGLE BRACKET */
/* 0x0abf marker ? ??? */
{ 0x0ac3, 0x215b }, /* oneeighth ⅛ VULGAR FRACTION ONE EIGHTH */
{ 0x0ac4, 0x215c }, /* threeeighths ⅜ VULGAR FRACTION THREE EIGHTHS */
{ 0x0ad2, 0x201c }, /* leftdoublequotemark “ LEFT DOUBLE QUOTATION MARK */
{ 0x0ad3, 0x201d }, /* rightdoublequotemark ” RIGHT DOUBLE QUOTATION MARK */
{ 0x0ad4, 0x211e }, /* prescription ℞ PRESCRIPTION TAKE */
+ { 0x0ad5, 0x2030 }, /* permille ‰ PER MILLE SIGN */
{ 0x0ad6, 0x2032 }, /* minutes ′ PRIME */
{ 0x0ad7, 0x2033 }, /* seconds ″ DOUBLE PRIME */
{ 0x0ad9, 0x271d }, /* latincross ✝ LATIN CROSS */
{ 0x0bd6, 0x222a }, /* downshoe ∪ UNION */
{ 0x0bd8, 0x2283 }, /* rightshoe ⊃ SUPERSET OF */
{ 0x0bda, 0x2282 }, /* leftshoe ⊂ SUBSET OF */
- { 0x0bdc, 0x22a2 }, /* lefttack ⊢ RIGHT TACK */
- { 0x0bfc, 0x22a3 }, /* righttack ⊣ LEFT TACK */
+ { 0x0bdc, 0x22a3 }, /* lefttack ⊣ LEFT TACK */
+ { 0x0bfc, 0x22a2 }, /* righttack ⊢ RIGHT TACK */
{ 0x0cdf, 0x2017 }, /* hebrew_doublelowline ‗ DOUBLE LOW LINE */
{ 0x0ce0, 0x05d0 }, /* hebrew_aleph א HEBREW LETTER ALEF */
{ 0x0ce1, 0x05d1 }, /* hebrew_bet ב HEBREW LETTER BET */
{ 0x0ef0, 0x3171 }, /* Hangul_SunkyeongeumMieum ㅱ HANGUL LETTER KAPYEOUNMIEUM */
{ 0x0ef1, 0x3178 }, /* Hangul_SunkyeongeumPieub ㅸ HANGUL LETTER KAPYEOUNPIEUP */
{ 0x0ef2, 0x317f }, /* Hangul_PanSios ㅿ HANGUL LETTER PANSIOS */
-/* 0x0ef3 Hangul_KkogjiDalrinIeung ? ??? */
+ { 0x0ef3, 0x3181 }, /* Hangul_KkogjiDalrinIeung ㆁ HANGUL LETTER YESIEUNG */
{ 0x0ef4, 0x3184 }, /* Hangul_SunkyeongeumPhieuf ㆄ HANGUL LETTER KAPYEOUNPHIEUPH */
{ 0x0ef5, 0x3186 }, /* Hangul_YeorinHieuh ㆆ HANGUL LETTER YEORINHIEUH */
{ 0x0ef6, 0x318d }, /* Hangul_AraeA ㆍ HANGUL LETTER ARAEA */
{ 0x0ef9, 0x11f0 }, /* Hangul_J_KkogjiDalrinIeung ᇰ HANGUL JONGSEONG YESIEUNG */
{ 0x0efa, 0x11f9 }, /* Hangul_J_YeorinHieuh ᇹ HANGUL JONGSEONG YEORINHIEUH */
{ 0x0eff, 0x20a9 }, /* Korean_Won ₩ WON SIGN */
- { 0x13a4, 0x20ac }, /* Euro € EURO SIGN */
{ 0x13bc, 0x0152 }, /* OE Œ LATIN CAPITAL LIGATURE OE */
{ 0x13bd, 0x0153 }, /* oe œ LATIN SMALL LIGATURE OE */
{ 0x13be, 0x0178 }, /* Ydiaeresis Ÿ LATIN CAPITAL LETTER Y WITH DIAERESIS */
- { 0x20a0, 0x20a0 }, /* EcuSign ₠ EURO-CURRENCY SIGN */
- { 0x20a1, 0x20a1 }, /* ColonSign ₡ COLON SIGN */
- { 0x20a2, 0x20a2 }, /* CruzeiroSign ₢ CRUZEIRO SIGN */
- { 0x20a3, 0x20a3 }, /* FFrancSign ₣ FRENCH FRANC SIGN */
- { 0x20a4, 0x20a4 }, /* LiraSign ₤ LIRA SIGN */
- { 0x20a5, 0x20a5 }, /* MillSign ₥ MILL SIGN */
- { 0x20a6, 0x20a6 }, /* NairaSign ₦ NAIRA SIGN */
- { 0x20a7, 0x20a7 }, /* PesetaSign ₧ PESETA SIGN */
- { 0x20a8, 0x20a8 }, /* RupeeSign ₨ RUPEE SIGN */
- { 0x20a9, 0x20a9 }, /* WonSign ₩ WON SIGN */
- { 0x20aa, 0x20aa }, /* NewSheqelSign ₪ NEW SHEQEL SIGN */
- { 0x20ab, 0x20ab }, /* DongSign ₫ DONG SIGN */
{ 0x20ac, 0x20ac }, /* EuroSign € EURO SIGN */
-
- { 0xff80, 0x0020 }, /* KP_Space SPACE */
- { 0xffaa, 0x002a }, /* KP_Multiply * ASTERISK */
- { 0xffab, 0x002b }, /* KP_Plus + PLUS SIGN */
- /* XXX: It's debatable what KP_Separator and KP_Decimal should represent,
- * as well as locale-specific. So just enforce English colonial
- * hegemony on the world for the time being. */
- { 0xffac, 0x002e }, /* KP_Separator . FULL STOP */
- { 0xffad, 0x002d }, /* KP_Subtract - HYPHEN-MINUS */
- { 0xffae, 0x002e }, /* KP_Decimal . FULL STOP */
- { 0xffaf, 0x002f }, /* KP_Divide / SOLIDUS */
- { 0xffbd, 0x003d }, /* KP_Equal = EQUAL SIGN */
};
-_X_EXPORT uint32_t
-xkb_keysym_to_utf32(xkb_keysym_t keysym)
+/* binary search with range check */
+static uint32_t
+bin_search(const struct codepair *table, size_t length, xkb_keysym_t keysym)
{
- int min = 0;
- int max = sizeof(keysymtab) / sizeof(struct codepair) - 1;
- int mid;
+ size_t first = 0;
+ size_t last = length;
+
+ if (keysym < table[0].keysym || keysym > table[length].keysym)
+ return 0;
+ /* binary search in table */
+ while (last >= first) {
+ size_t mid = (first + last) / 2;
+ if (table[mid].keysym < keysym)
+ first = mid + 1;
+ else if (table[mid].keysym > keysym)
+ last = mid - 1;
+ else /* found it */
+ return table[mid].ucs;
+ }
+
+ /* no matching Unicode value found in table */
+ return NO_KEYSYM_UNICODE_CONVERSION;
+}
+
+XKB_EXPORT uint32_t
+xkb_keysym_to_utf32(xkb_keysym_t keysym)
+{
/* first check for Latin-1 characters (1:1 mapping) */
if ((keysym >= 0x0020 && keysym <= 0x007e) ||
(keysym >= 0x00a0 && keysym <= 0x00ff))
return keysym;
- if (keysym >= 0xffb0 && keysym <= 0xffb9)
- return keysym - (0xffb0 - 0x0030);
+ /* patch encoding botch */
+ if (keysym == XKB_KEY_KP_Space)
+ return XKB_KEY_space & 0x7f;
- /* also check for directly encoded 24-bit UCS characters */
- if ((keysym & 0xff000000) == 0x01000000)
- return keysym & 0x00ffffff;
+ /* special keysyms */
+ if ((keysym >= XKB_KEY_BackSpace && keysym <= XKB_KEY_Clear) ||
+ (keysym >= XKB_KEY_KP_Multiply && keysym <= XKB_KEY_KP_9) ||
+ keysym == XKB_KEY_Return || keysym == XKB_KEY_Escape ||
+ keysym == XKB_KEY_Delete || keysym == XKB_KEY_KP_Tab ||
+ keysym == XKB_KEY_KP_Enter || keysym == XKB_KEY_KP_Equal)
+ return keysym & 0x7f;
- /* binary search in table */
- while (max >= min) {
- mid = (min + max) / 2;
- if (keysymtab[mid].keysym < keysym)
- min = mid + 1;
- else if (keysymtab[mid].keysym > keysym)
- max = mid - 1;
- else /* found it */
- return keysymtab[mid].ucs;
- }
+ /* also check for directly encoded Unicode codepoints */
+
+ /* Exclude surrogates: they are invalid in UTF-32.
+ * See https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G28875
+ * for further details.
+ */
+ if (0x0100d800 <= keysym && keysym <= 0x0100dfff)
+ return NO_KEYSYM_UNICODE_CONVERSION;
+ /*
+ * In theory, this is supposed to start from 0x100100, such that the ASCII
+ * range, which is already covered by 0x00-0xff, can't be encoded in two
+ * ways. However, changing this after a couple of decades probably won't
+ * go well, so it stays as it is.
+ */
+ if (0x01000000 <= keysym && keysym <= 0x0110ffff)
+ return keysym - 0x01000000;
- /* no matching Unicode value found */
- return 0;
+ /* search main table */
+ return bin_search(keysymtab, ARRAY_SIZE(keysymtab) - 1, keysym);
+}
+
+XKB_EXPORT xkb_keysym_t
+xkb_utf32_to_keysym(uint32_t ucs)
+{
+ /* first check for Latin-1 characters (1:1 mapping) */
+ if ((ucs >= 0x0020 && ucs <= 0x007e) ||
+ (ucs >= 0x00a0 && ucs <= 0x00ff))
+ return ucs;
+
+ /* special keysyms */
+ if ((ucs >= (XKB_KEY_BackSpace & 0x7f) && ucs <= (XKB_KEY_Clear & 0x7f)) ||
+ ucs == (XKB_KEY_Return & 0x7f) || ucs == (XKB_KEY_Escape & 0x7f))
+ return ucs | 0xff00;
+ if (ucs == (XKB_KEY_Delete & 0x7f))
+ return XKB_KEY_Delete;
+
+ /* Unicode non-symbols and code points outside Unicode planes */
+ if ((ucs >= 0xd800 && ucs <= 0xdfff) ||
+ (ucs >= 0xfdd0 && ucs <= 0xfdef) ||
+ ucs > 0x10ffff || (ucs & 0xfffe) == 0xfffe)
+ return XKB_KEY_NoSymbol;
+
+ /* search main table */
+ for (size_t i = 0; i < ARRAY_SIZE(keysymtab); i++)
+ if (keysymtab[i].ucs == ucs)
+ return keysymtab[i].keysym;
+
+ /* Use direct encoding if everything else fails */
+ return ucs | 0x01000000;
}
/*
* Author: Rob Bradford <rob@linux.intel.com>
*/
-static int
-utf32_to_utf8(uint32_t unichar, char *buffer)
-{
- int count, shift, length;
- uint8_t head;
-
- if (unichar <= 0x007f) {
- buffer[0] = unichar;
- buffer[1] = '\0';
- return 2;
- }
- else if (unichar <= 0x07FF) {
- length = 2;
- head = 0xc0;
- }
- else if (unichar <= 0xffff) {
- length = 3;
- head = 0xe0;
- }
- else if (unichar <= 0x1fffff) {
- length = 4;
- head = 0xf0;
- }
- else if (unichar <= 0x3ffffff) {
- length = 5;
- head = 0xf8;
- }
- else {
- length = 6;
- head = 0xfc;
- }
-
- for (count = length - 1, shift = 0; count > 0; count--, shift += 6)
- buffer[count] = 0x80 | ((unichar >> shift) & 0x3f);
-
- buffer[0] = head | ((unichar >> shift) & 0x3f);
- buffer[length] = '\0';
-
- return length + 1;
-}
-
-_X_EXPORT int
+XKB_EXPORT int
xkb_keysym_to_utf8(xkb_keysym_t keysym, char *buffer, size_t size)
{
uint32_t codepoint;
codepoint = xkb_keysym_to_utf32(keysym);
- if (codepoint == 0)
+ if (codepoint == NO_KEYSYM_UNICODE_CONVERSION)
return 0;
return utf32_to_utf8(codepoint, buffer);