Compose: add iterator API
[platform/upstream/libxkbcommon.git] / src / compose / parser.c
index a8458ac..e1b81de 100644 (file)
@@ -63,9 +63,6 @@ OR PERFORMANCE OF THIS SOFTWARE.
 #include "utf8.h"
 #include "parser.h"
 
-#define MAX_LHS_LEN 10
-#define MAX_INCLUDE_DEPTH 5
-
 /*
  * Grammar adapted from libX11/modules/im/ximcp/imLcPrs.c.
  * See also the XCompose(5) manpage.
@@ -126,18 +123,18 @@ lex(struct scanner *s, union lvalue *val)
 {
 skip_more_whitespace_and_comments:
     /* Skip spaces. */
-    while (is_space(peek(s)))
-        if (next(s) == '\n')
+    while (is_space(scanner_peek(s)))
+        if (scanner_next(s) == '\n')
             return TOK_END_OF_LINE;
 
     /* Skip comments. */
-    if (chr(s, '#')) {
-        skip_to_eol(s);
+    if (scanner_chr(s, '#')) {
+        scanner_skip_to_eol(s);
         goto skip_more_whitespace_and_comments;
     }
 
     /* See if we're done. */
-    if (eof(s)) return TOK_END_OF_FILE;
+    if (scanner_eof(s)) return TOK_END_OF_FILE;
 
     /* New token. */
     s->token_line = s->line;
@@ -145,14 +142,14 @@ skip_more_whitespace_and_comments:
     s->buf_pos = 0;
 
     /* LHS Keysym. */
-    if (chr(s, '<')) {
-        while (peek(s) != '>' && !eol(s) && !eof(s))
-            buf_append(s, next(s));
-        if (!chr(s, '>')) {
+    if (scanner_chr(s, '<')) {
+        while (scanner_peek(s) != '>' && !scanner_eol(s) && !scanner_eof(s))
+            scanner_buf_append(s, scanner_next(s));
+        if (!scanner_chr(s, '>')) {
             scanner_err(s, "unterminated keysym literal");
             return TOK_ERROR;
         }
-        if (!buf_append(s, '\0')) {
+        if (!scanner_buf_append(s, '\0')) {
             scanner_err(s, "keysym literal is too long");
             return TOK_ERROR;
         }
@@ -162,46 +159,46 @@ skip_more_whitespace_and_comments:
     }
 
     /* Colon. */
-    if (chr(s, ':'))
+    if (scanner_chr(s, ':'))
         return TOK_COLON;
-    if (chr(s, '!'))
+    if (scanner_chr(s, '!'))
         return TOK_BANG;
-    if (chr(s, '~'))
+    if (scanner_chr(s, '~'))
         return TOK_TILDE;
 
     /* String literal. */
-    if (chr(s, '\"')) {
-        while (!eof(s) && !eol(s) && peek(s) != '\"') {
-            if (chr(s, '\\')) {
+    if (scanner_chr(s, '\"')) {
+        while (!scanner_eof(s) && !scanner_eol(s) && scanner_peek(s) != '\"') {
+            if (scanner_chr(s, '\\')) {
                 uint8_t o;
-                if (chr(s, '\\')) {
-                    buf_append(s, '\\');
+                if (scanner_chr(s, '\\')) {
+                    scanner_buf_append(s, '\\');
                 }
-                else if (chr(s, '"')) {
-                    buf_append(s, '"');
+                else if (scanner_chr(s, '"')) {
+                    scanner_buf_append(s, '"');
                 }
-                else if (chr(s, 'x') || chr(s, 'X')) {
-                    if (hex(s, &o))
-                        buf_append(s, (char) o);
+                else if (scanner_chr(s, 'x') || scanner_chr(s, 'X')) {
+                    if (scanner_hex(s, &o))
+                        scanner_buf_append(s, (char) o);
                     else
                         scanner_warn(s, "illegal hexadecimal escape sequence in string literal");
                 }
-                else if (oct(s, &o)) {
-                    buf_append(s, (char) o);
+                else if (scanner_oct(s, &o)) {
+                    scanner_buf_append(s, (char) o);
                 }
                 else {
-                    scanner_warn(s, "unknown escape sequence (%c) in string literal", peek(s));
+                    scanner_warn(s, "unknown escape sequence (%c) in string literal", scanner_peek(s));
                     /* Ignore. */
                 }
             } else {
-                buf_append(s, next(s));
+                scanner_buf_append(s, scanner_next(s));
             }
         }
-        if (!chr(s, '\"')) {
+        if (!scanner_chr(s, '\"')) {
             scanner_err(s, "unterminated string literal");
             return TOK_ERROR;
         }
-        if (!buf_append(s, '\0')) {
+        if (!scanner_buf_append(s, '\0')) {
             scanner_err(s, "string literal is too long");
             return TOK_ERROR;
         }
@@ -215,11 +212,11 @@ skip_more_whitespace_and_comments:
     }
 
     /* Identifier or include. */
-    if (is_alpha(peek(s)) || peek(s) == '_') {
+    if (is_alpha(scanner_peek(s)) || scanner_peek(s) == '_') {
         s->buf_pos = 0;
-        while (is_alnum(peek(s)) || peek(s) == '_')
-            buf_append(s, next(s));
-        if (!buf_append(s, '\0')) {
+        while (is_alnum(scanner_peek(s)) || scanner_peek(s) == '_')
+            scanner_buf_append(s, scanner_next(s));
+        if (!scanner_buf_append(s, '\0')) {
             scanner_err(s, "identifier is too long");
             return TOK_ERROR;
         }
@@ -233,7 +230,7 @@ skip_more_whitespace_and_comments:
     }
 
     /* Discard rest of line. */
-    skip_to_eol(s);
+    scanner_skip_to_eol(s);
 
     scanner_err(s, "unrecognized token");
     return TOK_ERROR;
@@ -243,68 +240,68 @@ static enum rules_token
 lex_include_string(struct scanner *s, struct xkb_compose_table *table,
                    union lvalue *val_out)
 {
-    while (is_space(peek(s)))
-        if (next(s) == '\n')
+    while (is_space(scanner_peek(s)))
+        if (scanner_next(s) == '\n')
             return TOK_END_OF_LINE;
 
     s->token_line = s->line;
     s->token_column = s->column;
     s->buf_pos = 0;
 
-    if (!chr(s, '\"')) {
+    if (!scanner_chr(s, '\"')) {
         scanner_err(s, "include statement must be followed by a path");
         return TOK_ERROR;
     }
 
-    while (!eof(s) && !eol(s) && peek(s) != '\"') {
-        if (chr(s, '%')) {
-            if (chr(s, '%')) {
-                buf_append(s, '%');
+    while (!scanner_eof(s) && !scanner_eol(s) && scanner_peek(s) != '\"') {
+        if (scanner_chr(s, '%')) {
+            if (scanner_chr(s, '%')) {
+                scanner_buf_append(s, '%');
             }
-            else if (chr(s, 'H')) {
-                const char *home = secure_getenv("HOME");
+            else if (scanner_chr(s, 'H')) {
+                const char *home = xkb_context_getenv(table->ctx, "HOME");
                 if (!home) {
                     scanner_err(s, "%%H was used in an include statement, but the HOME environment variable is not set");
                     return TOK_ERROR;
                 }
-                if (!buf_appends(s, home)) {
+                if (!scanner_buf_appends(s, home)) {
                     scanner_err(s, "include path after expanding %%H is too long");
                     return TOK_ERROR;
                 }
             }
-            else if (chr(s, 'L')) {
-                char *path = get_locale_compose_file_path(table->locale);
+            else if (scanner_chr(s, 'L')) {
+                char *path = get_locale_compose_file_path(table->ctx, table->locale);
                 if (!path) {
                     scanner_err(s, "failed to expand %%L to the locale Compose file");
                     return TOK_ERROR;
                 }
-                if (!buf_appends(s, path)) {
+                if (!scanner_buf_appends(s, path)) {
                     free(path);
                     scanner_err(s, "include path after expanding %%L is too long");
                     return TOK_ERROR;
                 }
                 free(path);
             }
-            else if (chr(s, 'S')) {
-                const char *xlocaledir = get_xlocaledir_path();
-                if (!buf_appends(s, xlocaledir)) {
+            else if (scanner_chr(s, 'S')) {
+                const char *xlocaledir = get_xlocaledir_path(table->ctx);
+                if (!scanner_buf_appends(s, xlocaledir)) {
                     scanner_err(s, "include path after expanding %%S is too long");
                     return TOK_ERROR;
                 }
             }
             else {
-                scanner_err(s, "unknown %% format (%c) in include statement", peek(s));
+                scanner_err(s, "unknown %% format (%c) in include statement", scanner_peek(s));
                 return TOK_ERROR;
             }
         } else {
-            buf_append(s, next(s));
+            scanner_buf_append(s, scanner_next(s));
         }
     }
-    if (!chr(s, '\"')) {
+    if (!scanner_chr(s, '\"')) {
         scanner_err(s, "unterminated include statement");
         return TOK_ERROR;
     }
-    if (!buf_append(s, '\0')) {
+    if (!scanner_buf_append(s, '\0')) {
         scanner_err(s, "include path is too long");
         return TOK_ERROR;
     }
@@ -327,114 +324,108 @@ struct production {
     xkb_mod_mask_t mods;
 };
 
-static uint16_t
-add_node(struct xkb_compose_table *table, xkb_keysym_t keysym)
-{
-    struct compose_node new = {
-        .keysym = keysym,
-        .next = 0,
-        .is_leaf = true,
-    };
-    darray_append(table->nodes, new);
-    return darray_size(table->nodes) - 1;
-}
-
 static void
 add_production(struct xkb_compose_table *table, struct scanner *s,
                const struct production *production)
 {
-    unsigned lhs_pos;
-    uint16_t curr;
-    struct compose_node *node;
+    unsigned lhs_pos = 0;
+    uint32_t curr = darray_size(table->nodes) == 1 ? 0 : 1;
+    uint32_t *pptr = NULL;
+    struct compose_node *node = NULL;
 
-    if (darray_size(table->nodes) + 1 == MAX_COMPOSE_NODES)
+    /* Warn before potentially going over the limit, discard silently after. */
+    if (darray_size(table->nodes) + production->len + MAX_LHS_LEN > MAX_COMPOSE_NODES)
         scanner_warn(s, "too many sequences for one Compose file; will ignore further lines");
-    if (darray_size(table->nodes) >= MAX_COMPOSE_NODES)
+    if (darray_size(table->nodes) + production->len >= MAX_COMPOSE_NODES)
         return;
 
-    curr = 0;
-    node = &darray_item(table->nodes, curr);
-
     /*
-     * Insert the sequence to the trie, creating new nodes as needed.
+     * Insert the sequence to the ternary search tree, creating new nodes as
+     * needed.
      *
-     * TODO: This can be sped up a bit by first trying the path that the
-     * previous production took, and only then doing the linear search
-     * through the trie levels.  This will work because sequences in the
-     * Compose files are often clustered by a common prefix; especially
-     * in the 1st and 2nd keysyms, which is where the largest variation
-     * (thus, longest search) is.
+     * TODO: We insert in the order given, this means some inputs can create
+     * long O(n) chains, which results in total O(n^2) parsing time. We should
+     * ensure the tree is reasonably balanced somehow.
      */
-    for (lhs_pos = 0; lhs_pos < production->len; lhs_pos++) {
-        while (production->lhs[lhs_pos] != node->keysym) {
-            if (node->next == 0) {
-                uint16_t next = add_node(table, production->lhs[lhs_pos]);
-                /* Refetch since add_node could have realloc()ed. */
-                node = &darray_item(table->nodes, curr);
-                node->next = next;
+    while (true) {
+        const xkb_keysym_t keysym = production->lhs[lhs_pos];
+        const bool last = lhs_pos + 1 == production->len;
+
+        if (curr == 0) {
+            /*
+             * Create a new node and update the parent pointer to it.
+             * Update the pointer first because the append invalidates it.
+             */
+            struct compose_node new = {
+                .keysym = keysym,
+                .lokid = 0,
+                .hikid = 0,
+                .internal = {
+                    .eqkid = 0,
+                    .is_leaf = false,
+                },
+            };
+            curr = darray_size(table->nodes);
+            if (pptr != NULL) {
+                *pptr = curr;
+                pptr = NULL;
             }
-
-            curr = node->next;
-            node = &darray_item(table->nodes, curr);
+            darray_append(table->nodes, new);
         }
 
-        if (lhs_pos + 1 == production->len)
-            break;
+        node = &darray_item(table->nodes, curr);
 
-        if (node->is_leaf) {
-            if (node->leaf.utf8 != 0 ||
-                node->leaf.keysym != XKB_KEY_NoSymbol) {
+        if (keysym < node->keysym) {
+            pptr = &node->lokid;
+            curr = node->lokid;
+        } else if (keysym > node->keysym) {
+            pptr = &node->hikid;
+            curr = node->hikid;
+        } else if (!last) {
+            if (node->is_leaf) {
                 scanner_warn(s, "a sequence already exists which is a prefix of this sequence; overriding");
-                node->leaf.utf8 = 0;
-                node->leaf.keysym = XKB_KEY_NoSymbol;
+                node->internal.eqkid = 0;
+                node->internal.is_leaf = false;
             }
-
-            {
-                uint16_t successor = add_node(table, production->lhs[lhs_pos + 1]);
-                /* Refetch since add_node could have realloc()ed. */
-                node = &darray_item(table->nodes, curr);
-                node->is_leaf = false;
-                node->internal.successor = successor;
+            lhs_pos++;
+            pptr = &node->internal.eqkid;
+            curr = node->internal.eqkid;
+        } else {
+            if (node->is_leaf) {
+                bool same_string =
+                    (node->leaf.utf8 == 0 && !production->has_string) ||
+                    (
+                        node->leaf.utf8 != 0 && production->has_string &&
+                        streq(&darray_item(table->utf8, node->leaf.utf8),
+                              production->string)
+                    );
+                bool same_keysym =
+                    (node->leaf.keysym == XKB_KEY_NoSymbol && !production->has_keysym) ||
+                    (
+                        node->leaf.keysym != XKB_KEY_NoSymbol && production->has_keysym &&
+                        node->leaf.keysym == production->keysym
+                    );
+                if (same_string && same_keysym) {
+                    scanner_warn(s, "this compose sequence is a duplicate of another; skipping line");
+                    return;
+                } else {
+                    scanner_warn(s, "this compose sequence already exists; overriding");
+                }
+            } else if (node->internal.eqkid != 0) {
+                scanner_warn(s, "this compose sequence is a prefix of another; skipping line");
+                return;
+            }
+            node->is_leaf = true;
+            if (production->has_string) {
+                node->leaf.utf8 = darray_size(table->utf8);
+                darray_append_items(table->utf8, production->string,
+                                    strlen(production->string) + 1);
+            }
+            if (production->has_keysym) {
+                node->leaf.keysym = production->keysym;
             }
-        }
-
-        curr = node->internal.successor;
-        node = &darray_item(table->nodes, curr);
-    }
-
-    if (!node->is_leaf) {
-        scanner_warn(s, "this compose sequence is a prefix of another; skipping line");
-        return;
-    }
-
-    if (node->leaf.utf8 != 0 || node->leaf.keysym != XKB_KEY_NoSymbol) {
-        bool same_string =
-            (node->leaf.utf8 == 0 && !production->has_string) ||
-            (
-                node->leaf.utf8 != 0 && production->has_string &&
-                streq(&darray_item(table->utf8, node->leaf.utf8),
-                      production->string)
-            );
-        bool same_keysym =
-            (node->leaf.keysym == XKB_KEY_NoSymbol && !production->has_keysym) ||
-            (
-                node->leaf.keysym != XKB_KEY_NoSymbol && production->has_keysym &&
-                node->leaf.keysym == production->keysym
-            );
-        if (same_string && same_keysym) {
-            scanner_warn(s, "this compose sequence is a duplicate of another; skipping line");
             return;
         }
-        scanner_warn(s, "this compose sequence already exists; overriding");
-    }
-
-    if (production->has_string) {
-        node->leaf.utf8 = darray_size(table->utf8);
-        darray_append_items(table->utf8, production->string,
-                            strlen(production->string) + 1);
-    }
-    if (production->has_keysym) {
-        node->leaf.keysym = production->keysym;
     }
 }
 
@@ -733,7 +724,9 @@ parse_file(struct xkb_compose_table *table, FILE *file, const char *file_name)
 
     ok = map_file(file, &string, &size);
     if (!ok) {
-        log_err(table->ctx, "Couldn't read Compose file %s: %s\n",
+        log_err(table->ctx,
+                XKB_LOG_MESSAGE_NO_ID,
+                "Couldn't read Compose file %s: %s\n",
                 file_name, strerror(errno));
         return false;
     }