2 * jmt.c: Earley parser for lenses based on Jim/Mandelbaum transducers
4 * Copyright (C) 2009-2016 David Lutterkort
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * Author: David Lutterkort <lutter@redhat.com>
30 /* This is an implementation of the Earley parser described in the paper
31 * "Efficient Earley Parsing with Regular Right-hand Sides" by Trever Jim
32 * and Yitzhak Mandelbaum.
34 * Since we only deal in lenses, they are both terminals and nonterminals:
35 * recursive lenses (those with lens->recursive set) are nonterminals, and
36 * non-recursive lenses are terminals, with the exception of non-recursive
37 * lenses that match the empty word. Lenses are also the semantic actions
38 * attached to a SCAN or COMPLETE during recosntruction of the parse
39 * tree. Our SCAN makes sure that we only ever scan nonempty words - that
40 * means that for nonrecursive lenses which match the empty word (e.g., del
41 * [ \t]* "") we need to make sure we get a COMPLETE when the lens matched
44 * That is achieved by treating such a lens t as the construct (t|T) with
49 * Data structures for the Jim/Mandelbaum parser
52 #define IND_MAX UINT32_MAX
61 #define array_elem(arr, ind, typ) \
62 (((typ *) (arr).data) + ind)
64 #define array_for_each(i, arr) \
65 for(ind_t i = 0; i < (arr).used; i++)
67 #define array_each_elem(elt, arr, typ) \
68 for(typ *elt = (typ *) (arr).data + 0; \
69 elt - (typ *) (arr).data < (arr).used; \
72 static void array_init(struct array *arr, size_t elem_size) {
74 arr->elem_size = elem_size;
77 ATTRIBUTE_RETURN_CHECK
78 static int array_add(struct array *arr, ind_t *ind) {
79 if (arr->used >= arr->size) {
81 ind_t expand = arr->size;
84 r = mem_realloc_n(&(arr->data), arr->elem_size, arr->size + expand);
87 memset((char *) arr->data + arr->elem_size*arr->size, 0,
88 arr->elem_size * expand);
96 /* Insert a new entry into the array at index IND. Shift all entries from
97 * IND on back by one. */
98 ATTRIBUTE_RETURN_CHECK
99 static int array_insert(struct array *arr, ind_t ind) {
102 if (array_add(arr, &last) < 0)
108 memmove((char *) arr->data + arr->elem_size*(ind+1),
109 (char *) arr->data + arr->elem_size*ind,
110 arr->elem_size * (arr->used - ind - 1));
111 memset((char *) arr->data + arr->elem_size*ind, 0, arr->elem_size);
115 static void array_release(struct array *arr) {
118 arr->used = arr->size = 0;
122 static void array_remove(struct array *arr, ind_t ind) {
123 char *data = arr->data;
124 memmove(data + ind * arr->elem_size,
125 data + (ind + 1) * arr->elem_size,
126 (arr->used - ind - 1) * arr->elem_size);
130 ATTRIBUTE_RETURN_CHECK
131 static int array_join(struct array *dst, struct array *src) {
134 if (dst->elem_size != src->elem_size)
137 r = mem_realloc_n(&(dst->data), dst->elem_size,
138 dst->used + src->used);
142 memcpy(((char *) dst->data) + dst->used * dst->elem_size,
144 src->used * src->elem_size);
145 dst->used += src->used;
146 dst->size = dst->used;
150 /* Special lens indices - these don't reference lenses, but
151 * some pseudo actions. We stash them at the top of the range
152 * of IND_T, with LENS_MAX giving us the maximal lens index for
167 struct state *next; /* Linked list for memory management */
168 struct array trans; /* Array of struct trans */
169 ind_t nret; /* Number of returned lenses */
170 ind_t *ret; /* The returned lenses */
171 ind_t num; /* Counter for num of new states */
172 unsigned int reachable : 1;
173 unsigned int live : 1;
176 /* Sets of states used in determinizing the NFA to a DFA */
178 struct state *state; /* The new state in the DFA */
179 struct array set; /* Set of (struct state *) in the NFA, sorted */
182 /* For recursive lenses (nonterminals), the mapping of nonterminal to
183 * state. We also store nonrecursive lenses here; in that case, the state
190 /* A Jim/Mandelbaum transducer */
193 struct array lenses; /* Array of struct jmt_lens */
195 ind_t lens; /* The start symbol of the grammar */
201 R_COMPLETE = R_ROOT << 1,
202 R_PREDICT = R_COMPLETE << 1,
203 R_SCAN = R_PREDICT << 1
206 /* The reason an item was added for parse reconstruction; can be R_SCAN,
207 * R_COMPLETE, R_PREDICT or R_COMPLETE|R_PREDICT */
209 enum item_reason reason;
210 ind_t lens; /* R_COMPLETE, R_SCAN */
211 ind_t from_set; /* R_COMPLETE, R_PREDICT, R_SCAN */
212 ind_t from_item; /* R_COMPLETE, R_PREDICT, R_SCAN */
213 ind_t to_item; /* R_COMPLETE */
214 ind_t caller; /* number of a state */
218 /* The 'classical' Earley item (state, parent) */
221 /* Backlinks to why item was added */
235 struct item_set **sets;
238 #define for_each_item(it, set) \
239 array_each_elem(it, (set)->items, struct item)
241 #define for_each_trans(t, s) \
242 array_each_elem(t, (s)->trans, struct trans)
244 #define parse_state(parse, ind) \
245 array_elem(parse->states, ind, struct state)
247 static struct item *set_item(struct jmt_parse *parse, ind_t set,
249 ensure(parse->sets[set] != NULL, parse);
250 ensure(item < parse->sets[set]->items.used, parse);
251 return array_elem(parse->sets[set]->items, item, struct item);
256 static struct state *item_state(struct jmt_parse *parse, ind_t set,
258 return set_item(parse, set, item)->state;
261 static struct trans *state_trans(struct state *state, ind_t t) {
262 return array_elem(state->trans, t, struct trans);
265 static ind_t item_parent(struct jmt_parse *parse, ind_t set, ind_t item) {
266 return set_item(parse, set, item)->parent;
269 static bool is_return(const struct state *s) {
273 static struct lens *lens_of_parse(struct jmt_parse *parse, ind_t lens) {
274 return array_elem(parse->jmt->lenses, lens, struct jmt_lens)->lens;
282 * Manipulate the Earley graph. We denote edges in the graph as
283 * [j, (s,i)] -> [k, item_k] => [l, item_l]
284 * to indicate that we are adding item (s,i) to E_j and record that its
285 * child is the item with index item_k in E_k and its sibling is item
289 /* Add item (s, k) to E_j. Note that the item was caused by action reason
290 * using lens starting at from_item in E_{from_set}
292 * [j, (s,k)] -> [from_set, from_item] => [j, to_item]
294 static ind_t parse_add_item(struct jmt_parse *parse, ind_t j,
295 struct state *s, ind_t k,
296 enum item_reason reason, ind_t lens,
298 ind_t from_item, ind_t to_item,
302 struct item_set *set = parse->sets[j];
303 struct item *item = NULL;
304 ind_t result = IND_MAX;
306 ensure(from_item == EPS || from_item < parse->sets[from_set]->items.used,
308 ensure(to_item == EPS || to_item < parse->sets[j]->items.used,
312 r = ALLOC(parse->sets[j]);
313 ERR_NOMEM(r < 0, parse);
314 array_init(&parse->sets[j]->items, sizeof(struct item));
315 set = parse->sets[j];
318 for (ind_t i=0; i < set->items.used; i++) {
319 if (item_state(parse, j, i) == s
320 && item_parent(parse, j, i) == k) {
322 item = set_item(parse, j, i);
327 if (result == IND_MAX) {
328 r = array_add(&set->items, &result);
329 ERR_NOMEM(r < 0, parse);
331 item = set_item(parse, j, result);
336 for (ind_t i = 0; i < item->nlinks; i++) {
337 struct link *lnk = item->links + i;
338 if (lnk->reason == reason && lnk->lens == lens
339 && lnk->from_set == from_set && lnk->from_item == from_item
340 && lnk->to_item == to_item && lnk->caller == caller)
344 r = REALLOC_N(item->links, item->nlinks + 1);
345 ERR_NOMEM(r < 0, parse);
347 struct link *lnk = item->links+ item->nlinks;
350 lnk->reason = reason;
352 lnk->from_set = from_set;
353 lnk->from_item = from_item;
354 lnk->to_item = to_item;
355 lnk->caller = caller;
360 /* Add item (s, i) to set E_j and record that it was added because
361 * a parse of nonterminal lens, starting at itemk in E_k, was completed
362 * by item item in E_j
364 * [j, (s,i)] -> [j, item] => [k, itemk]
366 static void parse_add_complete(struct jmt_parse *parse, ind_t j,
367 struct state *s, ind_t i,
368 ind_t k, ind_t itemk,
371 parse_add_item(parse, j, s, i, R_COMPLETE, lens, k, itemk, item, IND_MAX);
374 /* Same as parse_add_complete, but mark the item also as a predict
376 * [j, (s,i)] -> [j, item] => [k, itemk]
378 static void parse_add_predict_complete(struct jmt_parse *parse, ind_t j,
379 struct state *s, ind_t i,
380 ind_t k, ind_t itemk,
382 ind_t item, ind_t caller) {
383 parse_add_item(parse, j, s, i, R_COMPLETE|R_PREDICT,
384 lens, k, itemk, item, caller);
387 /* Add item (s, j) to E_j and record that it was added because of a
388 * prediction from item from in E_j
390 static ind_t parse_add_predict(struct jmt_parse *parse, ind_t j,
391 struct state *s, ind_t from) {
393 ensure(from < parse->sets[j]->items.used, parse);
394 struct state *t = item_state(parse, j, from);
396 return parse_add_item(parse, j, s, j, R_PREDICT, EPS, j, from, EPS,
402 /* Add item (s,i) to E_j and record that it was added because of scanning
403 * with lens starting from item item in E_k.
405 * [j, (s,i)] -> [k, item]
407 static void parse_add_scan(struct jmt_parse *parse, ind_t j,
408 struct state *s, ind_t i,
409 ind_t lens, ind_t k, ind_t item) {
410 ensure(item < parse->sets[k]->items.used, parse);
412 parse_add_item(parse, j, s, i, R_SCAN, lens, k, item, EPS, IND_MAX);
418 static bool is_complete(const struct link *lnk) {
419 return lnk->reason & R_COMPLETE;
423 static bool is_predict(const struct link *lnk) {
424 return lnk->reason & R_PREDICT;
428 static bool is_scan(const struct link *lnk) {
429 return lnk->reason & R_SCAN;
433 static bool is_last_sibling(const struct link *lnk) {
434 if (is_complete(lnk))
436 return lnk->reason & (R_PREDICT|R_ROOT);
440 static bool returns(const struct state *s, ind_t l) {
441 for (ind_t i = 0; i < s->nret; i++)
447 static void state_add_return(struct jmt *jmt, struct state *s, ind_t l) {
450 if (s == NULL || returns(s, l))
453 r = REALLOC_N(s->ret, s->nret + 1);
454 ERR_NOMEM(r < 0, jmt);
461 static void state_merge_returns(struct jmt *jmt, struct state *dst,
462 const struct state *src) {
463 for (ind_t l = 0; l < src->nret; l++)
464 state_add_return(jmt, dst, src->ret[l]);
467 static void nncomplete(struct jmt_parse *parse, ind_t j,
468 struct state *t, ind_t k, ind_t item) {
470 for (ind_t itemk = 0; itemk < parse->sets[k]->items.used; itemk++) {
471 struct state *u = item_state(parse, k, itemk);
472 for_each_trans(y, u) {
473 if (returns(t, y->lens)) {
474 ind_t parent = item_parent(parse, k, itemk);
475 parse_add_complete(parse, j,
477 k, itemk, y->lens, item);
483 /* NCALLER for (t, i) in E_j, which has index item in E_j, and t -> s a
484 * call in the transducer and s a return. The item (s,j) has index pred in
487 static void ncaller(struct jmt_parse *parse, ind_t j, ind_t item,
488 struct state *t, ind_t i, struct state *s, ind_t pred) {
489 for_each_trans(u, t) {
490 if (returns(s, u->lens)) {
491 /* [j, (u->to, i)] -> [j, pred] => [j, item] */
492 parse_add_predict_complete(parse, j,
500 /* NCALLEE for (t, parent) in E_j, which has index item in E_j, and t -> s
501 * a call in the transducer and s a return. The item (s,j) has index pred
504 static void ncallee(struct jmt_parse *parse, ind_t j, ATTRIBUTE_UNUSED ind_t item,
505 ATTRIBUTE_UNUSED struct state *t, ATTRIBUTE_UNUSED ind_t parent, struct state *s, ind_t pred) {
506 for_each_trans(u, s) {
507 if (returns(s, u->lens)) {
508 /* [j, (u->to, j)] -> [j, item] => [j, item] */
509 parse_add_predict_complete(parse, j,
517 static struct jmt_parse *parse_init(struct jmt *jmt,
518 const char *text, size_t text_len) {
520 struct jmt_parse *parse;
523 ERR_NOMEM(r < 0, jmt);
526 parse->error = jmt->error;
528 parse->nsets = text_len + 1;
529 r = ALLOC_N(parse->sets, parse->nsets);
530 ERR_NOMEM(r < 0, jmt);
539 void jmt_free_parse(struct jmt_parse *parse) {
542 for (int i=0; i < parse->nsets; i++) {
543 struct item_set *set = parse->sets[i];
545 array_each_elem(x, set->items, struct item)
547 array_release(&set->items);
555 static struct state *lens_state(struct jmt *jmt, ind_t l);
557 static void flens(FILE *fp, ind_t l) {
559 fprintf(fp, "%c", 'S');
560 else if (l < 'S' - 'A')
561 fprintf(fp, "%c", 'A' + l - 1);
562 else if (l <= 'Z' - 'A')
563 fprintf(fp, "%c", 'A' + l);
565 fprintf(fp, "%u", l);
568 static void parse_dot_link(FILE *fp, struct jmt_parse *parse,
569 ind_t k, struct item *x, struct link *lnk) {
570 char *lens_label = NULL;
571 if (is_complete(lnk) || is_scan(lnk)) {
572 struct state *sA = lens_state(parse->jmt, lnk->lens);
576 r = xasprintf(&lens_label, "<%d>", lnk->lens);
578 r = xasprintf(&lens_label, "%d", lnk->lens);
580 fprintf(fp, "// Internal error generating lens_label\n");
584 fprintf(fp, " n%d_%d_%d [ label = \"(%d, %d)\"];\n",
585 k, x->state->num, x->parent, x->state->num, x->parent);
586 if (is_complete(lnk)) {
587 struct item *y = set_item(parse, k, lnk->to_item);
588 const char *pred = is_predict(lnk) ? "p" : "";
589 fprintf(fp, " n%d_%d_%d -> n%s%d_%d_%d [ style = dashed ];\n",
590 k, x->state->num, x->parent,
591 pred, k, y->state->num, y->parent);
592 if (is_predict(lnk)) {
593 fprintf(fp, " n%s%d_%d_%d [ label = \"\" ];\n",
594 pred, k, y->state->num, y->parent);
595 fprintf(fp, " n%s%d_%d_%d -> n%d_%d_%d [ style = bold ];\n",
596 pred, k, y->state->num, y->parent,
597 k, y->state->num, y->parent);
599 y = set_item(parse, lnk->from_set, lnk->from_item);
601 " n%d_%d_%d -> n%d_%d_%d [ style = dashed, label = \"",
602 k, x->state->num, x->parent,
603 lnk->from_set, y->state->num, y->parent);
604 flens(fp, lnk->lens);
605 fprintf(fp, "\" ];\n");
606 } else if (is_scan(lnk)) {
608 set_item(parse, lnk->from_set, lnk->from_item);
610 " n%d_%d_%d -> n%d_%d_%d [ label = \"",
611 k, x->state->num, x->parent,
612 lnk->from_set, y->state->num, y->parent);
613 for (ind_t i=lnk->from_set; i < k; i++)
614 fprintf(fp, "%c", parse->text[i]);
615 fprintf(fp, "\" ];\n");
617 } else if (is_predict(lnk)) {
619 set_item(parse, lnk->from_set, lnk->from_item);
621 " n%d_%d_%d -> n%d_%d_%d [ style = bold ];\n",
622 k, x->state->num, x->parent,
623 lnk->from_set, y->state->num, y->parent);
629 static void parse_dot(struct jmt_parse *parse, const char *fname) {
630 FILE *fp = debug_fopen("%s", fname);
634 fprintf(fp, "digraph \"jmt_parse\" {\n");
635 fprintf(fp, " rankdir = RL;\n");
636 for (int k=0; k < parse->nsets; k++) {
637 struct item_set *set = parse->sets[k];
642 fprintf(fp, " subgraph \"cluster_E_%d\" {\n", k);
643 fprintf(fp, " rankdir=RL;\n rank=same;\n");
644 fprintf(fp, " title%d [ label=\"E%d\", shape=plaintext ]\n", k, k);
645 for_each_item(x, set) {
646 for (int i=0; i < x->nlinks; i++) {
647 struct link *lnk = x->links + i;
648 parse_dot_link(fp, parse, k, x, lnk);
658 jmt_parse(struct jmt *jmt, const char *text, size_t text_len)
660 struct jmt_parse *parse = NULL;
662 parse = parse_init(jmt, text, text_len);
666 parse_add_item(parse, 0, jmt->start, 0, R_ROOT, EPS, EPS, EPS, EPS,
669 if (is_return(jmt->start)) {
670 for_each_trans(x, jmt->start) {
671 if (returns(jmt->start, x->lens))
672 parse_add_predict_complete(parse, 0, x->to, 0,
673 0, 0, x->lens, 0, 0);
677 for (int j=0; j <= text_len; j++) {
678 struct item_set *set = parse->sets[j];
682 for (int item=0; item < set->items.used; item++) {
683 struct state *t = item_state(parse, j, item);
684 ind_t i = item_parent(parse, j, item);
686 if (is_return(t) && i != j) {
688 nncomplete(parse, j, t, i, item);
691 for_each_trans(x, t) {
692 if (x->lens == CALL) {
694 ind_t pred = parse_add_predict(parse, j, x->to, item);
696 if (is_return(x->to)) {
698 ncaller(parse, j, item, t, i, x->to, pred);
700 ncallee(parse, j, item, t, i, x->to, pred);
704 struct lens *lens = lens_of_parse(parse, x->lens);
705 struct state *sA = lens_state(parse->jmt, x->lens);
706 if (! lens->recursive && sA == NULL) {
708 // FIXME: We really need to find every k so that
709 // text[j..k] matches lens->ctype, not just one
710 count = regexp_match(lens->ctype, text, text_len, j, NULL);
712 parse_add_scan(parse, j+count,
721 if (debugging("cf.jmt.parse"))
722 parse_dot(parse, "jmt_parse.dot");
725 jmt_free_parse(parse);
730 * Reconstruction of the parse tree
734 build_nullable(struct jmt_parse *parse, ind_t pos,
735 struct jmt_visitor *visitor, struct lens *lens, int lvl) {
736 if (! lens->recursive) {
737 if (visitor->terminal != NULL) {
738 (*visitor->terminal)(lens, pos, pos, visitor->data);
742 if (visitor->enter != NULL) {
743 (*visitor->enter)(lens, pos, pos, visitor->data);
749 build_nullable(parse, pos, visitor, lens->body, lvl+1);
752 for (int i=0; i < lens->nchildren; i++)
753 build_nullable(parse, pos, visitor, lens->children[i], lvl+1);
756 for (int i=0; i < lens->nchildren; i++)
757 if (lens->children[i]->ctype_nullable)
758 build_nullable(parse, pos, visitor,
759 lens->children[i], lvl+1);
763 build_nullable(parse, pos, visitor, lens->child, lvl+1);
769 BUG_ON(true, parse, "Unexpected lens tag %d", lens->tag);
772 if (visitor->exit != NULL) {
773 (*visitor->exit)(lens, pos, pos, visitor->data);
781 static void build_trace(const char *msg, ind_t start, ind_t end,
782 struct item *x, int lvl) {
783 for (int i=0; i < lvl; i++) putc(' ', stderr);
785 printf("%s %d..%d: (%d, %d) %d %s%s%s\n", msg,
786 start, end, x->state->num,
787 x->parent, x->links->lens,
788 is_complete(x->links) ? "c" : "",
789 is_predict(x->links) ? "p" : "",
790 is_scan(x->links) ? "s" : "");
792 printf("%s %d..%d\n", msg, start, end);
796 static int add_sibling(struct array *siblings, ind_t lnk) {
800 r = array_add(siblings, &ind);
803 *array_elem(*siblings, ind, ind_t) = lnk;
807 /* Return true if CALLER is a possible caller for the link LNK which starts
810 * FIXME: We can get rid of the caller field on a link if we distinguish
811 * between NCALLER and NCALLEE in the Earley graph, rather than collapse
812 * them both into links with reason PREDICT|COMPLETE
814 static bool is_caller(struct item *x, struct link *lnk, ind_t caller) {
815 if (lnk->reason & R_ROOT)
816 return caller == lnk->caller;
818 if (! is_predict(lnk))
821 if (is_complete(lnk)) {
822 /* NCALLER: caller == t
823 * NCALLEE: caller == s */
824 return caller == lnk->caller;
826 /* PREDICT: caller == t || caller == s */
827 return caller == lnk->caller || caller == x->state->num;
830 /* Traverse the siblings of x and check that the callee's set of callers
831 * contains CALLER. When a path ending with a call from CALLER exists,
832 * record the number of the corresponding link for each item in
833 * SIBLINGS. The links are recorded in left-to-right order, i.e. the number
834 * of the first link to follow is in the last entry in SIBLINGS.
836 * Returns 0 if there is a path ending in a callee of CALLER. Return -1 if
837 * there is none. Return -2 if there are multiple such paths. Return -3 for
840 static int filter_siblings(struct jmt_visitor *visitor, struct lens *lens,
841 ind_t k, ind_t item, ind_t caller,
842 struct array *siblings) {
843 struct jmt_parse *parse = visitor->parse;
844 struct item *x = set_item(parse, k, item);
848 for (ind_t lnk = 0; lnk < x->nlinks; lnk++)
849 if (is_last_sibling(x->links + lnk))
852 if (nlast > 0 && nlast < x->nlinks)
855 if (nlast == x->nlinks) {
856 for (ind_t lnk = 0; lnk < x->nlinks; lnk++) {
857 if (is_caller(x, x->links + lnk, caller)) {
859 r = add_sibling(siblings, lnk);
861 ERR_REPORT(parse, AUG_ENOMEM, NULL);
870 ind_t found = IND_MAX;
871 for (ind_t lnk = 0; lnk < x->nlinks; lnk++) {
872 struct link *l = x->links + lnk;
873 r = filter_siblings(visitor, lens,
874 l->from_set, l->from_item, caller,
879 if (found != IND_MAX)
887 if (found == IND_MAX) {
890 r = add_sibling(siblings, found);
892 ERR_REPORT(parse, AUG_ENOMEM, NULL);
899 (*visitor->error)(lens, visitor->data, k,
900 "Ambiguous parse: %d links in state (%d, %d) in E_%d",
901 x->nlinks, x->state->num, x->parent, k);
905 static void visit_enter(struct jmt_visitor *visitor, struct lens *lens,
906 size_t start, size_t end,
907 struct item *x, int lvl) {
908 if (debugging("cf.jmt.visit"))
909 build_trace("{", start, end, x, lvl);
910 if (visitor->enter != NULL)
911 (*visitor->enter)(lens, start, end, visitor->data);
914 static void visit_exit(struct jmt_visitor *visitor, struct lens *lens,
915 size_t start, size_t end,
916 struct item *x, int lvl) {
917 if (debugging("cf.jmt.visit"))
918 build_trace("}", start, end, x, lvl);
919 if (visitor->exit != NULL)
920 (*visitor->exit)(lens, start, end, visitor->data);
924 build_children(struct jmt_parse *parse, ind_t k, ind_t item,
925 struct jmt_visitor *visitor, int lvl, ind_t caller);
928 build_tree(struct jmt_parse *parse, ind_t k, ind_t item, struct lens *lens,
929 struct jmt_visitor *visitor, int lvl) {
930 struct item *x = set_item(parse, k, item);
931 ind_t start = x->links->from_set;
933 struct item *old_x = x;
936 /* This completion corresponds to a nullable nonterminal
937 * that match epsilon. Reconstruct the full parse tree
938 * for matching epsilon */
939 if (debugging("cf.jmt.visit"))
940 build_trace("N", x->links->from_set, k, x, lvl);
941 build_nullable(parse, start, visitor, lens, lvl);
945 ensure(is_complete(x->links), parse);
947 visit_enter(visitor, lens, start, end, x, lvl);
950 /* x is a completion item. (k, x->to_item) is its first child in the
952 if (! is_predict(x->links)) {
953 struct link *lnk = x->links;
954 struct item *sib = set_item(parse, lnk->from_set, lnk->from_item);
955 ind_t caller = sib->state->num;
958 x = set_item(parse, k, item);
959 build_children(parse, k, item, visitor, lvl, caller);
963 visit_exit(visitor, lens, start, end, old_x, lvl);
970 build_children(struct jmt_parse *parse, ind_t k, ind_t item,
971 struct jmt_visitor *visitor, int lvl, ind_t caller) {
972 struct item *x = set_item(parse, k, item);
973 struct lens *lens = lens_of_parse(parse, x->links->lens);
974 struct array siblings;
978 array_init(&siblings, sizeof(ind_t));
979 r = filter_siblings(visitor, lens, k, item, caller, &siblings);
983 /* x the first item in a list of siblings; visit items (x->from_set,
984 * x->from_item) in order, which will visit x and its siblings in the
985 * parse tree from right to left */
986 for (ind_t i = siblings.used - 1; i > 0; i--) {
987 ind_t lnk = *array_elem(siblings, i, ind_t);
988 struct lens *sub = lens_of_parse(parse, x->links[lnk].lens);
989 if (sub->recursive) {
990 build_tree(parse, k, item, sub, visitor, lvl+1);
993 if (debugging("cf.jmt.visit"))
994 build_trace("T", x->links->from_set, k, x, lvl+1);
995 if (visitor->terminal != NULL) {
996 (*visitor->terminal)(sub,
997 x->links->from_set, k, visitor->data);
1001 k = x->links[lnk].from_set;
1002 item = x->links[lnk].from_item;
1003 x = set_item(parse, k, item);
1006 array_release(&siblings);
1010 int jmt_visit(struct jmt_visitor *visitor, size_t *len) {
1011 struct jmt_parse *parse = visitor->parse;
1012 ind_t k = parse->nsets - 1; /* Current Earley set */
1014 struct item_set *set = parse->sets[k];
1019 for (item = 0; item < set->items.used; item++) {
1020 struct item *x = set_item(parse, k, item);
1021 if (x->parent == 0 && returns(x->state, parse->jmt->lens)) {
1022 for (ind_t i = 0; i < x->nlinks; i++) {
1023 if (is_complete(x->links + i) || is_scan(x->links + i)) {
1024 if (debugging("cf.jmt.visit"))
1025 printf("visit: found (%d, %d) in E_%d\n",
1026 x->state->num, x->parent, k);
1033 if (item >= parse->sets[k]->items.used)
1035 struct lens *lens = lens_of_parse(parse, parse->jmt->lens);
1037 visit_enter(visitor, lens, 0, k, NULL, 0);
1040 *len = build_children(parse, k, item, visitor, 0,
1041 parse->jmt->start->num);
1044 visit_exit(visitor, lens, 0, k, NULL, 0);
1051 if (parse->sets[k] != NULL) break;
1057 * Build the automaton
1061 static struct state *make_state(struct jmt *jmt) {
1066 ERR_NOMEM(r < 0, jmt);
1067 s->num = jmt->state_count++;
1068 array_init(&s->trans, sizeof(struct trans));
1069 if (jmt->start != NULL)
1070 list_cons(jmt->start->next, s);
1078 static ind_t add_lens(struct jmt *jmt, struct lens *lens) {
1081 struct state *sA = NULL;
1084 r = array_add(&jmt->lenses, &l);
1085 ERR_NOMEM(r < 0, jmt);
1086 ERR_NOMEM(l == IND_MAX, jmt);
1088 if (! lens->recursive)
1089 nullable = regexp_matches_empty(lens->ctype);
1091 array_elem(jmt->lenses, l, struct jmt_lens)->lens = lens;
1092 /* A nonrecursive lens that matches epsilon is both a terminal
1093 * and a nonterminal */
1094 if (lens->recursive || nullable) {
1095 sA = make_state(jmt);
1096 ERR_NOMEM(sA == NULL, jmt);
1097 array_elem(jmt->lenses, l, struct jmt_lens)->state = sA;
1098 if (! lens->recursive) {
1099 /* Add lens again, so that l refers to the nonterminal T
1100 * for the lens, and l+1 refers to the terminal t for it */
1102 r = array_add(&jmt->lenses, &m);
1103 ERR_NOMEM(r < 0, jmt);
1104 ERR_NOMEM(m == IND_MAX, jmt);
1106 array_elem(jmt->lenses, m, struct jmt_lens)->lens = lens;
1110 if (debugging("cf.jmt")) {
1112 printf("add_lens: ");
1113 print_regexp(stdout, lens->ctype);
1114 printf(" %s\n", format_lens(lens));
1116 printf("add_lens: ");
1118 printf(" %u %s\n", sA->num, format_lens(lens));
1120 printf("add_lens: // %s\n", format_lens(lens));
1130 static struct trans *
1131 add_new_trans(struct jmt *jmt,
1132 struct state *from, struct state *to, ind_t lens) {
1137 if (from == NULL || to == NULL)
1140 r = array_add(&from->trans, &i);
1141 ERR_NOMEM(r < 0, jmt);
1142 t = array_elem(from->trans, i, struct trans);
1150 static struct trans *
1151 add_eps_trans(struct jmt *jmt, struct state *from, struct state *to) {
1152 return add_new_trans(jmt, from, to, EPS);
1155 static struct lens *lens_of_jmt(struct jmt *jmt, ind_t l) {
1156 return array_elem(jmt->lenses, l, struct jmt_lens)->lens;
1159 static ind_t lens_index(struct jmt *jmt, struct lens *lens) {
1160 array_for_each(i, jmt->lenses)
1161 if (lens_of_jmt(jmt, i) == lens)
1166 static struct state *lens_state(struct jmt *jmt, ind_t l) {
1167 return array_elem(jmt->lenses, l, struct jmt_lens)->state;
1170 static void print_lens_symbol(FILE *fp, struct jmt *jmt, struct lens *lens) {
1171 ind_t l = lens_index(jmt, lens);
1172 struct state *sA = lens_state(jmt, l);
1175 print_regexp(fp, lens->ctype);
1180 static void print_grammar(struct jmt *jmt, struct lens *lens) {
1181 ind_t l = lens_index(jmt, lens);
1182 struct state *sA = lens_state(jmt, l);
1184 if (sA == NULL || (lens->tag == L_REC && lens->rec_internal))
1188 print_lens_symbol(stdout, jmt, lens);
1191 if (! lens->recursive) {
1192 /* Nullable regexps */
1193 print_regexp(stdout, lens->ctype);
1198 switch (lens->tag) {
1200 print_lens_symbol(stdout, jmt, lens->children[0]);
1201 for (int i=1; i < lens->nchildren; i++) {
1203 print_lens_symbol(stdout, jmt, lens->children[i]);
1206 for (int i=0; i < lens->nchildren; i++)
1207 print_grammar(jmt, lens->children[i]);
1210 print_lens_symbol(stdout, jmt, lens->children[0]);
1211 for (int i=1; i < lens->nchildren; i++) {
1213 print_lens_symbol(stdout, jmt, lens->children[i]);
1216 for (int i=0; i < lens->nchildren; i++)
1217 print_grammar(jmt, lens->children[i]);
1220 print_lens_symbol(stdout, jmt, lens->child);
1222 print_grammar(jmt, lens->child);
1225 print_lens_symbol(stdout, jmt, lens->child);
1227 print_grammar(jmt, lens->child);
1230 print_lens_symbol(stdout, jmt, lens->child);
1232 print_grammar(jmt, lens->child);
1235 print_lens_symbol(stdout, jmt, lens->body);
1237 print_grammar(jmt, lens->body);
1240 print_lens_symbol(stdout, jmt, lens->child);
1242 print_grammar(jmt, lens->child);
1245 BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag);
1252 static void print_grammar_top(struct jmt *jmt, struct lens *lens) {
1253 printf("Grammar:\n");
1254 print_grammar(jmt, lens);
1255 if (lens->tag == L_REC) {
1257 print_lens_symbol(stdout, jmt, lens->alias);
1259 print_lens_symbol(stdout, jmt, lens->alias->body);
1264 static void index_lenses(struct jmt *jmt, struct lens *lens) {
1267 l = lens_index(jmt, lens);
1269 l = add_lens(jmt, lens);
1273 if (! lens->recursive)
1276 switch (lens->tag) {
1279 for (int i=0; i < lens->nchildren; i++)
1280 index_lenses(jmt, lens->children[i]);
1286 index_lenses(jmt, lens->child);
1289 if (! lens->rec_internal)
1290 index_lenses(jmt, lens->body);
1293 BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag);
1300 static void thompson(struct jmt *jmt, struct lens *lens,
1301 struct state **s, struct state **f) {
1302 ind_t l = lens_index(jmt, lens);
1303 struct state *sA = lens_state(jmt, l);
1304 ensure(l < jmt->lenses.used, jmt);
1306 *s = make_state(jmt);
1307 *f = make_state(jmt);
1310 if (lens->recursive) {
1312 add_new_trans(jmt, *s, *f, l);
1313 add_new_trans(jmt, *s, sA, CALL);
1314 } else if (sA == NULL) {
1315 /* A terminal that never matches epsilon */
1316 add_new_trans(jmt, *s, *f, l);
1318 /* A terminal that matches epsilon */
1319 add_new_trans(jmt, *s, *f, l);
1320 add_new_trans(jmt, *s, sA, CALL);
1321 add_new_trans(jmt, *s, *f, l+1);
1327 static void conv(struct jmt *jmt, struct lens *lens,
1328 struct state **s, struct state **e,
1330 ind_t l = lens_index(jmt, lens);
1331 ensure(l < jmt->lenses.used, jmt);
1332 struct state *sA = lens_state(jmt, l);
1338 if (lens->recursive) {
1340 *s = make_state(jmt);
1341 *f = make_state(jmt);
1343 add_new_trans(jmt, *s, *f, l);
1345 ensure(sA != NULL, jmt);
1346 add_new_trans(jmt, *s, sA, EPS);
1348 } else if (sA == NULL) {
1349 /* A terminal that never matches epsilon */
1350 *s = make_state(jmt);
1351 *f = make_state(jmt);
1353 add_new_trans(jmt, *s, *f, l);
1356 /* A terminal that matches epsilon */
1357 *s = make_state(jmt);
1358 *f = make_state(jmt);
1360 add_new_trans(jmt, *s, *f, l);
1361 add_new_trans(jmt, *s, *f, l+1);
1362 add_new_trans(jmt, *s, sA, EPS);
1369 static void conv_concat(struct jmt *jmt, struct lens *lens,
1370 struct state **s, struct state **e,
1372 struct state *s2, *f2, *e2;
1374 conv(jmt, lens->children[0], &s2, &e2, &f2);
1375 *s = make_state(jmt);
1376 add_new_trans(jmt, *s, s2, EPS);
1378 for (int i=1; i < lens->nchildren; i++) {
1379 struct state *s3, *e3, *f3, *scall, *fcall;
1380 conv(jmt, lens->children[i], &s3, &e3, &f3);
1381 thompson(jmt, lens->children[i], &scall, &fcall);
1383 add_eps_trans(jmt, f2, scall);
1384 add_eps_trans(jmt, e2, s3);
1385 *f = make_state(jmt);
1386 add_eps_trans(jmt, f3, *f);
1387 add_eps_trans(jmt, fcall, *f);
1388 *e = make_state(jmt);
1389 add_eps_trans(jmt, e3, *e);
1397 static void conv_union(struct jmt *jmt, struct lens *lens,
1398 struct state **s, struct state **e,
1401 *s = make_state(jmt);
1402 *e = make_state(jmt);
1403 *f = make_state(jmt);
1406 for (int i = 0; i < lens->nchildren; i++) {
1407 struct state *s2, *e2, *f2;
1409 conv(jmt, lens->children[i], &s2, &e2, &f2);
1412 add_eps_trans(jmt, *s, s2);
1413 add_eps_trans(jmt, e2, *e);
1414 add_eps_trans(jmt, f2, *f);
1421 static void conv_star(struct jmt *jmt, struct lens *lens,
1422 struct state **s, struct state **e,
1425 *s = make_state(jmt);
1426 *e = make_state(jmt);
1427 *f = make_state(jmt);
1430 struct state *si, *ei, *fi, *scall, *fcall;
1431 conv(jmt, lens->child, &si, &ei, &fi);
1432 thompson(jmt, lens->child, &scall, &fcall);
1435 add_eps_trans(jmt, *s, si);
1436 add_eps_trans(jmt, ei, si);
1437 add_eps_trans(jmt, *s, *e);
1438 add_eps_trans(jmt, ei, *e);
1439 add_eps_trans(jmt, fi, scall);
1440 add_eps_trans(jmt, fcall, scall);
1441 add_eps_trans(jmt, fi, *f);
1442 add_eps_trans(jmt, fcall, *f);
1449 static void conv_rhs(struct jmt *jmt, ind_t l) {
1450 struct lens *lens = lens_of_jmt(jmt, l);
1451 struct state *s = NULL, *e = NULL, *f = NULL;
1452 struct state *sA = lens_state(jmt, l);
1454 if (! lens->recursive) {
1455 /* Nothing to do for terminals that do not match epsilon */
1457 state_add_return(jmt, sA, l);
1461 /* All other nonterminals/recursive lenses */
1464 if (lens->ctype_nullable)
1465 state_add_return(jmt, sA, l);
1467 switch (lens->tag) {
1469 conv(jmt, lens->body, &s, &e, &f);
1472 conv_concat(jmt, lens, &s, &e, &f);
1475 conv_union(jmt, lens, &s, &e, &f);
1478 conv(jmt, lens->child, &s, &e, &f);
1481 conv_star(jmt, lens, &s, &e, &f);
1484 conv(jmt, lens->child, &s, &e, &f);
1485 add_new_trans(jmt, s, e, EPS);
1488 conv(jmt, lens->child, &s, &e, &f);
1491 BUG_ON(true, jmt, "Unexpected lens tag %d", lens->tag);
1494 ensure(sA != NULL, jmt);
1496 add_eps_trans(jmt, sA, s);
1497 state_add_return(jmt, e, l);
1498 state_add_return(jmt, f, l);
1504 ATTRIBUTE_RETURN_CHECK
1505 static int push_state(struct array *worklist, struct state *s) {
1509 r = array_add(worklist, &ind);
1512 *array_elem(*worklist, ind, struct state *) = s;
1516 static struct state *pop_state(struct array *worklist) {
1517 if (worklist->used > 0) {
1518 worklist->used -= 1;
1519 return *array_elem(*worklist, worklist->used, struct state *);
1525 static void free_state(struct state *s) {
1529 array_release(&s->trans);
1533 static void collect(struct jmt *jmt) {
1534 struct array worklist;
1535 size_t count, removed;
1539 list_for_each(s, jmt->start) {
1545 array_init(&worklist, sizeof(struct state *));
1546 jmt->start->reachable = 1;
1547 for (struct state *s = jmt->start;
1549 s = pop_state(&worklist)) {
1550 for_each_trans(t, s) {
1551 if (! t->to->reachable) {
1552 t->to->reachable = 1;
1553 r = push_state(&worklist, t->to);
1554 ERR_NOMEM(r < 0, jmt);
1559 list_for_each(s, jmt->start)
1560 if (s->reachable && is_return(s))
1566 list_for_each(s, jmt->start) {
1567 if (! s->live && s->reachable) {
1568 for_each_trans(t, s) {
1569 if (t->lens != CALL && t->to->live) {
1579 list_for_each(s, jmt->start) {
1580 if (s->live && s->reachable) {
1581 for (ind_t i = 0; i < s->trans.used; ) {
1582 struct trans *t = state_trans(s, i);
1583 if (! (t->to->live && t->to->reachable))
1584 array_remove(&s->trans, i);
1592 for (struct state *s = jmt->start;
1593 s->next != NULL; ) {
1594 struct state *p = s->next;
1595 if (p->live && p->reachable) {
1605 array_release(&worklist);
1609 static void dedup(struct state *s) {
1610 array_for_each(i, s->trans) {
1611 struct trans *t = state_trans(s, i);
1612 for (ind_t j = i+1; j < s->trans.used;) {
1613 struct trans *u = state_trans(s, j);
1614 if (t->to == u->to && t->lens == u->lens)
1615 array_remove(&s->trans, j);
1622 static void unepsilon(struct jmt *jmt) {
1625 if (debugging("cf.jmt.build"))
1626 jmt_dot(jmt, "jmt_10_raw.dot");
1629 /* Get rid of epsilon transitions */
1633 list_for_each(s, jmt->start) {
1634 array_for_each(i , s->trans) {
1635 struct trans *t = state_trans(s, i);
1636 if (t->lens == EPS) {
1637 struct state *to = t->to;
1638 array_remove(&s->trans, i);
1639 r = array_join(&s->trans, &to->trans);
1640 ERR_NOMEM(r < 0, jmt);
1641 state_merge_returns(jmt, s, to);
1650 if (debugging("cf.jmt.build"))
1651 jmt_dot(jmt, "jmt_20_uneps.dot");
1656 static bool is_deterministic(struct jmt *jmt) {
1657 list_for_each(s, jmt->start) {
1658 array_for_each(i, s->trans) {
1659 struct trans *t = state_trans(s, i);
1660 for (ind_t j = i+1; j < s->trans.used; j++) {
1661 struct trans *u = state_trans(s, j);
1662 if (t->lens == u->lens)
1670 static void free_nfa_state(struct nfa_state *s) {
1673 array_release(&s->set);
1677 static struct nfa_state *make_nfa_state(struct jmt *jmt) {
1678 struct nfa_state *result = NULL;
1682 ERR_NOMEM(r < 0, jmt);
1684 array_init(&result->set, sizeof(struct state *));
1692 static ind_t nfa_state_add(struct jmt *jmt, struct nfa_state *nfa,
1697 array_for_each(j, nfa->set) {
1698 struct state *q = *array_elem(nfa->set, j, struct state *);
1703 /* Keep the list of states sorted */
1705 for (int j=0; j + 1 < nfa->set.used; j++) {
1706 if (s < *array_elem(nfa->set, j, struct state *)) {
1711 r = array_insert(&nfa->set, i);
1712 ERR_NOMEM(r < 0, jmt);
1714 *array_elem(nfa->set, i, struct state *) = s;
1720 static bool nfa_same_set(struct nfa_state *s1, struct nfa_state *s2) {
1721 if (s1->set.used != s2->set.used)
1724 array_for_each(i, s1->set) {
1725 struct state *q1 = *array_elem(s1->set, i, struct state *);
1726 struct state *q2 = *array_elem(s2->set, i, struct state *);
1734 static struct nfa_state *nfa_uniq(struct jmt *jmt, struct array *newstates,
1735 struct nfa_state *s) {
1739 array_each_elem(q, *newstates, struct nfa_state *) {
1740 if (nfa_same_set(s, *q)) {
1748 r = array_add(newstates, &ind);
1749 ERR_NOMEM(r < 0, jmt);
1750 *array_elem(*newstates, ind, struct nfa_state *) = s;
1752 if (s->state == NULL) {
1753 s->state = make_state(jmt);
1757 /* This makes looking at pictures easier */
1758 if (s->set.used == 1)
1759 s->state->num = (*array_elem(s->set, 0, struct state *))->num;
1767 static void det_target(struct jmt *jmt, struct array *newstates,
1768 struct nfa_state *nfas, ind_t l) {
1769 struct nfa_state *to = NULL;
1771 array_each_elem(s, nfas->set, struct state *) {
1772 for_each_trans(t, *s) {
1775 to = make_nfa_state(jmt);
1778 nfa_state_add(jmt, to, t->to);
1785 to = nfa_uniq(jmt, newstates, to);
1787 add_new_trans(jmt, nfas->state, to->state, l);
1791 static void determinize(struct jmt *jmt) {
1792 struct nfa_state *ini = NULL;
1793 struct array *newstates = NULL;
1797 if (is_deterministic(jmt))
1800 r = ALLOC(newstates);
1801 ERR_NOMEM(r < 0, jmt);
1802 array_init(newstates, sizeof(struct nfa_state *));
1804 nlenses = jmt->lenses.used;
1806 /* The initial state consists of just the start state */
1807 ini = make_nfa_state(jmt);
1809 nfa_state_add(jmt, ini, jmt->start);
1812 /* Make a new initial state */
1813 ini->state = make_state(jmt);
1814 ini->state->num = jmt->start->num; /* Makes lokking at pictures easier */
1816 jmt->start->next = ini->state->next;
1817 ini->state->next = jmt->start;
1818 jmt->start = ini->state;
1820 r = array_add(newstates, &ind);
1821 ERR_NOMEM(r < 0, jmt);
1823 *array_elem(*newstates, ind, struct nfa_state *) = ini;
1826 for (ind_t i = 0; i < newstates->used; i++) {
1827 struct nfa_state *nfas = *array_elem(*newstates, i, struct nfa_state *);
1829 for (int j = 0; j < nfas->set.used; j++) {
1830 struct state *s = *array_elem(nfas->set, j, struct state *);
1831 state_merge_returns(jmt, nfas->state, s);
1834 for (ind_t l = 0; l < nlenses; l++) {
1835 det_target(jmt, newstates, nfas, l);
1838 det_target(jmt, newstates, nfas, CALL);
1846 array_each_elem(s, *newstates, struct nfa_state *) {
1849 array_release(newstates);
1852 free_nfa_state(ini);
1858 struct jmt *jmt_build(struct lens *lens) {
1859 struct jmt *jmt = NULL;
1863 ERR_NOMEM(r < 0, lens->info);
1865 jmt->error = lens->info->error;
1866 array_init(&jmt->lenses, sizeof(struct jmt_lens));
1868 index_lenses(jmt, lens);
1870 if (debugging("cf.jmt"))
1871 print_grammar_top(jmt, lens);
1873 for (ind_t i=0; i < jmt->lenses.used; i++) {
1884 if (debugging("cf.jmt.build"))
1885 jmt_dot(jmt, "jmt_30_dfa.dot");
1893 void jmt_free(struct jmt *jmt) {
1896 array_release(&jmt->lenses);
1897 struct state *s = jmt->start;
1899 struct state *del = s;
1906 void jmt_dot(struct jmt *jmt, const char *fname) {
1907 FILE *fp = debug_fopen("%s", fname);
1911 fprintf(fp, "digraph \"jmt\" {\n");
1912 fprintf(fp, " rankdir = LR;\n");
1913 list_for_each(s, jmt->start) {
1915 fprintf(fp, " %u [ shape = doublecircle, label = \"%u (",
1917 flens(fp, s->ret[0]);
1918 for (ind_t i = 1; i < s->nret; i++) {
1920 flens(fp, s->ret[i]);
1922 fprintf(fp, ")\" ];\n");
1924 for_each_trans(t, s) {
1925 fprintf(fp, " %u -> %u ", s->num, t->to->num);
1928 else if (t->lens == CALL)
1929 fprintf(fp, "[ label = \"call\" ];\n");
1930 else if (lens_state(jmt, t->lens) == NULL) {
1931 struct lens *lens = lens_of_jmt(jmt, t->lens);
1932 fprintf(fp, "[ label = \"");
1933 print_regexp(fp, lens->ctype);
1934 fprintf(fp, "\" ];\n");
1936 fprintf(fp, "[ label = \"");
1938 fprintf(fp, "\" ];\n");
1948 * indent-tabs-mode: nil