2 * fa.c: finite automata
4 * Copyright (C) 2007-2016 David Lutterkort
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * Author: David Lutterkort <dlutter@redhat.com>
24 * This implementation follows closely the Java dk.brics.automaton package
25 * by Anders Moeller. The project's website is
26 * http://www.brics.dk/automaton/.
28 * It is by no means a complete reimplementation of that package; only a
29 * subset of what Automaton provides is implemented here.
43 #define UCHAR_NUM (UCHAR_MAX+1)
45 typedef unsigned char uchar;
47 #define E(cond) if (cond) goto error
48 #define F(expr) if ((expr) < 0) goto error
50 /* Which algorithm to use in FA_MINIMIZE */
51 int fa_minimization_algorithm = FA_MIN_HOPCROFT;
53 /* A finite automaton. INITIAL is both the initial state and the head of
54 * the list of all states. Any state that is allocated for this automaton
55 * is put on this list. Dead/unreachable states are cleared from the list
56 * at opportune times (e.g., during minimization) It's poor man's garbage
59 * Normally, transitions are on a character range [min..max]; in
60 * fa_as_regexp, we store regexps on transitions in the re field of each
61 * transition. TRANS_RE indicates that we do that, and is used by fa_dot to
62 * produce proper graphs of an automaton transitioning on regexps.
64 * For case-insensitive regexps (nocase == 1), the FA never has transitions
65 * on uppercase letters [A-Z], effectively removing these letters from the
69 struct state *initial;
70 int deterministic : 1;
72 unsigned int nocase : 1;
76 /* A state in a finite automaton. Transitions are never shared between
77 states so that we can free the list when we need to free the state */
81 unsigned int accept : 1;
82 unsigned int live : 1;
83 unsigned int reachable : 1;
84 unsigned int visited : 1; /* Used in various places to track progress */
85 /* Array of transitions. The TUSED first entries are used, the array
86 has allocated room for TSIZE */
92 /* A transition. If the input has a character in the inclusive
93 * range [MIN, MAX], move to TO
109 #define UINT_BIT (sizeof(unsigned int) * CHAR_BIT)
111 typedef unsigned int bitset;
113 static bitset *bitset_init(size_t nbits) {
115 if (ALLOC_N(bs, (nbits + UINT_BIT) / UINT_BIT) == -1)
120 static inline void bitset_clr(bitset *bs, unsigned int bit) {
121 bs[bit/UINT_BIT] &= ~(1 << (bit % UINT_BIT));
124 static inline void bitset_set(bitset *bs, unsigned int bit) {
125 bs[bit/UINT_BIT] |= 1 << (bit % UINT_BIT);
129 static inline int bitset_get(const bitset * const bs, unsigned int bit) {
130 return (bs[bit/UINT_BIT] >> bit % UINT_BIT) & 1;
134 static inline bool bitset_disjoint(const bitset *const bs1,
135 const bitset *const bs2,
137 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) {
144 static void bitset_free(bitset *bs) {
148 static void bitset_negate(bitset *bs, size_t nbits) {
149 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
154 * Representation of a parsed regular expression. The regular expression is
155 * parsed according to the following grammar by PARSE_REGEXP:
157 * regexp: concat_exp ('|' regexp)?
158 * concat_exp: repeated_exp concat_exp?
159 * repeated_exp: simple_exp
163 * | simple_exp '{' INT (',' INT '}')?
164 * simple_exp: char_class
168 * char_class: '[' char_exp ']'
169 * | '[' '^' char_exp ']'
170 * char_exp: CHAR '-' CHAR
183 #define re_unref(r) unref(r, re)
189 struct { /* UNION, CONCAT */
196 /* Whether we can use character ranges when converting back
198 unsigned int no_ranges:1;
211 /* Used to keep state of the regex parse; RX may contain NUL's */
213 const char *rx; /* Current position in regex */
214 const char *rend; /* Last char of rx+ 1 */
215 int error; /* error code */
216 /* Whether new CSET's should have the no_ranges flag set */
217 unsigned int no_ranges:1;
220 /* String with explicit length, used when converting re to string */
226 static struct re *parse_regexp(struct re_parse *parse);
228 /* A map from a set of states to a state. */
229 typedef hash_t state_set_hash;
231 static hash_val_t ptr_hash(const void *p);
233 static const int array_initial_size = 4;
234 static const int array_max_expansion = 128;
236 enum state_set_init_flags {
245 unsigned int sorted : 1;
246 unsigned int with_data : 1;
247 struct state **states;
251 struct state_set_list {
252 struct state_set_list *next;
253 struct state_set *set;
256 /* Clean up FA by removing dead transitions and states and reducing
257 * transitions. Unreachable states are freed. The return value is the same
258 * as FA; returning it is merely a convenience.
260 * Only automata in this state should be returned to the user
262 ATTRIBUTE_RETURN_CHECK
263 static int collect(struct fa *fa);
265 ATTRIBUTE_RETURN_CHECK
266 static int totalize(struct fa *fa);
268 /* Print an FA into a (fairly) fixed file if the environment variable
269 * FA_DOT_DIR is set. This code is only used for debugging
271 #define FA_DOT_DIR "FA_DOT_DIR"
274 static void fa_dot_debug(struct fa *fa, const char *tag) {
276 static int count = 0;
281 if ((dot_dir = getenv(FA_DOT_DIR)) == NULL)
284 r = asprintf(&fname, "%s/fa_%02d_%s.dot", dot_dir, count++, tag);
288 fp = fopen(fname, "w");
294 static void print_char_set(struct re *set) {
301 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
302 while (bitset_get(set->cset, from) == set->negate)
304 if (from > UCHAR_MAX)
307 to < UCHAR_MAX && (bitset_get(set->cset, to+1) == !set->negate);
312 printf("%c-%c", from, to);
319 static void print_re(struct re *re) {
340 printf("){%d,%d}", re->min, re->max);
354 static void release_re_str(struct re_str *str) {
361 static void free_re_str(struct re_str *str) {
368 static struct re_str *make_re_str(const char *s) {
375 str->len = strlen(s);
376 if (str->rx == NULL) {
384 static int re_str_alloc(struct re_str *str) {
385 return ALLOC_N(str->rx, str->len + 1);
392 static void free_trans(struct state *s) {
395 s->tused = s->tsize = 0;
398 static void gut(struct fa *fa) {
399 list_for_each(s, fa->initial) {
402 list_free(fa->initial);
406 void fa_free(struct fa *fa) {
413 static struct state *make_state(void) {
417 s->hash = ptr_hash(s);
421 static struct state *add_state(struct fa *fa, int accept) {
422 struct state *s = make_state();
425 if (fa->initial == NULL) {
428 list_cons(fa->initial->next, s);
434 #define last_trans(s) ((s)->trans + (s)->tused - 1)
436 #define for_each_trans(t, s) \
437 for (struct trans *t = (s)->trans; \
438 (t - (s)->trans) < (s)->tused; \
441 ATTRIBUTE_RETURN_CHECK
442 static int add_new_trans(struct state *from, struct state *to,
443 uchar min, uchar max) {
446 if (from->tused == from->tsize) {
447 size_t tsize = from->tsize;
449 tsize = array_initial_size;
450 else if (from->tsize > array_max_expansion)
451 tsize += array_max_expansion;
454 if (REALLOC_N(from->trans, tsize) == -1)
458 from->trans[from->tused].to = to;
459 from->trans[from->tused].min = min;
460 from->trans[from->tused].max = max;
465 ATTRIBUTE_RETURN_CHECK
466 static int add_epsilon_trans(struct state *from,
469 from->accept |= to->accept;
470 for_each_trans(t, to) {
471 r = add_new_trans(from, t->to, t->min, t->max);
478 static void set_initial(struct fa *fa, struct state *s) {
479 list_remove(s, fa->initial);
480 list_cons(fa->initial, s);
483 /* Merge automaton FA2 into FA1. This simply adds FA2's states to FA1
484 and then frees FA2. It has no influence on the language accepted by FA1
486 static void fa_merge(struct fa *fa1, struct fa **fa2) {
487 list_append(fa1->initial, (*fa2)->initial);
493 * Operations on STATE_SET
495 static void state_set_free(struct state_set *set) {
503 static int state_set_init_data(struct state_set *set) {
505 if (set->data == NULL)
506 return ALLOC_N(set->data, set->size);
511 /* Create a new STATE_SET with an initial size of SIZE. If SIZE is -1, use
512 the default size ARRAY_INITIAL_SIZE. FLAGS is a bitmask indicating
514 - S_SORTED: keep the states in the set sorted by their address, and use
515 binary search for lookups. If it is not set, entries are kept in the
516 order in which they are added and lookups scan linearly through the
518 - S_DATA: allocate the DATA array in the set, and keep its size in sync
519 with the size of the STATES array.
521 static struct state_set *state_set_init(int size, int flags) {
522 struct state_set *set = NULL;
526 set->sorted = (flags & S_SORTED) ? 1 : 0;
527 set->with_data = (flags & S_DATA) ? 1 : 0;
530 F(ALLOC_N(set->states, set->size));
532 F(state_set_init_data(set));
540 ATTRIBUTE_RETURN_CHECK
541 static int state_set_expand(struct state_set *set) {
543 set->size = array_initial_size;
544 else if (set->size > array_max_expansion)
545 set->size += array_max_expansion;
548 if (REALLOC_N(set->states, set->size) < 0)
551 if (REALLOC_N(set->data, set->size) < 0)
555 /* We do this to provoke a SEGV as early as possible */
561 /* Return the index where S belongs in SET->STATES to keep it sorted. S
562 may not be in SET->STATES. The returned index is in the interval [0
563 .. SET->USED], with the latter indicating that S is larger than all
564 values in SET->STATES
566 static int state_set_pos(const struct state_set *set, const struct state *s) {
567 int l = 0, h = set->used;
570 if (set->states[m] > s)
572 else if (set->states[m] < s)
580 ATTRIBUTE_RETURN_CHECK
581 static int state_set_push(struct state_set *set, struct state *s) {
582 if (set->size == set->used)
583 if (state_set_expand(set) < 0)
586 int p = state_set_pos(set, s);
587 if (set->size == set->used)
588 if (state_set_expand(set) < 0)
590 while (p < set->used && set->states[p] <= s)
593 memmove(set->states + p + 1, set->states + p,
594 sizeof(*set->states) * (set->used - p));
595 if (set->data != NULL)
596 memmove(set->data + p + 1, set->data + p,
597 sizeof(*set->data) * (set->used - p));
603 set->states[set->used++] = s;
604 return set->used - 1;
608 ATTRIBUTE_RETURN_CHECK
609 static int state_set_push_data(struct state_set *set, struct state *s,
611 int i = state_set_push(set, s);
618 static int state_set_index(const struct state_set *set,
619 const struct state *s) {
621 int p = state_set_pos(set, s);
622 return (p < set->used && set->states[p] == s) ? p : -1;
624 for (int i=0; i < set->used; i++) {
625 if (set->states[i] == s)
632 static void state_set_remove(struct state_set *set,
633 const struct state *s) {
635 int p = state_set_index(set, s);
637 memmove(set->states + p, set->states + p + 1,
638 sizeof(*set->states) * (set->used - p - 1));
639 if (set->data != NULL)
640 memmove(set->data + p, set->data + p + 1,
641 sizeof(*set->data) * (set->used - p - 1));
643 int p = state_set_index(set, s);
645 set->states[p] = set->states[--set->used];
650 /* Only add S if it's not in SET yet. Return 1 if S was added, 0 if it was
651 already in the set and -1 on error. */
652 ATTRIBUTE_RETURN_CHECK
653 static int state_set_add(struct state_set *set, struct state *s) {
655 int p = state_set_pos(set, s);
656 if (p < set->used && set->states[p] == s)
658 if (set->size == set->used)
659 if (state_set_expand(set) < 0)
661 while (p < set->used && set->states[p] <= s)
664 memmove(set->states + p + 1, set->states + p,
665 sizeof(*set->states) * (set->used - p));
666 if (set->data != NULL)
667 memmove(set->data + p + 1, set->data + p,
668 sizeof(*set->data) * (set->used - p));
673 if (state_set_index(set, s) >= 0)
675 if (state_set_push(set, s) < 0)
680 /* We do this to provoke a SEGV as early as possible */
686 static struct state *state_set_pop(struct state_set *set) {
687 struct state *s = NULL;
689 s = set->states[--set->used];
693 static struct state *state_set_pop_data(struct state_set *set, void **d) {
694 struct state *s = NULL;
695 s = state_set_pop(set);
696 *d = set->data[set->used];
700 static void *state_set_find_data(struct state_set *set, struct state *s) {
701 int i = state_set_index(set, s);
708 static int state_set_equal(const struct state_set *set1,
709 const struct state_set *set2) {
710 if (set1->used != set2->used)
712 if (set1->sorted && set2->sorted) {
713 for (int i = 0; i < set1->used; i++)
714 if (set1->states[i] != set2->states[i])
718 for (int i=0; i < set1->used; i++)
719 if (state_set_index(set2, set1->states[i]) == -1)
726 static void state_set_compact(struct state_set *set) {
727 while (set->used > 0 && set->states[set->used] == NULL)
729 for (int i=0; i < set->used; i++) {
730 if (set->states[i] == NULL) {
732 set->states[i] = set->states[set->used];
734 set->data[i] = set->data[set->used];
736 while (set->used > 0 && set->states[set->used] == NULL)
742 /* Add an entry (FST, SND) to SET. FST is stored in SET->STATES, and SND is
743 stored in SET->DATA at the same index.
745 ATTRIBUTE_RETURN_CHECK
746 static int state_pair_push(struct state_set **set,
747 struct state *fst, struct state *snd) {
749 *set = state_set_init(-1, S_DATA);
752 int i = state_set_push(*set, fst);
755 (*set)->data[i] = snd;
760 /* Return the index of the pair (FST, SND) in SET, or -1 if SET contains no
763 static int state_pair_find(struct state_set *set, struct state *fst,
765 for (int i=0; i < set->used; i++)
766 if (set->states[i] == fst && set->data[i] == snd)
771 /* Jenkins' hash for void* */
772 static hash_val_t ptr_hash(const void *p) {
774 char *c = (char *) &p;
775 for (int i=0; i < sizeof(p); i++) {
777 hash += (hash << 10);
781 hash ^= (hash >> 11);
782 hash += (hash << 15);
786 typedef hash_t state_triple_hash;
788 static hash_val_t pair_hash(const void *key) {
789 register struct state *const *pair = key;
790 return pair[0]->hash + pair[1]->hash;
793 static int pair_cmp(const void *key1, const void *key2) {
794 return memcmp(key1, key2, 2*sizeof(struct state *));
797 static void state_triple_node_free(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
798 free((void *) hnode_getkey(node));
802 static state_triple_hash *state_triple_init(void) {
803 state_triple_hash *hash;
805 hash = hash_create(HASHCOUNT_T_MAX, pair_cmp, pair_hash);
808 hash_set_allocator(hash, NULL, state_triple_node_free, NULL);
812 ATTRIBUTE_RETURN_CHECK
813 static int state_triple_push(state_triple_hash *hash,
818 if (ALLOC_N(pair, 2) < 0)
822 return hash_alloc_insert(hash, pair, s3);
825 static struct state * state_triple_thd(state_triple_hash *hash,
828 struct state *pair[2];
832 node = hash_lookup(hash, pair);
833 return (node == NULL) ? NULL : (struct state *) hnode_get(node);
836 static void state_triple_free(state_triple_hash *hash) {
838 hash_free_nodes(hash);
846 ATTRIBUTE_RETURN_CHECK
847 static int mark_reachable(struct fa *fa) {
848 struct state_set *worklist = state_set_init(-1, S_NONE);
853 list_for_each(s, fa->initial) {
856 fa->initial->reachable = 1;
858 for (struct state *s = fa->initial;
860 s = state_set_pop(worklist)) {
861 for_each_trans(t, s) {
862 if (! t->to->reachable) {
863 t->to->reachable = 1;
864 F(state_set_push(worklist, t->to));
871 state_set_free(worklist);
875 /* Return all reachable states. As a sideeffect, all states have their
876 REACHABLE flag set appropriately.
878 static struct state_set *fa_states(struct fa *fa) {
879 struct state_set *visited = state_set_init(-1, S_NONE);
882 r = mark_reachable(fa);
883 E(visited == NULL || r < 0);
885 list_for_each(s, fa->initial) {
887 F(state_set_push(visited, s));
891 state_set_free(visited);
895 /* Return all reachable accepting states. As a sideeffect, all states have
896 their REACHABLE flag set appropriately.
898 static struct state_set *fa_accept_states(struct fa *fa) {
899 struct state_set *accept = state_set_init(-1, S_NONE);
904 r = mark_reachable(fa);
907 list_for_each(s, fa->initial) {
908 if (s->reachable && s->accept)
909 F(state_set_push(accept, s));
913 state_set_free(accept);
917 /* Mark all live states, i.e. states from which an accepting state can be
918 reached. All states have their REACHABLE and LIVE flags set
921 ATTRIBUTE_RETURN_CHECK
922 static int mark_live(struct fa *fa) {
925 F(mark_reachable(fa));
927 list_for_each(s, fa->initial) {
928 s->live = s->reachable && s->accept;
933 list_for_each(s, fa->initial) {
934 if (! s->live && s->reachable) {
935 for_each_trans(t, s) {
951 * Reverse an automaton in place. Change FA so that it accepts the
952 * language that is the reverse of the input automaton.
954 * Returns a list of the new initial states of the automaton. The list must
955 * be freed by the caller.
957 static struct state_set *fa_reverse(struct fa *fa) {
958 struct state_set *all = NULL;
959 struct state_set *accept = NULL;
964 accept = fa_accept_states(fa);
967 F(state_set_init_data(all));
969 /* Reverse all transitions */
971 F(ALLOC_N(tused, all->used));
972 for (int i=0; i < all->used; i++) {
973 all->data[i] = all->states[i]->trans;
974 tused[i] = all->states[i]->tused;
975 all->states[i]->trans = NULL;
976 all->states[i]->tsize = 0;
977 all->states[i]->tused = 0;
979 for (int i=0; i < all->used; i++) {
980 struct state *s = all->states[i];
981 struct trans *t = all->data[i];
983 for (int j=0; j < tused[i]; j++) {
984 r = add_new_trans(t[j].to, s, t[j].min, t[j].max);
992 /* Make new initial and final states */
993 struct state *s = add_state(fa, 0);
996 fa->initial->accept = 1;
998 for (int i=0; i < accept->used; i++) {
999 r = add_epsilon_trans(s, accept->states[i]);
1004 fa->deterministic = 0;
1006 state_set_free(all);
1009 state_set_free(all);
1010 state_set_free(accept);
1015 * Return a sorted array of all interval start points in FA. The returned
1016 * array is a string (null terminated)
1018 static uchar* start_points(struct fa *fa, int *npoints) {
1019 char pointset[UCHAR_NUM];
1020 uchar *points = NULL;
1022 F(mark_reachable(fa));
1023 MEMZERO(pointset, UCHAR_NUM);
1024 list_for_each(s, fa->initial) {
1028 for_each_trans(t, s) {
1029 pointset[t->min] = 1;
1030 if (t->max < UCHAR_MAX)
1031 pointset[t->max+1] = 1;
1036 for(int i=0; i < UCHAR_NUM; *npoints += pointset[i], i++);
1038 F(ALLOC_N(points, *npoints+1));
1039 for (int i=0, n=0; i < UCHAR_NUM; i++) {
1041 points[n++] = (uchar) i;
1051 * Operations on STATE_SET_HASH
1053 static int state_set_hash_contains(state_set_hash *smap,
1054 struct state_set *set) {
1055 return hash_lookup(smap, set) != NULL;
1059 * Find the set in SMAP that has the same states as SET. If the two are
1060 * different, i.e. they point to different memory locations, free SET and
1061 * return the set found in SMAP
1063 static struct state_set *state_set_hash_uniq(state_set_hash *smap,
1064 struct state_set *set) {
1065 hnode_t *node = hash_lookup(smap, set);
1066 const struct state_set *orig_set = hnode_getkey(node);
1067 if (orig_set != set) {
1068 state_set_free(set);
1070 return (struct state_set *) orig_set;
1073 static struct state *state_set_hash_get_state(state_set_hash *smap,
1074 struct state_set *set) {
1075 hnode_t *node = hash_lookup(smap, set);
1076 return (struct state *) hnode_get(node);
1079 static hash_val_t set_hash(const void *key) {
1080 hash_val_t hash = 0;
1081 const struct state_set *set = key;
1083 for (int i = 0; i < set->used; i++) {
1084 hash += set->states[i]->hash;
1089 static int set_cmp(const void *key1, const void *key2) {
1090 const struct state_set *set1 = key1;
1091 const struct state_set *set2 = key2;
1093 return state_set_equal(set1, set2) ? 0 : 1;
1096 static void set_destroy(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
1097 struct state_set *set = (struct state_set *) hnode_getkey(node);
1098 state_set_free(set);
1102 ATTRIBUTE_RETURN_CHECK
1103 static int state_set_hash_add(state_set_hash **smap,
1104 struct state_set *set, struct fa *fa) {
1105 if (*smap == NULL) {
1106 *smap = hash_create(HASHCOUNT_T_MAX, set_cmp, set_hash);
1108 hash_set_allocator(*smap, NULL, set_destroy, NULL);
1110 struct state *s = add_state(fa, 0);
1112 F(hash_alloc_insert(*smap, set, s));
1118 static void state_set_hash_free(state_set_hash *smap,
1119 struct state_set *protect) {
1120 if (protect != NULL) {
1121 hnode_t *node = hash_lookup(smap, protect);
1122 hash_delete(smap, node);
1123 hnode_getkey(node) = NULL;
1124 set_destroy(node, NULL);
1126 hash_free_nodes(smap);
1130 static int state_set_list_add(struct state_set_list **list,
1131 struct state_set *set) {
1132 struct state_set_list *elt;
1136 list_cons(*list, elt);
1140 static struct state_set *state_set_list_pop(struct state_set_list **list) {
1141 struct state_set_list *elt = *list;
1142 struct state_set *set = elt->set;
1149 /* Compare transitions lexicographically by (to, min, reverse max) */
1150 static int trans_to_cmp(const void *v1, const void *v2) {
1151 const struct trans *t1 = v1;
1152 const struct trans *t2 = v2;
1154 if (t1->to != t2->to) {
1155 return (t1->to < t2->to) ? -1 : 1;
1157 if (t1->min < t2->min)
1159 if (t1->min > t2->min)
1161 if (t1->max > t2->max)
1163 return (t1->max < t2->max) ? 1 : 0;
1166 /* Compare transitions lexicographically by (min, reverse max, to) */
1167 static int trans_intv_cmp(const void *v1, const void *v2) {
1168 const struct trans *t1 = v1;
1169 const struct trans *t2 = v2;
1171 if (t1->min < t2->min)
1173 if (t1->min > t2->min)
1175 if (t1->max > t2->max)
1177 if (t1->max < t2->max)
1179 if (t1->to != t2->to) {
1180 return (t1->to < t2->to) ? -1 : 1;
1186 * Reduce an automaton by combining overlapping and adjacent edge intervals
1187 * with the same destination.
1189 static void reduce(struct fa *fa) {
1190 list_for_each(s, fa->initial) {
1194 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
1196 struct trans *t = s->trans;
1197 while (j < s->tused) {
1198 if (t[i].to == t[j].to && t[j].min <= t[i].max + 1) {
1199 if (t[j].max > t[i].max)
1200 t[i].max = t[j].max;
1205 memmove(s->trans + i, s->trans + j,
1206 sizeof(*s->trans) * (s->tused - j));
1212 /* Shrink if we use less than half the allocated size */
1213 if (s->tsize > array_initial_size && 2*s->tused < s->tsize) {
1215 r = REALLOC_N(s->trans, s->tsize);
1217 s->tsize = s->tused;
1223 * Remove dead transitions from an FA; a transition is dead is it does not
1224 * lead to a live state. This also removes any states that are not
1225 * reachable any longer from FA->INITIAL.
1227 * Returns the same FA as a convenience
1230 static void collect_trans(struct fa *fa) {
1231 list_for_each(s, fa->initial) {
1236 while (i < s->tused) {
1237 if (! s->trans[i].to->live) {
1239 memmove(s->trans + i, s->trans + s->tused,
1249 static void collect_dead_states(struct fa *fa) {
1250 /* Remove all dead states and free their storage */
1251 for (struct state *s = fa->initial; s->next != NULL; ) {
1252 if (! s->next->live) {
1253 struct state *del = s->next;
1254 s->next = del->next;
1263 static int collect(struct fa *fa) {
1266 if (! fa->initial->live) {
1267 /* This automaton accepts nothing, make it the canonical
1270 list_for_each(s, fa->initial) {
1273 list_free(fa->initial->next);
1274 fa->deterministic = 1;
1277 collect_dead_states(fa);
1285 static void swap_initial(struct fa *fa) {
1286 struct state *s = fa->initial;
1287 if (s->next != NULL) {
1288 fa->initial = s->next;
1289 s->next = fa->initial->next;
1290 fa->initial->next = s;
1295 * Make a finite automaton deterministic using the given set of initial
1296 * states with the subset construction. This also eliminates dead states
1297 * and transitions and reduces and orders the transitions for each state
1299 static int determinize(struct fa *fa, struct state_set *ini) {
1301 int make_ini = (ini == NULL);
1302 const uchar *points = NULL;
1303 state_set_hash *newstate = NULL;
1304 struct state_set_list *worklist = NULL;
1307 if (fa->deterministic)
1310 points = start_points(fa, &npoints);
1313 ini = state_set_init(-1, S_NONE);
1314 if (ini == NULL || state_set_push(ini, fa->initial) < 0) {
1315 state_set_free(ini);
1320 F(state_set_list_add(&worklist, ini));
1321 F(state_set_hash_add(&newstate, ini, fa));
1322 // Make the new state the initial state
1324 while (worklist != NULL) {
1325 struct state_set *sset = state_set_list_pop(&worklist);
1326 struct state *r = state_set_hash_get_state(newstate, sset);
1327 for (int q=0; q < sset->used; q++) {
1328 r->accept |= sset->states[q]->accept;
1330 for (int n=0; n < npoints; n++) {
1331 struct state_set *pset = state_set_init(-1, S_SORTED);
1333 for(int q=0 ; q < sset->used; q++) {
1334 for_each_trans(t, sset->states[q]) {
1335 if (t->min <= points[n] && points[n] <= t->max) {
1336 F(state_set_add(pset, t->to));
1340 if (!state_set_hash_contains(newstate, pset)) {
1341 F(state_set_list_add(&worklist, pset));
1342 F(state_set_hash_add(&newstate, pset, fa));
1344 pset = state_set_hash_uniq(newstate, pset);
1346 struct state *q = state_set_hash_get_state(newstate, pset);
1347 uchar min = points[n];
1348 uchar max = UCHAR_MAX;
1350 max = points[n+1] - 1;
1351 if (add_new_trans(r, q, min, max) < 0)
1355 fa->deterministic = 1;
1359 state_set_hash_free(newstate, make_ini ? NULL : ini);
1360 free((void *) points);
1361 if (collect(fa) < 0)
1370 * Minimization. As a sideeffect of minimization, the transitions are
1371 * reduced and ordered.
1374 static struct state *step(struct state *s, uchar c) {
1375 for_each_trans(t, s) {
1376 if (t->min <= c && c <= t->max)
1383 struct state_list_node *first;
1384 struct state_list_node *last;
1388 struct state_list_node {
1389 struct state_list *sl;
1390 struct state_list_node *next;
1391 struct state_list_node *prev;
1392 struct state *state;
1395 static struct state_list_node *state_list_add(struct state_list *sl,
1397 struct state_list_node *n;
1405 if (sl->size++ == 0) {
1416 static void state_list_remove(struct state_list_node *n) {
1417 struct state_list *sl = n->sl;
1420 sl->first = n->next;
1422 n->prev->next = n->next;
1426 n->next->prev = n->prev;
1431 static void state_list_free(struct state_list *sl) {
1433 list_free(sl->first);
1437 /* The linear index of element (q,c) in an NSTATES * NSIGMA matrix */
1438 #define INDEX(q, c) (q * nsigma + c)
1440 static int minimize_hopcroft(struct fa *fa) {
1441 struct state_set *states = NULL;
1442 uchar *sigma = NULL;
1443 struct state_set **reverse = NULL;
1444 bitset *reverse_nonempty = NULL;
1445 struct state_set **partition = NULL;
1446 unsigned int *block = NULL;
1447 struct state_list **active = NULL;
1448 struct state_list_node **active2 = NULL;
1449 int *pending = NULL;
1450 bitset *pending2 = NULL;
1451 struct state_set *split = NULL;
1452 bitset *split2 = NULL;
1454 bitset *refine2 = NULL;
1455 struct state_set **splitblock = NULL;
1456 struct state_set *newstates = NULL;
1460 unsigned int nstates = 0;
1463 F(determinize(fa, NULL));
1465 /* Total automaton, nothing to do */
1466 if (fa->initial->tused == 1
1467 && fa->initial->trans[0].to == fa->initial
1468 && fa->initial->trans[0].min == UCHAR_MIN
1469 && fa->initial->trans[0].max == UCHAR_MAX)
1474 /* make arrays for numbered states and effective alphabet */
1475 states = state_set_init(-1, S_NONE);
1478 list_for_each(s, fa->initial) {
1479 F(state_set_push(states, s));
1481 nstates = states->used;
1483 sigma = start_points(fa, &nsigma);
1486 /* initialize data structures */
1488 /* An ss->used x nsigma matrix of lists of states */
1489 F(ALLOC_N(reverse, nstates * nsigma));
1490 reverse_nonempty = bitset_init(nstates * nsigma);
1491 E(reverse_nonempty == NULL);
1492 F(ALLOC_N(partition, nstates));
1493 F(ALLOC_N(block, nstates));
1495 F(ALLOC_N(active, nstates * nsigma));
1496 F(ALLOC_N(active2, nstates * nsigma));
1498 /* PENDING is an array of pairs of ints. The i'th pair is stored in
1499 * PENDING[2*i] and PENDING[2*i + 1]. There are NPENDING pairs in
1500 * PENDING at any time. SPENDING is the maximum number of pairs
1501 * allocated for PENDING.
1503 size_t npending = 0, spending = 0;
1504 pending2 = bitset_init(nstates * nsigma);
1505 E(pending2 == NULL);
1507 split = state_set_init(-1, S_NONE);
1508 split2 = bitset_init(nstates);
1509 E(split == NULL || split2 == NULL);
1511 F(ALLOC_N(refine, nstates));
1512 refine2 = bitset_init(nstates);
1515 F(ALLOC_N(splitblock, nstates));
1517 for (int q = 0; q < nstates; q++) {
1518 splitblock[q] = state_set_init(-1, S_NONE);
1519 partition[q] = state_set_init(-1, S_NONE);
1520 E(splitblock[q] == NULL || partition[q] == NULL);
1521 for (int x = 0; x < nsigma; x++) {
1522 reverse[INDEX(q, x)] = state_set_init(-1, S_NONE);
1523 E(reverse[INDEX(q, x)] == NULL);
1524 F(ALLOC_N(active[INDEX(q, x)], 1));
1528 /* find initial partition and reverse edges */
1529 for (int q = 0; q < nstates; q++) {
1530 struct state *qq = states->states[q];
1536 F(state_set_push(partition[j], qq));
1538 for (int x = 0; x < nsigma; x++) {
1540 struct state *p = step(qq, y);
1542 int pn = state_set_index(states, p);
1544 F(state_set_push(reverse[INDEX(pn, x)], qq));
1545 bitset_set(reverse_nonempty, INDEX(pn, x));
1549 /* initialize active sets */
1550 for (int j = 0; j <= 1; j++)
1551 for (int x = 0; x < nsigma; x++)
1552 for (int q = 0; q < partition[j]->used; q++) {
1553 struct state *qq = partition[j]->states[q];
1554 int qn = state_set_index(states, qq);
1555 if (bitset_get(reverse_nonempty, INDEX(qn, x))) {
1556 active2[INDEX(qn, x)] =
1557 state_list_add(active[INDEX(j, x)], qq);
1558 E(active2[INDEX(qn, x)] == NULL);
1562 /* initialize pending */
1563 F(ALLOC_N(pending, 2*nsigma));
1566 for (int x = 0; x < nsigma; x++) {
1567 int a0 = active[INDEX(0,x)]->size;
1568 int a1 = active[INDEX(1,x)]->size;
1576 bitset_set(pending2, INDEX(j, x));
1579 /* process pending until fixed point */
1581 while (npending-- > 0) {
1582 int p = pending[2*npending];
1583 int x = pending[2*npending+1];
1584 bitset_clr(pending2, INDEX(p, x));
1586 /* find states that need to be split off their blocks */
1587 struct state_list *sh = active[INDEX(p,x)];
1588 for (struct state_list_node *m = sh->first; m != NULL; m = m->next) {
1589 int q = state_set_index(states, m->state);
1590 struct state_set *rev = reverse[INDEX(q, x)];
1591 for (int r =0; r < rev->used; r++) {
1592 struct state *rs = rev->states[r];
1593 int s = state_set_index(states, rs);
1594 if (! bitset_get(split2, s)) {
1595 bitset_set(split2, s);
1596 F(state_set_push(split, rs));
1598 F(state_set_push(splitblock[j], rs));
1599 if (!bitset_get(refine2, j)) {
1600 bitset_set(refine2, j);
1607 for(int rr=0; rr < ref; rr++) {
1609 struct state_set *sp = splitblock[j];
1610 if (sp->used < partition[j]->used) {
1611 struct state_set *b1 = partition[j];
1612 struct state_set *b2 = partition[k];
1613 for (int s = 0; s < sp->used; s++) {
1614 state_set_remove(b1, sp->states[s]);
1615 F(state_set_push(b2, sp->states[s]));
1616 int snum = state_set_index(states, sp->states[s]);
1618 for (int c = 0; c < nsigma; c++) {
1619 struct state_list_node *sn = active2[INDEX(snum, c)];
1620 if (sn != NULL && sn->sl == active[INDEX(j,c)]) {
1621 state_list_remove(sn);
1622 active2[INDEX(snum, c)] =
1623 state_list_add(active[INDEX(k, c)],
1625 E(active2[INDEX(snum, c)] == NULL);
1630 for (int c = 0; c < nsigma; c++) {
1631 int aj = active[INDEX(j, c)]->size;
1632 int ak = active[INDEX(k, c)]->size;
1633 if (npending + 1 > spending) {
1635 F(REALLOC_N(pending, 2 * spending));
1637 pending[2*npending + 1] = c;
1638 if (!bitset_get(pending2, INDEX(j, c))
1639 && 0 < aj && aj <= ak) {
1640 bitset_set(pending2, INDEX(j, c));
1641 pending[2*npending] = j;
1643 bitset_set(pending2, INDEX(k, c));
1644 pending[2*npending] = k;
1650 for (int s = 0; s < sp->used; s++) {
1651 int snum = state_set_index(states, sp->states[s]);
1652 bitset_clr(split2, snum);
1654 bitset_clr(refine2, j);
1660 /* make a new state for each equivalence class, set initial state */
1661 newstates = state_set_init(k, S_NONE);
1662 E(newstates == NULL);
1663 F(ALLOC_N(nsnum, k));
1664 F(ALLOC_N(nsind, nstates));
1666 for (int n = 0; n < k; n++) {
1667 struct state *s = make_state();
1669 newstates->states[n] = s;
1670 struct state_set *partn = partition[n];
1671 for (int q=0; q < partn->used; q++) {
1672 struct state *qs = partn->states[q];
1673 int qnum = state_set_index(states, qs);
1674 if (qs == fa->initial)
1675 s->live = 1; /* Abuse live to flag the new intial state */
1676 nsnum[n] = qnum; /* select representative */
1677 nsind[qnum] = n; /* and point from partition to new state */
1681 /* build transitions and set acceptance */
1682 for (int n = 0; n < k; n++) {
1683 struct state *s = newstates->states[n];
1684 s->accept = states->states[nsnum[n]]->accept;
1685 for_each_trans(t, states->states[nsnum[n]]) {
1686 int toind = state_set_index(states, t->to);
1687 struct state *nto = newstates->states[nsind[toind]];
1688 F(add_new_trans(s, nto, t->min, t->max));
1692 /* Get rid of old states and transitions and turn NEWTSTATES into
1695 for (int n=0; n < k; n++)
1696 if (newstates->states[n]->live) {
1697 struct state *ini = newstates->states[n];
1698 newstates->states[n] = newstates->states[0];
1699 newstates->states[0] = ini;
1701 for (int n=0; n < k-1; n++)
1702 newstates->states[n]->next = newstates->states[n+1];
1703 fa->initial = newstates->states[0];
1711 state_set_free(states);
1713 bitset_free(reverse_nonempty);
1715 for (int i=0; i < nstates*nsigma; i++) {
1717 state_set_free(reverse[i]);
1719 state_list_free(active[i]);
1725 bitset_free(pending2);
1726 state_set_free(split);
1727 bitset_free(split2);
1729 bitset_free(refine2);
1730 for (int q=0; q < nstates; q++) {
1732 state_set_free(splitblock[q]);
1734 state_set_free(partition[q]);
1738 state_set_free(newstates);
1740 if (collect(fa) < 0)
1748 static int minimize_brzozowski(struct fa *fa) {
1749 struct state_set *set;
1751 /* Minimize using Brzozowski's algorithm */
1752 set = fa_reverse(fa);
1754 F(determinize(fa, set));
1755 state_set_free(set);
1757 set = fa_reverse(fa);
1759 F(determinize(fa, set));
1760 state_set_free(set);
1766 int fa_minimize(struct fa *fa) {
1774 if (fa_minimization_algorithm == FA_MIN_BRZOZOWSKI) {
1775 r = minimize_brzozowski(fa);
1777 r = minimize_hopcroft(fa);
1786 * Construction of fa
1789 static struct fa *fa_make_empty(void) {
1794 if (add_state(fa, 0) == NULL) {
1798 /* Even though, technically, this fa is both minimal and deterministic,
1799 * this function is also used to allocate new fa's which are then modified
1800 * further. Rather than risk erroneously marking such an fa as minimal
1801 * and deterministic, we do not do that here and take the minor hit if
1802 * that should ever need to be determined for an actual empty fa
1807 static struct fa *fa_make_epsilon(void) {
1808 struct fa *fa = fa_make_empty();
1810 fa->initial->accept = 1;
1811 fa->deterministic= 1;
1817 static struct fa *fa_make_char(uchar c) {
1818 struct fa *fa = fa_make_empty();
1822 struct state *s = fa->initial;
1823 struct state *t = add_state(fa, 1);
1829 r = add_new_trans(s, t, c, c);
1832 fa->deterministic = 1;
1840 struct fa *fa_make_basic(unsigned int basic) {
1843 if (basic == FA_EMPTY) {
1844 return fa_make_empty();
1845 } else if (basic == FA_EPSILON) {
1846 return fa_make_epsilon();
1847 } else if (basic == FA_TOTAL) {
1848 struct fa *fa = fa_make_epsilon();
1849 r = add_new_trans(fa->initial, fa->initial, UCHAR_MIN, UCHAR_MAX);
1859 int fa_is_basic(struct fa *fa, unsigned int basic) {
1860 if (basic == FA_EMPTY) {
1861 return ! fa->initial->accept && fa->initial->tused == 0;
1862 } else if (basic == FA_EPSILON) {
1863 return fa->initial->accept && fa->initial->tused == 0;
1864 } else if (basic == FA_TOTAL) {
1865 if (! fa->initial->accept)
1868 if (fa->initial->tused != 2)
1870 struct trans *t1 = fa->initial->trans;
1871 struct trans *t2 = fa->initial->trans + 1;
1872 if (t1->to != fa->initial || t2->to != fa->initial)
1874 if (t2->max != UCHAR_MAX) {
1876 t2 = fa->initial->trans;
1878 return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
1879 t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
1881 struct trans *t = fa->initial->trans;
1882 return fa->initial->tused == 1 &&
1883 t->to == fa->initial &&
1884 t->min == UCHAR_MIN && t->max == UCHAR_MAX;
1890 static struct fa *fa_clone(struct fa *fa) {
1891 struct fa *result = NULL;
1892 struct state_set *set = state_set_init(-1, S_DATA|S_SORTED);
1895 if (fa == NULL || set == NULL || ALLOC(result) < 0)
1898 result->deterministic = fa->deterministic;
1899 result->minimal = fa->minimal;
1900 result->nocase = fa->nocase;
1901 list_for_each(s, fa->initial) {
1902 int i = state_set_push(set, s);
1905 struct state *q = add_state(result, s->accept);
1910 q->reachable = s->reachable;
1912 for (int i=0; i < set->used; i++) {
1913 struct state *s = set->states[i];
1914 struct state *sc = set->data[i];
1915 for_each_trans(t, s) {
1916 int to = state_set_index(set, t->to);
1918 struct state *toc = set->data[to];
1919 r = add_new_trans(sc, toc, t->min, t->max);
1924 state_set_free(set);
1927 state_set_free(set);
1932 static int case_expand(struct fa *fa);
1934 /* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
1935 ATTRIBUTE_RETURN_CHECK
1936 static int union_in_place(struct fa *fa1, struct fa **fa2) {
1940 if (fa1->nocase != (*fa2)->nocase) {
1941 if (case_expand(fa1) < 0)
1943 if (case_expand(*fa2) < 0)
1947 s = add_state(fa1, 0);
1950 r = add_epsilon_trans(s, fa1->initial);
1953 r = add_epsilon_trans(s, (*fa2)->initial);
1957 fa1->deterministic = 0;
1961 set_initial(fa1, s);
1966 struct fa *fa_union(struct fa *fa1, struct fa *fa2) {
1967 fa1 = fa_clone(fa1);
1968 fa2 = fa_clone(fa2);
1969 if (fa1 == NULL || fa2 == NULL)
1972 F(union_in_place(fa1, &fa2));
1981 /* Concat FA2 onto FA1; frees FA2 and changes FA1 to FA1.FA2 */
1982 ATTRIBUTE_RETURN_CHECK
1983 static int concat_in_place(struct fa *fa1, struct fa **fa2) {
1986 if (fa1->nocase != (*fa2)->nocase) {
1987 if (case_expand(fa1) < 0)
1989 if (case_expand(*fa2) < 0)
1993 list_for_each(s, fa1->initial) {
1996 r = add_epsilon_trans(s, (*fa2)->initial);
2002 fa1->deterministic = 0;
2009 struct fa *fa_concat(struct fa *fa1, struct fa *fa2) {
2010 fa1 = fa_clone(fa1);
2011 fa2 = fa_clone(fa2);
2013 if (fa1 == NULL || fa2 == NULL)
2016 F(concat_in_place(fa1, &fa2));
2028 static struct fa *fa_make_char_set(bitset *cset, int negate) {
2029 struct fa *fa = fa_make_empty();
2033 struct state *s = fa->initial;
2034 struct state *t = add_state(fa, 1);
2041 while (from <= UCHAR_MAX) {
2042 while (from <= UCHAR_MAX && bitset_get(cset, from) == negate)
2044 if (from > UCHAR_MAX)
2047 while (to < UCHAR_MAX && (bitset_get(cset, to + 1) == !negate))
2049 r = add_new_trans(s, t, from, to);
2055 fa->deterministic = 1;
2064 static struct fa *fa_star(struct fa *fa) {
2072 s = add_state(fa, 1);
2076 r = add_epsilon_trans(s, fa->initial);
2081 list_for_each(p, fa->initial->next) {
2083 r = add_epsilon_trans(p, s);
2088 fa->deterministic = 0;
2098 /* Form the automaton (FA){N}; FA is not modified */
2099 static struct fa *repeat(struct fa *fa, int n) {
2101 return fa_make_epsilon();
2102 } else if (n == 1) {
2103 return fa_clone(fa);
2105 struct fa *cfa = fa_clone(fa);
2109 struct fa *tfa = fa_clone(fa);
2114 if (concat_in_place(cfa, &tfa) < 0) {
2125 struct fa *fa_iter(struct fa *fa, int min, int max) {
2131 if (min > max && max != -1) {
2132 return fa_make_empty();
2135 struct fa *sfa = fa_star(fa);
2140 struct fa *cfa = repeat(fa, min);
2145 if (concat_in_place(cfa, &sfa) < 0) {
2152 struct fa *cfa = NULL;
2155 cfa = repeat(fa, min);
2159 struct fa *cfa2 = fa_clone(fa);
2165 struct fa *cfa3 = fa_clone(fa);
2171 list_for_each(s, cfa3->initial) {
2173 r = add_epsilon_trans(s, cfa2->initial);
2182 fa_merge(cfa3, &cfa2);
2186 list_for_each(s, cfa->initial) {
2188 r = add_epsilon_trans(s, cfa2->initial);
2196 fa_merge(cfa, &cfa2);
2197 cfa->deterministic = 0;
2200 if (collect(cfa) < 0) {
2208 static void sort_transition_intervals(struct fa *fa) {
2209 list_for_each(s, fa->initial) {
2210 qsort(s->trans, s->tused, sizeof(*s->trans), trans_intv_cmp);
2214 struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
2216 struct fa *fa = NULL;
2217 struct state_set *worklist = NULL;
2218 state_triple_hash *newstates = NULL;
2221 return fa_clone(fa1);
2223 if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
2224 return fa_make_empty();
2226 if (fa1->nocase != fa2->nocase) {
2227 F(case_expand(fa1));
2228 F(case_expand(fa2));
2231 fa = fa_make_empty();
2232 worklist = state_set_init(-1, S_NONE);
2233 newstates = state_triple_init();
2234 if (fa == NULL || worklist == NULL || newstates == NULL)
2237 sort_transition_intervals(fa1);
2238 sort_transition_intervals(fa2);
2240 F(state_set_push(worklist, fa1->initial));
2241 F(state_set_push(worklist, fa2->initial));
2242 F(state_set_push(worklist, fa->initial));
2243 F(state_triple_push(newstates,
2244 fa1->initial, fa2->initial, fa->initial));
2245 while (worklist->used) {
2246 struct state *s = state_set_pop(worklist);
2247 struct state *p2 = state_set_pop(worklist);
2248 struct state *p1 = state_set_pop(worklist);
2249 s->accept = p1->accept && p2->accept;
2251 struct trans *t1 = p1->trans;
2252 struct trans *t2 = p2->trans;
2253 for (int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2254 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2257 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2259 if (t2[n2].max >= t1[n1].min) {
2260 struct state *r = state_triple_thd(newstates,
2261 t1[n1].to, t2[n2].to);
2263 r = add_state(fa, 0);
2265 F(state_set_push(worklist, t1[n1].to));
2266 F(state_set_push(worklist, t2[n2].to));
2267 F(state_set_push(worklist, r));
2268 F(state_triple_push(newstates,
2269 t1[n1].to, t2[n2].to, r));
2271 char min = t1[n1].min > t2[n2].min
2272 ? t1[n1].min : t2[n2].min;
2273 char max = t1[n1].max < t2[n2].max
2274 ? t1[n1].max : t2[n2].max;
2275 ret = add_new_trans(s, r, min, max);
2282 fa->deterministic = fa1->deterministic && fa2->deterministic;
2283 fa->nocase = fa1->nocase && fa2->nocase;
2285 state_set_free(worklist);
2286 state_triple_free(newstates);
2288 if (collect(fa) < 0) {
2301 int fa_contains(struct fa *fa1, struct fa *fa2) {
2303 struct state_set *worklist = NULL; /* List of pairs of states */
2304 struct state_set *visited = NULL; /* List of pairs of states */
2306 if (fa1 == NULL || fa2 == NULL)
2312 F(determinize(fa2, NULL));
2313 sort_transition_intervals(fa1);
2314 sort_transition_intervals(fa2);
2316 F(state_pair_push(&worklist, fa1->initial, fa2->initial));
2317 F(state_pair_push(&visited, fa1->initial, fa2->initial));
2318 while (worklist->used) {
2319 struct state *p1, *p2;
2321 p1 = state_set_pop_data(worklist, &v2);
2324 if (p1->accept && !p2->accept)
2327 struct trans *t1 = p1->trans;
2328 struct trans *t2 = p2->trans;
2329 for(int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2330 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2332 int min1 = t1[n1].min, max1 = t1[n1].max;
2334 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2336 if (t2[n2].min > min1)
2338 if (t2[n2].max < CHAR_MAX)
2339 min1 = t2[n2].max + 1;
2344 if (state_pair_find(visited, t1[n1].to, t2[n2].to) == -1) {
2345 F(state_pair_push(&worklist, t1[n1].to, t2[n2].to));
2346 F(state_pair_push(&visited, t1[n1].to, t2[n2].to));
2356 state_set_free(worklist);
2357 state_set_free(visited);
2364 static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash,
2369 /* Never transition on anything in [A-Z] */
2370 if (min > 'Z' || max < 'A') {
2371 result = add_new_trans(s, crash, min, max);
2372 } else if (min >= 'A' && max <= 'Z') {
2374 } else if (max <= 'Z') {
2376 result = add_new_trans(s, crash, min, 'A' - 1);
2377 } else if (min >= 'A') {
2379 result = add_new_trans(s, crash, 'Z' + 1, max);
2381 /* min < 'A' && max > 'Z' */
2382 result = add_new_trans(s, crash, min, 'A' - 1);
2384 result = add_new_trans(s, crash, 'Z' + 1, max);
2387 result = add_new_trans(s, crash, min, max);
2392 static int totalize(struct fa *fa) {
2394 struct state *crash = add_state(fa, 0);
2397 F(mark_reachable(fa));
2398 sort_transition_intervals(fa);
2400 r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX);
2404 list_for_each(s, fa->initial) {
2405 int next = UCHAR_MIN;
2406 int tused = s->tused;
2407 for (int i=0; i < tused; i++) {
2408 uchar min = s->trans[i].min, max = s->trans[i].max;
2410 r = add_crash_trans(fa, s, crash, next, min - 1);
2417 if (next <= UCHAR_MAX) {
2418 r = add_crash_trans(fa, s, crash, next, UCHAR_MAX);
2428 struct fa *fa_complement(struct fa *fa) {
2431 F(determinize(fa, NULL));
2433 list_for_each(s, fa->initial)
2434 s->accept = ! s->accept;
2443 struct fa *fa_minus(struct fa *fa1, struct fa *fa2) {
2444 if (fa1 == NULL || fa2 == NULL)
2447 if (fa_is_basic(fa1, FA_EMPTY) || fa1 == fa2)
2448 return fa_make_empty();
2449 if (fa_is_basic(fa2, FA_EMPTY))
2450 return fa_clone(fa1);
2452 struct fa *cfa2 = fa_complement(fa2);
2456 struct fa *result = fa_intersect(fa1, cfa2);
2461 static int accept_to_accept(struct fa *fa) {
2463 struct state *s = add_state(fa, 0);
2467 F(mark_reachable(fa));
2468 list_for_each(a, fa->initial) {
2469 if (a->accept && a->reachable) {
2470 r = add_epsilon_trans(s, a);
2477 fa->deterministic = fa->minimal = 0;
2483 struct fa *fa_overlap(struct fa *fa1, struct fa *fa2) {
2484 struct fa *fa = NULL, *eps = NULL, *result = NULL;
2485 struct state_set *map = NULL;
2487 if (fa1 == NULL || fa2 == NULL)
2490 fa1 = fa_clone(fa1);
2491 fa2 = fa_clone(fa2);
2492 if (fa1 == NULL || fa2 == NULL)
2495 if (determinize(fa1, NULL) < 0)
2497 if (accept_to_accept(fa1) < 0)
2500 map = fa_reverse(fa2);
2501 state_set_free(map);
2502 if (determinize(fa2, NULL) < 0)
2504 if (accept_to_accept(fa2) < 0)
2506 map = fa_reverse(fa2);
2507 state_set_free(map);
2508 if (determinize(fa2, NULL) < 0)
2511 fa = fa_intersect(fa1, fa2);
2515 eps = fa_make_epsilon();
2519 result = fa_minus(fa, eps);
2531 int fa_equals(struct fa *fa1, struct fa *fa2) {
2532 if (fa1 == NULL || fa2 == NULL)
2535 /* fa_contains(fa1, fa2) && fa_contains(fa2, fa1) with error checking */
2536 int c1 = fa_contains(fa1, fa2);
2541 return fa_contains(fa2, fa1);
2544 static unsigned int chr_score(char c) {
2547 } else if (isalnum(c)) {
2549 } else if (isprint(c)) {
2551 } else if (c == '\0') {
2558 static unsigned int str_score(const struct re_str *str) {
2559 unsigned int score = 0;
2560 for (int i=0; i < str->len; i++) {
2561 score += chr_score(str->rx[i]);
2566 /* See if we get a better string for DST by appending C to SRC. If DST is
2567 * NULL or empty, always use SRC + C
2569 static struct re_str *string_extend(struct re_str *dst,
2570 const struct re_str *src,
2574 || str_score(src) + chr_score(c) < str_score(dst)) {
2575 int slen = src->len;
2577 dst = make_re_str(NULL);
2580 if (REALLOC_N(dst->rx, slen+2) < 0) {
2584 memcpy(dst->rx, src->rx, slen);
2586 dst->rx[slen + 1] = '\0';
2587 dst->len = slen + 1;
2592 static char pick_char(struct trans *t) {
2593 for (int c = t->min; c <= t->max; c++)
2594 if (isalpha(c)) return c;
2595 for (int c = t->min; c <= t->max; c++)
2596 if (isalnum(c)) return c;
2597 for (int c = t->min; c <= t->max; c++)
2598 if (isprint(c)) return c;
2602 /* Generate an example string for FA. Traverse all transitions and record
2603 * at each turn the "best" word found for that state.
2605 int fa_example(struct fa *fa, char **example, size_t *example_len) {
2606 struct re_str *word = NULL;
2607 struct state_set *path = NULL, *worklist = NULL;
2608 struct re_str *str = NULL;
2613 /* Sort to avoid any ambiguity because of reordering of transitions */
2614 sort_transition_intervals(fa);
2616 /* Map from state to string */
2617 path = state_set_init(-1, S_DATA|S_SORTED);
2618 str = make_re_str("");
2619 if (path == NULL || str == NULL)
2621 F(state_set_push_data(path, fa->initial, str));
2624 /* List of states still to visit */
2625 worklist = state_set_init(-1, S_NONE);
2626 if (worklist == NULL)
2628 F(state_set_push(worklist, fa->initial));
2630 while (worklist->used) {
2631 struct state *s = state_set_pop(worklist);
2632 struct re_str *ps = state_set_find_data(path, s);
2633 for_each_trans(t, s) {
2634 char c = pick_char(t);
2635 int toind = state_set_index(path, t->to);
2637 struct re_str *w = string_extend(NULL, ps, c);
2639 F(state_set_push(worklist, t->to));
2640 F(state_set_push_data(path, t->to, w));
2642 path->data[toind] = string_extend(path->data[toind], ps, c);
2647 for (int i=0; i < path->used; i++) {
2648 struct state *p = path->states[i];
2649 struct re_str *ps = path->data[i];
2651 (word == NULL || word->len == 0
2652 || (ps->len > 0 && str_score(word) > str_score(ps)))) {
2659 state_set_free(path);
2660 state_set_free(worklist);
2662 *example_len = word->len;
2663 *example = word->rx;
2668 state_set_free(path);
2669 state_set_free(worklist);
2683 static int fa_enumerate_intl(struct state *s, struct enum_intl *ei, int pos) {
2686 if (ei->bsize <= pos + 1) {
2688 F(REALLOC_N(ei->buf, ei->bsize));
2691 ei->buf[pos] = '\0';
2692 for_each_trans(t, s) {
2696 for (int i=t->min; i <= t->max; i++) {
2698 if (t->to->accept) {
2699 if (ei->nwords >= ei->limit)
2701 ei->words[ei->nwords] = strdup(ei->buf);
2702 E(ei->words[ei->nwords] == NULL);
2705 result = fa_enumerate_intl(t->to, ei, pos+1);
2710 ei->buf[pos] = '\0';
2716 int fa_enumerate(struct fa *fa, int limit, char ***words) {
2717 struct enum_intl ei;
2722 ei.bsize = 8; /* Arbitrary initial size */
2724 F(ALLOC_N(ei.words, limit));
2725 F(ALLOC_N(ei.buf, ei.bsize));
2727 /* We use the visited bit to track which states we already visited
2728 * during the construction of a word to detect loops */
2729 list_for_each(s, fa->initial)
2731 fa->initial->visited = 1;
2732 if (fa->initial->accept) {
2733 if (ei.nwords >= limit)
2735 ei.words[0] = strdup("");
2736 E(ei.words[0] == NULL);
2739 result = fa_enumerate_intl(fa->initial, &ei, 0);
2750 for (int i=0; i < ei.nwords; i++)
2756 /* Expand the automaton FA by replacing every transition s(c) -> p from
2757 * state s to p on character c by two transitions s(X) -> r, r(c) -> p via
2759 * If ADD_MARKER is true, also add for each original state s a new a loop
2760 * s(Y) -> q and q(X) -> s through a new state q.
2762 * The end result is that an automaton accepting "a|ab" is turned into one
2763 * accepting "Xa|XaXb" if add_marker is false and "(YX)*Xa|(YX)*Xa(YX)*Xb"
2764 * when add_marker is true.
2766 * The returned automaton is a copy of FA, FA is not modified.
2768 static struct fa *expand_alphabet(struct fa *fa, int add_marker,
2776 F(mark_reachable(fa));
2777 list_for_each(p, fa->initial) {
2781 struct state *r = add_state(fa, 0);
2784 r->trans = p->trans;
2785 r->tused = p->tused;
2786 r->tsize = p->tsize;
2788 p->tused = p->tsize = 0;
2789 ret = add_new_trans(p, r, X, X);
2793 struct state *q = add_state(fa, 0);
2794 ret = add_new_trans(p, q, Y, Y);
2797 ret = add_new_trans(q, p, X, X);
2808 static bitset *alphabet(struct fa *fa) {
2809 bitset *bs = bitset_init(UCHAR_NUM);
2814 list_for_each(s, fa->initial) {
2815 for (int i=0; i < s->tused; i++) {
2816 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2823 static bitset *last_chars(struct fa *fa) {
2824 bitset *bs = bitset_init(UCHAR_NUM);
2829 list_for_each(s, fa->initial) {
2830 for (int i=0; i < s->tused; i++) {
2831 if (s->trans[i].to->accept) {
2832 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2840 static bitset *first_chars(struct fa *fa) {
2841 bitset *bs = bitset_init(UCHAR_NUM);
2842 struct state *s = fa->initial;
2847 for (int i=0; i < s->tused; i++) {
2848 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2854 /* Return 1 if F1 and F2 are known to be unambiguously concatenable
2855 * according to simple heuristics. Return 0 if they need to be checked
2856 * further to decide ambiguity
2857 * Return -1 if an allocation fails
2859 static int is_splittable(struct fa *fa1, struct fa *fa2) {
2860 bitset *alpha1 = NULL;
2861 bitset *alpha2 = NULL;
2862 bitset *last1 = NULL;
2863 bitset *first2 = NULL;
2866 alpha2 = alphabet(fa2);
2867 last1 = last_chars(fa1);
2868 if (alpha2 == NULL || last1 == NULL)
2870 if (bitset_disjoint(last1, alpha2, UCHAR_NUM)) {
2875 alpha1 = alphabet(fa1);
2876 first2 = first_chars(fa2);
2877 if (alpha1 == NULL || first2 == NULL)
2879 if (bitset_disjoint(first2, alpha1, UCHAR_NUM)) {
2885 bitset_free(alpha1);
2886 bitset_free(alpha2);
2888 bitset_free(first2);
2892 /* This algorithm is due to Anders Moeller, and can be found in class
2893 * AutomatonOperations in dk.brics.grammar
2895 int fa_ambig_example(struct fa *fa1, struct fa *fa2,
2896 char **upv, size_t *upv_len,
2897 char **pv, char **v) {
2898 static const char X = '\001';
2899 static const char Y = '\002';
2900 char *result = NULL, *s = NULL;
2901 size_t result_len = 0;
2903 struct fa *mp = NULL, *ms = NULL, *sp = NULL, *ss = NULL, *amb = NULL;
2904 struct fa *a1f = NULL, *a1t = NULL, *a2f = NULL, *a2t = NULL;
2905 struct fa *b1 = NULL, *b2 = NULL;
2914 r = is_splittable(fa1, fa2);
2922 #define MPs Ys Xs "(" Xs "(.|\n))+"
2923 #define MSs Ys Xs "(" Xs "(.|\n))*"
2924 #define SPs "(" Xs "(.|\n))+" Ys Xs
2925 #define SSs "(" Xs "(.|\n))*" Ys Xs
2926 /* These could become static constants */
2927 r = fa_compile( MPs, strlen(MPs), &mp);
2928 if (r != REG_NOERROR)
2930 r = fa_compile( MSs, strlen(MSs), &ms);
2931 if (r != REG_NOERROR)
2933 r = fa_compile( SPs, strlen(SPs), &sp);
2934 if (r != REG_NOERROR)
2936 r = fa_compile( SSs, strlen(SSs), &ss);
2937 if (r != REG_NOERROR)
2946 a1f = expand_alphabet(fa1, 0, X, Y);
2947 a1t = expand_alphabet(fa1, 1, X, Y);
2948 a2f = expand_alphabet(fa2, 0, X, Y);
2949 a2t = expand_alphabet(fa2, 1, X, Y);
2950 if (a1f == NULL || a1t == NULL || a2f == NULL || a2t == NULL)
2953 /* Compute b1 = ((a1f . mp) & a1t) . ms */
2954 if (concat_in_place(a1f, &mp) < 0)
2956 b1 = fa_intersect(a1f, a1t);
2959 if (concat_in_place(b1, &ms) < 0)
2962 /* Compute b2 = ss . ((sp . a2f) & a2t) */
2963 if (concat_in_place(sp, &a2f) < 0)
2965 b2 = fa_intersect(sp, a2t);
2968 if (concat_in_place(ss, &b2) < 0)
2973 /* The automaton we are really interested in */
2974 amb = fa_intersect(b1, b2);
2979 r = fa_example(amb, &s, &s_len);
2985 result_len = (s_len-1)/2 - 1;
2986 F(ALLOC_N(result, result_len + 1));
2989 for (i=0; s[2*i] == X; i++) {
2990 assert((t - result) < result_len);
2997 for ( ;s[2*i] == X; i++) {
2998 assert((t - result) < result_len);
3005 for (; 2*i+1 < s_len; i++) {
3006 assert((t - result) < result_len);
3013 /* Clean up intermediate automata */
3029 *upv_len = result_len;
3038 * Construct an fa from a regular expression
3040 static struct fa *fa_from_re(struct re *re) {
3041 struct fa *result = NULL;
3046 result = fa_from_re(re->exp1);
3049 struct fa *fa2 = fa_from_re(re->exp2);
3052 if (union_in_place(result, &fa2) < 0)
3058 result = fa_from_re(re->exp1);
3061 struct fa *fa2 = fa_from_re(re->exp2);
3064 if (concat_in_place(result, &fa2) < 0)
3069 result = fa_make_char_set(re->cset, re->negate);
3073 struct fa *fa = fa_from_re(re->exp);
3076 result = fa_iter(fa, re->min, re->max);
3081 result = fa_make_epsilon();
3084 result = fa_make_char(re->c);
3096 static void free_re(struct re *re) {
3099 assert(re->ref == 0);
3101 if (re->type == UNION || re->type == CONCAT) {
3104 } else if (re->type == ITER) {
3106 } else if (re->type == CSET) {
3107 bitset_free(re->cset);
3112 int fa_compile(const char *regexp, size_t size, struct fa **fa) {
3113 struct re *re = NULL;
3114 struct re_parse parse;
3119 parse.rend = regexp + size;
3120 parse.error = REG_NOERROR;
3122 re = parse_regexp(&parse);
3126 *fa = fa_from_re(re);
3129 if (*fa == NULL || collect(*fa) < 0)
3130 parse.error = REG_ESPACE;
3134 /* We represent a case-insensitive FA by using only transitions on
3135 * lower-case letters.
3137 int fa_nocase(struct fa *fa) {
3138 if (fa == NULL || fa->nocase)
3142 list_for_each(s, fa->initial) {
3143 int tused = s->tused;
3144 /* For every transition on characters in [A-Z] add a corresponding
3145 * transition on [a-z]; remove any portion covering [A-Z] */
3146 for (int i=0; i < tused; i++) {
3147 struct trans *t = s->trans + i;
3148 int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
3149 int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
3151 if (t->min > 'Z' || t->max < 'A')
3153 if (t->min >= 'A' && t->max <= 'Z') {
3154 t->min = tolower(t->min);
3155 t->max = tolower(t->max);
3156 } else if (t->max <= 'Z') {
3159 F(add_new_trans(s, t->to, lc_min, lc_max));
3160 } else if (t->min >= 'A') {
3163 F(add_new_trans(s, t->to, lc_min, lc_max));
3165 /* t->min < 'A' && t->max > 'Z' */
3166 F(add_new_trans(s, t->to, 'Z' + 1, t->max));
3167 s->trans[i].max = 'A' - 1;
3168 F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
3178 int fa_is_nocase(struct fa *fa) {
3182 /* If FA is case-insensitive, turn it into a case-sensitive automaton by
3183 * adding transitions on upper-case letters for each existing transition on
3184 * lower-case letters */
3185 static int case_expand(struct fa *fa) {
3190 list_for_each(s, fa->initial) {
3191 int tused = s->tused;
3192 /* For every transition on characters in [a-z] add a corresponding
3193 * transition on [A-Z] */
3194 for (int i=0; i < tused; i++) {
3195 struct trans *t = s->trans + i;
3196 int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
3197 int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
3199 if (t->min > 'z' || t->max < 'a')
3201 F(add_new_trans(s, t->to, lc_min, lc_max));
3211 * Regular expression parser
3214 static struct re *make_re(enum re_type type) {
3216 if (make_ref(re) == 0)
3221 static struct re *make_re_rep(struct re *exp, int min, int max) {
3222 struct re *re = make_re(ITER);
3233 static struct re *make_re_binop(enum re_type type, struct re *exp1,
3235 struct re *re = make_re(type);
3246 static struct re *make_re_char(uchar c) {
3247 struct re *re = make_re(CHAR);
3253 static struct re *make_re_char_set(bool negate, bool no_ranges) {
3254 struct re *re = make_re(CSET);
3256 re->negate = negate;
3257 re->no_ranges = no_ranges;
3258 re->cset = bitset_init(UCHAR_NUM);
3259 if (re->cset == NULL)
3265 static bool more(struct re_parse *parse) {
3266 return parse->rx < parse->rend;
3269 static bool match(struct re_parse *parse, char m) {
3272 if (*parse->rx == m) {
3279 static bool peek(struct re_parse *parse, const char *chars) {
3280 return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL;
3283 static bool next(struct re_parse *parse, char *c) {
3291 static bool parse_char(struct re_parse *parse, int quoted, char *c) {
3294 if (quoted && *parse->rx == '\\') {
3296 return next(parse, c);
3298 return next(parse, c);
3302 static void add_re_char(struct re *re, uchar from, uchar to) {
3303 assert(re->type == CSET);
3304 for (unsigned int c = from; c <= to; c++)
3305 bitset_set(re->cset, c);
3308 static void parse_char_class(struct re_parse *parse, struct re *re) {
3309 if (! more(parse)) {
3310 parse->error = REG_EBRACK;
3314 parse_char(parse, 0, &from);
3316 if (match(parse, '-')) {
3317 if (! more(parse)) {
3318 parse->error = REG_EBRACK;
3321 if (peek(parse, "]")) {
3323 parse->error = REG_ERANGE;
3326 add_re_char(re, from, to);
3327 add_re_char(re, '-', '-');
3329 } else if (!parse_char(parse, 0, &to)) {
3330 parse->error = REG_ERANGE;
3335 parse->error = REG_ERANGE;
3338 add_re_char(re, from, to);
3343 static struct re *parse_simple_exp(struct re_parse *parse) {
3344 struct re *re = NULL;
3346 if (match(parse, '[')) {
3347 bool negate = match(parse, '^');
3348 re = make_re_char_set(negate, parse->no_ranges);
3350 parse->error = REG_ESPACE;
3353 parse_char_class(parse, re);
3354 if (parse->error != REG_NOERROR)
3356 while (more(parse) && ! peek(parse, "]")) {
3357 parse_char_class(parse, re);
3358 if (parse->error != REG_NOERROR)
3361 if (! match(parse, ']')) {
3362 parse->error = REG_EBRACK;
3365 } else if (match(parse, '(')) {
3366 if (match(parse, ')')) {
3367 re = make_re(EPSILON);
3369 parse->error = REG_ESPACE;
3373 re = parse_regexp(parse);
3376 if (! match(parse, ')')) {
3377 parse->error = REG_EPAREN;
3381 } else if (match(parse, '.')) {
3382 re = make_re_char_set(1, parse->no_ranges);
3384 parse->error = REG_ESPACE;
3387 add_re_char(re, '\n', '\n');
3388 } else if (more(parse)) {
3390 if (!parse_char(parse, 1, &c)) {
3391 parse->error = REG_EESCAPE;
3394 re = make_re_char(c);
3396 parse->error = REG_ESPACE;
3400 re = make_re(EPSILON);
3402 parse->error = REG_ESPACE;
3412 static int parse_int(struct re_parse *parse) {
3418 /* We need to be careful that strtoul will never access
3419 * memory beyond parse->rend
3421 for (lim = parse->rx; lim < parse->rend && *lim >= '0' && *lim <= '9';
3423 if (lim < parse->rend) {
3424 l = strtoul(parse->rx, &end, 10);
3425 used = end - parse->rx;
3427 char *s = strndup(parse->rx, parse->rend - parse->rx);
3429 parse->error = REG_ESPACE;
3432 l = strtoul(s, &end, 10);
3440 if ((l<0) || (l > INT_MAX)) {
3441 parse->error = REG_BADBR;
3447 static struct re *parse_repeated_exp(struct re_parse *parse) {
3448 struct re *re = parse_simple_exp(parse);
3451 if (match(parse, '?')) {
3452 re = make_re_rep(re, 0, 1);
3453 } else if (match(parse, '*')) {
3454 re = make_re_rep(re, 0, -1);
3455 } else if (match(parse, '+')) {
3456 re = make_re_rep(re, 1, -1);
3457 } else if (match(parse, '{')) {
3459 min = parse_int(parse);
3462 if (match(parse, ',')) {
3463 max = parse_int(parse);
3465 max = -1; /* If it's not an int, it means 'unbounded' */
3466 if (! match(parse, '}')) {
3467 parse->error = REG_EBRACE;
3470 } else if (match(parse, '}')) {
3473 parse->error = REG_EBRACE;
3476 if (min > max && max != -1) {
3477 parse->error = REG_BADBR;
3480 re = make_re_rep(re, min, max);
3483 parse->error = REG_ESPACE;
3490 static struct re *parse_concat_exp(struct re_parse *parse) {
3491 struct re *re = parse_repeated_exp(parse);
3495 if (more(parse) && ! peek(parse, ")|")) {
3496 struct re *re2 = parse_concat_exp(parse);
3499 re = make_re_binop(CONCAT, re, re2);
3501 parse->error = REG_ESPACE;
3512 static struct re *parse_regexp(struct re_parse *parse) {
3513 struct re *re = NULL;
3515 /* Something like (|r) */
3516 if (peek(parse, "|"))
3517 re = make_re(EPSILON);
3519 re = parse_concat_exp(parse);
3523 if (match(parse, '|')) {
3524 struct re *re2 = NULL;
3525 /* Something like (r|) */
3526 if (peek(parse, ")"))
3527 re2 = make_re(EPSILON);
3529 re2 = parse_regexp(parse);
3533 re = make_re_binop(UNION, re, re2);
3535 parse->error = REG_ESPACE;
3547 * Convert a STRUCT RE to a string. Code is complicated by the fact that
3548 * we try to be clever and avoid unneeded parens and concatenation with
3551 static int re_as_string(const struct re *re, struct re_str *str);
3553 static int re_binop_count(enum re_type type, const struct re *re) {
3554 assert(type == CONCAT || type == UNION);
3555 if (re->type == type) {
3556 return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2);
3562 static int re_binop_store(enum re_type type, const struct re *re,
3563 const struct re **list) {
3565 if (type == re->type) {
3566 pos = re_binop_store(type, re->exp1, list);
3567 pos += re_binop_store(type, re->exp2, list + pos);
3575 static int re_union_as_string(const struct re *re, struct re_str *str) {
3576 assert(re->type == UNION);
3579 const struct re **res = NULL;
3580 struct re_str *strings = NULL;
3583 nre = re_binop_count(re->type, re);
3584 r = ALLOC_N(res, nre);
3588 re_binop_store(re->type, re, res);
3590 r = ALLOC_N(strings, nre);
3595 for (int i=0; i < nre; i++) {
3596 if (re_as_string(res[i], strings + i) < 0)
3598 str->len += strings[i].len;
3602 r = re_str_alloc(str);
3607 for (int i=0; i < nre; i++) {
3610 memcpy(p, strings[i].rx, strings[i].len);
3611 p += strings[i].len;
3617 if (strings != NULL) {
3618 for (int i=0; i < nre; i++)
3619 release_re_str(strings + i);
3624 release_re_str(str);
3630 static int re_needs_parens_in_concat(const struct re *re) {
3631 return (re->type != CHAR && re->type != CSET && re->type != ITER);
3634 static int re_concat_as_string(const struct re *re, struct re_str *str) {
3635 assert(re->type == CONCAT);
3637 const struct re **res = NULL;
3638 struct re_str *strings = NULL;
3642 nre = re_binop_count(re->type, re);
3643 r = ALLOC_N(res, nre);
3646 re_binop_store(re->type, re, res);
3648 r = ALLOC_N(strings, nre);
3653 for (int i=0; i < nre; i++) {
3654 if (res[i]->type == EPSILON)
3656 if (re_as_string(res[i], strings + i) < 0)
3658 str->len += strings[i].len;
3659 if (re_needs_parens_in_concat(res[i]))
3663 r = re_str_alloc(str);
3668 for (int i=0; i < nre; i++) {
3669 if (res[i]->type == EPSILON)
3671 if (re_needs_parens_in_concat(res[i]))
3673 p = memcpy(p, strings[i].rx, strings[i].len);
3674 p += strings[i].len;
3675 if (re_needs_parens_in_concat(res[i]))
3682 if (strings != NULL) {
3683 for (int i=0; i < nre; i++)
3684 release_re_str(strings + i);
3689 release_re_str(str);
3694 static bool cset_contains(const struct re *cset, int c) {
3695 return bitset_get(cset->cset, c) != cset->negate;
3698 static int re_cset_as_string(const struct re *re, struct re_str *str) {
3699 const uchar rbrack = ']';
3700 const uchar dash = '-';
3701 const uchar nul = '\0';
3703 static const char *const empty_set = "[]";
3704 static const char *const total_set = "(.|\n)";
3705 static const char *const not_newline = ".";
3708 int from, to, negate;
3710 int incl_rbrack, incl_dash;
3713 str->len = strlen(empty_set);
3715 /* We can not include NUL explicitly in a CSET since we use ordinary
3716 NUL delimited strings to represent them. That means that we need to
3717 use negated representation if NUL is to be included (and vice versa)
3719 negate = cset_contains(re, nul);
3721 for (from = UCHAR_MIN;
3722 from <= UCHAR_MAX && cset_contains(re, from);
3724 if (from > UCHAR_MAX) {
3725 /* Special case: the set matches every character */
3726 str->rx = strdup(total_set);
3731 from <= UCHAR_MAX && cset_contains(re, from);
3733 if (from > UCHAR_MAX) {
3734 /* Special case: the set matches everything but '\n' */
3735 str->rx = strdup(not_newline);
3741 /* See if ']' and '-' will be explicitly included in the character set
3742 (INCL_RBRACK, INCL_DASH) As we loop over the character set, we reset
3743 these flags if they are in the set, but not mentioned explicitly
3745 incl_rbrack = cset_contains(re, rbrack) != negate;
3746 incl_dash = cset_contains(re, dash) != negate;
3748 if (re->no_ranges) {
3749 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++)
3750 if (cset_contains(re, from) != negate)
3753 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3754 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3756 if (from > UCHAR_MAX)
3759 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3762 if (to == from && (from == rbrack || from == dash))
3764 if (from == rbrack || from == dash)
3766 if (to == rbrack || to == dash)
3769 len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3);
3771 if (from < rbrack && rbrack < to)
3773 if (from < dash && dash < to)
3777 str->len += incl_rbrack + incl_dash;
3780 str->len += 1; /* For the ^ */
3782 r = re_str_alloc(str);
3793 if (re->no_ranges) {
3794 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) {
3795 if (from == rbrack || from == dash)
3797 if (cset_contains(re, from) != negate)
3801 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3802 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3804 if (from > UCHAR_MAX)
3807 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3810 if (to == from && (from == rbrack || from == dash))
3812 if (from == rbrack || from == dash)
3814 if (to == rbrack || to == dash)
3819 } else if (to == from + 1) {
3834 if (str->rx == NULL)
3836 str->len = strlen(str->rx);
3839 release_re_str(str);
3843 static int re_iter_as_string(const struct re *re, struct re_str *str) {
3844 const char *quant = NULL;
3848 if (re_as_string(re->exp, str) < 0)
3851 if (re->min == 0 && re->max == -1) {
3853 } else if (re->min == 1 && re->max == -1) {
3855 } else if (re->min == 0 && re->max == 1) {
3857 } else if (re->max == -1) {
3858 r = asprintf(&iter, "{%d,}", re->min);
3863 r = asprintf(&iter, "{%d,%d}", re->min, re->max);
3869 if (re->exp->type == CHAR || re->exp->type == CSET) {
3870 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0)
3872 strcpy(str->rx + str->len, quant);
3873 str->len += strlen(quant);
3875 /* Format '(' + str->rx ')' + quant */
3876 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0)
3878 memmove(str->rx + 1, str->rx, str->len);
3880 str->rx[str->len + 1] = ')';
3882 strcpy(str->rx + str->len, quant);
3883 str->len += strlen(quant);
3891 release_re_str(str);
3895 static int re_as_string(const struct re *re, struct re_str *str) {
3896 /* Characters that must be escaped */
3897 static const char * const special_chars = ".()[]{}*|+?\\^$";
3902 result = re_union_as_string(re, str);
3905 result = re_concat_as_string(re, str);
3908 result = re_cset_as_string(re, str);
3911 if (re->c == '\0' || strchr(special_chars, re->c) == NULL) {
3912 if (ALLOC_N(str->rx, 2) < 0)
3917 if (ALLOC_N(str->rx, 3) < 0)
3921 str->len = strlen(str->rx);
3925 result = re_iter_as_string(re, str);
3928 if (ALLOC_N(str->rx, 3) < 0)
3930 strcpy(str->rx, "()");
3931 str->len = strlen(str->rx);
3940 release_re_str(str);
3944 static int convert_trans_to_re(struct state *s) {
3945 struct re *re = NULL;
3947 struct trans *trans = NULL;
3953 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
3954 for (int i = 0; i < s->tused - 1; i++) {
3955 if (s->trans[i].to != s->trans[i+1].to)
3958 r = ALLOC_N(trans, nto);
3962 struct state *to = s->trans[0].to;
3964 for_each_trans(t, s) {
3966 trans[tind].to = to;
3967 trans[tind].re = re;
3973 re = make_re_char_set(0, 0);
3977 add_re_char(re, t->min, t->max);
3979 assert(nto == tind + 1);
3980 trans[tind].to = to;
3981 trans[tind].re = re;
3983 /* Simplify CSETs with a single char to a CHAR */
3984 for (int t=0; t < nto; t++) {
3986 uchar chr = UCHAR_MIN;
3987 for (int c = 0; c < UCHAR_NUM; c++) {
3988 if (bitset_get(trans[t].re->cset, c)) {
3994 re_unref(trans[t].re);
3995 trans[t].re = make_re_char(chr);
3996 if (trans[t].re == NULL)
4002 s->tused = s->tsize = nto;
4007 for (int i=0; i < nto; i++)
4008 unref(trans[i].re, re);
4013 ATTRIBUTE_RETURN_CHECK
4014 static int add_new_re_trans(struct state *s1, struct state *s2,
4017 r = add_new_trans(s1, s2, 0, 0);
4020 last_trans(s1)->re = re;
4024 /* Add the regular expression R1 . LOOP* . R2 to the transition
4026 static int re_collapse_trans(struct state *s1, struct state *s2,
4027 struct re *r1, struct re *loop, struct re *r2) {
4028 struct re *re = NULL;
4030 if (loop->type != EPSILON) {
4031 loop = make_re_rep(ref(loop), 0, -1);
4036 if (r1->type == EPSILON) {
4037 if (loop->type == EPSILON) {
4040 re = make_re_binop(CONCAT, loop, ref(r2));
4043 if (loop->type == EPSILON) {
4044 if (r2->type == EPSILON) {
4047 re = make_re_binop(CONCAT, ref(r1), ref(r2));
4050 re = make_re_binop(CONCAT, ref(r1), loop);
4051 if (re != NULL && r2->type != EPSILON) {
4052 re = make_re_binop(CONCAT, re, ref(r2));
4059 struct trans *t = NULL;
4060 for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1);
4061 if (t > last_trans(s1)) {
4062 if (add_new_re_trans(s1, s2, re) < 0)
4065 if (t->re == NULL) {
4068 t->re = make_re_binop(UNION, re, t->re);
4075 // FIXME: make sure we don't leak loop
4079 static int convert_strings(struct fa *fa) {
4080 struct state_set *worklist = state_set_init(-1, S_NONE);
4083 E(worklist == NULL);
4085 /* Abuse hash to count indegree, reachable to mark visited states */
4086 list_for_each(s, fa->initial) {
4091 list_for_each(s, fa->initial) {
4092 for_each_trans(t, s) {
4097 for (struct state *s = fa->initial;
4099 s = state_set_pop(worklist)) {
4100 for (int i=0; i < s->tused; i++) {
4101 struct trans *t = s->trans + i;
4102 struct state *to = t->to;
4103 while (to->hash == 1 && to->tused == 1 && ! to->accept) {
4104 if (t->re == NULL) {
4105 t->re = to->trans->re;
4106 to->trans->re = NULL;
4108 t->re = make_re_binop(CONCAT, t->re, to->trans->re);
4112 t->to = to->trans->to;
4116 for (int j=0; j < s->tused; j++) {
4117 if (j != i && s->trans[j].to == to) {
4118 /* Combine transitions i and j; remove trans j */
4119 t->re = make_re_binop(UNION, t->re, s->trans[j].re);
4122 memmove(s->trans + j, s->trans + j + 1,
4123 sizeof(s->trans[j]) * (s->tused - j - 1));
4134 if (! to->reachable) {
4136 F(state_set_push(worklist, to));
4141 for (struct state *s = fa->initial; s->next != NULL; ) {
4142 if (s->next->hash == 0 && s->next->tused == 0) {
4143 struct state *del = s->next;
4144 s->next = del->next;
4154 state_set_free(worklist);
4158 /* Convert an FA to a regular expression.
4159 * The strategy is the following:
4160 * (1) For all states S1 and S2, convert the transitions between them
4161 * into one transition whose regexp is a CSET
4162 * (2) Add a new initial state INI and a new final state FIN to the automaton,
4163 * a transition from INI to the old initial state of FA, and a transition
4164 * from all accepting states of FA to FIN; the regexp on those transitions
4165 * matches only the empty word
4166 * (3) Eliminate states S (except for INI and FIN) one by one:
4167 * Let LOOP the regexp for the transition S -> S if it exists, epsilon
4169 * For all S1, S2 different from S with S1 -> S -> S2
4170 * Let R1 the regexp of S1 -> S
4171 * R2 the regexp of S -> S2
4172 * R3 the regexp S1 -> S2 (or epsilon if no such transition)
4173 * set the regexp on the transition S1 -> S2 to
4174 * R1 . (LOOP)* . R2 | R3
4175 * (4) The regexp for the whole FA can now be found as the regexp of
4176 * the transition INI -> FIN
4177 * (5) Convert that STRUCT RE to a string with RE_AS_STRING
4179 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
4181 struct state *fin = NULL, *ini = NULL;
4182 struct re *eps = NULL;
4190 eps = make_re(EPSILON);
4194 fin = add_state(fa,1);
4200 list_for_each(s, fa->initial) {
4201 r = convert_trans_to_re(s);
4204 if (s->accept && s != fin) {
4205 r = add_new_re_trans(s, fin, ref(eps));
4212 ini = add_state(fa, 0);
4216 r = add_new_re_trans(ini, fa->initial, ref(eps));
4219 set_initial(fa, ini);
4221 convert_strings(fa);
4223 list_for_each(s, fa->initial->next) {
4227 struct re *loop = eps;
4228 for_each_trans(t, s) {
4232 list_for_each(s1, fa->initial) {
4235 for (int t1 = 0; t1 < s1->tused; t1++) {
4236 if (s1->trans[t1].to == s) {
4237 for (int t = 0; t < s->tused; t++) {
4238 if (s->trans[t].to == s)
4240 r = re_collapse_trans(s1, s->trans[t].to,
4254 for_each_trans(t, fa->initial) {
4258 if (re_as_string(t->re, &str) < 0)
4261 *regexp_len = str.len;
4265 list_for_each(s, fa->initial) {
4266 for_each_trans(t, s) {
4279 static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
4286 r1 = re_restrict_alphabet(re->exp1, from, to);
4287 r2 = re_restrict_alphabet(re->exp2, from, to);
4288 result = (r1 != 0) ? r1 : r2;
4293 bitset_negate(re->cset, UCHAR_NUM);
4295 for (int i=from; i <= to; i++)
4296 bitset_clr(re->cset, i);
4299 if (from <= re->c && re->c <= to)
4303 result = re_restrict_alphabet(re->exp, from, to);
4315 int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
4316 char **newregexp, size_t *newregexp_len,
4317 char from, char to) {
4319 struct re *re = NULL;
4320 struct re_parse parse;
4326 parse.rend = regexp + regexp_len;
4327 parse.error = REG_NOERROR;
4328 re = parse_regexp(&parse);
4329 if (parse.error != REG_NOERROR)
4332 result = re_restrict_alphabet(re, from, to);
4339 result = re_as_string(re, &str);
4340 *newregexp = str.rx;
4341 *newregexp_len = str.len;
4347 int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
4348 char **newregexp, size_t *newregexp_len) {
4350 struct re *re = NULL;
4351 struct re_parse parse;
4357 parse.rend = regexp + regexp_len;
4358 parse.error = REG_NOERROR;
4359 parse.no_ranges = 1;
4360 re = parse_regexp(&parse);
4361 if (parse.error != REG_NOERROR)
4365 result = re_as_string(re, &str);
4366 *newregexp = str.rx;
4367 *newregexp_len = str.len;
4372 /* Expand regexp so that it is case-insensitive in a case-sensitive match.
4374 * Return 1 when a change was made, -1 when an allocation failed, and 0
4375 * when no change was made.
4377 static int re_case_expand(struct re *re) {
4378 int result = 0, r1, r2;
4383 r1 = re_case_expand(re->exp1);
4384 r2 = re_case_expand(re->exp2);
4385 result = (r1 != 0) ? r1 : r2;
4388 for (int c = 'A'; c <= 'Z'; c++)
4389 if (bitset_get(re->cset, c)) {
4391 bitset_set(re->cset, tolower(c));
4393 for (int c = 'a'; c <= 'z'; c++)
4394 if (bitset_get(re->cset, c)) {
4396 bitset_set(re->cset, toupper(c));
4400 if (isalpha(re->c)) {
4405 re->cset = bitset_init(UCHAR_NUM);
4406 if (re->cset == NULL)
4408 bitset_set(re->cset, tolower(c));
4409 bitset_set(re->cset, toupper(c));
4414 result = re_case_expand(re->exp);
4426 int fa_expand_nocase(const char *regexp, size_t regexp_len,
4427 char **newregexp, size_t *newregexp_len) {
4429 struct re *re = NULL;
4430 struct re_parse parse;
4436 parse.rend = regexp + regexp_len;
4437 parse.error = REG_NOERROR;
4438 re = parse_regexp(&parse);
4439 if (parse.error != REG_NOERROR)
4442 r = re_case_expand(re);
4450 result = re_as_string(re, &str);
4451 *newregexp = str.rx;
4452 *newregexp_len = str.len;
4454 *newregexp = strndup(regexp, regexp_len);
4455 *newregexp_len = regexp_len;
4456 result = (*newregexp == NULL) ? REG_ESPACE : REG_NOERROR;
4462 static void print_char(FILE *out, uchar c) {
4463 /* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
4464 Also, a space ' ' is shown as '\s' */
4465 static const char *const escape_from = " \n\t\v\b\r\f\a/\0";
4466 static const char *const escape_to = "sntvbrfa/0";
4467 char *p = strchr(escape_from, c);
4469 int i = p - escape_from;
4470 fprintf(out, "\\\\%c", escape_to[i]);
4471 } else if (! isprint(c)) {
4472 fprintf(out, "\\\\0%03o", (unsigned char) c);
4473 } else if (c == '"') {
4474 fprintf(out, "\\\"");
4480 void fa_dot(FILE *out, struct fa *fa) {
4481 fprintf(out, "digraph {\n rankdir=LR;");
4482 list_for_each(s, fa->initial) {
4484 fprintf(out, "\"%p\" [shape=doublecircle];\n", s);
4486 fprintf(out, "\"%p\" [shape=circle];\n", s);
4489 fprintf(out, "%s -> \"%p\";\n", fa->deterministic ? "dfa" : "nfa",
4494 list_for_each(s, fa->initial) {
4495 for_each_trans(t, s) {
4496 fprintf(out, "\"%p\" -> \"%p\" [ label = \"", s, t->to);
4498 re_as_string(t->re, &str);
4499 for (int i=0; i < str.len; i++) {
4500 print_char(out, str.rx[i]);
4502 release_re_str(&str);
4504 print_char(out, t->min);
4505 if (t->min != t->max) {
4507 print_char(out, t->max);
4510 fprintf(out, "\" ];\n");
4513 fprintf(out, "}\n");
4518 * indent-tabs-mode: nil