2 * fa.c: finite automata
4 * Copyright (C) 2007-2016 David Lutterkort
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * Author: David Lutterkort <dlutter@redhat.com>
24 * This implementation follows closely the Java dk.brics.automaton package
25 * by Anders Moeller. The project's website is
26 * http://www.brics.dk/automaton/.
28 * It is by no means a complete reimplementation of that package; only a
29 * subset of what Automaton provides is implemented here.
43 #define UCHAR_NUM (UCHAR_MAX+1)
45 typedef unsigned char uchar;
47 #define E(cond) if (cond) goto error
48 #define F(expr) if ((expr) < 0) goto error
50 /* Which algorithm to use in FA_MINIMIZE */
51 int fa_minimization_algorithm = FA_MIN_HOPCROFT;
53 /* A finite automaton. INITIAL is both the initial state and the head of
54 * the list of all states. Any state that is allocated for this automaton
55 * is put on this list. Dead/unreachable states are cleared from the list
56 * at opportune times (e.g., during minimization) It's poor man's garbage
59 * Normally, transitions are on a character range [min..max]; in
60 * fa_as_regexp, we store regexps on transitions in the re field of each
61 * transition. TRANS_RE indicates that we do that, and is used by fa_dot to
62 * produce proper graphs of an automaton transitioning on regexps.
64 * For case-insensitive regexps (nocase == 1), the FA never has transitions
65 * on uppercase letters [A-Z], effectively removing these letters from the
69 struct state *initial;
70 int deterministic : 1;
72 unsigned int nocase : 1;
76 /* A state in a finite automaton. Transitions are never shared between
77 states so that we can free the list when we need to free the state */
81 unsigned int accept : 1;
82 unsigned int live : 1;
83 unsigned int reachable : 1;
84 unsigned int visited : 1; /* Used in various places to track progress */
85 /* Array of transitions. The TUSED first entries are used, the array
86 has allocated room for TSIZE */
92 /* A transition. If the input has a character in the inclusive
93 * range [MIN, MAX], move to TO
109 #define UINT_BIT (sizeof(unsigned int) * CHAR_BIT)
111 typedef unsigned int bitset;
113 static bitset *bitset_init(size_t nbits) {
115 if (ALLOC_N(bs, (nbits + UINT_BIT) / UINT_BIT) == -1)
120 static inline void bitset_clr(bitset *bs, unsigned int bit) {
121 bs[bit/UINT_BIT] &= ~(1 << (bit % UINT_BIT));
124 static inline void bitset_set(bitset *bs, unsigned int bit) {
125 bs[bit/UINT_BIT] |= 1 << (bit % UINT_BIT);
129 static inline int bitset_get(const bitset * const bs, unsigned int bit) {
130 return (bs[bit/UINT_BIT] >> bit % UINT_BIT) & 1;
134 static inline bool bitset_disjoint(const bitset *const bs1,
135 const bitset *const bs2,
137 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) {
144 static void bitset_free(bitset *bs) {
148 static void bitset_negate(bitset *bs, size_t nbits) {
149 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
154 * Representation of a parsed regular expression. The regular expression is
155 * parsed according to the following grammar by PARSE_REGEXP:
157 * regexp: concat_exp ('|' regexp)?
158 * concat_exp: repeated_exp concat_exp?
159 * repeated_exp: simple_exp
163 * | simple_exp '{' INT (',' INT '}')?
164 * simple_exp: char_class
168 * char_class: '[' char_exp ']'
169 * | '[' '^' char_exp ']'
170 * char_exp: CHAR '-' CHAR
183 #define re_unref(r) unref(r, re)
189 struct { /* UNION, CONCAT */
196 /* Whether we can use character ranges when converting back
198 unsigned int no_ranges:1;
211 /* Used to keep state of the regex parse; RX may contain NUL's */
213 const char *rx; /* Current position in regex */
214 const char *rend; /* Last char of rx+ 1 */
215 int error; /* error code */
216 /* Whether new CSET's should have the no_ranges flag set */
217 unsigned int no_ranges:1;
220 /* String with explicit length, used when converting re to string */
226 static struct re *parse_regexp(struct re_parse *parse);
228 /* A map from a set of states to a state. */
229 typedef hash_t state_set_hash;
231 static hash_val_t ptr_hash(const void *p);
233 static const int array_initial_size = 4;
234 static const int array_max_expansion = 128;
236 enum state_set_init_flags {
245 unsigned int sorted : 1;
246 unsigned int with_data : 1;
247 struct state **states;
251 struct state_set_list {
252 struct state_set_list *next;
253 struct state_set *set;
256 /* Clean up FA by removing dead transitions and states and reducing
257 * transitions. Unreachable states are freed. The return value is the same
258 * as FA; returning it is merely a convenience.
260 * Only automata in this state should be returned to the user
262 ATTRIBUTE_RETURN_CHECK
263 static int collect(struct fa *fa);
265 ATTRIBUTE_RETURN_CHECK
266 static int totalize(struct fa *fa);
268 /* Print an FA into a (fairly) fixed file if the environment variable
269 * FA_DOT_DIR is set. This code is only used for debugging
271 #define FA_DOT_DIR "FA_DOT_DIR"
274 static void fa_dot_debug(struct fa *fa, const char *tag) {
276 static int count = 0;
281 if ((dot_dir = getenv(FA_DOT_DIR)) == NULL)
284 r = asprintf(&fname, "%s/fa_%02d_%s.dot", dot_dir, count++, tag);
288 fp = fopen(fname, "w");
294 static void print_char_set(struct re *set) {
301 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
302 while (bitset_get(set->cset, from) == set->negate)
304 if (from > UCHAR_MAX)
307 to < UCHAR_MAX && (bitset_get(set->cset, to+1) == !set->negate);
312 printf("%c-%c", from, to);
319 static void print_re(struct re *re) {
340 printf("){%d,%d}", re->min, re->max);
354 static void release_re_str(struct re_str *str) {
361 static void free_re_str(struct re_str *str) {
368 static struct re_str *make_re_str(const char *s) {
375 str->len = strlen(s);
376 if (str->rx == NULL) {
384 static int re_str_alloc(struct re_str *str) {
385 return ALLOC_N(str->rx, str->len + 1);
392 static void free_trans(struct state *s) {
395 s->tused = s->tsize = 0;
398 static void gut(struct fa *fa) {
399 list_for_each(s, fa->initial) {
402 list_free(fa->initial);
406 void fa_free(struct fa *fa) {
413 static struct state *make_state(void) {
417 s->hash = ptr_hash(s);
421 static struct state *add_state(struct fa *fa, int accept) {
422 struct state *s = make_state();
425 if (fa->initial == NULL) {
428 list_cons(fa->initial->next, s);
434 #define last_trans(s) ((s)->trans + (s)->tused - 1)
436 #define for_each_trans(t, s) \
437 for (struct trans *t = (s)->trans; \
438 (t - (s)->trans) < (s)->tused; \
441 ATTRIBUTE_RETURN_CHECK
442 static int add_new_trans(struct state *from, struct state *to,
443 uchar min, uchar max) {
446 if (from->tused == from->tsize) {
447 size_t tsize = from->tsize;
449 tsize = array_initial_size;
450 else if (from->tsize > array_max_expansion)
451 tsize += array_max_expansion;
454 if (REALLOC_N(from->trans, tsize) == -1)
458 from->trans[from->tused].to = to;
459 from->trans[from->tused].min = min;
460 from->trans[from->tused].max = max;
465 ATTRIBUTE_RETURN_CHECK
466 static int add_epsilon_trans(struct state *from,
469 from->accept |= to->accept;
470 for_each_trans(t, to) {
471 r = add_new_trans(from, t->to, t->min, t->max);
478 static void set_initial(struct fa *fa, struct state *s) {
479 list_remove(s, fa->initial);
480 list_cons(fa->initial, s);
483 /* Merge automaton FA2 into FA1. This simply adds FA2's states to FA1
484 and then frees FA2. It has no influence on the language accepted by FA1
486 static void fa_merge(struct fa *fa1, struct fa **fa2) {
487 list_append(fa1->initial, (*fa2)->initial);
493 * Operations on STATE_SET
495 static void state_set_free(struct state_set *set) {
503 static int state_set_init_data(struct state_set *set) {
505 if (set->data == NULL)
506 return ALLOC_N(set->data, set->size);
511 /* Create a new STATE_SET with an initial size of SIZE. If SIZE is -1, use
512 the default size ARRAY_INITIAL_SIZE. FLAGS is a bitmask indicating
514 - S_SORTED: keep the states in the set sorted by their address, and use
515 binary search for lookups. If it is not set, entries are kept in the
516 order in which they are added and lookups scan linearly through the
518 - S_DATA: allocate the DATA array in the set, and keep its size in sync
519 with the size of the STATES array.
521 static struct state_set *state_set_init(int size, int flags) {
522 struct state_set *set = NULL;
526 set->sorted = (flags & S_SORTED) ? 1 : 0;
527 set->with_data = (flags & S_DATA) ? 1 : 0;
530 F(ALLOC_N(set->states, set->size));
532 F(state_set_init_data(set));
540 ATTRIBUTE_RETURN_CHECK
541 static int state_set_expand(struct state_set *set) {
543 set->size = array_initial_size;
544 else if (set->size > array_max_expansion)
545 set->size += array_max_expansion;
548 if (REALLOC_N(set->states, set->size) < 0)
551 if (REALLOC_N(set->data, set->size) < 0)
555 /* We do this to provoke a SEGV as early as possible */
561 /* Return the index where S belongs in SET->STATES to keep it sorted. S
562 may not be in SET->STATES. The returned index is in the interval [0
563 .. SET->USED], with the latter indicating that S is larger than all
564 values in SET->STATES
566 static int state_set_pos(const struct state_set *set, const struct state *s) {
567 int l = 0, h = set->used;
570 if (set->states[m] > s)
572 else if (set->states[m] < s)
580 ATTRIBUTE_RETURN_CHECK
581 static int state_set_push(struct state_set *set, struct state *s) {
582 if (set->size == set->used)
583 if (state_set_expand(set) < 0)
586 int p = state_set_pos(set, s);
587 if (set->size == set->used)
588 if (state_set_expand(set) < 0)
590 while (p < set->used && set->states[p] <= s)
593 memmove(set->states + p + 1, set->states + p,
594 sizeof(*set->states) * (set->used - p));
595 if (set->data != NULL)
596 memmove(set->data + p + 1, set->data + p,
597 sizeof(*set->data) * (set->used - p));
603 set->states[set->used++] = s;
604 return set->used - 1;
608 ATTRIBUTE_RETURN_CHECK
609 static int state_set_push_data(struct state_set *set, struct state *s,
611 int i = state_set_push(set, s);
618 static int state_set_index(const struct state_set *set,
619 const struct state *s) {
621 int p = state_set_pos(set, s);
622 return (p < set->used && set->states[p] == s) ? p : -1;
624 for (int i=0; i < set->used; i++) {
625 if (set->states[i] == s)
632 static void state_set_remove(struct state_set *set,
633 const struct state *s) {
635 int p = state_set_index(set, s);
637 memmove(set->states + p, set->states + p + 1,
638 sizeof(*set->states) * (set->used - p - 1));
639 if (set->data != NULL)
640 memmove(set->data + p, set->data + p + 1,
641 sizeof(*set->data) * (set->used - p - 1));
643 int p = state_set_index(set, s);
645 set->states[p] = set->states[--set->used];
650 /* Only add S if it's not in SET yet. Return 1 if S was added, 0 if it was
651 already in the set and -1 on error. */
652 ATTRIBUTE_RETURN_CHECK
653 static int state_set_add(struct state_set *set, struct state *s) {
655 int p = state_set_pos(set, s);
656 if (p < set->used && set->states[p] == s)
658 if (set->size == set->used)
659 if (state_set_expand(set) < 0)
661 while (p < set->used && set->states[p] <= s)
664 memmove(set->states + p + 1, set->states + p,
665 sizeof(*set->states) * (set->used - p));
666 if (set->data != NULL)
667 memmove(set->data + p + 1, set->data + p,
668 sizeof(*set->data) * (set->used - p));
673 if (state_set_index(set, s) >= 0)
675 if (state_set_push(set, s) < 0)
680 /* We do this to provoke a SEGV as early as possible */
686 static struct state *state_set_pop(struct state_set *set) {
687 struct state *s = NULL;
689 s = set->states[--set->used];
693 static struct state *state_set_pop_data(struct state_set *set, void **d) {
694 struct state *s = NULL;
695 s = state_set_pop(set);
696 *d = set->data[set->used];
700 static void *state_set_find_data(struct state_set *set, struct state *s) {
701 int i = state_set_index(set, s);
708 static int state_set_equal(const struct state_set *set1,
709 const struct state_set *set2) {
710 if (set1->used != set2->used)
712 if (set1->sorted && set2->sorted) {
713 for (int i = 0; i < set1->used; i++)
714 if (set1->states[i] != set2->states[i])
718 for (int i=0; i < set1->used; i++)
719 if (state_set_index(set2, set1->states[i]) == -1)
726 static void state_set_compact(struct state_set *set) {
727 while (set->used > 0 && set->states[set->used] == NULL)
729 for (int i=0; i < set->used; i++) {
730 if (set->states[i] == NULL) {
732 set->states[i] = set->states[set->used];
734 set->data[i] = set->data[set->used];
736 while (set->used > 0 && set->states[set->used] == NULL)
742 /* Add an entry (FST, SND) to SET. FST is stored in SET->STATES, and SND is
743 stored in SET->DATA at the same index.
745 ATTRIBUTE_RETURN_CHECK
746 static int state_pair_push(struct state_set **set,
747 struct state *fst, struct state *snd) {
749 *set = state_set_init(-1, S_DATA);
752 int i = state_set_push(*set, fst);
755 (*set)->data[i] = snd;
760 /* Return the index of the pair (FST, SND) in SET, or -1 if SET contains no
763 static int state_pair_find(struct state_set *set, struct state *fst,
765 for (int i=0; i < set->used; i++)
766 if (set->states[i] == fst && set->data[i] == snd)
771 /* Jenkins' hash for void* */
772 static hash_val_t ptr_hash(const void *p) {
774 char *c = (char *) &p;
775 for (int i=0; i < sizeof(p); i++) {
777 hash += (hash << 10);
781 hash ^= (hash >> 11);
782 hash += (hash << 15);
786 typedef hash_t state_triple_hash;
788 static hash_val_t pair_hash(const void *key) {
789 register struct state *const *pair = key;
790 return pair[0]->hash + pair[1]->hash;
793 static int pair_cmp(const void *key1, const void *key2) {
794 return memcmp(key1, key2, 2*sizeof(struct state *));
797 static void state_triple_node_free(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
798 free((void *) hnode_getkey(node));
802 static state_triple_hash *state_triple_init(void) {
803 state_triple_hash *hash;
805 hash = hash_create(HASHCOUNT_T_MAX, pair_cmp, pair_hash);
808 hash_set_allocator(hash, NULL, state_triple_node_free, NULL);
812 ATTRIBUTE_RETURN_CHECK
813 static int state_triple_push(state_triple_hash *hash,
818 if (ALLOC_N(pair, 2) < 0)
822 return hash_alloc_insert(hash, pair, s3);
825 static struct state * state_triple_thd(state_triple_hash *hash,
828 struct state *pair[2];
832 node = hash_lookup(hash, pair);
833 return (node == NULL) ? NULL : (struct state *) hnode_get(node);
836 static void state_triple_free(state_triple_hash *hash) {
838 hash_free_nodes(hash);
846 ATTRIBUTE_RETURN_CHECK
847 static int mark_reachable(struct fa *fa) {
848 struct state_set *worklist = state_set_init(-1, S_NONE);
853 list_for_each(s, fa->initial) {
856 fa->initial->reachable = 1;
858 for (struct state *s = fa->initial;
860 s = state_set_pop(worklist)) {
861 for_each_trans(t, s) {
862 if (! t->to->reachable) {
863 t->to->reachable = 1;
864 F(state_set_push(worklist, t->to));
871 state_set_free(worklist);
875 /* Return all reachable states. As a sideeffect, all states have their
876 REACHABLE flag set appropriately.
878 static struct state_set *fa_states(struct fa *fa) {
879 struct state_set *visited = state_set_init(-1, S_NONE);
882 r = mark_reachable(fa);
883 E(visited == NULL || r < 0);
885 list_for_each(s, fa->initial) {
887 F(state_set_push(visited, s));
891 state_set_free(visited);
895 /* Return all reachable accepting states. As a sideeffect, all states have
896 their REACHABLE flag set appropriately.
898 static struct state_set *fa_accept_states(struct fa *fa) {
899 struct state_set *accept = state_set_init(-1, S_NONE);
904 r = mark_reachable(fa);
907 list_for_each(s, fa->initial) {
908 if (s->reachable && s->accept)
909 F(state_set_push(accept, s));
913 state_set_free(accept);
917 /* Mark all live states, i.e. states from which an accepting state can be
918 reached. All states have their REACHABLE and LIVE flags set
921 ATTRIBUTE_RETURN_CHECK
922 static int mark_live(struct fa *fa) {
925 F(mark_reachable(fa));
927 list_for_each(s, fa->initial) {
928 s->live = s->reachable && s->accept;
933 list_for_each(s, fa->initial) {
934 if (! s->live && s->reachable) {
935 for_each_trans(t, s) {
951 * Reverse an automaton in place. Change FA so that it accepts the
952 * language that is the reverse of the input automaton.
954 * Returns a list of the new initial states of the automaton. The list must
955 * be freed by the caller.
957 static struct state_set *fa_reverse(struct fa *fa) {
958 struct state_set *all = NULL;
959 struct state_set *accept = NULL;
964 accept = fa_accept_states(fa);
967 F(state_set_init_data(all));
969 /* Reverse all transitions */
971 F(ALLOC_N(tused, all->used));
972 for (int i=0; i < all->used; i++) {
973 all->data[i] = all->states[i]->trans;
974 tused[i] = all->states[i]->tused;
975 all->states[i]->trans = NULL;
976 all->states[i]->tsize = 0;
977 all->states[i]->tused = 0;
979 for (int i=0; i < all->used; i++) {
980 struct state *s = all->states[i];
981 struct trans *t = all->data[i];
983 for (int j=0; j < tused[i]; j++) {
984 r = add_new_trans(t[j].to, s, t[j].min, t[j].max);
992 /* Make new initial and final states */
993 struct state *s = add_state(fa, 0);
996 fa->initial->accept = 1;
998 for (int i=0; i < accept->used; i++) {
999 r = add_epsilon_trans(s, accept->states[i]);
1004 fa->deterministic = 0;
1006 state_set_free(all);
1009 state_set_free(all);
1010 state_set_free(accept);
1015 * Return a sorted array of all interval start points in FA. The returned
1016 * array is a string (null terminated)
1018 static uchar* start_points(struct fa *fa, int *npoints) {
1019 char pointset[UCHAR_NUM];
1020 uchar *points = NULL;
1022 F(mark_reachable(fa));
1023 MEMZERO(pointset, UCHAR_NUM);
1024 list_for_each(s, fa->initial) {
1028 for_each_trans(t, s) {
1029 pointset[t->min] = 1;
1030 if (t->max < UCHAR_MAX)
1031 pointset[t->max+1] = 1;
1036 for(int i=0; i < UCHAR_NUM; *npoints += pointset[i], i++);
1038 F(ALLOC_N(points, *npoints+1));
1039 for (int i=0, n=0; i < UCHAR_NUM; i++) {
1041 points[n++] = (uchar) i;
1051 * Operations on STATE_SET_HASH
1053 static int state_set_hash_contains(state_set_hash *smap,
1054 struct state_set *set) {
1055 return hash_lookup(smap, set) != NULL;
1059 * Find the set in SMAP that has the same states as SET. If the two are
1060 * different, i.e. they point to different memory locations, free SET and
1061 * return the set found in SMAP
1063 static struct state_set *state_set_hash_uniq(state_set_hash *smap,
1064 struct state_set *set) {
1065 hnode_t *node = hash_lookup(smap, set);
1066 const struct state_set *orig_set = hnode_getkey(node);
1067 if (orig_set != set) {
1068 state_set_free(set);
1070 return (struct state_set *) orig_set;
1073 static struct state *state_set_hash_get_state(state_set_hash *smap,
1074 struct state_set *set) {
1075 hnode_t *node = hash_lookup(smap, set);
1076 return (struct state *) hnode_get(node);
1079 static hash_val_t set_hash(const void *key) {
1080 hash_val_t hash = 0;
1081 const struct state_set *set = key;
1083 for (int i = 0; i < set->used; i++) {
1084 hash += set->states[i]->hash;
1089 static int set_cmp(const void *key1, const void *key2) {
1090 const struct state_set *set1 = key1;
1091 const struct state_set *set2 = key2;
1093 return state_set_equal(set1, set2) ? 0 : 1;
1096 static void set_destroy(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
1097 struct state_set *set = (struct state_set *) hnode_getkey(node);
1098 state_set_free(set);
1102 ATTRIBUTE_RETURN_CHECK
1103 static int state_set_hash_add(state_set_hash **smap,
1104 struct state_set *set, struct fa *fa) {
1105 if (*smap == NULL) {
1106 *smap = hash_create(HASHCOUNT_T_MAX, set_cmp, set_hash);
1108 hash_set_allocator(*smap, NULL, set_destroy, NULL);
1110 struct state *s = add_state(fa, 0);
1112 F(hash_alloc_insert(*smap, set, s));
1118 static void state_set_hash_free(state_set_hash *smap,
1119 struct state_set *protect) {
1120 if (protect != NULL) {
1121 hnode_t *node = hash_lookup(smap, protect);
1122 hash_delete(smap, node);
1123 hnode_getkey(node) = NULL;
1124 set_destroy(node, NULL);
1126 hash_free_nodes(smap);
1130 static int state_set_list_add(struct state_set_list **list,
1131 struct state_set *set) {
1132 struct state_set_list *elt;
1136 list_cons(*list, elt);
1140 static struct state_set *state_set_list_pop(struct state_set_list **list) {
1141 struct state_set_list *elt = *list;
1142 struct state_set *set = elt->set;
1149 /* Compare transitions lexicographically by (to, min, reverse max) */
1150 static int trans_to_cmp(const void *v1, const void *v2) {
1151 const struct trans *t1 = v1;
1152 const struct trans *t2 = v2;
1154 if (t1->to != t2->to) {
1155 return (t1->to < t2->to) ? -1 : 1;
1157 if (t1->min < t2->min)
1159 if (t1->min > t2->min)
1161 if (t1->max > t2->max)
1163 return (t1->max < t2->max) ? 1 : 0;
1166 /* Compare transitions lexicographically by (min, reverse max, to) */
1167 static int trans_intv_cmp(const void *v1, const void *v2) {
1168 const struct trans *t1 = v1;
1169 const struct trans *t2 = v2;
1171 if (t1->min < t2->min)
1173 if (t1->min > t2->min)
1175 if (t1->max > t2->max)
1177 if (t1->max < t2->max)
1179 if (t1->to != t2->to) {
1180 return (t1->to < t2->to) ? -1 : 1;
1186 * Reduce an automaton by combining overlapping and adjacent edge intervals
1187 * with the same destination.
1189 static void reduce(struct fa *fa) {
1190 list_for_each(s, fa->initial) {
1194 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
1196 struct trans *t = s->trans;
1197 while (j < s->tused) {
1198 if (t[i].to == t[j].to && t[j].min <= t[i].max + 1) {
1199 if (t[j].max > t[i].max)
1200 t[i].max = t[j].max;
1205 memmove(s->trans + i, s->trans + j,
1206 sizeof(*s->trans) * (s->tused - j));
1212 /* Shrink if we use less than half the allocated size */
1213 if (s->tsize > array_initial_size && 2*s->tused < s->tsize) {
1215 r = REALLOC_N(s->trans, s->tsize);
1217 s->tsize = s->tused;
1223 * Remove dead transitions from an FA; a transition is dead is it does not
1224 * lead to a live state. This also removes any states that are not
1225 * reachable any longer from FA->INITIAL.
1227 * Returns the same FA as a convenience
1230 static void collect_trans(struct fa *fa) {
1231 list_for_each(s, fa->initial) {
1236 while (i < s->tused) {
1237 if (! s->trans[i].to->live) {
1239 memmove(s->trans + i, s->trans + s->tused,
1249 static void collect_dead_states(struct fa *fa) {
1250 /* Remove all dead states and free their storage */
1251 for (struct state *s = fa->initial; s->next != NULL; ) {
1252 if (! s->next->live) {
1253 struct state *del = s->next;
1254 s->next = del->next;
1263 static int collect(struct fa *fa) {
1266 if (! fa->initial->live) {
1267 /* This automaton accepts nothing, make it the canonical
1270 list_for_each(s, fa->initial) {
1273 list_free(fa->initial->next);
1274 fa->deterministic = 1;
1277 collect_dead_states(fa);
1285 static void swap_initial(struct fa *fa) {
1286 struct state *s = fa->initial;
1287 if (s->next != NULL) {
1288 fa->initial = s->next;
1289 s->next = fa->initial->next;
1290 fa->initial->next = s;
1295 * Make a finite automaton deterministic using the given set of initial
1296 * states with the subset construction. This also eliminates dead states
1297 * and transitions and reduces and orders the transitions for each state
1299 static int determinize(struct fa *fa, struct state_set *ini) {
1301 int make_ini = (ini == NULL);
1302 const uchar *points = NULL;
1303 state_set_hash *newstate = NULL;
1304 struct state_set_list *worklist = NULL;
1307 if (fa->deterministic)
1310 points = start_points(fa, &npoints);
1313 ini = state_set_init(-1, S_NONE);
1314 if (ini == NULL || state_set_push(ini, fa->initial) < 0) {
1315 state_set_free(ini);
1320 F(state_set_list_add(&worklist, ini));
1321 F(state_set_hash_add(&newstate, ini, fa));
1322 // Make the new state the initial state
1324 while (worklist != NULL) {
1325 struct state_set *sset = state_set_list_pop(&worklist);
1326 struct state *r = state_set_hash_get_state(newstate, sset);
1327 for (int q=0; q < sset->used; q++) {
1328 r->accept |= sset->states[q]->accept;
1330 for (int n=0; n < npoints; n++) {
1331 struct state_set *pset = state_set_init(-1, S_SORTED);
1333 for(int q=0 ; q < sset->used; q++) {
1334 for_each_trans(t, sset->states[q]) {
1335 if (t->min <= points[n] && points[n] <= t->max) {
1336 F(state_set_add(pset, t->to));
1340 if (!state_set_hash_contains(newstate, pset)) {
1341 F(state_set_list_add(&worklist, pset));
1342 F(state_set_hash_add(&newstate, pset, fa));
1344 pset = state_set_hash_uniq(newstate, pset);
1346 struct state *q = state_set_hash_get_state(newstate, pset);
1347 uchar min = points[n];
1348 uchar max = UCHAR_MAX;
1350 max = points[n+1] - 1;
1351 if (add_new_trans(r, q, min, max) < 0)
1355 fa->deterministic = 1;
1359 state_set_hash_free(newstate, make_ini ? NULL : ini);
1360 free((void *) points);
1361 if (collect(fa) < 0)
1370 * Minimization. As a sideeffect of minimization, the transitions are
1371 * reduced and ordered.
1374 static struct state *step(struct state *s, uchar c) {
1375 for_each_trans(t, s) {
1376 if (t->min <= c && c <= t->max)
1383 struct state_list_node *first;
1384 struct state_list_node *last;
1388 struct state_list_node {
1389 struct state_list *sl;
1390 struct state_list_node *next;
1391 struct state_list_node *prev;
1392 struct state *state;
1395 static struct state_list_node *state_list_add(struct state_list *sl,
1397 struct state_list_node *n;
1405 if (sl->size++ == 0) {
1416 static void state_list_remove(struct state_list_node *n) {
1417 struct state_list *sl = n->sl;
1420 sl->first = n->next;
1422 n->prev->next = n->next;
1426 n->next->prev = n->prev;
1431 static void state_list_free(struct state_list *sl) {
1433 list_free(sl->first);
1437 /* The linear index of element (q,c) in an NSTATES * NSIGMA matrix */
1438 #define INDEX(q, c) (q * nsigma + c)
1440 static int minimize_hopcroft(struct fa *fa) {
1441 struct state_set *states = NULL;
1442 uchar *sigma = NULL;
1443 struct state_set **reverse = NULL;
1444 bitset *reverse_nonempty = NULL;
1445 struct state_set **partition = NULL;
1446 unsigned int *block = NULL;
1447 struct state_list **active = NULL;
1448 struct state_list_node **active2 = NULL;
1449 int *pending = NULL;
1450 bitset *pending2 = NULL;
1451 struct state_set *split = NULL;
1452 bitset *split2 = NULL;
1454 bitset *refine2 = NULL;
1455 struct state_set **splitblock = NULL;
1456 struct state_set *newstates = NULL;
1460 unsigned int nstates = 0;
1463 F(determinize(fa, NULL));
1465 /* Total automaton, nothing to do */
1466 if (fa->initial->tused == 1
1467 && fa->initial->trans[0].to == fa->initial
1468 && fa->initial->trans[0].min == UCHAR_MIN
1469 && fa->initial->trans[0].max == UCHAR_MAX)
1474 /* make arrays for numbered states and effective alphabet */
1475 states = state_set_init(-1, S_NONE);
1478 list_for_each(s, fa->initial) {
1479 F(state_set_push(states, s));
1481 nstates = states->used;
1483 sigma = start_points(fa, &nsigma);
1486 /* initialize data structures */
1488 /* An ss->used x nsigma matrix of lists of states */
1489 F(ALLOC_N(reverse, nstates * nsigma));
1490 reverse_nonempty = bitset_init(nstates * nsigma);
1491 E(reverse_nonempty == NULL);
1492 F(ALLOC_N(partition, nstates));
1493 F(ALLOC_N(block, nstates));
1495 F(ALLOC_N(active, nstates * nsigma));
1496 F(ALLOC_N(active2, nstates * nsigma));
1498 /* PENDING is an array of pairs of ints. The i'th pair is stored in
1499 * PENDING[2*i] and PENDING[2*i + 1]. There are NPENDING pairs in
1500 * PENDING at any time. SPENDING is the maximum number of pairs
1501 * allocated for PENDING.
1503 size_t npending = 0, spending = 0;
1504 pending2 = bitset_init(nstates * nsigma);
1505 E(pending2 == NULL);
1507 split = state_set_init(-1, S_NONE);
1508 split2 = bitset_init(nstates);
1509 E(split == NULL || split2 == NULL);
1511 F(ALLOC_N(refine, nstates));
1512 refine2 = bitset_init(nstates);
1515 F(ALLOC_N(splitblock, nstates));
1517 for (int q = 0; q < nstates; q++) {
1518 splitblock[q] = state_set_init(-1, S_NONE);
1519 partition[q] = state_set_init(-1, S_NONE);
1520 E(splitblock[q] == NULL || partition[q] == NULL);
1521 for (int x = 0; x < nsigma; x++) {
1522 reverse[INDEX(q, x)] = state_set_init(-1, S_NONE);
1523 E(reverse[INDEX(q, x)] == NULL);
1524 F(ALLOC_N(active[INDEX(q, x)], 1));
1528 /* find initial partition and reverse edges */
1529 for (int q = 0; q < nstates; q++) {
1530 struct state *qq = states->states[q];
1536 F(state_set_push(partition[j], qq));
1538 for (int x = 0; x < nsigma; x++) {
1540 struct state *p = step(qq, y);
1542 int pn = state_set_index(states, p);
1544 F(state_set_push(reverse[INDEX(pn, x)], qq));
1545 bitset_set(reverse_nonempty, INDEX(pn, x));
1549 /* initialize active sets */
1550 for (int j = 0; j <= 1; j++)
1551 for (int x = 0; x < nsigma; x++)
1552 for (int q = 0; q < partition[j]->used; q++) {
1553 struct state *qq = partition[j]->states[q];
1554 int qn = state_set_index(states, qq);
1555 if (bitset_get(reverse_nonempty, INDEX(qn, x))) {
1556 active2[INDEX(qn, x)] =
1557 state_list_add(active[INDEX(j, x)], qq);
1558 E(active2[INDEX(qn, x)] == NULL);
1562 /* initialize pending */
1563 F(ALLOC_N(pending, 2*nsigma));
1566 for (int x = 0; x < nsigma; x++) {
1567 int a0 = active[INDEX(0,x)]->size;
1568 int a1 = active[INDEX(1,x)]->size;
1576 bitset_set(pending2, INDEX(j, x));
1579 /* process pending until fixed point */
1581 while (npending-- > 0) {
1582 int p = pending[2*npending];
1583 int x = pending[2*npending+1];
1584 bitset_clr(pending2, INDEX(p, x));
1586 /* find states that need to be split off their blocks */
1587 struct state_list *sh = active[INDEX(p,x)];
1588 for (struct state_list_node *m = sh->first; m != NULL; m = m->next) {
1589 int q = state_set_index(states, m->state);
1590 struct state_set *rev = reverse[INDEX(q, x)];
1591 for (int r =0; r < rev->used; r++) {
1592 struct state *rs = rev->states[r];
1593 int s = state_set_index(states, rs);
1594 if (! bitset_get(split2, s)) {
1595 bitset_set(split2, s);
1596 F(state_set_push(split, rs));
1598 F(state_set_push(splitblock[j], rs));
1599 if (!bitset_get(refine2, j)) {
1600 bitset_set(refine2, j);
1607 for(int rr=0; rr < ref; rr++) {
1609 struct state_set *sp = splitblock[j];
1610 if (sp->used < partition[j]->used) {
1611 struct state_set *b1 = partition[j];
1612 struct state_set *b2 = partition[k];
1613 for (int s = 0; s < sp->used; s++) {
1614 state_set_remove(b1, sp->states[s]);
1615 F(state_set_push(b2, sp->states[s]));
1616 int snum = state_set_index(states, sp->states[s]);
1618 for (int c = 0; c < nsigma; c++) {
1619 struct state_list_node *sn = active2[INDEX(snum, c)];
1620 if (sn != NULL && sn->sl == active[INDEX(j,c)]) {
1621 state_list_remove(sn);
1622 active2[INDEX(snum, c)] =
1623 state_list_add(active[INDEX(k, c)],
1625 E(active2[INDEX(snum, c)] == NULL);
1630 for (int c = 0; c < nsigma; c++) {
1631 int aj = active[INDEX(j, c)]->size;
1632 int ak = active[INDEX(k, c)]->size;
1633 if (npending + 1 > spending) {
1635 F(REALLOC_N(pending, 2 * spending));
1637 pending[2*npending + 1] = c;
1638 if (!bitset_get(pending2, INDEX(j, c))
1639 && 0 < aj && aj <= ak) {
1640 bitset_set(pending2, INDEX(j, c));
1641 pending[2*npending] = j;
1643 bitset_set(pending2, INDEX(k, c));
1644 pending[2*npending] = k;
1650 for (int s = 0; s < sp->used; s++) {
1651 int snum = state_set_index(states, sp->states[s]);
1652 bitset_clr(split2, snum);
1654 bitset_clr(refine2, j);
1660 /* make a new state for each equivalence class, set initial state */
1661 newstates = state_set_init(k, S_NONE);
1662 E(newstates == NULL);
1663 F(ALLOC_N(nsnum, k));
1664 F(ALLOC_N(nsind, nstates));
1666 for (int n = 0; n < k; n++) {
1667 struct state *s = make_state();
1669 newstates->states[n] = s;
1670 struct state_set *partn = partition[n];
1671 for (int q=0; q < partn->used; q++) {
1672 struct state *qs = partn->states[q];
1673 int qnum = state_set_index(states, qs);
1674 if (qs == fa->initial)
1675 s->live = 1; /* Abuse live to flag the new intial state */
1676 nsnum[n] = qnum; /* select representative */
1677 nsind[qnum] = n; /* and point from partition to new state */
1681 /* build transitions and set acceptance */
1682 for (int n = 0; n < k; n++) {
1683 struct state *s = newstates->states[n];
1684 s->accept = states->states[nsnum[n]]->accept;
1685 for_each_trans(t, states->states[nsnum[n]]) {
1686 int toind = state_set_index(states, t->to);
1687 struct state *nto = newstates->states[nsind[toind]];
1688 F(add_new_trans(s, nto, t->min, t->max));
1692 /* Get rid of old states and transitions and turn NEWTSTATES into
1695 for (int n=0; n < k; n++)
1696 if (newstates->states[n]->live) {
1697 struct state *ini = newstates->states[n];
1698 newstates->states[n] = newstates->states[0];
1699 newstates->states[0] = ini;
1701 for (int n=0; n < k-1; n++)
1702 newstates->states[n]->next = newstates->states[n+1];
1703 fa->initial = newstates->states[0];
1711 state_set_free(states);
1713 bitset_free(reverse_nonempty);
1715 for (int i=0; i < nstates*nsigma; i++) {
1717 state_set_free(reverse[i]);
1719 state_list_free(active[i]);
1725 bitset_free(pending2);
1726 state_set_free(split);
1727 bitset_free(split2);
1729 bitset_free(refine2);
1730 for (int q=0; q < nstates; q++) {
1732 state_set_free(splitblock[q]);
1734 state_set_free(partition[q]);
1738 state_set_free(newstates);
1740 if (collect(fa) < 0)
1748 static int minimize_brzozowski(struct fa *fa) {
1749 struct state_set *set;
1751 /* Minimize using Brzozowski's algorithm */
1752 set = fa_reverse(fa);
1754 F(determinize(fa, set));
1755 state_set_free(set);
1757 set = fa_reverse(fa);
1759 F(determinize(fa, set));
1760 state_set_free(set);
1766 int fa_minimize(struct fa *fa) {
1774 if (fa_minimization_algorithm == FA_MIN_BRZOZOWSKI) {
1775 r = minimize_brzozowski(fa);
1777 r = minimize_hopcroft(fa);
1786 * Construction of fa
1789 static struct fa *fa_make_empty(void) {
1794 if (add_state(fa, 0) == NULL) {
1798 fa->deterministic = 1;
1803 static struct fa *fa_make_epsilon(void) {
1804 struct fa *fa = fa_make_empty();
1806 fa->initial->accept = 1;
1807 fa->deterministic= 1;
1813 static struct fa *fa_make_char(uchar c) {
1814 struct fa *fa = fa_make_empty();
1818 struct state *s = fa->initial;
1819 struct state *t = add_state(fa, 1);
1825 r = add_new_trans(s, t, c, c);
1828 fa->deterministic = 1;
1836 struct fa *fa_make_basic(unsigned int basic) {
1839 if (basic == FA_EMPTY) {
1840 return fa_make_empty();
1841 } else if (basic == FA_EPSILON) {
1842 return fa_make_epsilon();
1843 } else if (basic == FA_TOTAL) {
1844 struct fa *fa = fa_make_epsilon();
1845 r = add_new_trans(fa->initial, fa->initial, UCHAR_MIN, UCHAR_MAX);
1855 int fa_is_basic(struct fa *fa, unsigned int basic) {
1856 if (basic == FA_EMPTY) {
1857 return ! fa->initial->accept && fa->initial->tused == 0;
1858 } else if (basic == FA_EPSILON) {
1859 return fa->initial->accept && fa->initial->tused == 0;
1860 } else if (basic == FA_TOTAL) {
1861 if (! fa->initial->accept)
1864 if (fa->initial->tused != 2)
1866 struct trans *t1 = fa->initial->trans;
1867 struct trans *t2 = fa->initial->trans + 1;
1868 if (t1->to != fa->initial || t2->to != fa->initial)
1870 if (t2->max != UCHAR_MAX) {
1872 t2 = fa->initial->trans;
1874 return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
1875 t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
1877 struct trans *t = fa->initial->trans;
1878 return fa->initial->tused == 1 &&
1879 t->to == fa->initial &&
1880 t->min == UCHAR_MIN && t->max == UCHAR_MAX;
1886 static struct fa *fa_clone(struct fa *fa) {
1887 struct fa *result = NULL;
1888 struct state_set *set = state_set_init(-1, S_DATA|S_SORTED);
1891 if (fa == NULL || set == NULL || ALLOC(result) < 0)
1894 result->deterministic = fa->deterministic;
1895 result->minimal = fa->minimal;
1896 result->nocase = fa->nocase;
1897 list_for_each(s, fa->initial) {
1898 int i = state_set_push(set, s);
1901 struct state *q = add_state(result, s->accept);
1906 q->reachable = s->reachable;
1908 for (int i=0; i < set->used; i++) {
1909 struct state *s = set->states[i];
1910 struct state *sc = set->data[i];
1911 for_each_trans(t, s) {
1912 int to = state_set_index(set, t->to);
1914 struct state *toc = set->data[to];
1915 r = add_new_trans(sc, toc, t->min, t->max);
1920 state_set_free(set);
1923 state_set_free(set);
1928 static int case_expand(struct fa *fa);
1930 /* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
1931 ATTRIBUTE_RETURN_CHECK
1932 static int union_in_place(struct fa *fa1, struct fa **fa2) {
1936 if (fa1->nocase != (*fa2)->nocase) {
1937 if (case_expand(fa1) < 0)
1939 if (case_expand(*fa2) < 0)
1943 s = add_state(fa1, 0);
1946 r = add_epsilon_trans(s, fa1->initial);
1949 r = add_epsilon_trans(s, (*fa2)->initial);
1953 fa1->deterministic = 0;
1957 set_initial(fa1, s);
1962 struct fa *fa_union(struct fa *fa1, struct fa *fa2) {
1963 fa1 = fa_clone(fa1);
1964 fa2 = fa_clone(fa2);
1965 if (fa1 == NULL || fa2 == NULL)
1968 F(union_in_place(fa1, &fa2));
1977 /* Concat FA2 onto FA1; frees FA2 and changes FA1 to FA1.FA2 */
1978 ATTRIBUTE_RETURN_CHECK
1979 static int concat_in_place(struct fa *fa1, struct fa **fa2) {
1982 if (fa1->nocase != (*fa2)->nocase) {
1983 if (case_expand(fa1) < 0)
1985 if (case_expand(*fa2) < 0)
1989 list_for_each(s, fa1->initial) {
1992 r = add_epsilon_trans(s, (*fa2)->initial);
1998 fa1->deterministic = 0;
2005 struct fa *fa_concat(struct fa *fa1, struct fa *fa2) {
2006 fa1 = fa_clone(fa1);
2007 fa2 = fa_clone(fa2);
2009 if (fa1 == NULL || fa2 == NULL)
2012 F(concat_in_place(fa1, &fa2));
2024 static struct fa *fa_make_char_set(bitset *cset, int negate) {
2025 struct fa *fa = fa_make_empty();
2029 struct state *s = fa->initial;
2030 struct state *t = add_state(fa, 1);
2037 while (from <= UCHAR_MAX) {
2038 while (from <= UCHAR_MAX && bitset_get(cset, from) == negate)
2040 if (from > UCHAR_MAX)
2043 while (to < UCHAR_MAX && (bitset_get(cset, to + 1) == !negate))
2045 r = add_new_trans(s, t, from, to);
2051 fa->deterministic = 1;
2060 static struct fa *fa_star(struct fa *fa) {
2068 s = add_state(fa, 1);
2072 r = add_epsilon_trans(s, fa->initial);
2077 list_for_each(p, fa->initial->next) {
2079 r = add_epsilon_trans(p, s);
2084 fa->deterministic = 0;
2094 /* Form the automaton (FA){N}; FA is not modified */
2095 static struct fa *repeat(struct fa *fa, int n) {
2097 return fa_make_epsilon();
2098 } else if (n == 1) {
2099 return fa_clone(fa);
2101 struct fa *cfa = fa_clone(fa);
2105 struct fa *tfa = fa_clone(fa);
2110 if (concat_in_place(cfa, &tfa) < 0) {
2121 struct fa *fa_iter(struct fa *fa, int min, int max) {
2127 if (min > max && max != -1) {
2128 return fa_make_empty();
2131 struct fa *sfa = fa_star(fa);
2136 struct fa *cfa = repeat(fa, min);
2141 if (concat_in_place(cfa, &sfa) < 0) {
2148 struct fa *cfa = NULL;
2151 cfa = repeat(fa, min);
2155 struct fa *cfa2 = fa_clone(fa);
2161 struct fa *cfa3 = fa_clone(fa);
2167 list_for_each(s, cfa3->initial) {
2169 r = add_epsilon_trans(s, cfa2->initial);
2178 fa_merge(cfa3, &cfa2);
2182 list_for_each(s, cfa->initial) {
2184 r = add_epsilon_trans(s, cfa2->initial);
2192 fa_merge(cfa, &cfa2);
2193 cfa->deterministic = 0;
2196 if (collect(cfa) < 0) {
2204 static void sort_transition_intervals(struct fa *fa) {
2205 list_for_each(s, fa->initial) {
2206 qsort(s->trans, s->tused, sizeof(*s->trans), trans_intv_cmp);
2210 struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
2212 struct fa *fa = NULL;
2213 struct state_set *worklist = NULL;
2214 state_triple_hash *newstates = NULL;
2217 return fa_clone(fa1);
2219 if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
2220 return fa_make_empty();
2222 if (fa1->nocase != fa2->nocase) {
2223 F(case_expand(fa1));
2224 F(case_expand(fa2));
2227 fa = fa_make_empty();
2228 worklist = state_set_init(-1, S_NONE);
2229 newstates = state_triple_init();
2230 if (fa == NULL || worklist == NULL || newstates == NULL)
2233 sort_transition_intervals(fa1);
2234 sort_transition_intervals(fa2);
2236 F(state_set_push(worklist, fa1->initial));
2237 F(state_set_push(worklist, fa2->initial));
2238 F(state_set_push(worklist, fa->initial));
2239 F(state_triple_push(newstates,
2240 fa1->initial, fa2->initial, fa->initial));
2241 while (worklist->used) {
2242 struct state *s = state_set_pop(worklist);
2243 struct state *p2 = state_set_pop(worklist);
2244 struct state *p1 = state_set_pop(worklist);
2245 s->accept = p1->accept && p2->accept;
2247 struct trans *t1 = p1->trans;
2248 struct trans *t2 = p2->trans;
2249 for (int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2250 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2253 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2255 if (t2[n2].max >= t1[n1].min) {
2256 struct state *r = state_triple_thd(newstates,
2257 t1[n1].to, t2[n2].to);
2259 r = add_state(fa, 0);
2261 F(state_set_push(worklist, t1[n1].to));
2262 F(state_set_push(worklist, t2[n2].to));
2263 F(state_set_push(worklist, r));
2264 F(state_triple_push(newstates,
2265 t1[n1].to, t2[n2].to, r));
2267 char min = t1[n1].min > t2[n2].min
2268 ? t1[n1].min : t2[n2].min;
2269 char max = t1[n1].max < t2[n2].max
2270 ? t1[n1].max : t2[n2].max;
2271 ret = add_new_trans(s, r, min, max);
2278 fa->deterministic = fa1->deterministic && fa2->deterministic;
2279 fa->nocase = fa1->nocase && fa2->nocase;
2281 state_set_free(worklist);
2282 state_triple_free(newstates);
2284 if (collect(fa) < 0) {
2297 int fa_contains(struct fa *fa1, struct fa *fa2) {
2299 struct state_set *worklist = NULL; /* List of pairs of states */
2300 struct state_set *visited = NULL; /* List of pairs of states */
2302 if (fa1 == NULL || fa2 == NULL)
2308 F(determinize(fa2, NULL));
2309 sort_transition_intervals(fa1);
2310 sort_transition_intervals(fa2);
2312 F(state_pair_push(&worklist, fa1->initial, fa2->initial));
2313 F(state_pair_push(&visited, fa1->initial, fa2->initial));
2314 while (worklist->used) {
2315 struct state *p1, *p2;
2317 p1 = state_set_pop_data(worklist, &v2);
2320 if (p1->accept && !p2->accept)
2323 struct trans *t1 = p1->trans;
2324 struct trans *t2 = p2->trans;
2325 for(int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2326 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2328 int min1 = t1[n1].min, max1 = t1[n1].max;
2330 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2332 if (t2[n2].min > min1)
2334 if (t2[n2].max < CHAR_MAX)
2335 min1 = t2[n2].max + 1;
2340 if (state_pair_find(visited, t1[n1].to, t2[n2].to) == -1) {
2341 F(state_pair_push(&worklist, t1[n1].to, t2[n2].to));
2342 F(state_pair_push(&visited, t1[n1].to, t2[n2].to));
2352 state_set_free(worklist);
2353 state_set_free(visited);
2360 static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash,
2365 /* Never transition on anything in [A-Z] */
2366 if (min > 'Z' || max < 'A') {
2367 result = add_new_trans(s, crash, min, max);
2368 } else if (min >= 'A' && max <= 'Z') {
2370 } else if (max <= 'Z') {
2372 result = add_new_trans(s, crash, min, 'A' - 1);
2373 } else if (min >= 'A') {
2375 result = add_new_trans(s, crash, 'Z' + 1, max);
2377 /* min < 'A' && max > 'Z' */
2378 result = add_new_trans(s, crash, min, 'A' - 1);
2380 result = add_new_trans(s, crash, 'Z' + 1, max);
2383 result = add_new_trans(s, crash, min, max);
2388 static int totalize(struct fa *fa) {
2390 struct state *crash = add_state(fa, 0);
2393 F(mark_reachable(fa));
2394 sort_transition_intervals(fa);
2396 r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX);
2400 list_for_each(s, fa->initial) {
2401 int next = UCHAR_MIN;
2402 int tused = s->tused;
2403 for (int i=0; i < tused; i++) {
2404 uchar min = s->trans[i].min, max = s->trans[i].max;
2406 r = add_crash_trans(fa, s, crash, next, min - 1);
2413 if (next <= UCHAR_MAX) {
2414 r = add_crash_trans(fa, s, crash, next, UCHAR_MAX);
2424 struct fa *fa_complement(struct fa *fa) {
2427 F(determinize(fa, NULL));
2429 list_for_each(s, fa->initial)
2430 s->accept = ! s->accept;
2439 struct fa *fa_minus(struct fa *fa1, struct fa *fa2) {
2440 if (fa1 == NULL || fa2 == NULL)
2443 if (fa_is_basic(fa1, FA_EMPTY) || fa1 == fa2)
2444 return fa_make_empty();
2445 if (fa_is_basic(fa2, FA_EMPTY))
2446 return fa_clone(fa1);
2448 struct fa *cfa2 = fa_complement(fa2);
2452 struct fa *result = fa_intersect(fa1, cfa2);
2457 static int accept_to_accept(struct fa *fa) {
2459 struct state *s = add_state(fa, 0);
2463 F(mark_reachable(fa));
2464 list_for_each(a, fa->initial) {
2465 if (a->accept && a->reachable) {
2466 r = add_epsilon_trans(s, a);
2473 fa->deterministic = fa->minimal = 0;
2479 struct fa *fa_overlap(struct fa *fa1, struct fa *fa2) {
2480 struct fa *fa = NULL, *eps = NULL, *result = NULL;
2481 struct state_set *map = NULL;
2483 if (fa1 == NULL || fa2 == NULL)
2486 fa1 = fa_clone(fa1);
2487 fa2 = fa_clone(fa2);
2488 if (fa1 == NULL || fa2 == NULL)
2491 if (determinize(fa1, NULL) < 0)
2493 if (accept_to_accept(fa1) < 0)
2496 map = fa_reverse(fa2);
2497 state_set_free(map);
2498 if (determinize(fa2, NULL) < 0)
2500 if (accept_to_accept(fa2) < 0)
2502 map = fa_reverse(fa2);
2503 state_set_free(map);
2504 if (determinize(fa2, NULL) < 0)
2507 fa = fa_intersect(fa1, fa2);
2511 eps = fa_make_epsilon();
2515 result = fa_minus(fa, eps);
2527 int fa_equals(struct fa *fa1, struct fa *fa2) {
2528 if (fa1 == NULL || fa2 == NULL)
2531 /* fa_contains(fa1, fa2) && fa_contains(fa2, fa1) with error checking */
2532 int c1 = fa_contains(fa1, fa2);
2537 return fa_contains(fa2, fa1);
2540 static unsigned int chr_score(char c) {
2543 } else if (isalnum(c)) {
2545 } else if (isprint(c)) {
2547 } else if (c == '\0') {
2554 static unsigned int str_score(const struct re_str *str) {
2555 unsigned int score = 0;
2556 for (int i=0; i < str->len; i++) {
2557 score += chr_score(str->rx[i]);
2562 /* See if we get a better string for DST by appending C to SRC. If DST is
2563 * NULL or empty, always use SRC + C
2565 static struct re_str *string_extend(struct re_str *dst,
2566 const struct re_str *src,
2570 || str_score(src) + chr_score(c) < str_score(dst)) {
2571 int slen = src->len;
2573 dst = make_re_str(NULL);
2576 if (REALLOC_N(dst->rx, slen+2) < 0) {
2580 memcpy(dst->rx, src->rx, slen);
2582 dst->rx[slen + 1] = '\0';
2583 dst->len = slen + 1;
2588 static char pick_char(struct trans *t) {
2589 for (int c = t->min; c <= t->max; c++)
2590 if (isalpha(c)) return c;
2591 for (int c = t->min; c <= t->max; c++)
2592 if (isalnum(c)) return c;
2593 for (int c = t->min; c <= t->max; c++)
2594 if (isprint(c)) return c;
2598 /* Generate an example string for FA. Traverse all transitions and record
2599 * at each turn the "best" word found for that state.
2601 int fa_example(struct fa *fa, char **example, size_t *example_len) {
2602 struct re_str *word = NULL;
2603 struct state_set *path = NULL, *worklist = NULL;
2604 struct re_str *str = NULL;
2609 /* Sort to avoid any ambiguity because of reordering of transitions */
2610 sort_transition_intervals(fa);
2612 /* Map from state to string */
2613 path = state_set_init(-1, S_DATA|S_SORTED);
2614 str = make_re_str("");
2615 if (path == NULL || str == NULL)
2617 F(state_set_push_data(path, fa->initial, str));
2620 /* List of states still to visit */
2621 worklist = state_set_init(-1, S_NONE);
2622 if (worklist == NULL)
2624 F(state_set_push(worklist, fa->initial));
2626 while (worklist->used) {
2627 struct state *s = state_set_pop(worklist);
2628 struct re_str *ps = state_set_find_data(path, s);
2629 for_each_trans(t, s) {
2630 char c = pick_char(t);
2631 int toind = state_set_index(path, t->to);
2633 struct re_str *w = string_extend(NULL, ps, c);
2635 F(state_set_push(worklist, t->to));
2636 F(state_set_push_data(path, t->to, w));
2638 path->data[toind] = string_extend(path->data[toind], ps, c);
2643 for (int i=0; i < path->used; i++) {
2644 struct state *p = path->states[i];
2645 struct re_str *ps = path->data[i];
2647 (word == NULL || word->len == 0
2648 || (ps->len > 0 && str_score(word) > str_score(ps)))) {
2655 state_set_free(path);
2656 state_set_free(worklist);
2658 *example_len = word->len;
2659 *example = word->rx;
2664 state_set_free(path);
2665 state_set_free(worklist);
2679 static int fa_enumerate_intl(struct state *s, struct enum_intl *ei, int pos) {
2682 if (ei->bsize <= pos + 1) {
2684 F(REALLOC_N(ei->buf, ei->bsize));
2687 ei->buf[pos] = '\0';
2688 for_each_trans(t, s) {
2692 for (int i=t->min; i <= t->max; i++) {
2694 if (t->to->accept) {
2695 if (ei->nwords >= ei->limit)
2697 ei->words[ei->nwords] = strdup(ei->buf);
2698 E(ei->words[ei->nwords] == NULL);
2701 result = fa_enumerate_intl(t->to, ei, pos+1);
2706 ei->buf[pos] = '\0';
2712 int fa_enumerate(struct fa *fa, int limit, char ***words) {
2713 struct enum_intl ei;
2718 ei.bsize = 8; /* Arbitrary initial size */
2720 F(ALLOC_N(ei.words, limit));
2721 F(ALLOC_N(ei.buf, ei.bsize));
2723 /* We use the visited bit to track which states we already visited
2724 * during the construction of a word to detect loops */
2725 list_for_each(s, fa->initial)
2727 fa->initial->visited = 1;
2728 if (fa->initial->accept) {
2729 if (ei.nwords >= limit)
2731 ei.words[0] = strdup("");
2732 E(ei.words[0] == NULL);
2735 result = fa_enumerate_intl(fa->initial, &ei, 0);
2746 for (int i=0; i < ei.nwords; i++)
2752 /* Expand the automaton FA by replacing every transition s(c) -> p from
2753 * state s to p on character c by two transitions s(X) -> r, r(c) -> p via
2755 * If ADD_MARKER is true, also add for each original state s a new a loop
2756 * s(Y) -> q and q(X) -> s through a new state q.
2758 * The end result is that an automaton accepting "a|ab" is turned into one
2759 * accepting "Xa|XaXb" if add_marker is false and "(YX)*Xa|(YX)*Xa(YX)*Xb"
2760 * when add_marker is true.
2762 * The returned automaton is a copy of FA, FA is not modified.
2764 static struct fa *expand_alphabet(struct fa *fa, int add_marker,
2772 F(mark_reachable(fa));
2773 list_for_each(p, fa->initial) {
2777 struct state *r = add_state(fa, 0);
2780 r->trans = p->trans;
2781 r->tused = p->tused;
2782 r->tsize = p->tsize;
2784 p->tused = p->tsize = 0;
2785 ret = add_new_trans(p, r, X, X);
2789 struct state *q = add_state(fa, 0);
2790 ret = add_new_trans(p, q, Y, Y);
2793 ret = add_new_trans(q, p, X, X);
2804 static bitset *alphabet(struct fa *fa) {
2805 bitset *bs = bitset_init(UCHAR_NUM);
2810 list_for_each(s, fa->initial) {
2811 for (int i=0; i < s->tused; i++) {
2812 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2819 static bitset *last_chars(struct fa *fa) {
2820 bitset *bs = bitset_init(UCHAR_NUM);
2825 list_for_each(s, fa->initial) {
2826 for (int i=0; i < s->tused; i++) {
2827 if (s->trans[i].to->accept) {
2828 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2836 static bitset *first_chars(struct fa *fa) {
2837 bitset *bs = bitset_init(UCHAR_NUM);
2838 struct state *s = fa->initial;
2843 for (int i=0; i < s->tused; i++) {
2844 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2850 /* Return 1 if F1 and F2 are known to be unambiguously concatenable
2851 * according to simple heuristics. Return 0 if they need to be checked
2852 * further to decide ambiguity
2853 * Return -1 if an allocation fails
2855 static int is_splittable(struct fa *fa1, struct fa *fa2) {
2856 bitset *alpha1 = NULL;
2857 bitset *alpha2 = NULL;
2858 bitset *last1 = NULL;
2859 bitset *first2 = NULL;
2862 alpha2 = alphabet(fa2);
2863 last1 = last_chars(fa1);
2864 if (alpha2 == NULL || last1 == NULL)
2866 if (bitset_disjoint(last1, alpha2, UCHAR_NUM)) {
2871 alpha1 = alphabet(fa1);
2872 first2 = first_chars(fa2);
2873 if (alpha1 == NULL || first2 == NULL)
2875 if (bitset_disjoint(first2, alpha1, UCHAR_NUM)) {
2881 bitset_free(alpha1);
2882 bitset_free(alpha2);
2884 bitset_free(first2);
2888 /* This algorithm is due to Anders Moeller, and can be found in class
2889 * AutomatonOperations in dk.brics.grammar
2891 int fa_ambig_example(struct fa *fa1, struct fa *fa2,
2892 char **upv, size_t *upv_len,
2893 char **pv, char **v) {
2894 static const char X = '\001';
2895 static const char Y = '\002';
2896 char *result = NULL, *s = NULL;
2897 size_t result_len = 0;
2899 struct fa *mp = NULL, *ms = NULL, *sp = NULL, *ss = NULL, *amb = NULL;
2900 struct fa *a1f = NULL, *a1t = NULL, *a2f = NULL, *a2t = NULL;
2901 struct fa *b1 = NULL, *b2 = NULL;
2910 r = is_splittable(fa1, fa2);
2918 #define MPs Ys Xs "(" Xs "(.|\n))+"
2919 #define MSs Ys Xs "(" Xs "(.|\n))*"
2920 #define SPs "(" Xs "(.|\n))+" Ys Xs
2921 #define SSs "(" Xs "(.|\n))*" Ys Xs
2922 /* These could become static constants */
2923 r = fa_compile( MPs, strlen(MPs), &mp);
2924 if (r != REG_NOERROR)
2926 r = fa_compile( MSs, strlen(MSs), &ms);
2927 if (r != REG_NOERROR)
2929 r = fa_compile( SPs, strlen(SPs), &sp);
2930 if (r != REG_NOERROR)
2932 r = fa_compile( SSs, strlen(SSs), &ss);
2933 if (r != REG_NOERROR)
2942 a1f = expand_alphabet(fa1, 0, X, Y);
2943 a1t = expand_alphabet(fa1, 1, X, Y);
2944 a2f = expand_alphabet(fa2, 0, X, Y);
2945 a2t = expand_alphabet(fa2, 1, X, Y);
2946 if (a1f == NULL || a1t == NULL || a2f == NULL || a2t == NULL)
2949 /* Compute b1 = ((a1f . mp) & a1t) . ms */
2950 if (concat_in_place(a1f, &mp) < 0)
2952 b1 = fa_intersect(a1f, a1t);
2955 if (concat_in_place(b1, &ms) < 0)
2958 /* Compute b2 = ss . ((sp . a2f) & a2t) */
2959 if (concat_in_place(sp, &a2f) < 0)
2961 b2 = fa_intersect(sp, a2t);
2964 if (concat_in_place(ss, &b2) < 0)
2969 /* The automaton we are really interested in */
2970 amb = fa_intersect(b1, b2);
2975 r = fa_example(amb, &s, &s_len);
2981 result_len = (s_len-1)/2 - 1;
2982 F(ALLOC_N(result, result_len + 1));
2985 for (i=0; s[2*i] == X; i++) {
2986 assert((t - result) < result_len);
2993 for ( ;s[2*i] == X; i++) {
2994 assert((t - result) < result_len);
3001 for (; 2*i+1 < s_len; i++) {
3002 assert((t - result) < result_len);
3009 /* Clean up intermediate automata */
3025 *upv_len = result_len;
3034 * Construct an fa from a regular expression
3036 static struct fa *fa_from_re(struct re *re) {
3037 struct fa *result = NULL;
3042 result = fa_from_re(re->exp1);
3045 struct fa *fa2 = fa_from_re(re->exp2);
3048 if (union_in_place(result, &fa2) < 0)
3054 result = fa_from_re(re->exp1);
3057 struct fa *fa2 = fa_from_re(re->exp2);
3060 if (concat_in_place(result, &fa2) < 0)
3065 result = fa_make_char_set(re->cset, re->negate);
3069 struct fa *fa = fa_from_re(re->exp);
3072 result = fa_iter(fa, re->min, re->max);
3077 result = fa_make_epsilon();
3080 result = fa_make_char(re->c);
3092 static void free_re(struct re *re) {
3095 assert(re->ref == 0);
3097 if (re->type == UNION || re->type == CONCAT) {
3100 } else if (re->type == ITER) {
3102 } else if (re->type == CSET) {
3103 bitset_free(re->cset);
3108 int fa_compile(const char *regexp, size_t size, struct fa **fa) {
3109 struct re *re = NULL;
3110 struct re_parse parse;
3115 parse.rend = regexp + size;
3116 parse.error = REG_NOERROR;
3118 re = parse_regexp(&parse);
3122 *fa = fa_from_re(re);
3125 if (*fa == NULL || collect(*fa) < 0)
3126 parse.error = REG_ESPACE;
3130 /* We represent a case-insensitive FA by using only transitions on
3131 * lower-case letters.
3133 int fa_nocase(struct fa *fa) {
3134 if (fa == NULL || fa->nocase)
3138 list_for_each(s, fa->initial) {
3139 int tused = s->tused;
3140 /* For every transition on characters in [A-Z] add a corresponding
3141 * transition on [a-z]; remove any portion covering [A-Z] */
3142 for (int i=0; i < tused; i++) {
3143 struct trans *t = s->trans + i;
3144 int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
3145 int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
3147 if (t->min > 'Z' || t->max < 'A')
3149 if (t->min >= 'A' && t->max <= 'Z') {
3150 t->min = tolower(t->min);
3151 t->max = tolower(t->max);
3152 } else if (t->max <= 'Z') {
3155 F(add_new_trans(s, t->to, lc_min, lc_max));
3156 } else if (t->min >= 'A') {
3159 F(add_new_trans(s, t->to, lc_min, lc_max));
3161 /* t->min < 'A' && t->max > 'Z' */
3162 F(add_new_trans(s, t->to, 'Z' + 1, t->max));
3163 s->trans[i].max = 'A' - 1;
3164 F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
3174 int fa_is_nocase(struct fa *fa) {
3178 /* If FA is case-insensitive, turn it into a case-sensitive automaton by
3179 * adding transitions on upper-case letters for each existing transition on
3180 * lower-case letters */
3181 static int case_expand(struct fa *fa) {
3186 list_for_each(s, fa->initial) {
3187 int tused = s->tused;
3188 /* For every transition on characters in [a-z] add a corresponding
3189 * transition on [A-Z] */
3190 for (int i=0; i < tused; i++) {
3191 struct trans *t = s->trans + i;
3192 int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
3193 int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
3195 if (t->min > 'z' || t->max < 'a')
3197 F(add_new_trans(s, t->to, lc_min, lc_max));
3207 * Regular expression parser
3210 static struct re *make_re(enum re_type type) {
3212 if (make_ref(re) == 0)
3217 static struct re *make_re_rep(struct re *exp, int min, int max) {
3218 struct re *re = make_re(ITER);
3229 static struct re *make_re_binop(enum re_type type, struct re *exp1,
3231 struct re *re = make_re(type);
3242 static struct re *make_re_char(uchar c) {
3243 struct re *re = make_re(CHAR);
3249 static struct re *make_re_char_set(bool negate, bool no_ranges) {
3250 struct re *re = make_re(CSET);
3252 re->negate = negate;
3253 re->no_ranges = no_ranges;
3254 re->cset = bitset_init(UCHAR_NUM);
3255 if (re->cset == NULL)
3261 static bool more(struct re_parse *parse) {
3262 return parse->rx < parse->rend;
3265 static bool match(struct re_parse *parse, char m) {
3268 if (*parse->rx == m) {
3275 static bool peek(struct re_parse *parse, const char *chars) {
3276 return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL;
3279 static bool next(struct re_parse *parse, char *c) {
3287 static bool parse_char(struct re_parse *parse, int quoted, char *c) {
3290 if (quoted && *parse->rx == '\\') {
3292 return next(parse, c);
3294 return next(parse, c);
3298 static void add_re_char(struct re *re, uchar from, uchar to) {
3299 assert(re->type == CSET);
3300 for (unsigned int c = from; c <= to; c++)
3301 bitset_set(re->cset, c);
3304 static void parse_char_class(struct re_parse *parse, struct re *re) {
3305 if (! more(parse)) {
3306 parse->error = REG_EBRACK;
3310 parse_char(parse, 0, &from);
3312 if (match(parse, '-')) {
3313 if (! more(parse)) {
3314 parse->error = REG_EBRACK;
3317 if (peek(parse, "]")) {
3319 parse->error = REG_ERANGE;
3322 add_re_char(re, from, to);
3323 add_re_char(re, '-', '-');
3325 } else if (!parse_char(parse, 0, &to)) {
3326 parse->error = REG_ERANGE;
3331 parse->error = REG_ERANGE;
3334 add_re_char(re, from, to);
3339 static struct re *parse_simple_exp(struct re_parse *parse) {
3340 struct re *re = NULL;
3342 if (match(parse, '[')) {
3343 bool negate = match(parse, '^');
3344 re = make_re_char_set(negate, parse->no_ranges);
3346 parse->error = REG_ESPACE;
3349 parse_char_class(parse, re);
3350 if (parse->error != REG_NOERROR)
3352 while (more(parse) && ! peek(parse, "]")) {
3353 parse_char_class(parse, re);
3354 if (parse->error != REG_NOERROR)
3357 if (! match(parse, ']')) {
3358 parse->error = REG_EBRACK;
3361 } else if (match(parse, '(')) {
3362 if (match(parse, ')')) {
3363 re = make_re(EPSILON);
3365 parse->error = REG_ESPACE;
3369 re = parse_regexp(parse);
3372 if (! match(parse, ')')) {
3373 parse->error = REG_EPAREN;
3377 } else if (match(parse, '.')) {
3378 re = make_re_char_set(1, parse->no_ranges);
3380 parse->error = REG_ESPACE;
3383 add_re_char(re, '\n', '\n');
3384 } else if (more(parse)) {
3386 if (!parse_char(parse, 1, &c)) {
3387 parse->error = REG_EESCAPE;
3390 re = make_re_char(c);
3392 parse->error = REG_ESPACE;
3396 re = make_re(EPSILON);
3398 parse->error = REG_ESPACE;
3408 static int parse_int(struct re_parse *parse) {
3414 /* We need to be careful that strtoul will never access
3415 * memory beyond parse->rend
3417 for (lim = parse->rx; lim < parse->rend && *lim >= '0' && *lim <= '9';
3419 if (lim < parse->rend) {
3420 l = strtoul(parse->rx, &end, 10);
3421 used = end - parse->rx;
3423 char *s = strndup(parse->rx, parse->rend - parse->rx);
3425 parse->error = REG_ESPACE;
3428 l = strtoul(s, &end, 10);
3436 if ((l<0) || (l > INT_MAX)) {
3437 parse->error = REG_BADBR;
3443 static struct re *parse_repeated_exp(struct re_parse *parse) {
3444 struct re *re = parse_simple_exp(parse);
3447 if (match(parse, '?')) {
3448 re = make_re_rep(re, 0, 1);
3449 } else if (match(parse, '*')) {
3450 re = make_re_rep(re, 0, -1);
3451 } else if (match(parse, '+')) {
3452 re = make_re_rep(re, 1, -1);
3453 } else if (match(parse, '{')) {
3455 min = parse_int(parse);
3458 if (match(parse, ',')) {
3459 max = parse_int(parse);
3461 max = -1; /* If it's not an int, it means 'unbounded' */
3462 if (! match(parse, '}')) {
3463 parse->error = REG_EBRACE;
3466 } else if (match(parse, '}')) {
3469 parse->error = REG_EBRACE;
3472 if (min > max && max != -1) {
3473 parse->error = REG_BADBR;
3476 re = make_re_rep(re, min, max);
3479 parse->error = REG_ESPACE;
3486 static struct re *parse_concat_exp(struct re_parse *parse) {
3487 struct re *re = parse_repeated_exp(parse);
3491 if (more(parse) && ! peek(parse, ")|")) {
3492 struct re *re2 = parse_concat_exp(parse);
3495 re = make_re_binop(CONCAT, re, re2);
3497 parse->error = REG_ESPACE;
3508 static struct re *parse_regexp(struct re_parse *parse) {
3509 struct re *re = NULL;
3511 /* Something like (|r) */
3512 if (peek(parse, "|"))
3513 re = make_re(EPSILON);
3515 re = parse_concat_exp(parse);
3519 if (match(parse, '|')) {
3520 struct re *re2 = NULL;
3521 /* Something like (r|) */
3522 if (peek(parse, ")"))
3523 re2 = make_re(EPSILON);
3525 re2 = parse_regexp(parse);
3529 re = make_re_binop(UNION, re, re2);
3531 parse->error = REG_ESPACE;
3543 * Convert a STRUCT RE to a string. Code is complicated by the fact that
3544 * we try to be clever and avoid unneeded parens and concatenation with
3547 static int re_as_string(const struct re *re, struct re_str *str);
3549 static int re_binop_count(enum re_type type, const struct re *re) {
3550 assert(type == CONCAT || type == UNION);
3551 if (re->type == type) {
3552 return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2);
3558 static int re_binop_store(enum re_type type, const struct re *re,
3559 const struct re **list) {
3561 if (type == re->type) {
3562 pos = re_binop_store(type, re->exp1, list);
3563 pos += re_binop_store(type, re->exp2, list + pos);
3571 static int re_union_as_string(const struct re *re, struct re_str *str) {
3572 assert(re->type == UNION);
3575 const struct re **res = NULL;
3576 struct re_str *strings = NULL;
3579 nre = re_binop_count(re->type, re);
3580 r = ALLOC_N(res, nre);
3584 re_binop_store(re->type, re, res);
3586 r = ALLOC_N(strings, nre);
3591 for (int i=0; i < nre; i++) {
3592 if (re_as_string(res[i], strings + i) < 0)
3594 str->len += strings[i].len;
3598 r = re_str_alloc(str);
3603 for (int i=0; i < nre; i++) {
3606 memcpy(p, strings[i].rx, strings[i].len);
3607 p += strings[i].len;
3613 if (strings != NULL) {
3614 for (int i=0; i < nre; i++)
3615 release_re_str(strings + i);
3620 release_re_str(str);
3626 static int re_needs_parens_in_concat(const struct re *re) {
3627 return (re->type != CHAR && re->type != CSET && re->type != ITER);
3630 static int re_concat_as_string(const struct re *re, struct re_str *str) {
3631 assert(re->type == CONCAT);
3633 const struct re **res = NULL;
3634 struct re_str *strings = NULL;
3638 nre = re_binop_count(re->type, re);
3639 r = ALLOC_N(res, nre);
3642 re_binop_store(re->type, re, res);
3644 r = ALLOC_N(strings, nre);
3649 for (int i=0; i < nre; i++) {
3650 if (res[i]->type == EPSILON)
3652 if (re_as_string(res[i], strings + i) < 0)
3654 str->len += strings[i].len;
3655 if (re_needs_parens_in_concat(res[i]))
3659 r = re_str_alloc(str);
3664 for (int i=0; i < nre; i++) {
3665 if (res[i]->type == EPSILON)
3667 if (re_needs_parens_in_concat(res[i]))
3669 p = memcpy(p, strings[i].rx, strings[i].len);
3670 p += strings[i].len;
3671 if (re_needs_parens_in_concat(res[i]))
3678 if (strings != NULL) {
3679 for (int i=0; i < nre; i++)
3680 release_re_str(strings + i);
3685 release_re_str(str);
3690 static bool cset_contains(const struct re *cset, int c) {
3691 return bitset_get(cset->cset, c) != cset->negate;
3694 static int re_cset_as_string(const struct re *re, struct re_str *str) {
3695 const uchar rbrack = ']';
3696 const uchar dash = '-';
3697 const uchar nul = '\0';
3699 static const char *const empty_set = "[]";
3700 static const char *const total_set = "(.|\n)";
3701 static const char *const not_newline = ".";
3704 int from, to, negate;
3706 int incl_rbrack, incl_dash;
3709 str->len = strlen(empty_set);
3711 /* We can not include NUL explicitly in a CSET since we use ordinary
3712 NUL delimited strings to represent them. That means that we need to
3713 use negated representation if NUL is to be included (and vice versa)
3715 negate = cset_contains(re, nul);
3717 for (from = UCHAR_MIN;
3718 from <= UCHAR_MAX && cset_contains(re, from);
3720 if (from > UCHAR_MAX) {
3721 /* Special case: the set matches every character */
3722 str->rx = strdup(total_set);
3727 from <= UCHAR_MAX && cset_contains(re, from);
3729 if (from > UCHAR_MAX) {
3730 /* Special case: the set matches everything but '\n' */
3731 str->rx = strdup(not_newline);
3737 /* See if ']' and '-' will be explicitly included in the character set
3738 (INCL_RBRACK, INCL_DASH) As we loop over the character set, we reset
3739 these flags if they are in the set, but not mentioned explicitly
3741 incl_rbrack = cset_contains(re, rbrack) != negate;
3742 incl_dash = cset_contains(re, dash) != negate;
3744 if (re->no_ranges) {
3745 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++)
3746 if (cset_contains(re, from) != negate)
3749 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3750 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3752 if (from > UCHAR_MAX)
3755 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3758 if (to == from && (from == rbrack || from == dash))
3760 if (from == rbrack || from == dash)
3762 if (to == rbrack || to == dash)
3765 len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3);
3767 if (from < rbrack && rbrack < to)
3769 if (from < dash && dash < to)
3773 str->len += incl_rbrack + incl_dash;
3776 str->len += 1; /* For the ^ */
3778 r = re_str_alloc(str);
3789 if (re->no_ranges) {
3790 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) {
3791 if (from == rbrack || from == dash)
3793 if (cset_contains(re, from) != negate)
3797 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3798 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3800 if (from > UCHAR_MAX)
3803 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3806 if (to == from && (from == rbrack || from == dash))
3808 if (from == rbrack || from == dash)
3810 if (to == rbrack || to == dash)
3815 } else if (to == from + 1) {
3830 if (str->rx == NULL)
3832 str->len = strlen(str->rx);
3835 release_re_str(str);
3839 static int re_iter_as_string(const struct re *re, struct re_str *str) {
3840 const char *quant = NULL;
3844 if (re_as_string(re->exp, str) < 0)
3847 if (re->min == 0 && re->max == -1) {
3849 } else if (re->min == 1 && re->max == -1) {
3851 } else if (re->min == 0 && re->max == 1) {
3853 } else if (re->max == -1) {
3854 r = asprintf(&iter, "{%d,}", re->min);
3859 r = asprintf(&iter, "{%d,%d}", re->min, re->max);
3865 if (re->exp->type == CHAR || re->exp->type == CSET) {
3866 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0)
3868 strcpy(str->rx + str->len, quant);
3869 str->len += strlen(quant);
3871 /* Format '(' + str->rx ')' + quant */
3872 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0)
3874 memmove(str->rx + 1, str->rx, str->len);
3876 str->rx[str->len + 1] = ')';
3878 strcpy(str->rx + str->len, quant);
3879 str->len += strlen(quant);
3887 release_re_str(str);
3891 static int re_as_string(const struct re *re, struct re_str *str) {
3892 /* Characters that must be escaped */
3893 static const char * const special_chars = ".()[]{}*|+?\\^$";
3898 result = re_union_as_string(re, str);
3901 result = re_concat_as_string(re, str);
3904 result = re_cset_as_string(re, str);
3907 if (re->c == '\0' || strchr(special_chars, re->c) == NULL) {
3908 if (ALLOC_N(str->rx, 2) < 0)
3913 if (ALLOC_N(str->rx, 3) < 0)
3917 str->len = strlen(str->rx);
3921 result = re_iter_as_string(re, str);
3924 if (ALLOC_N(str->rx, 3) < 0)
3926 strcpy(str->rx, "()");
3927 str->len = strlen(str->rx);
3936 release_re_str(str);
3940 static int convert_trans_to_re(struct state *s) {
3941 struct re *re = NULL;
3943 struct trans *trans = NULL;
3949 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
3950 for (int i = 0; i < s->tused - 1; i++) {
3951 if (s->trans[i].to != s->trans[i+1].to)
3954 r = ALLOC_N(trans, nto);
3958 struct state *to = s->trans[0].to;
3960 for_each_trans(t, s) {
3962 trans[tind].to = to;
3963 trans[tind].re = re;
3969 re = make_re_char_set(0, 0);
3973 add_re_char(re, t->min, t->max);
3975 assert(nto == tind + 1);
3976 trans[tind].to = to;
3977 trans[tind].re = re;
3979 /* Simplify CSETs with a single char to a CHAR */
3980 for (int t=0; t < nto; t++) {
3982 uchar chr = UCHAR_MIN;
3983 for (int c = 0; c < UCHAR_NUM; c++) {
3984 if (bitset_get(trans[t].re->cset, c)) {
3990 re_unref(trans[t].re);
3991 trans[t].re = make_re_char(chr);
3992 if (trans[t].re == NULL)
3998 s->tused = s->tsize = nto;
4003 for (int i=0; i < nto; i++)
4004 unref(trans[i].re, re);
4009 ATTRIBUTE_RETURN_CHECK
4010 static int add_new_re_trans(struct state *s1, struct state *s2,
4013 r = add_new_trans(s1, s2, 0, 0);
4016 last_trans(s1)->re = re;
4020 /* Add the regular expression R1 . LOOP* . R2 to the transition
4022 static int re_collapse_trans(struct state *s1, struct state *s2,
4023 struct re *r1, struct re *loop, struct re *r2) {
4024 struct re *re = NULL;
4026 if (loop->type != EPSILON) {
4027 loop = make_re_rep(ref(loop), 0, -1);
4032 if (r1->type == EPSILON) {
4033 if (loop->type == EPSILON) {
4036 re = make_re_binop(CONCAT, loop, ref(r2));
4039 if (loop->type == EPSILON) {
4040 if (r2->type == EPSILON) {
4043 re = make_re_binop(CONCAT, ref(r1), ref(r2));
4046 re = make_re_binop(CONCAT, ref(r1), loop);
4047 if (re != NULL && r2->type != EPSILON) {
4048 re = make_re_binop(CONCAT, re, ref(r2));
4055 struct trans *t = NULL;
4056 for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1);
4057 if (t > last_trans(s1)) {
4058 if (add_new_re_trans(s1, s2, re) < 0)
4061 if (t->re == NULL) {
4064 t->re = make_re_binop(UNION, re, t->re);
4071 // FIXME: make sure we don't leak loop
4075 static int convert_strings(struct fa *fa) {
4076 struct state_set *worklist = state_set_init(-1, S_NONE);
4079 E(worklist == NULL);
4081 /* Abuse hash to count indegree, reachable to mark visited states */
4082 list_for_each(s, fa->initial) {
4087 list_for_each(s, fa->initial) {
4088 for_each_trans(t, s) {
4093 for (struct state *s = fa->initial;
4095 s = state_set_pop(worklist)) {
4096 for (int i=0; i < s->tused; i++) {
4097 struct trans *t = s->trans + i;
4098 struct state *to = t->to;
4099 while (to->hash == 1 && to->tused == 1 && ! to->accept) {
4100 if (t->re == NULL) {
4101 t->re = to->trans->re;
4102 to->trans->re = NULL;
4104 t->re = make_re_binop(CONCAT, t->re, to->trans->re);
4108 t->to = to->trans->to;
4112 for (int j=0; j < s->tused; j++) {
4113 if (j != i && s->trans[j].to == to) {
4114 /* Combine transitions i and j; remove trans j */
4115 t->re = make_re_binop(UNION, t->re, s->trans[j].re);
4118 memmove(s->trans + j, s->trans + j + 1,
4119 sizeof(s->trans[j]) * (s->tused - j - 1));
4130 if (! to->reachable) {
4132 F(state_set_push(worklist, to));
4137 for (struct state *s = fa->initial; s->next != NULL; ) {
4138 if (s->next->hash == 0 && s->next->tused == 0) {
4139 struct state *del = s->next;
4140 s->next = del->next;
4150 state_set_free(worklist);
4154 /* Convert an FA to a regular expression.
4155 * The strategy is the following:
4156 * (1) For all states S1 and S2, convert the transitions between them
4157 * into one transition whose regexp is a CSET
4158 * (2) Add a new initial state INI and a new final state FIN to the automaton,
4159 * a transition from INI to the old initial state of FA, and a transition
4160 * from all accepting states of FA to FIN; the regexp on those transitions
4161 * matches only the empty word
4162 * (3) Eliminate states S (except for INI and FIN) one by one:
4163 * Let LOOP the regexp for the transition S -> S if it exists, epsilon
4165 * For all S1, S2 different from S with S1 -> S -> S2
4166 * Let R1 the regexp of S1 -> S
4167 * R2 the regexp of S -> S2
4168 * R3 the regexp S1 -> S2 (or epsilon if no such transition)
4169 * set the regexp on the transition S1 -> S2 to
4170 * R1 . (LOOP)* . R2 | R3
4171 * (4) The regexp for the whole FA can now be found as the regexp of
4172 * the transition INI -> FIN
4173 * (5) Convert that STRUCT RE to a string with RE_AS_STRING
4175 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
4177 struct state *fin = NULL, *ini = NULL;
4178 struct re *eps = NULL;
4186 eps = make_re(EPSILON);
4190 fin = add_state(fa,1);
4196 list_for_each(s, fa->initial) {
4197 r = convert_trans_to_re(s);
4200 if (s->accept && s != fin) {
4201 r = add_new_re_trans(s, fin, ref(eps));
4208 ini = add_state(fa, 0);
4212 r = add_new_re_trans(ini, fa->initial, ref(eps));
4215 set_initial(fa, ini);
4217 convert_strings(fa);
4219 list_for_each(s, fa->initial->next) {
4223 struct re *loop = eps;
4224 for_each_trans(t, s) {
4228 list_for_each(s1, fa->initial) {
4231 for (int t1 = 0; t1 < s1->tused; t1++) {
4232 if (s1->trans[t1].to == s) {
4233 for (int t = 0; t < s->tused; t++) {
4234 if (s->trans[t].to == s)
4236 r = re_collapse_trans(s1, s->trans[t].to,
4250 for_each_trans(t, fa->initial) {
4254 if (re_as_string(t->re, &str) < 0)
4257 *regexp_len = str.len;
4261 list_for_each(s, fa->initial) {
4262 for_each_trans(t, s) {
4275 static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
4282 r1 = re_restrict_alphabet(re->exp1, from, to);
4283 r2 = re_restrict_alphabet(re->exp2, from, to);
4284 result = (r1 != 0) ? r1 : r2;
4289 bitset_negate(re->cset, UCHAR_NUM);
4291 for (int i=from; i <= to; i++)
4292 bitset_clr(re->cset, i);
4295 if (from <= re->c && re->c <= to)
4299 result = re_restrict_alphabet(re->exp, from, to);
4311 int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
4312 char **newregexp, size_t *newregexp_len,
4313 char from, char to) {
4315 struct re *re = NULL;
4316 struct re_parse parse;
4322 parse.rend = regexp + regexp_len;
4323 parse.error = REG_NOERROR;
4324 re = parse_regexp(&parse);
4325 if (parse.error != REG_NOERROR)
4328 result = re_restrict_alphabet(re, from, to);
4335 result = re_as_string(re, &str);
4336 *newregexp = str.rx;
4337 *newregexp_len = str.len;
4343 int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
4344 char **newregexp, size_t *newregexp_len) {
4346 struct re *re = NULL;
4347 struct re_parse parse;
4353 parse.rend = regexp + regexp_len;
4354 parse.error = REG_NOERROR;
4355 parse.no_ranges = 1;
4356 re = parse_regexp(&parse);
4357 if (parse.error != REG_NOERROR)
4361 result = re_as_string(re, &str);
4362 *newregexp = str.rx;
4363 *newregexp_len = str.len;
4368 /* Expand regexp so that it is case-insensitive in a case-sensitive match.
4370 * Return 1 when a change was made, -1 when an allocation failed, and 0
4371 * when no change was made.
4373 static int re_case_expand(struct re *re) {
4374 int result = 0, r1, r2;
4379 r1 = re_case_expand(re->exp1);
4380 r2 = re_case_expand(re->exp2);
4381 result = (r1 != 0) ? r1 : r2;
4384 for (int c = 'A'; c <= 'Z'; c++)
4385 if (bitset_get(re->cset, c)) {
4387 bitset_set(re->cset, tolower(c));
4389 for (int c = 'a'; c <= 'z'; c++)
4390 if (bitset_get(re->cset, c)) {
4392 bitset_set(re->cset, toupper(c));
4396 if (isalpha(re->c)) {
4401 re->cset = bitset_init(UCHAR_NUM);
4402 if (re->cset == NULL)
4404 bitset_set(re->cset, tolower(c));
4405 bitset_set(re->cset, toupper(c));
4410 result = re_case_expand(re->exp);
4422 int fa_expand_nocase(const char *regexp, size_t regexp_len,
4423 char **newregexp, size_t *newregexp_len) {
4425 struct re *re = NULL;
4426 struct re_parse parse;
4432 parse.rend = regexp + regexp_len;
4433 parse.error = REG_NOERROR;
4434 re = parse_regexp(&parse);
4435 if (parse.error != REG_NOERROR)
4438 r = re_case_expand(re);
4446 result = re_as_string(re, &str);
4447 *newregexp = str.rx;
4448 *newregexp_len = str.len;
4450 *newregexp = strndup(regexp, regexp_len);
4451 *newregexp_len = regexp_len;
4452 result = (*newregexp == NULL) ? REG_ESPACE : REG_NOERROR;
4458 static void print_char(FILE *out, uchar c) {
4459 /* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
4460 Also, a space ' ' is shown as '\s' */
4461 static const char *const escape_from = " \n\t\v\b\r\f\a/\0";
4462 static const char *const escape_to = "sntvbrfa/0";
4463 char *p = strchr(escape_from, c);
4465 int i = p - escape_from;
4466 fprintf(out, "\\\\%c", escape_to[i]);
4467 } else if (! isprint(c)) {
4468 fprintf(out, "\\\\0%03o", (unsigned char) c);
4469 } else if (c == '"') {
4470 fprintf(out, "\\\"");
4476 void fa_dot(FILE *out, struct fa *fa) {
4477 fprintf(out, "digraph {\n rankdir=LR;");
4478 list_for_each(s, fa->initial) {
4480 fprintf(out, "\"%p\" [shape=doublecircle];\n", s);
4482 fprintf(out, "\"%p\" [shape=circle];\n", s);
4485 fprintf(out, "%s -> \"%p\";\n", fa->deterministic ? "dfa" : "nfa",
4490 list_for_each(s, fa->initial) {
4491 for_each_trans(t, s) {
4492 fprintf(out, "\"%p\" -> \"%p\" [ label = \"", s, t->to);
4494 re_as_string(t->re, &str);
4495 for (int i=0; i < str.len; i++) {
4496 print_char(out, str.rx[i]);
4498 release_re_str(&str);
4500 print_char(out, t->min);
4501 if (t->min != t->max) {
4503 print_char(out, t->max);
4506 fprintf(out, "\" ];\n");
4509 fprintf(out, "}\n");
4514 * indent-tabs-mode: nil