2 * fa.c: finite automata
4 * Copyright (C) 2007-2015 David Lutterkort
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * Author: David Lutterkort <dlutter@redhat.com>
24 * This implementation follows closely the Java dk.brics.automaton package
25 * by Anders Moeller. The project's website is
26 * http://www.brics.dk/automaton/.
28 * It is by no means a complete reimplementation of that package; only a
29 * subset of what Automaton provides is implemented here.
43 #define UCHAR_NUM (UCHAR_MAX+1)
45 typedef unsigned char uchar;
47 #define E(cond) if (cond) goto error
48 #define F(expr) if ((expr) < 0) goto error
50 /* Which algorithm to use in FA_MINIMIZE */
51 int fa_minimization_algorithm = FA_MIN_HOPCROFT;
53 /* A finite automaton. INITIAL is both the initial state and the head of
54 * the list of all states. Any state that is allocated for this automaton
55 * is put on this list. Dead/unreachable states are cleared from the list
56 * at opportune times (e.g., during minimization) It's poor man's garbage
59 * Normally, transitions are on a character range [min..max]; in
60 * fa_as_regexp, we store regexps on transitions in the re field of each
61 * transition. TRANS_RE indicates that we do that, and is used by fa_dot to
62 * produce proper graphs of an automaton transitioning on regexps.
64 * For case-insensitive regexps (nocase == 1), the FA never has transitions
65 * on uppercase letters [A-Z], effectively removing these letters from the
69 struct state *initial;
70 int deterministic : 1;
72 unsigned int nocase : 1;
76 /* A state in a finite automaton. Transitions are never shared between
77 states so that we can free the list when we need to free the state */
81 unsigned int accept : 1;
82 unsigned int live : 1;
83 unsigned int reachable : 1;
84 unsigned int visited : 1; /* Used in various places to track progress */
85 /* Array of transitions. The TUSED first entries are used, the array
86 has allocated room for TSIZE */
92 /* A transition. If the input has a character in the inclusive
93 * range [MIN, MAX], move to TO
109 #define UINT_BIT (sizeof(unsigned int) * CHAR_BIT)
111 typedef unsigned int bitset;
113 static bitset *bitset_init(size_t nbits) {
115 if (ALLOC_N(bs, (nbits + UINT_BIT) / UINT_BIT) == -1)
120 static inline void bitset_clr(bitset *bs, unsigned int bit) {
121 bs[bit/UINT_BIT] &= ~(1 << (bit % UINT_BIT));
124 static inline void bitset_set(bitset *bs, unsigned int bit) {
125 bs[bit/UINT_BIT] |= 1 << (bit % UINT_BIT);
129 static inline int bitset_get(const bitset * const bs, unsigned int bit) {
130 return (bs[bit/UINT_BIT] >> bit % UINT_BIT) & 1;
134 static inline bool bitset_disjoint(const bitset *const bs1,
135 const bitset *const bs2,
137 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) {
144 static void bitset_free(bitset *bs) {
148 static void bitset_negate(bitset *bs, size_t nbits) {
149 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
154 * Representation of a parsed regular expression. The regular expression is
155 * parsed according to the following grammar by PARSE_REGEXP:
157 * regexp: concat_exp ('|' regexp)?
158 * concat_exp: repeated_exp concat_exp?
159 * repeated_exp: simple_exp
163 * | simple_exp '{' INT (',' INT '}')?
164 * simple_exp: char_class
168 * char_class: '[' char_exp ']'
169 * | '[' '^' char_exp ']'
170 * char_exp: CHAR '-' CHAR
183 #define re_unref(r) unref(r, re)
189 struct { /* UNION, CONCAT */
196 /* Whether we can use character ranges when converting back
198 unsigned int no_ranges:1;
211 /* Used to keep state of the regex parse; RX may contain NUL's */
213 const char *rx; /* Current position in regex */
214 const char *rend; /* Last char of rx+ 1 */
215 int error; /* error code */
216 /* Whether new CSET's should have the no_ranges flag set */
217 unsigned int no_ranges:1;
220 /* String with explicit length, used when converting re to string */
226 static struct re *parse_regexp(struct re_parse *parse);
228 /* A map from a set of states to a state. */
229 typedef hash_t state_set_hash;
231 static hash_val_t ptr_hash(const void *p);
233 static const int array_initial_size = 4;
234 static const int array_max_expansion = 128;
236 enum state_set_init_flags {
245 unsigned int sorted : 1;
246 unsigned int with_data : 1;
247 struct state **states;
251 struct state_set_list {
252 struct state_set_list *next;
253 struct state_set *set;
256 /* Clean up FA by removing dead transitions and states and reducing
257 * transitions. Unreachable states are freed. The return value is the same
258 * as FA; returning it is merely a convenience.
260 * Only automata in this state should be returned to the user
262 ATTRIBUTE_RETURN_CHECK
263 static int collect(struct fa *fa);
265 ATTRIBUTE_RETURN_CHECK
266 static int totalize(struct fa *fa);
268 /* Print an FA into a (fairly) fixed file if the environment variable
269 * FA_DOT_DIR is set. This code is only used for debugging
271 #define FA_DOT_DIR "FA_DOT_DIR"
274 static void fa_dot_debug(struct fa *fa, const char *tag) {
276 static int count = 0;
281 if ((dot_dir = getenv(FA_DOT_DIR)) == NULL)
284 r = asprintf(&fname, "%s/fa_%02d_%s.dot", dot_dir, count++, tag);
288 fp = fopen(fname, "w");
294 static void print_char_set(struct re *set) {
301 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
302 while (bitset_get(set->cset, from) == set->negate)
304 if (from > UCHAR_MAX)
307 to < UCHAR_MAX && (bitset_get(set->cset, to+1) == !set->negate);
312 printf("%c-%c", from, to);
319 static void print_re(struct re *re) {
340 printf("){%d,%d}", re->min, re->max);
354 static void release_re_str(struct re_str *str) {
361 static void free_re_str(struct re_str *str) {
368 static struct re_str *make_re_str(const char *s) {
375 str->len = strlen(s);
376 if (str->rx == NULL) {
384 static int re_str_alloc(struct re_str *str) {
385 return ALLOC_N(str->rx, str->len + 1);
392 static void free_trans(struct state *s) {
395 s->tused = s->tsize = 0;
398 static void gut(struct fa *fa) {
399 list_for_each(s, fa->initial) {
402 list_free(fa->initial);
406 void fa_free(struct fa *fa) {
413 static struct state *make_state(void) {
417 s->hash = ptr_hash(s);
421 static struct state *add_state(struct fa *fa, int accept) {
422 struct state *s = make_state();
425 if (fa->initial == NULL) {
428 list_cons(fa->initial->next, s);
434 #define last_trans(s) ((s)->trans + (s)->tused - 1)
436 #define for_each_trans(t, s) \
437 for (struct trans *t = (s)->trans; \
438 (t - (s)->trans) < (s)->tused; \
441 ATTRIBUTE_RETURN_CHECK
442 static int add_new_trans(struct state *from, struct state *to,
443 uchar min, uchar max) {
446 if (from->tused == from->tsize) {
447 size_t tsize = from->tsize;
449 tsize = array_initial_size;
450 else if (from->tsize > array_max_expansion)
451 tsize += array_max_expansion;
454 if (REALLOC_N(from->trans, tsize) == -1)
458 from->trans[from->tused].to = to;
459 from->trans[from->tused].min = min;
460 from->trans[from->tused].max = max;
465 ATTRIBUTE_RETURN_CHECK
466 static int add_epsilon_trans(struct state *from,
469 from->accept |= to->accept;
470 for_each_trans(t, to) {
471 r = add_new_trans(from, t->to, t->min, t->max);
478 static void set_initial(struct fa *fa, struct state *s) {
479 list_remove(s, fa->initial);
480 list_cons(fa->initial, s);
483 /* Merge automaton FA2 into FA1. This simply adds FA2's states to FA1
484 and then frees FA2. It has no influence on the language accepted by FA1
486 static void fa_merge(struct fa *fa1, struct fa **fa2) {
487 list_append(fa1->initial, (*fa2)->initial);
493 * Operations on STATE_SET
495 static void state_set_free(struct state_set *set) {
503 static int state_set_init_data(struct state_set *set) {
505 if (set->data == NULL)
506 return ALLOC_N(set->data, set->size);
511 /* Create a new STATE_SET with an initial size of SIZE. If SIZE is -1, use
512 the default size ARRAY_INITIAL_SIZE. FLAGS is a bitmask indicating
514 - S_SORTED: keep the states in the set sorted by their address, and use
515 binary search for lookups. If it is not set, entries are kept in the
516 order in which they are added and lookups scan linearly through the
518 - S_DATA: allocate the DATA array in the set, and keep its size in sync
519 with the size of the STATES array.
521 static struct state_set *state_set_init(int size, int flags) {
522 struct state_set *set = NULL;
526 set->sorted = (flags & S_SORTED) ? 1 : 0;
527 set->with_data = (flags & S_DATA) ? 1 : 0;
530 F(ALLOC_N(set->states, set->size));
532 F(state_set_init_data(set));
540 ATTRIBUTE_RETURN_CHECK
541 static int state_set_expand(struct state_set *set) {
543 set->size = array_initial_size;
544 else if (set->size > array_max_expansion)
545 set->size += array_max_expansion;
548 if (REALLOC_N(set->states, set->size) < 0)
551 if (REALLOC_N(set->data, set->size) < 0)
555 /* We do this to provoke a SEGV as early as possible */
561 /* Return the index where S belongs in SET->STATES to keep it sorted. S
562 may not be in SET->STATES. The returned index is in the interval [0
563 .. SET->USED], with the latter indicating that S is larger than all
564 values in SET->STATES
566 static int state_set_pos(const struct state_set *set, const struct state *s) {
567 int l = 0, h = set->used;
570 if (set->states[m] > s)
572 else if (set->states[m] < s)
580 ATTRIBUTE_RETURN_CHECK
581 static int state_set_push(struct state_set *set, struct state *s) {
582 if (set->size == set->used)
583 if (state_set_expand(set) < 0)
586 int p = state_set_pos(set, s);
587 if (set->size == set->used)
588 if (state_set_expand(set) < 0)
590 while (p < set->used && set->states[p] <= s)
593 memmove(set->states + p + 1, set->states + p,
594 sizeof(*set->states) * (set->used - p));
595 if (set->data != NULL)
596 memmove(set->data + p + 1, set->data + p,
597 sizeof(*set->data) * (set->used - p));
603 set->states[set->used++] = s;
604 return set->used - 1;
608 ATTRIBUTE_RETURN_CHECK
609 static int state_set_push_data(struct state_set *set, struct state *s,
611 int i = state_set_push(set, s);
618 static int state_set_index(const struct state_set *set,
619 const struct state *s) {
621 int p = state_set_pos(set, s);
622 return (p < set->used && set->states[p] == s) ? p : -1;
624 for (int i=0; i < set->used; i++) {
625 if (set->states[i] == s)
632 static void state_set_remove(struct state_set *set,
633 const struct state *s) {
635 int p = state_set_index(set, s);
637 memmove(set->states + p, set->states + p + 1,
638 sizeof(*set->states) * (set->used - p - 1));
639 if (set->data != NULL)
640 memmove(set->data + p, set->data + p + 1,
641 sizeof(*set->data) * (set->used - p - 1));
643 int p = state_set_index(set, s);
645 set->states[p] = set->states[--set->used];
650 /* Only add S if it's not in SET yet. Return 1 if S was added, 0 if it was
651 already in the set and -1 on error. */
652 ATTRIBUTE_RETURN_CHECK
653 static int state_set_add(struct state_set *set, struct state *s) {
655 int p = state_set_pos(set, s);
656 if (p < set->used && set->states[p] == s)
658 if (set->size == set->used)
659 if (state_set_expand(set) < 0)
661 while (p < set->used && set->states[p] <= s)
664 memmove(set->states + p + 1, set->states + p,
665 sizeof(*set->states) * (set->used - p));
666 if (set->data != NULL)
667 memmove(set->data + p + 1, set->data + p,
668 sizeof(*set->data) * (set->used - p));
673 if (state_set_index(set, s) >= 0)
675 if (state_set_push(set, s) < 0)
680 /* We do this to provoke a SEGV as early as possible */
686 static struct state *state_set_pop(struct state_set *set) {
687 struct state *s = NULL;
689 s = set->states[--set->used];
693 static struct state *state_set_pop_data(struct state_set *set, void **d) {
694 struct state *s = NULL;
695 s = state_set_pop(set);
696 *d = set->data[set->used];
700 static void *state_set_find_data(struct state_set *set, struct state *s) {
701 int i = state_set_index(set, s);
708 static int state_set_equal(const struct state_set *set1,
709 const struct state_set *set2) {
710 if (set1->used != set2->used)
712 if (set1->sorted && set2->sorted) {
713 for (int i = 0; i < set1->used; i++)
714 if (set1->states[i] != set2->states[i])
718 for (int i=0; i < set1->used; i++)
719 if (state_set_index(set2, set1->states[i]) == -1)
726 static void state_set_compact(struct state_set *set) {
727 while (set->used > 0 && set->states[set->used] == NULL)
729 for (int i=0; i < set->used; i++) {
730 if (set->states[i] == NULL) {
732 set->states[i] = set->states[set->used];
734 set->data[i] = set->data[set->used];
736 while (set->used > 0 && set->states[set->used] == NULL)
742 /* Add an entry (FST, SND) to SET. FST is stored in SET->STATES, and SND is
743 stored in SET->DATA at the same index.
745 ATTRIBUTE_RETURN_CHECK
746 static int state_pair_push(struct state_set **set,
747 struct state *fst, struct state *snd) {
749 *set = state_set_init(-1, S_DATA);
752 int i = state_set_push(*set, fst);
755 (*set)->data[i] = snd;
760 /* Return the index of the pair (FST, SND) in SET, or -1 if SET contains no
763 static int state_pair_find(struct state_set *set, struct state *fst,
765 for (int i=0; i < set->used; i++)
766 if (set->states[i] == fst && set->data[i] == snd)
771 /* Jenkins' hash for void* */
772 static hash_val_t ptr_hash(const void *p) {
774 char *c = (char *) &p;
775 for (int i=0; i < sizeof(p); i++) {
777 hash += (hash << 10);
781 hash ^= (hash >> 11);
782 hash += (hash << 15);
786 typedef hash_t state_triple_hash;
788 static hash_val_t pair_hash(const void *key) {
789 register struct state *const *pair = key;
790 return pair[0]->hash + pair[1]->hash;
793 static int pair_cmp(const void *key1, const void *key2) {
794 return memcmp(key1, key2, 2*sizeof(struct state *));
797 static void state_triple_node_free(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
798 free((void *) hnode_getkey(node));
802 static state_triple_hash *state_triple_init(void) {
803 state_triple_hash *hash;
805 hash = hash_create(HASHCOUNT_T_MAX, pair_cmp, pair_hash);
808 hash_set_allocator(hash, NULL, state_triple_node_free, NULL);
812 ATTRIBUTE_RETURN_CHECK
813 static int state_triple_push(state_triple_hash *hash,
818 if (ALLOC_N(pair, 2) < 0)
822 return hash_alloc_insert(hash, pair, s3);
825 static struct state * state_triple_thd(state_triple_hash *hash,
828 struct state *pair[2];
832 node = hash_lookup(hash, pair);
833 return (node == NULL) ? NULL : (struct state *) hnode_get(node);
836 static void state_triple_free(state_triple_hash *hash) {
838 hash_free_nodes(hash);
846 ATTRIBUTE_RETURN_CHECK
847 static int mark_reachable(struct fa *fa) {
848 struct state_set *worklist = state_set_init(-1, S_NONE);
853 list_for_each(s, fa->initial) {
856 fa->initial->reachable = 1;
858 for (struct state *s = fa->initial;
860 s = state_set_pop(worklist)) {
861 for_each_trans(t, s) {
862 if (! t->to->reachable) {
863 t->to->reachable = 1;
864 F(state_set_push(worklist, t->to));
871 state_set_free(worklist);
875 /* Return all reachable states. As a sideeffect, all states have their
876 REACHABLE flag set appropriately.
878 static struct state_set *fa_states(struct fa *fa) {
879 struct state_set *visited = state_set_init(-1, S_NONE);
882 r = mark_reachable(fa);
883 E(visited == NULL || r < 0);
885 list_for_each(s, fa->initial) {
887 F(state_set_push(visited, s));
891 state_set_free(visited);
895 /* Return all reachable accepting states. As a sideeffect, all states have
896 their REACHABLE flag set appropriately.
898 static struct state_set *fa_accept_states(struct fa *fa) {
899 struct state_set *accept = state_set_init(-1, S_NONE);
904 r = mark_reachable(fa);
907 list_for_each(s, fa->initial) {
908 if (s->reachable && s->accept)
909 F(state_set_push(accept, s));
913 state_set_free(accept);
917 /* Mark all live states, i.e. states from which an accepting state can be
918 reached. All states have their REACHABLE and LIVE flags set
921 ATTRIBUTE_RETURN_CHECK
922 static int mark_live(struct fa *fa) {
925 F(mark_reachable(fa));
927 list_for_each(s, fa->initial) {
928 s->live = s->reachable && s->accept;
933 list_for_each(s, fa->initial) {
934 if (! s->live && s->reachable) {
935 for_each_trans(t, s) {
951 * Reverse an automaton in place. Change FA so that it accepts the
952 * language that is the reverse of the input automaton.
954 * Returns a list of the new initial states of the automaton. The list must
955 * be freed by the caller.
957 static struct state_set *fa_reverse(struct fa *fa) {
958 struct state_set *all = NULL;
959 struct state_set *accept = NULL;
964 accept = fa_accept_states(fa);
967 F(state_set_init_data(all));
969 /* Reverse all transitions */
971 F(ALLOC_N(tused, all->used));
972 for (int i=0; i < all->used; i++) {
973 all->data[i] = all->states[i]->trans;
974 tused[i] = all->states[i]->tused;
975 all->states[i]->trans = NULL;
976 all->states[i]->tsize = 0;
977 all->states[i]->tused = 0;
979 for (int i=0; i < all->used; i++) {
980 struct state *s = all->states[i];
981 struct trans *t = all->data[i];
983 for (int j=0; j < tused[i]; j++) {
984 r = add_new_trans(t[j].to, s, t[j].min, t[j].max);
992 /* Make new initial and final states */
993 struct state *s = add_state(fa, 0);
996 fa->initial->accept = 1;
998 for (int i=0; i < accept->used; i++) {
999 r = add_epsilon_trans(s, accept->states[i]);
1004 fa->deterministic = 0;
1006 state_set_free(all);
1009 state_set_free(all);
1010 state_set_free(accept);
1015 * Return a sorted array of all interval start points in FA. The returned
1016 * array is a string (null terminated)
1018 static uchar* start_points(struct fa *fa, int *npoints) {
1019 char pointset[UCHAR_NUM];
1020 uchar *points = NULL;
1022 F(mark_reachable(fa));
1023 MEMZERO(pointset, UCHAR_NUM);
1024 list_for_each(s, fa->initial) {
1028 for_each_trans(t, s) {
1029 pointset[t->min] = 1;
1030 if (t->max < UCHAR_MAX)
1031 pointset[t->max+1] = 1;
1036 for(int i=0; i < UCHAR_NUM; *npoints += pointset[i], i++);
1038 F(ALLOC_N(points, *npoints+1));
1039 for (int i=0, n=0; i < UCHAR_NUM; i++) {
1041 points[n++] = (uchar) i;
1051 * Operations on STATE_SET_HASH
1053 static int state_set_hash_contains(state_set_hash *smap,
1054 struct state_set *set) {
1055 return hash_lookup(smap, set) != NULL;
1059 * Find the set in SMAP that has the same states as SET. If the two are
1060 * different, i.e. they point to different memory locations, free SET and
1061 * return the set found in SMAP
1063 static struct state_set *state_set_hash_uniq(state_set_hash *smap,
1064 struct state_set *set) {
1065 hnode_t *node = hash_lookup(smap, set);
1066 const struct state_set *orig_set = hnode_getkey(node);
1067 if (orig_set != set) {
1068 state_set_free(set);
1070 return (struct state_set *) orig_set;
1073 static struct state *state_set_hash_get_state(state_set_hash *smap,
1074 struct state_set *set) {
1075 hnode_t *node = hash_lookup(smap, set);
1076 return (struct state *) hnode_get(node);
1079 static hash_val_t set_hash(const void *key) {
1080 hash_val_t hash = 0;
1081 const struct state_set *set = key;
1083 for (int i = 0; i < set->used; i++) {
1084 hash += set->states[i]->hash;
1089 static int set_cmp(const void *key1, const void *key2) {
1090 const struct state_set *set1 = key1;
1091 const struct state_set *set2 = key2;
1093 return state_set_equal(set1, set2) ? 0 : 1;
1096 static void set_destroy(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
1097 struct state_set *set = (struct state_set *) hnode_getkey(node);
1098 state_set_free(set);
1102 ATTRIBUTE_RETURN_CHECK
1103 static int state_set_hash_add(state_set_hash **smap,
1104 struct state_set *set, struct fa *fa) {
1105 if (*smap == NULL) {
1106 *smap = hash_create(HASHCOUNT_T_MAX, set_cmp, set_hash);
1108 hash_set_allocator(*smap, NULL, set_destroy, NULL);
1110 struct state *s = add_state(fa, 0);
1112 F(hash_alloc_insert(*smap, set, s));
1118 static void state_set_hash_free(state_set_hash *smap,
1119 struct state_set *protect) {
1120 if (protect != NULL) {
1121 hnode_t *node = hash_lookup(smap, protect);
1122 hash_delete(smap, node);
1123 hnode_getkey(node) = NULL;
1124 set_destroy(node, NULL);
1126 hash_free_nodes(smap);
1130 static int state_set_list_add(struct state_set_list **list,
1131 struct state_set *set) {
1132 struct state_set_list *elt;
1136 list_cons(*list, elt);
1140 static struct state_set *state_set_list_pop(struct state_set_list **list) {
1141 struct state_set_list *elt = *list;
1142 struct state_set *set = elt->set;
1149 /* Compare transitions lexicographically by (to, min, reverse max) */
1150 static int trans_to_cmp(const void *v1, const void *v2) {
1151 const struct trans *t1 = v1;
1152 const struct trans *t2 = v2;
1154 if (t1->to != t2->to) {
1155 return (t1->to < t2->to) ? -1 : 1;
1157 if (t1->min < t2->min)
1159 if (t1->min > t2->min)
1161 if (t1->max > t2->max)
1163 return (t1->max < t2->max) ? 1 : 0;
1166 /* Compare transitions lexicographically by (min, reverse max, to) */
1167 static int trans_intv_cmp(const void *v1, const void *v2) {
1168 const struct trans *t1 = v1;
1169 const struct trans *t2 = v2;
1171 if (t1->min < t2->min)
1173 if (t1->min > t2->min)
1175 if (t1->max > t2->max)
1177 if (t1->max < t2->max)
1179 if (t1->to != t2->to) {
1180 return (t1->to < t2->to) ? -1 : 1;
1186 * Reduce an automaton by combining overlapping and adjacent edge intervals
1187 * with the same destination.
1189 static void reduce(struct fa *fa) {
1190 list_for_each(s, fa->initial) {
1194 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
1196 struct trans *t = s->trans;
1197 while (j < s->tused) {
1198 if (t[i].to == t[j].to && t[j].min <= t[i].max + 1) {
1199 if (t[j].max > t[i].max)
1200 t[i].max = t[j].max;
1205 memmove(s->trans + i, s->trans + j,
1206 sizeof(*s->trans) * (s->tused - j));
1212 /* Shrink if we use less than half the allocated size */
1213 if (s->tsize > array_initial_size && 2*s->tused < s->tsize) {
1215 r = REALLOC_N(s->trans, s->tsize);
1217 s->tsize = s->tused;
1223 * Remove dead transitions from an FA; a transition is dead is it does not
1224 * lead to a live state. This also removes any states that are not
1225 * reachable any longer from FA->INITIAL.
1227 * Returns the same FA as a convenience
1230 static void collect_trans(struct fa *fa) {
1231 list_for_each(s, fa->initial) {
1236 while (i < s->tused) {
1237 if (! s->trans[i].to->live) {
1239 memmove(s->trans + i, s->trans + s->tused,
1249 static void collect_dead_states(struct fa *fa) {
1250 /* Remove all dead states and free their storage */
1251 for (struct state *s = fa->initial; s->next != NULL; ) {
1252 if (! s->next->live) {
1253 struct state *del = s->next;
1254 s->next = del->next;
1263 static int collect(struct fa *fa) {
1266 if (! fa->initial->live) {
1267 /* This automaton accepts nothing, make it the canonical
1270 list_for_each(s, fa->initial) {
1273 list_free(fa->initial->next);
1274 fa->deterministic = 1;
1277 collect_dead_states(fa);
1285 static void swap_initial(struct fa *fa) {
1286 struct state *s = fa->initial;
1287 if (s->next != NULL) {
1288 fa->initial = s->next;
1289 s->next = fa->initial->next;
1290 fa->initial->next = s;
1295 * Make a finite automaton deterministic using the given set of initial
1296 * states with the subset construction. This also eliminates dead states
1297 * and transitions and reduces and orders the transitions for each state
1299 static int determinize(struct fa *fa, struct state_set *ini) {
1301 int make_ini = (ini == NULL);
1302 const uchar *points = NULL;
1303 state_set_hash *newstate = NULL;
1304 struct state_set_list *worklist = NULL;
1307 if (fa->deterministic)
1310 points = start_points(fa, &npoints);
1313 ini = state_set_init(-1, S_NONE);
1314 if (ini == NULL || state_set_push(ini, fa->initial) < 0)
1318 F(state_set_list_add(&worklist, ini));
1319 F(state_set_hash_add(&newstate, ini, fa));
1320 // Make the new state the initial state
1322 while (worklist != NULL) {
1323 struct state_set *sset = state_set_list_pop(&worklist);
1324 struct state *r = state_set_hash_get_state(newstate, sset);
1325 for (int q=0; q < sset->used; q++) {
1326 r->accept |= sset->states[q]->accept;
1328 for (int n=0; n < npoints; n++) {
1329 struct state_set *pset = state_set_init(-1, S_SORTED);
1331 for(int q=0 ; q < sset->used; q++) {
1332 for_each_trans(t, sset->states[q]) {
1333 if (t->min <= points[n] && points[n] <= t->max) {
1334 F(state_set_add(pset, t->to));
1338 if (!state_set_hash_contains(newstate, pset)) {
1339 F(state_set_list_add(&worklist, pset));
1340 F(state_set_hash_add(&newstate, pset, fa));
1342 pset = state_set_hash_uniq(newstate, pset);
1344 struct state *q = state_set_hash_get_state(newstate, pset);
1345 uchar min = points[n];
1346 uchar max = UCHAR_MAX;
1348 max = points[n+1] - 1;
1349 if (add_new_trans(r, q, min, max) < 0)
1353 fa->deterministic = 1;
1357 state_set_hash_free(newstate, make_ini ? NULL : ini);
1358 free((void *) points);
1359 if (collect(fa) < 0)
1368 * Minimization. As a sideeffect of minimization, the transitions are
1369 * reduced and ordered.
1372 static struct state *step(struct state *s, uchar c) {
1373 for_each_trans(t, s) {
1374 if (t->min <= c && c <= t->max)
1381 struct state_list_node *first;
1382 struct state_list_node *last;
1386 struct state_list_node {
1387 struct state_list *sl;
1388 struct state_list_node *next;
1389 struct state_list_node *prev;
1390 struct state *state;
1393 static struct state_list_node *state_list_add(struct state_list *sl,
1395 struct state_list_node *n;
1403 if (sl->size++ == 0) {
1414 static void state_list_remove(struct state_list_node *n) {
1415 struct state_list *sl = n->sl;
1418 sl->first = n->next;
1420 n->prev->next = n->next;
1424 n->next->prev = n->prev;
1429 static void state_list_free(struct state_list *sl) {
1431 list_free(sl->first);
1435 /* The linear index of element (q,c) in an NSTATES * NSIGMA matrix */
1436 #define INDEX(q, c) (q * nsigma + c)
1438 static int minimize_hopcroft(struct fa *fa) {
1439 struct state_set *states = NULL;
1440 uchar *sigma = NULL;
1441 struct state_set **reverse = NULL;
1442 bitset *reverse_nonempty = NULL;
1443 struct state_set **partition = NULL;
1444 unsigned int *block = NULL;
1445 struct state_list **active = NULL;
1446 struct state_list_node **active2 = NULL;
1447 int *pending = NULL;
1448 bitset *pending2 = NULL;
1449 struct state_set *split = NULL;
1450 bitset *split2 = NULL;
1452 bitset *refine2 = NULL;
1453 struct state_set **splitblock = NULL;
1454 struct state_set *newstates = NULL;
1458 unsigned int nstates = 0;
1461 F(determinize(fa, NULL));
1463 /* Total automaton, nothing to do */
1464 if (fa->initial->tused == 1
1465 && fa->initial->trans[0].to == fa->initial
1466 && fa->initial->trans[0].min == UCHAR_MIN
1467 && fa->initial->trans[0].max == UCHAR_MAX)
1472 /* make arrays for numbered states and effective alphabet */
1473 states = state_set_init(-1, S_NONE);
1476 list_for_each(s, fa->initial) {
1477 F(state_set_push(states, s));
1479 nstates = states->used;
1481 sigma = start_points(fa, &nsigma);
1484 /* initialize data structures */
1486 /* An ss->used x nsigma matrix of lists of states */
1487 F(ALLOC_N(reverse, nstates * nsigma));
1488 reverse_nonempty = bitset_init(nstates * nsigma);
1489 E(reverse_nonempty == NULL);
1490 F(ALLOC_N(partition, nstates));
1491 F(ALLOC_N(block, nstates));
1493 F(ALLOC_N(active, nstates * nsigma));
1494 F(ALLOC_N(active2, nstates * nsigma));
1496 /* PENDING is an array of pairs of ints. The i'th pair is stored in
1497 * PENDING[2*i] and PENDING[2*i + 1]. There are NPENDING pairs in
1498 * PENDING at any time. SPENDING is the maximum number of pairs
1499 * allocated for PENDING.
1501 size_t npending = 0, spending = 0;
1502 pending2 = bitset_init(nstates * nsigma);
1503 E(pending2 == NULL);
1505 split = state_set_init(-1, S_NONE);
1506 split2 = bitset_init(nstates);
1507 E(split == NULL || split2 == NULL);
1509 F(ALLOC_N(refine, nstates));
1510 refine2 = bitset_init(nstates);
1513 F(ALLOC_N(splitblock, nstates));
1515 for (int q = 0; q < nstates; q++) {
1516 splitblock[q] = state_set_init(-1, S_NONE);
1517 partition[q] = state_set_init(-1, S_NONE);
1518 E(splitblock[q] == NULL || partition[q] == NULL);
1519 for (int x = 0; x < nsigma; x++) {
1520 reverse[INDEX(q, x)] = state_set_init(-1, S_NONE);
1521 E(reverse[INDEX(q, x)] == NULL);
1522 F(ALLOC_N(active[INDEX(q, x)], 1));
1526 /* find initial partition and reverse edges */
1527 for (int q = 0; q < nstates; q++) {
1528 struct state *qq = states->states[q];
1534 F(state_set_push(partition[j], qq));
1536 for (int x = 0; x < nsigma; x++) {
1538 struct state *p = step(qq, y);
1540 int pn = state_set_index(states, p);
1542 F(state_set_push(reverse[INDEX(pn, x)], qq));
1543 bitset_set(reverse_nonempty, INDEX(pn, x));
1547 /* initialize active sets */
1548 for (int j = 0; j <= 1; j++)
1549 for (int x = 0; x < nsigma; x++)
1550 for (int q = 0; q < partition[j]->used; q++) {
1551 struct state *qq = partition[j]->states[q];
1552 int qn = state_set_index(states, qq);
1553 if (bitset_get(reverse_nonempty, INDEX(qn, x))) {
1554 active2[INDEX(qn, x)] =
1555 state_list_add(active[INDEX(j, x)], qq);
1556 E(active2[INDEX(qn, x)] == NULL);
1560 /* initialize pending */
1561 F(ALLOC_N(pending, 2*nsigma));
1564 for (int x = 0; x < nsigma; x++) {
1565 int a0 = active[INDEX(0,x)]->size;
1566 int a1 = active[INDEX(1,x)]->size;
1574 bitset_set(pending2, INDEX(j, x));
1577 /* process pending until fixed point */
1579 while (npending-- > 0) {
1580 int p = pending[2*npending];
1581 int x = pending[2*npending+1];
1582 bitset_clr(pending2, INDEX(p, x));
1584 /* find states that need to be split off their blocks */
1585 struct state_list *sh = active[INDEX(p,x)];
1586 for (struct state_list_node *m = sh->first; m != NULL; m = m->next) {
1587 int q = state_set_index(states, m->state);
1588 struct state_set *rev = reverse[INDEX(q, x)];
1589 for (int r =0; r < rev->used; r++) {
1590 struct state *rs = rev->states[r];
1591 int s = state_set_index(states, rs);
1592 if (! bitset_get(split2, s)) {
1593 bitset_set(split2, s);
1594 F(state_set_push(split, rs));
1596 F(state_set_push(splitblock[j], rs));
1597 if (!bitset_get(refine2, j)) {
1598 bitset_set(refine2, j);
1605 for(int rr=0; rr < ref; rr++) {
1607 struct state_set *sp = splitblock[j];
1608 if (sp->used < partition[j]->used) {
1609 struct state_set *b1 = partition[j];
1610 struct state_set *b2 = partition[k];
1611 for (int s = 0; s < sp->used; s++) {
1612 state_set_remove(b1, sp->states[s]);
1613 F(state_set_push(b2, sp->states[s]));
1614 int snum = state_set_index(states, sp->states[s]);
1616 for (int c = 0; c < nsigma; c++) {
1617 struct state_list_node *sn = active2[INDEX(snum, c)];
1618 if (sn != NULL && sn->sl == active[INDEX(j,c)]) {
1619 state_list_remove(sn);
1620 active2[INDEX(snum, c)] =
1621 state_list_add(active[INDEX(k, c)],
1623 E(active2[INDEX(snum, c)] == NULL);
1628 for (int c = 0; c < nsigma; c++) {
1629 int aj = active[INDEX(j, c)]->size;
1630 int ak = active[INDEX(k, c)]->size;
1631 if (npending + 1 > spending) {
1633 F(REALLOC_N(pending, 2 * spending));
1635 pending[2*npending + 1] = c;
1636 if (!bitset_get(pending2, INDEX(j, c))
1637 && 0 < aj && aj <= ak) {
1638 bitset_set(pending2, INDEX(j, c));
1639 pending[2*npending] = j;
1641 bitset_set(pending2, INDEX(k, c));
1642 pending[2*npending] = k;
1648 for (int s = 0; s < sp->used; s++) {
1649 int snum = state_set_index(states, sp->states[s]);
1650 bitset_clr(split2, snum);
1652 bitset_clr(refine2, j);
1658 /* make a new state for each equivalence class, set initial state */
1659 newstates = state_set_init(k, S_NONE);
1660 E(newstates == NULL);
1661 F(ALLOC_N(nsnum, k));
1662 F(ALLOC_N(nsind, nstates));
1664 for (int n = 0; n < k; n++) {
1665 struct state *s = make_state();
1667 newstates->states[n] = s;
1668 struct state_set *partn = partition[n];
1669 for (int q=0; q < partn->used; q++) {
1670 struct state *qs = partn->states[q];
1671 int qnum = state_set_index(states, qs);
1672 if (qs == fa->initial)
1673 s->live = 1; /* Abuse live to flag the new intial state */
1674 nsnum[n] = qnum; /* select representative */
1675 nsind[qnum] = n; /* and point from partition to new state */
1679 /* build transitions and set acceptance */
1680 for (int n = 0; n < k; n++) {
1681 struct state *s = newstates->states[n];
1682 s->accept = states->states[nsnum[n]]->accept;
1683 for_each_trans(t, states->states[nsnum[n]]) {
1684 int toind = state_set_index(states, t->to);
1685 struct state *nto = newstates->states[nsind[toind]];
1686 F(add_new_trans(s, nto, t->min, t->max));
1690 /* Get rid of old states and transitions and turn NEWTSTATES into
1693 for (int n=0; n < k; n++)
1694 if (newstates->states[n]->live) {
1695 struct state *ini = newstates->states[n];
1696 newstates->states[n] = newstates->states[0];
1697 newstates->states[0] = ini;
1699 for (int n=0; n < k-1; n++)
1700 newstates->states[n]->next = newstates->states[n+1];
1701 fa->initial = newstates->states[0];
1709 state_set_free(states);
1711 bitset_free(reverse_nonempty);
1713 for (int i=0; i < nstates*nsigma; i++) {
1715 state_set_free(reverse[i]);
1717 state_list_free(active[i]);
1723 bitset_free(pending2);
1724 state_set_free(split);
1725 bitset_free(split2);
1727 bitset_free(refine2);
1728 for (int q=0; q < nstates; q++) {
1730 state_set_free(splitblock[q]);
1732 state_set_free(partition[q]);
1736 state_set_free(newstates);
1738 if (collect(fa) < 0)
1746 static int minimize_brzozowski(struct fa *fa) {
1747 struct state_set *set;
1749 /* Minimize using Brzozowski's algorithm */
1750 set = fa_reverse(fa);
1752 F(determinize(fa, set));
1753 state_set_free(set);
1755 set = fa_reverse(fa);
1757 F(determinize(fa, set));
1758 state_set_free(set);
1764 int fa_minimize(struct fa *fa) {
1772 if (fa_minimization_algorithm == FA_MIN_BRZOZOWSKI) {
1773 r = minimize_brzozowski(fa);
1775 r = minimize_hopcroft(fa);
1784 * Construction of fa
1787 static struct fa *fa_make_empty(void) {
1792 if (add_state(fa, 0) == NULL) {
1796 fa->deterministic = 1;
1801 static struct fa *fa_make_epsilon(void) {
1802 struct fa *fa = fa_make_empty();
1804 fa->initial->accept = 1;
1805 fa->deterministic= 1;
1811 static struct fa *fa_make_char(uchar c) {
1812 struct fa *fa = fa_make_empty();
1816 struct state *s = fa->initial;
1817 struct state *t = add_state(fa, 1);
1823 r = add_new_trans(s, t, c, c);
1826 fa->deterministic = 1;
1834 struct fa *fa_make_basic(unsigned int basic) {
1837 if (basic == FA_EMPTY) {
1838 return fa_make_empty();
1839 } else if (basic == FA_EPSILON) {
1840 return fa_make_epsilon();
1841 } else if (basic == FA_TOTAL) {
1842 struct fa *fa = fa_make_epsilon();
1843 r = add_new_trans(fa->initial, fa->initial, UCHAR_MIN, UCHAR_MAX);
1853 int fa_is_basic(struct fa *fa, unsigned int basic) {
1854 if (basic == FA_EMPTY) {
1855 return ! fa->initial->accept && fa->initial->tused == 0;
1856 } else if (basic == FA_EPSILON) {
1857 return fa->initial->accept && fa->initial->tused == 0;
1858 } else if (basic == FA_TOTAL) {
1859 if (! fa->initial->accept)
1862 if (fa->initial->tused != 2)
1864 struct trans *t1 = fa->initial->trans;
1865 struct trans *t2 = fa->initial->trans + 1;
1866 if (t1->to != fa->initial || t2->to != fa->initial)
1868 if (t2->max != UCHAR_MAX) {
1870 t2 = fa->initial->trans;
1872 return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
1873 t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
1875 struct trans *t = fa->initial->trans;
1876 return fa->initial->tused == 1 &&
1877 t->to == fa->initial &&
1878 t->min == UCHAR_MIN && t->max == UCHAR_MAX;
1884 static struct fa *fa_clone(struct fa *fa) {
1885 struct fa *result = NULL;
1886 struct state_set *set = state_set_init(-1, S_DATA|S_SORTED);
1889 if (fa == NULL || set == NULL || ALLOC(result) < 0)
1892 result->deterministic = fa->deterministic;
1893 result->minimal = fa->minimal;
1894 result->nocase = fa->nocase;
1895 list_for_each(s, fa->initial) {
1896 int i = state_set_push(set, s);
1899 struct state *q = add_state(result, s->accept);
1904 q->reachable = s->reachable;
1906 for (int i=0; i < set->used; i++) {
1907 struct state *s = set->states[i];
1908 struct state *sc = set->data[i];
1909 for_each_trans(t, s) {
1910 int to = state_set_index(set, t->to);
1912 struct state *toc = set->data[to];
1913 r = add_new_trans(sc, toc, t->min, t->max);
1918 state_set_free(set);
1921 state_set_free(set);
1926 static int case_expand(struct fa *fa);
1928 /* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
1929 ATTRIBUTE_RETURN_CHECK
1930 static int union_in_place(struct fa *fa1, struct fa **fa2) {
1934 if (fa1->nocase != (*fa2)->nocase) {
1935 if (case_expand(fa1) < 0)
1937 if (case_expand(*fa2) < 0)
1941 s = add_state(fa1, 0);
1944 r = add_epsilon_trans(s, fa1->initial);
1947 r = add_epsilon_trans(s, (*fa2)->initial);
1951 fa1->deterministic = 0;
1955 set_initial(fa1, s);
1960 struct fa *fa_union(struct fa *fa1, struct fa *fa2) {
1961 fa1 = fa_clone(fa1);
1962 fa2 = fa_clone(fa2);
1963 if (fa1 == NULL || fa2 == NULL)
1966 F(union_in_place(fa1, &fa2));
1975 /* Concat FA2 onto FA1; frees FA2 and changes FA1 to FA1.FA2 */
1976 ATTRIBUTE_RETURN_CHECK
1977 static int concat_in_place(struct fa *fa1, struct fa **fa2) {
1980 if (fa1->nocase != (*fa2)->nocase) {
1981 if (case_expand(fa1) < 0)
1983 if (case_expand(*fa2) < 0)
1987 list_for_each(s, fa1->initial) {
1990 r = add_epsilon_trans(s, (*fa2)->initial);
1996 fa1->deterministic = 0;
2003 struct fa *fa_concat(struct fa *fa1, struct fa *fa2) {
2004 fa1 = fa_clone(fa1);
2005 fa2 = fa_clone(fa2);
2007 if (fa1 == NULL || fa2 == NULL)
2010 F(concat_in_place(fa1, &fa2));
2022 static struct fa *fa_make_char_set(bitset *cset, int negate) {
2023 struct fa *fa = fa_make_empty();
2027 struct state *s = fa->initial;
2028 struct state *t = add_state(fa, 1);
2035 while (from <= UCHAR_MAX) {
2036 while (from <= UCHAR_MAX && bitset_get(cset, from) == negate)
2038 if (from > UCHAR_MAX)
2041 while (to < UCHAR_MAX && (bitset_get(cset, to + 1) == !negate))
2043 r = add_new_trans(s, t, from, to);
2049 fa->deterministic = 1;
2058 static struct fa *fa_star(struct fa *fa) {
2066 s = add_state(fa, 1);
2070 r = add_epsilon_trans(s, fa->initial);
2075 list_for_each(p, fa->initial->next) {
2077 r = add_epsilon_trans(p, s);
2082 fa->deterministic = 0;
2092 /* Form the automaton (FA){N}; FA is not modified */
2093 static struct fa *repeat(struct fa *fa, int n) {
2095 return fa_make_epsilon();
2096 } else if (n == 1) {
2097 return fa_clone(fa);
2099 struct fa *cfa = fa_clone(fa);
2103 struct fa *tfa = fa_clone(fa);
2108 if (concat_in_place(cfa, &tfa) < 0) {
2119 struct fa *fa_iter(struct fa *fa, int min, int max) {
2125 if (min > max && max != -1) {
2126 return fa_make_empty();
2129 struct fa *sfa = fa_star(fa);
2134 struct fa *cfa = repeat(fa, min);
2139 if (concat_in_place(cfa, &sfa) < 0) {
2146 struct fa *cfa = NULL;
2149 cfa = repeat(fa, min);
2153 struct fa *cfa2 = fa_clone(fa);
2159 struct fa *cfa3 = fa_clone(fa);
2165 list_for_each(s, cfa3->initial) {
2167 r = add_epsilon_trans(s, cfa2->initial);
2176 fa_merge(cfa3, &cfa2);
2180 list_for_each(s, cfa->initial) {
2182 r = add_epsilon_trans(s, cfa2->initial);
2190 fa_merge(cfa, &cfa2);
2191 cfa->deterministic = 0;
2194 if (collect(cfa) < 0) {
2202 static void sort_transition_intervals(struct fa *fa) {
2203 list_for_each(s, fa->initial) {
2204 qsort(s->trans, s->tused, sizeof(*s->trans), trans_intv_cmp);
2208 struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
2210 struct fa *fa = NULL;
2211 struct state_set *worklist = NULL;
2212 state_triple_hash *newstates = NULL;
2215 return fa_clone(fa1);
2217 if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
2218 return fa_make_empty();
2220 if (fa1->nocase != fa2->nocase) {
2221 F(case_expand(fa1));
2222 F(case_expand(fa2));
2225 fa = fa_make_empty();
2226 worklist = state_set_init(-1, S_NONE);
2227 newstates = state_triple_init();
2228 if (fa == NULL || worklist == NULL || newstates == NULL)
2231 sort_transition_intervals(fa1);
2232 sort_transition_intervals(fa2);
2234 F(state_set_push(worklist, fa1->initial));
2235 F(state_set_push(worklist, fa2->initial));
2236 F(state_set_push(worklist, fa->initial));
2237 F(state_triple_push(newstates,
2238 fa1->initial, fa2->initial, fa->initial));
2239 while (worklist->used) {
2240 struct state *s = state_set_pop(worklist);
2241 struct state *p2 = state_set_pop(worklist);
2242 struct state *p1 = state_set_pop(worklist);
2243 s->accept = p1->accept && p2->accept;
2245 struct trans *t1 = p1->trans;
2246 struct trans *t2 = p2->trans;
2247 for (int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2248 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2251 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2253 if (t2[n2].max >= t1[n1].min) {
2254 struct state *r = state_triple_thd(newstates,
2255 t1[n1].to, t2[n2].to);
2257 r = add_state(fa, 0);
2259 F(state_set_push(worklist, t1[n1].to));
2260 F(state_set_push(worklist, t2[n2].to));
2261 F(state_set_push(worklist, r));
2262 F(state_triple_push(newstates,
2263 t1[n1].to, t2[n2].to, r));
2265 char min = t1[n1].min > t2[n2].min
2266 ? t1[n1].min : t2[n2].min;
2267 char max = t1[n1].max < t2[n2].max
2268 ? t1[n1].max : t2[n2].max;
2269 ret = add_new_trans(s, r, min, max);
2276 fa->deterministic = fa1->deterministic && fa2->deterministic;
2277 fa->nocase = fa1->nocase && fa2->nocase;
2279 state_set_free(worklist);
2280 state_triple_free(newstates);
2282 if (collect(fa) < 0) {
2295 int fa_contains(struct fa *fa1, struct fa *fa2) {
2297 struct state_set *worklist = NULL; /* List of pairs of states */
2298 struct state_set *visited = NULL; /* List of pairs of states */
2300 if (fa1 == NULL || fa2 == NULL)
2306 F(determinize(fa2, NULL));
2307 sort_transition_intervals(fa1);
2308 sort_transition_intervals(fa2);
2310 F(state_pair_push(&worklist, fa1->initial, fa2->initial));
2311 F(state_pair_push(&visited, fa1->initial, fa2->initial));
2312 while (worklist->used) {
2313 struct state *p1, *p2;
2315 p1 = state_set_pop_data(worklist, &v2);
2318 if (p1->accept && !p2->accept)
2321 struct trans *t1 = p1->trans;
2322 struct trans *t2 = p2->trans;
2323 for(int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2324 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2326 int min1 = t1[n1].min, max1 = t1[n1].max;
2328 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2330 if (t2[n2].min > min1)
2332 if (t2[n2].max < CHAR_MAX)
2333 min1 = t2[n2].max + 1;
2338 if (state_pair_find(visited, t1[n1].to, t2[n2].to) == -1) {
2339 F(state_pair_push(&worklist, t1[n1].to, t2[n2].to));
2340 F(state_pair_push(&visited, t1[n1].to, t2[n2].to));
2350 state_set_free(worklist);
2351 state_set_free(visited);
2358 static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash,
2363 /* Never transition on anything in [A-Z] */
2364 if (min > 'Z' || max < 'A') {
2365 result = add_new_trans(s, crash, min, max);
2366 } else if (min >= 'A' && max <= 'Z') {
2368 } else if (max <= 'Z') {
2370 result = add_new_trans(s, crash, min, 'A' - 1);
2371 } else if (min >= 'A') {
2373 result = add_new_trans(s, crash, 'Z' + 1, max);
2375 /* min < 'A' && max > 'Z' */
2376 result = add_new_trans(s, crash, min, 'A' - 1);
2378 result = add_new_trans(s, crash, 'Z' + 1, max);
2381 result = add_new_trans(s, crash, min, max);
2386 static int totalize(struct fa *fa) {
2388 struct state *crash = add_state(fa, 0);
2391 F(mark_reachable(fa));
2392 sort_transition_intervals(fa);
2394 r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX);
2398 list_for_each(s, fa->initial) {
2399 int next = UCHAR_MIN;
2400 int tused = s->tused;
2401 for (int i=0; i < tused; i++) {
2402 uchar min = s->trans[i].min, max = s->trans[i].max;
2404 r = add_crash_trans(fa, s, crash, next, min - 1);
2411 if (next <= UCHAR_MAX) {
2412 r = add_crash_trans(fa, s, crash, next, UCHAR_MAX);
2422 struct fa *fa_complement(struct fa *fa) {
2425 F(determinize(fa, NULL));
2427 list_for_each(s, fa->initial)
2428 s->accept = ! s->accept;
2437 struct fa *fa_minus(struct fa *fa1, struct fa *fa2) {
2438 if (fa1 == NULL || fa2 == NULL)
2441 if (fa_is_basic(fa1, FA_EMPTY) || fa1 == fa2)
2442 return fa_make_empty();
2443 if (fa_is_basic(fa2, FA_EMPTY))
2444 return fa_clone(fa1);
2446 struct fa *cfa2 = fa_complement(fa2);
2450 struct fa *result = fa_intersect(fa1, cfa2);
2455 static int accept_to_accept(struct fa *fa) {
2457 struct state *s = add_state(fa, 0);
2461 F(mark_reachable(fa));
2462 list_for_each(a, fa->initial) {
2463 if (a->accept && a->reachable) {
2464 r = add_epsilon_trans(s, a);
2471 fa->deterministic = fa->minimal = 0;
2477 struct fa *fa_overlap(struct fa *fa1, struct fa *fa2) {
2478 struct fa *fa = NULL, *eps = NULL, *result = NULL;
2479 struct state_set *map = NULL;
2481 if (fa1 == NULL || fa2 == NULL)
2484 fa1 = fa_clone(fa1);
2485 fa2 = fa_clone(fa2);
2486 if (fa1 == NULL || fa2 == NULL)
2489 if (determinize(fa1, NULL) < 0)
2491 if (accept_to_accept(fa1) < 0)
2494 map = fa_reverse(fa2);
2495 state_set_free(map);
2496 if (determinize(fa2, NULL) < 0)
2498 if (accept_to_accept(fa2) < 0)
2500 map = fa_reverse(fa2);
2501 state_set_free(map);
2502 if (determinize(fa2, NULL) < 0)
2505 fa = fa_intersect(fa1, fa2);
2509 eps = fa_make_epsilon();
2513 result = fa_minus(fa, eps);
2525 int fa_equals(struct fa *fa1, struct fa *fa2) {
2526 if (fa1 == NULL || fa2 == NULL)
2529 /* fa_contains(fa1, fa2) && fa_contains(fa2, fa1) with error checking */
2530 int c1 = fa_contains(fa1, fa2);
2535 return fa_contains(fa2, fa1);
2538 static unsigned int chr_score(char c) {
2541 } else if (isalnum(c)) {
2543 } else if (isprint(c)) {
2545 } else if (c == '\0') {
2552 static unsigned int str_score(const struct re_str *str) {
2553 unsigned int score = 0;
2554 for (int i=0; i < str->len; i++) {
2555 score += chr_score(str->rx[i]);
2560 /* See if we get a better string for DST by appending C to SRC. If DST is
2561 * NULL or empty, always use SRC + C
2563 static struct re_str *string_extend(struct re_str *dst,
2564 const struct re_str *src,
2568 || str_score(src) + chr_score(c) < str_score(dst)) {
2569 int slen = src->len;
2571 dst = make_re_str(NULL);
2574 if (REALLOC_N(dst->rx, slen+2) < 0) {
2578 memcpy(dst->rx, src->rx, slen);
2580 dst->rx[slen + 1] = '\0';
2581 dst->len = slen + 1;
2586 static char pick_char(struct trans *t) {
2587 for (int c = t->min; c <= t->max; c++)
2588 if (isalpha(c)) return c;
2589 for (int c = t->min; c <= t->max; c++)
2590 if (isalnum(c)) return c;
2591 for (int c = t->min; c <= t->max; c++)
2592 if (isprint(c)) return c;
2596 /* Generate an example string for FA. Traverse all transitions and record
2597 * at each turn the "best" word found for that state.
2599 int fa_example(struct fa *fa, char **example, size_t *example_len) {
2600 struct re_str *word = NULL;
2601 struct state_set *path = NULL, *worklist = NULL;
2602 struct re_str *str = NULL;
2607 /* Sort to avoid any ambiguity because of reordering of transitions */
2608 sort_transition_intervals(fa);
2610 /* Map from state to string */
2611 path = state_set_init(-1, S_DATA|S_SORTED);
2612 str = make_re_str("");
2613 if (path == NULL || str == NULL)
2615 F(state_set_push_data(path, fa->initial, str));
2618 /* List of states still to visit */
2619 worklist = state_set_init(-1, S_NONE);
2620 if (worklist == NULL)
2622 F(state_set_push(worklist, fa->initial));
2624 while (worklist->used) {
2625 struct state *s = state_set_pop(worklist);
2626 struct re_str *ps = state_set_find_data(path, s);
2627 for_each_trans(t, s) {
2628 char c = pick_char(t);
2629 int toind = state_set_index(path, t->to);
2631 struct re_str *w = string_extend(NULL, ps, c);
2633 F(state_set_push(worklist, t->to));
2634 F(state_set_push_data(path, t->to, w));
2636 path->data[toind] = string_extend(path->data[toind], ps, c);
2641 for (int i=0; i < path->used; i++) {
2642 struct state *p = path->states[i];
2643 struct re_str *ps = path->data[i];
2645 (word == NULL || word->len == 0
2646 || (ps->len > 0 && str_score(word) > str_score(ps)))) {
2653 state_set_free(path);
2654 state_set_free(worklist);
2656 *example_len = word->len;
2657 *example = word->rx;
2662 state_set_free(path);
2663 state_set_free(worklist);
2677 static int fa_enumerate_intl(struct state *s, struct enum_intl *ei, int pos) {
2680 if (ei->bsize <= pos + 1) {
2682 F(REALLOC_N(ei->buf, ei->bsize));
2685 ei->buf[pos] = '\0';
2686 for_each_trans(t, s) {
2690 for (int i=t->min; i <= t->max; i++) {
2692 if (t->to->accept) {
2693 if (ei->nwords >= ei->limit)
2695 ei->words[ei->nwords] = strdup(ei->buf);
2696 E(ei->words[ei->nwords] == NULL);
2699 result = fa_enumerate_intl(t->to, ei, pos+1);
2704 ei->buf[pos] = '\0';
2710 int fa_enumerate(struct fa *fa, int limit, char ***words) {
2711 struct enum_intl ei;
2716 ei.bsize = 8; /* Arbitrary initial size */
2718 F(ALLOC_N(ei.words, limit));
2719 F(ALLOC_N(ei.buf, ei.bsize));
2721 /* We use the visited bit to track which states we already visited
2722 * during the construction of a word to detect loops */
2723 list_for_each(s, fa->initial)
2725 fa->initial->visited = 1;
2726 if (fa->initial->accept) {
2727 if (ei.nwords >= limit)
2729 ei.words[0] = strdup("");
2730 E(ei.words[0] == NULL);
2733 result = fa_enumerate_intl(fa->initial, &ei, 0);
2744 for (int i=0; i < ei.nwords; i++)
2750 /* Expand the automaton FA by replacing every transition s(c) -> p from
2751 * state s to p on character c by two transitions s(X) -> r, r(c) -> p via
2753 * If ADD_MARKER is true, also add for each original state s a new a loop
2754 * s(Y) -> q and q(X) -> s through a new state q.
2756 * The end result is that an automaton accepting "a|ab" is turned into one
2757 * accepting "Xa|XaXb" if add_marker is false and "(YX)*Xa|(YX)*Xa(YX)*Xb"
2758 * when add_marker is true.
2760 * The returned automaton is a copy of FA, FA is not modified.
2762 static struct fa *expand_alphabet(struct fa *fa, int add_marker,
2770 F(mark_reachable(fa));
2771 list_for_each(p, fa->initial) {
2775 struct state *r = add_state(fa, 0);
2778 r->trans = p->trans;
2779 r->tused = p->tused;
2780 r->tsize = p->tsize;
2782 p->tused = p->tsize = 0;
2783 ret = add_new_trans(p, r, X, X);
2787 struct state *q = add_state(fa, 0);
2788 ret = add_new_trans(p, q, Y, Y);
2791 ret = add_new_trans(q, p, X, X);
2802 static bitset *alphabet(struct fa *fa) {
2803 bitset *bs = bitset_init(UCHAR_NUM);
2808 list_for_each(s, fa->initial) {
2809 for (int i=0; i < s->tused; i++) {
2810 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2817 static bitset *last_chars(struct fa *fa) {
2818 bitset *bs = bitset_init(UCHAR_NUM);
2823 list_for_each(s, fa->initial) {
2824 for (int i=0; i < s->tused; i++) {
2825 if (s->trans[i].to->accept) {
2826 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2834 static bitset *first_chars(struct fa *fa) {
2835 bitset *bs = bitset_init(UCHAR_NUM);
2836 struct state *s = fa->initial;
2841 for (int i=0; i < s->tused; i++) {
2842 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2848 /* Return 1 if F1 and F2 are known to be unambiguously concatenable
2849 * according to simple heuristics. Return 0 if they need to be checked
2850 * further to decide ambiguity
2851 * Return -1 if an allocation fails
2853 static int is_splittable(struct fa *fa1, struct fa *fa2) {
2854 bitset *alpha1 = NULL;
2855 bitset *alpha2 = NULL;
2856 bitset *last1 = NULL;
2857 bitset *first2 = NULL;
2860 alpha2 = alphabet(fa2);
2861 last1 = last_chars(fa1);
2862 if (alpha2 == NULL || last1 == NULL)
2864 if (bitset_disjoint(last1, alpha2, UCHAR_NUM)) {
2869 alpha1 = alphabet(fa1);
2870 first2 = first_chars(fa2);
2871 if (alpha1 == NULL || first2 == NULL)
2873 if (bitset_disjoint(first2, alpha1, UCHAR_NUM)) {
2879 bitset_free(alpha1);
2880 bitset_free(alpha2);
2882 bitset_free(first2);
2886 /* This algorithm is due to Anders Moeller, and can be found in class
2887 * AutomatonOperations in dk.brics.grammar
2889 int fa_ambig_example(struct fa *fa1, struct fa *fa2,
2890 char **upv, size_t *upv_len,
2891 char **pv, char **v) {
2892 static const char X = '\001';
2893 static const char Y = '\002';
2894 char *result = NULL, *s = NULL;
2895 size_t result_len = 0;
2897 struct fa *mp = NULL, *ms = NULL, *sp = NULL, *ss = NULL, *amb = NULL;
2898 struct fa *a1f = NULL, *a1t = NULL, *a2f = NULL, *a2t = NULL;
2899 struct fa *b1 = NULL, *b2 = NULL;
2908 r = is_splittable(fa1, fa2);
2916 #define MPs Ys Xs "(" Xs "(.|\n))+"
2917 #define MSs Ys Xs "(" Xs "(.|\n))*"
2918 #define SPs "(" Xs "(.|\n))+" Ys Xs
2919 #define SSs "(" Xs "(.|\n))*" Ys Xs
2920 /* These could become static constants */
2921 r = fa_compile( MPs, strlen(MPs), &mp);
2922 if (r != REG_NOERROR)
2924 r = fa_compile( MSs, strlen(MSs), &ms);
2925 if (r != REG_NOERROR)
2927 r = fa_compile( SPs, strlen(SPs), &sp);
2928 if (r != REG_NOERROR)
2930 r = fa_compile( SSs, strlen(SSs), &ss);
2931 if (r != REG_NOERROR)
2940 a1f = expand_alphabet(fa1, 0, X, Y);
2941 a1t = expand_alphabet(fa1, 1, X, Y);
2942 a2f = expand_alphabet(fa2, 0, X, Y);
2943 a2t = expand_alphabet(fa2, 1, X, Y);
2944 if (a1f == NULL || a1t == NULL || a2f == NULL || a2t == NULL)
2947 /* Compute b1 = ((a1f . mp) & a1t) . ms */
2948 if (concat_in_place(a1f, &mp) < 0)
2950 b1 = fa_intersect(a1f, a1t);
2953 if (concat_in_place(b1, &ms) < 0)
2956 /* Compute b2 = ss . ((sp . a2f) & a2t) */
2957 if (concat_in_place(sp, &a2f) < 0)
2959 b2 = fa_intersect(sp, a2t);
2962 if (concat_in_place(ss, &b2) < 0)
2967 /* The automaton we are really interested in */
2968 amb = fa_intersect(b1, b2);
2973 r = fa_example(amb, &s, &s_len);
2979 result_len = (s_len-1)/2 - 1;
2980 F(ALLOC_N(result, result_len + 1));
2983 for (i=0; s[2*i] == X; i++) {
2984 assert((t - result) < result_len);
2991 for ( ;s[2*i] == X; i++) {
2992 assert((t - result) < result_len);
2999 for (; 2*i+1 < s_len; i++) {
3000 assert((t - result) < result_len);
3007 /* Clean up intermediate automata */
3023 *upv_len = result_len;
3032 * Construct an fa from a regular expression
3034 static struct fa *fa_from_re(struct re *re) {
3035 struct fa *result = NULL;
3040 result = fa_from_re(re->exp1);
3043 struct fa *fa2 = fa_from_re(re->exp2);
3046 if (union_in_place(result, &fa2) < 0)
3052 result = fa_from_re(re->exp1);
3055 struct fa *fa2 = fa_from_re(re->exp2);
3058 if (concat_in_place(result, &fa2) < 0)
3063 result = fa_make_char_set(re->cset, re->negate);
3067 struct fa *fa = fa_from_re(re->exp);
3070 result = fa_iter(fa, re->min, re->max);
3075 result = fa_make_epsilon();
3078 result = fa_make_char(re->c);
3090 static void free_re(struct re *re) {
3093 assert(re->ref == 0);
3095 if (re->type == UNION || re->type == CONCAT) {
3098 } else if (re->type == ITER) {
3100 } else if (re->type == CSET) {
3101 bitset_free(re->cset);
3106 int fa_compile(const char *regexp, size_t size, struct fa **fa) {
3107 struct re *re = NULL;
3108 struct re_parse parse;
3113 parse.rend = regexp + size;
3114 parse.error = REG_NOERROR;
3116 re = parse_regexp(&parse);
3120 *fa = fa_from_re(re);
3123 if (*fa == NULL || collect(*fa) < 0)
3124 parse.error = REG_ESPACE;
3128 /* We represent a case-insensitive FA by using only transitions on
3129 * lower-case letters.
3131 int fa_nocase(struct fa *fa) {
3132 if (fa == NULL || fa->nocase)
3136 list_for_each(s, fa->initial) {
3137 int tused = s->tused;
3138 /* For every transition on characters in [A-Z] add a corresponding
3139 * transition on [a-z]; remove any portion covering [A-Z] */
3140 for (int i=0; i < tused; i++) {
3141 struct trans *t = s->trans + i;
3142 int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
3143 int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
3145 if (t->min > 'Z' || t->max < 'A')
3147 if (t->min >= 'A' && t->max <= 'Z') {
3148 t->min = tolower(t->min);
3149 t->max = tolower(t->max);
3150 } else if (t->max <= 'Z') {
3153 F(add_new_trans(s, t->to, lc_min, lc_max));
3154 } else if (t->min >= 'A') {
3157 F(add_new_trans(s, t->to, lc_min, lc_max));
3159 /* t->min < 'A' && t->max > 'Z' */
3160 F(add_new_trans(s, t->to, 'Z' + 1, t->max));
3161 s->trans[i].max = 'A' - 1;
3162 F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
3172 int fa_is_nocase(struct fa *fa) {
3176 /* If FA is case-insensitive, turn it into a case-sensitive automaton by
3177 * adding transitions on upper-case letters for each existing transition on
3178 * lower-case letters */
3179 static int case_expand(struct fa *fa) {
3184 list_for_each(s, fa->initial) {
3185 int tused = s->tused;
3186 /* For every transition on characters in [a-z] add a corresponding
3187 * transition on [A-Z] */
3188 for (int i=0; i < tused; i++) {
3189 struct trans *t = s->trans + i;
3190 int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
3191 int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
3193 if (t->min > 'z' || t->max < 'a')
3195 F(add_new_trans(s, t->to, lc_min, lc_max));
3205 * Regular expression parser
3208 static struct re *make_re(enum re_type type) {
3210 if (make_ref(re) == 0)
3215 static struct re *make_re_rep(struct re *exp, int min, int max) {
3216 struct re *re = make_re(ITER);
3227 static struct re *make_re_binop(enum re_type type, struct re *exp1,
3229 struct re *re = make_re(type);
3240 static struct re *make_re_char(uchar c) {
3241 struct re *re = make_re(CHAR);
3247 static struct re *make_re_char_set(bool negate, bool no_ranges) {
3248 struct re *re = make_re(CSET);
3250 re->negate = negate;
3251 re->no_ranges = no_ranges;
3252 re->cset = bitset_init(UCHAR_NUM);
3253 if (re->cset == NULL)
3259 static bool more(struct re_parse *parse) {
3260 return parse->rx < parse->rend;
3263 static bool match(struct re_parse *parse, char m) {
3266 if (*parse->rx == m) {
3273 static bool peek(struct re_parse *parse, const char *chars) {
3274 return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL;
3277 static bool next(struct re_parse *parse, char *c) {
3285 static bool parse_char(struct re_parse *parse, int quoted, char *c) {
3288 if (quoted && *parse->rx == '\\') {
3290 return next(parse, c);
3292 return next(parse, c);
3296 static void add_re_char(struct re *re, uchar from, uchar to) {
3297 assert(re->type == CSET);
3298 for (unsigned int c = from; c <= to; c++)
3299 bitset_set(re->cset, c);
3302 static void parse_char_class(struct re_parse *parse, struct re *re) {
3303 if (! more(parse)) {
3304 parse->error = REG_EBRACK;
3308 parse_char(parse, 0, &from);
3310 if (match(parse, '-')) {
3311 if (! more(parse)) {
3312 parse->error = REG_EBRACK;
3315 if (peek(parse, "]")) {
3317 parse->error = REG_ERANGE;
3320 add_re_char(re, from, to);
3321 add_re_char(re, '-', '-');
3323 } else if (!parse_char(parse, 0, &to)) {
3324 parse->error = REG_ERANGE;
3329 parse->error = REG_ERANGE;
3332 add_re_char(re, from, to);
3337 static struct re *parse_simple_exp(struct re_parse *parse) {
3338 struct re *re = NULL;
3340 if (match(parse, '[')) {
3341 bool negate = match(parse, '^');
3342 re = make_re_char_set(negate, parse->no_ranges);
3344 parse->error = REG_ESPACE;
3347 parse_char_class(parse, re);
3348 if (parse->error != REG_NOERROR)
3350 while (more(parse) && ! peek(parse, "]")) {
3351 parse_char_class(parse, re);
3352 if (parse->error != REG_NOERROR)
3355 if (! match(parse, ']')) {
3356 parse->error = REG_EBRACK;
3359 } else if (match(parse, '(')) {
3360 if (match(parse, ')')) {
3361 re = make_re(EPSILON);
3363 parse->error = REG_ESPACE;
3367 re = parse_regexp(parse);
3370 if (! match(parse, ')')) {
3371 parse->error = REG_EPAREN;
3375 } else if (match(parse, '.')) {
3376 re = make_re_char_set(1, parse->no_ranges);
3378 parse->error = REG_ESPACE;
3381 add_re_char(re, '\n', '\n');
3382 } else if (more(parse)) {
3384 if (!parse_char(parse, 1, &c)) {
3385 parse->error = REG_EESCAPE;
3388 re = make_re_char(c);
3390 parse->error = REG_ESPACE;
3394 re = make_re(EPSILON);
3396 parse->error = REG_ESPACE;
3406 static int parse_int(struct re_parse *parse) {
3412 /* We need to be careful that strtoul will never access
3413 * memory beyond parse->rend
3415 for (lim = parse->rx; lim < parse->rend && *lim >= '0' && *lim <= '9';
3417 if (lim < parse->rend) {
3418 l = strtoul(parse->rx, &end, 10);
3419 used = end - parse->rx;
3421 char *s = strndup(parse->rx, parse->rend - parse->rx);
3423 parse->error = REG_ESPACE;
3426 l = strtoul(s, &end, 10);
3434 if ((l<0) || (l > INT_MAX)) {
3435 parse->error = REG_BADBR;
3441 static struct re *parse_repeated_exp(struct re_parse *parse) {
3442 struct re *re = parse_simple_exp(parse);
3445 if (match(parse, '?')) {
3446 re = make_re_rep(re, 0, 1);
3447 } else if (match(parse, '*')) {
3448 re = make_re_rep(re, 0, -1);
3449 } else if (match(parse, '+')) {
3450 re = make_re_rep(re, 1, -1);
3451 } else if (match(parse, '{')) {
3453 min = parse_int(parse);
3456 if (match(parse, ',')) {
3457 max = parse_int(parse);
3459 max = -1; /* If it's not an int, it means 'unbounded' */
3460 if (! match(parse, '}')) {
3461 parse->error = REG_EBRACE;
3464 } else if (match(parse, '}')) {
3467 parse->error = REG_EBRACE;
3470 if (min > max && max != -1) {
3471 parse->error = REG_BADBR;
3474 re = make_re_rep(re, min, max);
3477 parse->error = REG_ESPACE;
3484 static struct re *parse_concat_exp(struct re_parse *parse) {
3485 struct re *re = parse_repeated_exp(parse);
3489 if (more(parse) && ! peek(parse, ")|")) {
3490 struct re *re2 = parse_concat_exp(parse);
3493 re = make_re_binop(CONCAT, re, re2);
3495 parse->error = REG_ESPACE;
3506 static struct re *parse_regexp(struct re_parse *parse) {
3507 struct re *re = NULL;
3509 /* Something like (|r) */
3510 if (peek(parse, "|"))
3511 re = make_re(EPSILON);
3513 re = parse_concat_exp(parse);
3517 if (match(parse, '|')) {
3518 struct re *re2 = NULL;
3519 /* Something like (r|) */
3520 if (peek(parse, ")"))
3521 re2 = make_re(EPSILON);
3523 re2 = parse_regexp(parse);
3527 re = make_re_binop(UNION, re, re2);
3529 parse->error = REG_ESPACE;
3541 * Convert a STRUCT RE to a string. Code is complicated by the fact that
3542 * we try to be clever and avoid unneeded parens and concatenation with
3545 static int re_as_string(const struct re *re, struct re_str *str);
3547 static int re_binop_count(enum re_type type, const struct re *re) {
3548 assert(type == CONCAT || type == UNION);
3549 if (re->type == type) {
3550 return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2);
3556 static int re_binop_store(enum re_type type, const struct re *re,
3557 const struct re **list) {
3559 if (type == re->type) {
3560 pos = re_binop_store(type, re->exp1, list);
3561 pos += re_binop_store(type, re->exp2, list + pos);
3569 static int re_union_as_string(const struct re *re, struct re_str *str) {
3570 assert(re->type == UNION);
3573 const struct re **res = NULL;
3574 struct re_str *strings = NULL;
3577 nre = re_binop_count(re->type, re);
3578 r = ALLOC_N(res, nre);
3582 re_binop_store(re->type, re, res);
3584 r = ALLOC_N(strings, nre);
3589 for (int i=0; i < nre; i++) {
3590 if (re_as_string(res[i], strings + i) < 0)
3592 str->len += strings[i].len;
3596 r = re_str_alloc(str);
3601 for (int i=0; i < nre; i++) {
3604 memcpy(p, strings[i].rx, strings[i].len);
3605 p += strings[i].len;
3611 if (strings != NULL) {
3612 for (int i=0; i < nre; i++)
3613 release_re_str(strings + i);
3618 release_re_str(str);
3624 static int re_needs_parens_in_concat(const struct re *re) {
3625 return (re->type != CHAR && re->type != CSET && re->type != ITER);
3628 static int re_concat_as_string(const struct re *re, struct re_str *str) {
3629 assert(re->type == CONCAT);
3631 const struct re **res = NULL;
3632 struct re_str *strings = NULL;
3636 nre = re_binop_count(re->type, re);
3637 r = ALLOC_N(res, nre);
3640 re_binop_store(re->type, re, res);
3642 r = ALLOC_N(strings, nre);
3647 for (int i=0; i < nre; i++) {
3648 if (res[i]->type == EPSILON)
3650 if (re_as_string(res[i], strings + i) < 0)
3652 str->len += strings[i].len;
3653 if (re_needs_parens_in_concat(res[i]))
3657 r = re_str_alloc(str);
3662 for (int i=0; i < nre; i++) {
3663 if (res[i]->type == EPSILON)
3665 if (re_needs_parens_in_concat(res[i]))
3667 p = memcpy(p, strings[i].rx, strings[i].len);
3668 p += strings[i].len;
3669 if (re_needs_parens_in_concat(res[i]))
3676 if (strings != NULL) {
3677 for (int i=0; i < nre; i++)
3678 release_re_str(strings + i);
3683 release_re_str(str);
3688 static bool cset_contains(const struct re *cset, int c) {
3689 return bitset_get(cset->cset, c) != cset->negate;
3692 static int re_cset_as_string(const struct re *re, struct re_str *str) {
3693 const uchar rbrack = ']';
3694 const uchar dash = '-';
3695 const uchar nul = '\0';
3697 static const char *const empty_set = "[]";
3698 static const char *const total_set = "(.|\n)";
3699 static const char *const not_newline = ".";
3702 int from, to, negate;
3704 int incl_rbrack, incl_dash;
3707 str->len = strlen(empty_set);
3709 /* We can not include NUL explicitly in a CSET since we use ordinary
3710 NUL delimited strings to represent them. That means that we need to
3711 use negated representation if NUL is to be included (and vice versa)
3713 negate = cset_contains(re, nul);
3715 for (from = UCHAR_MIN;
3716 from <= UCHAR_MAX && cset_contains(re, from);
3718 if (from > UCHAR_MAX) {
3719 /* Special case: the set matches every character */
3720 str->rx = strdup(total_set);
3725 from <= UCHAR_MAX && cset_contains(re, from);
3727 if (from > UCHAR_MAX) {
3728 /* Special case: the set matches everything but '\n' */
3729 str->rx = strdup(not_newline);
3735 /* See if ']' and '-' will be explicitly included in the character set
3736 (INCL_RBRACK, INCL_DASH) As we loop over the character set, we reset
3737 these flags if they are in the set, but not mentioned explicitly
3739 incl_rbrack = cset_contains(re, rbrack) != negate;
3740 incl_dash = cset_contains(re, dash) != negate;
3742 if (re->no_ranges) {
3743 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++)
3744 if (cset_contains(re, from) != negate)
3747 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3748 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3750 if (from > UCHAR_MAX)
3753 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3756 if (to == from && (from == rbrack || from == dash))
3758 if (from == rbrack || from == dash)
3760 if (to == rbrack || to == dash)
3763 len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3);
3765 if (from < rbrack && rbrack < to)
3767 if (from < dash && dash < to)
3771 str->len += incl_rbrack + incl_dash;
3774 str->len += 1; /* For the ^ */
3776 r = re_str_alloc(str);
3787 if (re->no_ranges) {
3788 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) {
3789 if (from == rbrack || from == dash)
3791 if (cset_contains(re, from) != negate)
3795 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3796 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3798 if (from > UCHAR_MAX)
3801 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3804 if (to == from && (from == rbrack || from == dash))
3806 if (from == rbrack || from == dash)
3808 if (to == rbrack || to == dash)
3813 } else if (to == from + 1) {
3828 if (str->rx == NULL)
3830 str->len = strlen(str->rx);
3833 release_re_str(str);
3837 static int re_iter_as_string(const struct re *re, struct re_str *str) {
3838 const char *quant = NULL;
3842 if (re_as_string(re->exp, str) < 0)
3845 if (re->min == 0 && re->max == -1) {
3847 } else if (re->min == 1 && re->max == -1) {
3849 } else if (re->min == 0 && re->max == 1) {
3851 } else if (re->max == -1) {
3852 r = asprintf(&iter, "{%d,}", re->min);
3857 r = asprintf(&iter, "{%d,%d}", re->min, re->max);
3863 if (re->exp->type == CHAR || re->exp->type == CSET) {
3864 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0)
3866 strcpy(str->rx + str->len, quant);
3867 str->len += strlen(quant);
3869 /* Format '(' + str->rx ')' + quant */
3870 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0)
3872 memmove(str->rx + 1, str->rx, str->len);
3874 str->rx[str->len + 1] = ')';
3876 strcpy(str->rx + str->len, quant);
3877 str->len += strlen(quant);
3885 release_re_str(str);
3889 static int re_as_string(const struct re *re, struct re_str *str) {
3890 /* Characters that must be escaped */
3891 static const char * const special_chars = ".()[]{}*|+?\\^$";
3896 result = re_union_as_string(re, str);
3899 result = re_concat_as_string(re, str);
3902 result = re_cset_as_string(re, str);
3905 if (re->c == '\0' || strchr(special_chars, re->c) == NULL) {
3906 if (ALLOC_N(str->rx, 2) < 0)
3911 if (ALLOC_N(str->rx, 3) < 0)
3915 str->len = strlen(str->rx);
3919 result = re_iter_as_string(re, str);
3922 if (ALLOC_N(str->rx, 3) < 0)
3924 strcpy(str->rx, "()");
3925 str->len = strlen(str->rx);
3934 release_re_str(str);
3938 static int convert_trans_to_re(struct state *s) {
3939 struct re *re = NULL;
3941 struct trans *trans = NULL;
3947 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
3948 for (int i = 0; i < s->tused - 1; i++) {
3949 if (s->trans[i].to != s->trans[i+1].to)
3952 r = ALLOC_N(trans, nto);
3956 struct state *to = s->trans[0].to;
3958 for_each_trans(t, s) {
3960 trans[tind].to = to;
3961 trans[tind].re = re;
3967 re = make_re_char_set(0, 0);
3971 add_re_char(re, t->min, t->max);
3973 assert(nto == tind + 1);
3974 trans[tind].to = to;
3975 trans[tind].re = re;
3977 /* Simplify CSETs with a single char to a CHAR */
3978 for (int t=0; t < nto; t++) {
3980 uchar chr = UCHAR_MIN;
3981 for (int c = 0; c < UCHAR_NUM; c++) {
3982 if (bitset_get(trans[t].re->cset, c)) {
3988 re_unref(trans[t].re);
3989 trans[t].re = make_re_char(chr);
3990 if (trans[t].re == NULL)
3996 s->tused = s->tsize = nto;
4001 for (int i=0; i < nto; i++)
4002 unref(trans[i].re, re);
4007 ATTRIBUTE_RETURN_CHECK
4008 static int add_new_re_trans(struct state *s1, struct state *s2,
4011 r = add_new_trans(s1, s2, 0, 0);
4014 last_trans(s1)->re = re;
4018 /* Add the regular expression R1 . LOOP* . R2 to the transition
4020 static int re_collapse_trans(struct state *s1, struct state *s2,
4021 struct re *r1, struct re *loop, struct re *r2) {
4022 struct re *re = NULL;
4024 if (loop->type != EPSILON) {
4025 loop = make_re_rep(ref(loop), 0, -1);
4030 if (r1->type == EPSILON) {
4031 if (loop->type == EPSILON) {
4034 re = make_re_binop(CONCAT, loop, ref(r2));
4037 if (loop->type == EPSILON) {
4038 if (r2->type == EPSILON) {
4041 re = make_re_binop(CONCAT, ref(r1), ref(r2));
4044 re = make_re_binop(CONCAT, ref(r1), loop);
4045 if (re != NULL && r2->type != EPSILON) {
4046 re = make_re_binop(CONCAT, re, ref(r2));
4053 struct trans *t = NULL;
4054 for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1);
4055 if (t > last_trans(s1)) {
4056 if (add_new_re_trans(s1, s2, re) < 0)
4059 if (t->re == NULL) {
4062 t->re = make_re_binop(UNION, re, t->re);
4069 // FIXME: make sure we don't leak loop
4073 static int convert_strings(struct fa *fa) {
4074 struct state_set *worklist = state_set_init(-1, S_NONE);
4077 E(worklist == NULL);
4079 /* Abuse hash to count indegree, reachable to mark visited states */
4080 list_for_each(s, fa->initial) {
4085 list_for_each(s, fa->initial) {
4086 for_each_trans(t, s) {
4091 for (struct state *s = fa->initial;
4093 s = state_set_pop(worklist)) {
4094 for (int i=0; i < s->tused; i++) {
4095 struct trans *t = s->trans + i;
4096 struct state *to = t->to;
4097 while (to->hash == 1 && to->tused == 1 && ! to->accept) {
4098 if (t->re == NULL) {
4099 t->re = to->trans->re;
4100 to->trans->re = NULL;
4102 t->re = make_re_binop(CONCAT, t->re, to->trans->re);
4106 t->to = to->trans->to;
4110 for (int j=0; j < s->tused; j++) {
4111 if (j != i && s->trans[j].to == to) {
4112 /* Combine transitions i and j; remove trans j */
4113 t->re = make_re_binop(UNION, t->re, s->trans[j].re);
4116 memmove(s->trans + j, s->trans + j + 1,
4117 sizeof(s->trans[j]) * (s->tused - j - 1));
4128 if (! to->reachable) {
4130 F(state_set_push(worklist, to));
4135 for (struct state *s = fa->initial; s->next != NULL; ) {
4136 if (s->next->hash == 0 && s->next->tused == 0) {
4137 struct state *del = s->next;
4138 s->next = del->next;
4148 state_set_free(worklist);
4152 /* Convert an FA to a regular expression.
4153 * The strategy is the following:
4154 * (1) For all states S1 and S2, convert the transitions between them
4155 * into one transition whose regexp is a CSET
4156 * (2) Add a new initial state INI and a new final state FIN to the automaton,
4157 * a transition from INI to the old initial state of FA, and a transition
4158 * from all accepting states of FA to FIN; the regexp on those transitions
4159 * matches only the empty word
4160 * (3) Eliminate states S (except for INI and FIN) one by one:
4161 * Let LOOP the regexp for the transition S -> S if it exists, epsilon
4163 * For all S1, S2 different from S with S1 -> S -> S2
4164 * Let R1 the regexp of S1 -> S
4165 * R2 the regexp of S -> S2
4166 * R3 the regexp S1 -> S2 (or epsilon if no such transition)
4167 * set the regexp on the transition S1 -> S2 to
4168 * R1 . (LOOP)* . R2 | R3
4169 * (4) The regexp for the whole FA can now be found as the regexp of
4170 * the transition INI -> FIN
4171 * (5) Convert that STRUCT RE to a string with RE_AS_STRING
4173 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
4175 struct state *fin = NULL, *ini = NULL;
4176 struct re *eps = NULL;
4184 eps = make_re(EPSILON);
4188 fin = add_state(fa,1);
4194 list_for_each(s, fa->initial) {
4195 r = convert_trans_to_re(s);
4198 if (s->accept && s != fin) {
4199 r = add_new_re_trans(s, fin, ref(eps));
4206 ini = add_state(fa, 0);
4210 r = add_new_re_trans(ini, fa->initial, ref(eps));
4213 set_initial(fa, ini);
4215 convert_strings(fa);
4217 list_for_each(s, fa->initial->next) {
4221 struct re *loop = eps;
4222 for_each_trans(t, s) {
4226 list_for_each(s1, fa->initial) {
4229 for (int t1 = 0; t1 < s1->tused; t1++) {
4230 if (s1->trans[t1].to == s) {
4231 for (int t = 0; t < s->tused; t++) {
4232 if (s->trans[t].to == s)
4234 r = re_collapse_trans(s1, s->trans[t].to,
4248 for_each_trans(t, fa->initial) {
4252 if (re_as_string(t->re, &str) < 0)
4255 *regexp_len = str.len;
4259 list_for_each(s, fa->initial) {
4260 for_each_trans(t, s) {
4273 static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
4280 r1 = re_restrict_alphabet(re->exp1, from, to);
4281 r2 = re_restrict_alphabet(re->exp2, from, to);
4282 result = (r1 != 0) ? r1 : r2;
4287 bitset_negate(re->cset, UCHAR_NUM);
4289 for (int i=from; i <= to; i++)
4290 bitset_clr(re->cset, i);
4293 if (from <= re->c && re->c <= to)
4297 result = re_restrict_alphabet(re->exp, from, to);
4309 int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
4310 char **newregexp, size_t *newregexp_len,
4311 char from, char to) {
4313 struct re *re = NULL;
4314 struct re_parse parse;
4320 parse.rend = regexp + regexp_len;
4321 parse.error = REG_NOERROR;
4322 re = parse_regexp(&parse);
4323 if (parse.error != REG_NOERROR)
4326 result = re_restrict_alphabet(re, from, to);
4333 result = re_as_string(re, &str);
4334 *newregexp = str.rx;
4335 *newregexp_len = str.len;
4341 int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
4342 char **newregexp, size_t *newregexp_len) {
4344 struct re *re = NULL;
4345 struct re_parse parse;
4351 parse.rend = regexp + regexp_len;
4352 parse.error = REG_NOERROR;
4353 parse.no_ranges = 1;
4354 re = parse_regexp(&parse);
4355 if (parse.error != REG_NOERROR)
4359 result = re_as_string(re, &str);
4360 *newregexp = str.rx;
4361 *newregexp_len = str.len;
4366 /* Expand regexp so that it is case-insensitive in a case-sensitive match.
4368 * Return 1 when a change was made, -1 when an allocation failed, and 0
4369 * when no change was made.
4371 static int re_case_expand(struct re *re) {
4372 int result = 0, r1, r2;
4377 r1 = re_case_expand(re->exp1);
4378 r2 = re_case_expand(re->exp2);
4379 result = (r1 != 0) ? r1 : r2;
4382 for (int c = 'A'; c <= 'Z'; c++)
4383 if (bitset_get(re->cset, c)) {
4385 bitset_set(re->cset, tolower(c));
4387 for (int c = 'a'; c <= 'z'; c++)
4388 if (bitset_get(re->cset, c)) {
4390 bitset_set(re->cset, toupper(c));
4394 if (isalpha(re->c)) {
4399 re->cset = bitset_init(UCHAR_NUM);
4400 if (re->cset == NULL)
4402 bitset_set(re->cset, tolower(c));
4403 bitset_set(re->cset, toupper(c));
4408 result = re_case_expand(re->exp);
4420 int fa_expand_nocase(const char *regexp, size_t regexp_len,
4421 char **newregexp, size_t *newregexp_len) {
4423 struct re *re = NULL;
4424 struct re_parse parse;
4430 parse.rend = regexp + regexp_len;
4431 parse.error = REG_NOERROR;
4432 re = parse_regexp(&parse);
4433 if (parse.error != REG_NOERROR)
4436 r = re_case_expand(re);
4444 result = re_as_string(re, &str);
4445 *newregexp = str.rx;
4446 *newregexp_len = str.len;
4448 *newregexp = strndup(regexp, regexp_len);
4449 *newregexp_len = regexp_len;
4450 result = (*newregexp == NULL) ? REG_ESPACE : REG_NOERROR;
4456 static void print_char(FILE *out, uchar c) {
4457 /* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
4458 Also, a space ' ' is shown as '\s' */
4459 static const char *const escape_from = " \n\t\v\b\r\f\a/\0";
4460 static const char *const escape_to = "sntvbrfa/0";
4461 char *p = strchr(escape_from, c);
4463 int i = p - escape_from;
4464 fprintf(out, "\\\\%c", escape_to[i]);
4465 } else if (! isprint(c)) {
4466 fprintf(out, "\\\\0%03o", (unsigned char) c);
4467 } else if (c == '"') {
4468 fprintf(out, "\\\"");
4474 void fa_dot(FILE *out, struct fa *fa) {
4475 fprintf(out, "digraph {\n rankdir=LR;");
4476 list_for_each(s, fa->initial) {
4478 fprintf(out, "\"%p\" [shape=doublecircle];\n", s);
4480 fprintf(out, "\"%p\" [shape=circle];\n", s);
4483 fprintf(out, "%s -> \"%p\";\n", fa->deterministic ? "dfa" : "nfa",
4488 list_for_each(s, fa->initial) {
4489 for_each_trans(t, s) {
4490 fprintf(out, "\"%p\" -> \"%p\" [ label = \"", s, t->to);
4492 re_as_string(t->re, &str);
4493 for (int i=0; i < str.len; i++) {
4494 print_char(out, str.rx[i]);
4496 release_re_str(&str);
4498 print_char(out, t->min);
4499 if (t->min != t->max) {
4501 print_char(out, t->max);
4504 fprintf(out, "\" ];\n");
4507 fprintf(out, "}\n");
4512 * indent-tabs-mode: nil