2 * fa.c: finite automata
4 * Copyright (C) 2007-2015 David Lutterkort
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * Author: David Lutterkort <dlutter@redhat.com>
24 * This implementation follows closely the Java dk.brics.automaton package
25 * by Anders Moeller. The project's website is
26 * http://www.brics.dk/automaton/.
28 * It is by no means a complete reimplementation of that package; only a
29 * subset of what Automaton provides is implemented here.
43 #define UCHAR_NUM (UCHAR_MAX+1)
45 typedef unsigned char uchar;
47 #define E(cond) if (cond) goto error
48 #define F(expr) if ((expr) < 0) goto error
50 /* Which algorithm to use in FA_MINIMIZE */
51 int fa_minimization_algorithm = FA_MIN_HOPCROFT;
53 /* A finite automaton. INITIAL is both the initial state and the head of
54 * the list of all states. Any state that is allocated for this automaton
55 * is put on this list. Dead/unreachable states are cleared from the list
56 * at opportune times (e.g., during minimization) It's poor man's garbage
59 * Normally, transitions are on a character range [min..max]; in
60 * fa_as_regexp, we store regexps on transitions in the re field of each
61 * transition. TRANS_RE indicates that we do that, and is used by fa_dot to
62 * produce proper graphs of an automaton transitioning on regexps.
64 * For case-insensitive regexps (nocase == 1), the FA never has transitions
65 * on uppercase letters [A-Z], effectively removing these letters from the
69 struct state *initial;
70 int deterministic : 1;
72 unsigned int nocase : 1;
76 /* A state in a finite automaton. Transitions are never shared between
77 states so that we can free the list when we need to free the state */
81 unsigned int accept : 1;
82 unsigned int live : 1;
83 unsigned int reachable : 1;
84 unsigned int visited : 1; /* Used in various places to track progress */
85 /* Array of transitions. The TUSED first entries are used, the array
86 has allocated room for TSIZE */
92 /* A transition. If the input has a character in the inclusive
93 * range [MIN, MAX], move to TO
109 #define UINT_BIT (sizeof(unsigned int) * CHAR_BIT)
111 typedef unsigned int bitset;
113 static bitset *bitset_init(size_t nbits) {
115 if (ALLOC_N(bs, (nbits + UINT_BIT) / UINT_BIT) == -1)
120 static inline void bitset_clr(bitset *bs, unsigned int bit) {
121 bs[bit/UINT_BIT] &= ~(1 << (bit % UINT_BIT));
124 static inline void bitset_set(bitset *bs, unsigned int bit) {
125 bs[bit/UINT_BIT] |= 1 << (bit % UINT_BIT);
129 static inline int bitset_get(const bitset * const bs, unsigned int bit) {
130 return (bs[bit/UINT_BIT] >> bit % UINT_BIT) & 1;
134 static inline bool bitset_disjoint(const bitset *const bs1,
135 const bitset *const bs2,
137 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) {
144 static void bitset_free(bitset *bs) {
148 static void bitset_negate(bitset *bs, size_t nbits) {
149 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
154 * Representation of a parsed regular expression. The regular expression is
155 * parsed according to the following grammar by PARSE_REGEXP:
157 * regexp: concat_exp ('|' regexp)?
158 * concat_exp: repeated_exp concat_exp?
159 * repeated_exp: simple_exp
163 * | simple_exp '{' INT (',' INT '}')?
164 * simple_exp: char_class
168 * char_class: '[' char_exp ']'
169 * | '[' '^' char_exp ']'
170 * char_exp: CHAR '-' CHAR
183 #define re_unref(r) unref(r, re)
189 struct { /* UNION, CONCAT */
196 /* Whether we can use character ranges when converting back
198 unsigned int no_ranges:1;
211 /* Used to keep state of the regex parse; RX may contain NUL's */
213 const char *rx; /* Current position in regex */
214 const char *rend; /* Last char of rx+ 1 */
215 int error; /* error code */
216 /* Whether new CSET's should have the no_ranges flag set */
217 unsigned int no_ranges:1;
220 /* String with explicit length, used when converting re to string */
226 static struct re *parse_regexp(struct re_parse *parse);
228 /* A map from a set of states to a state. */
229 typedef hash_t state_set_hash;
231 static hash_val_t ptr_hash(const void *p);
233 static const int array_initial_size = 4;
234 static const int array_max_expansion = 128;
236 enum state_set_init_flags {
245 unsigned int sorted : 1;
246 unsigned int with_data : 1;
247 struct state **states;
251 struct state_set_list {
252 struct state_set_list *next;
253 struct state_set *set;
256 /* Clean up FA by removing dead transitions and states and reducing
257 * transitions. Unreachable states are freed. The return value is the same
258 * as FA; returning it is merely a convenience.
260 * Only automata in this state should be returned to the user
262 ATTRIBUTE_RETURN_CHECK
263 static int collect(struct fa *fa);
265 ATTRIBUTE_RETURN_CHECK
266 static int totalize(struct fa *fa);
268 /* Print an FA into a (fairly) fixed file if the environment variable
269 * FA_DOT_DIR is set. This code is only used for debugging
271 #define FA_DOT_DIR "FA_DOT_DIR"
274 static void fa_dot_debug(struct fa *fa, const char *tag) {
276 static int count = 0;
281 if ((dot_dir = getenv(FA_DOT_DIR)) == NULL)
284 r = asprintf(&fname, "%s/fa_%02d_%s.dot", dot_dir, count++, tag);
288 fp = fopen(fname, "w");
294 static void print_char_set(struct re *set) {
301 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
302 while (bitset_get(set->cset, from) == set->negate)
304 if (from > UCHAR_MAX)
307 to < UCHAR_MAX && (bitset_get(set->cset, to+1) == !set->negate);
312 printf("%c-%c", from, to);
319 static void print_re(struct re *re) {
340 printf("){%d,%d}", re->min, re->max);
354 static void release_re_str(struct re_str *str) {
361 static void free_re_str(struct re_str *str) {
368 static struct re_str *make_re_str(const char *s) {
375 str->len = strlen(s);
376 if (str->rx == NULL) {
384 static int re_str_alloc(struct re_str *str) {
385 return ALLOC_N(str->rx, str->len + 1);
392 static void free_trans(struct state *s) {
395 s->tused = s->tsize = 0;
398 static void gut(struct fa *fa) {
399 list_for_each(s, fa->initial) {
402 list_free(fa->initial);
406 void fa_free(struct fa *fa) {
413 static struct state *make_state(void) {
417 s->hash = ptr_hash(s);
421 static struct state *add_state(struct fa *fa, int accept) {
422 struct state *s = make_state();
425 if (fa->initial == NULL) {
428 list_cons(fa->initial->next, s);
434 #define last_trans(s) ((s)->trans + (s)->tused - 1)
436 #define for_each_trans(t, s) \
437 for (struct trans *t = (s)->trans; \
438 (t - (s)->trans) < (s)->tused; \
441 ATTRIBUTE_RETURN_CHECK
442 static int add_new_trans(struct state *from, struct state *to,
443 uchar min, uchar max) {
446 if (from->tused == from->tsize) {
447 size_t tsize = from->tsize;
449 tsize = array_initial_size;
450 else if (from->tsize > array_max_expansion)
451 tsize += array_max_expansion;
454 if (REALLOC_N(from->trans, tsize) == -1)
458 from->trans[from->tused].to = to;
459 from->trans[from->tused].min = min;
460 from->trans[from->tused].max = max;
465 ATTRIBUTE_RETURN_CHECK
466 static int add_epsilon_trans(struct state *from,
469 from->accept |= to->accept;
470 for_each_trans(t, to) {
471 r = add_new_trans(from, t->to, t->min, t->max);
478 static void set_initial(struct fa *fa, struct state *s) {
479 list_remove(s, fa->initial);
480 list_cons(fa->initial, s);
483 /* Merge automaton FA2 into FA1. This simply adds FA2's states to FA1
484 and then frees FA2. It has no influence on the language accepted by FA1
486 static void fa_merge(struct fa *fa1, struct fa **fa2) {
487 list_append(fa1->initial, (*fa2)->initial);
493 * Operations on STATE_SET
495 static void state_set_free(struct state_set *set) {
503 static int state_set_init_data(struct state_set *set) {
505 if (set->data == NULL)
506 return ALLOC_N(set->data, set->size);
511 /* Create a new STATE_SET with an initial size of SIZE. If SIZE is -1, use
512 the default size ARRAY_INITIAL_SIZE. FLAGS is a bitmask indicating
514 - S_SORTED: keep the states in the set sorted by their address, and use
515 binary search for lookups. If it is not set, entries are kept in the
516 order in which they are added and lookups scan linearly through the
518 - S_DATA: allocate the DATA array in the set, and keep its size in sync
519 with the size of the STATES array.
521 static struct state_set *state_set_init(int size, int flags) {
522 struct state_set *set = NULL;
526 set->sorted = (flags & S_SORTED) ? 1 : 0;
527 set->with_data = (flags & S_DATA) ? 1 : 0;
530 F(ALLOC_N(set->states, set->size));
532 F(state_set_init_data(set));
540 ATTRIBUTE_RETURN_CHECK
541 static int state_set_expand(struct state_set *set) {
543 set->size = array_initial_size;
544 else if (set->size > array_max_expansion)
545 set->size += array_max_expansion;
548 if (REALLOC_N(set->states, set->size) < 0)
551 if (REALLOC_N(set->data, set->size) < 0)
555 /* We do this to provoke a SEGV as early as possible */
561 /* Return the index where S belongs in SET->STATES to keep it sorted. S
562 may not be in SET->STATES. The returned index is in the interval [0
563 .. SET->USED], with the latter indicating that S is larger than all
564 values in SET->STATES
566 static int state_set_pos(const struct state_set *set, const struct state *s) {
567 int l = 0, h = set->used;
570 if (set->states[m] > s)
572 else if (set->states[m] < s)
580 ATTRIBUTE_RETURN_CHECK
581 static int state_set_push(struct state_set *set, struct state *s) {
582 if (set->size == set->used)
583 if (state_set_expand(set) < 0)
586 int p = state_set_pos(set, s);
587 if (set->size == set->used)
588 if (state_set_expand(set) < 0)
590 while (p < set->used && set->states[p] <= s)
593 memmove(set->states + p + 1, set->states + p,
594 sizeof(*set->states) * (set->used - p));
595 if (set->data != NULL)
596 memmove(set->data + p + 1, set->data + p,
597 sizeof(*set->data) * (set->used - p));
603 set->states[set->used++] = s;
604 return set->used - 1;
608 ATTRIBUTE_RETURN_CHECK
609 static int state_set_push_data(struct state_set *set, struct state *s,
611 int i = state_set_push(set, s);
618 static int state_set_index(const struct state_set *set,
619 const struct state *s) {
621 int p = state_set_pos(set, s);
622 return (p < set->used && set->states[p] == s) ? p : -1;
624 for (int i=0; i < set->used; i++) {
625 if (set->states[i] == s)
632 static void state_set_remove(struct state_set *set,
633 const struct state *s) {
635 int p = state_set_index(set, s);
637 memmove(set->states + p, set->states + p + 1,
638 sizeof(*set->states) * (set->used - p - 1));
639 if (set->data != NULL)
640 memmove(set->data + p, set->data + p + 1,
641 sizeof(*set->data) * (set->used - p - 1));
643 int p = state_set_index(set, s);
645 set->states[p] = set->states[--set->used];
650 /* Only add S if it's not in SET yet. Return 1 if S was added, 0 if it was
651 already in the set and -1 on error. */
652 ATTRIBUTE_RETURN_CHECK
653 static int state_set_add(struct state_set *set, struct state *s) {
655 int p = state_set_pos(set, s);
656 if (p < set->used && set->states[p] == s)
658 if (set->size == set->used)
659 if (state_set_expand(set) < 0)
661 while (p < set->used && set->states[p] <= s)
664 memmove(set->states + p + 1, set->states + p,
665 sizeof(*set->states) * (set->used - p));
666 if (set->data != NULL)
667 memmove(set->data + p + 1, set->data + p,
668 sizeof(*set->data) * (set->used - p));
673 if (state_set_index(set, s) >= 0)
675 if (state_set_push(set, s) < 0)
680 /* We do this to provoke a SEGV as early as possible */
686 static struct state *state_set_pop(struct state_set *set) {
687 struct state *s = NULL;
689 s = set->states[--set->used];
693 static struct state *state_set_pop_data(struct state_set *set, void **d) {
694 struct state *s = NULL;
695 s = state_set_pop(set);
696 *d = set->data[set->used];
700 static void *state_set_find_data(struct state_set *set, struct state *s) {
701 int i = state_set_index(set, s);
708 static int state_set_equal(const struct state_set *set1,
709 const struct state_set *set2) {
710 if (set1->used != set2->used)
712 if (set1->sorted && set2->sorted) {
713 for (int i = 0; i < set1->used; i++)
714 if (set1->states[i] != set2->states[i])
718 for (int i=0; i < set1->used; i++)
719 if (state_set_index(set2, set1->states[i]) == -1)
726 static void state_set_compact(struct state_set *set) {
727 while (set->used > 0 && set->states[set->used] == NULL)
729 for (int i=0; i < set->used; i++) {
730 if (set->states[i] == NULL) {
732 set->states[i] = set->states[set->used];
734 set->data[i] = set->data[set->used];
736 while (set->used > 0 && set->states[set->used] == NULL)
742 /* Add an entry (FST, SND) to SET. FST is stored in SET->STATES, and SND is
743 stored in SET->DATA at the same index.
745 ATTRIBUTE_RETURN_CHECK
746 static int state_pair_push(struct state_set **set,
747 struct state *fst, struct state *snd) {
749 *set = state_set_init(-1, S_DATA);
752 int i = state_set_push(*set, fst);
755 (*set)->data[i] = snd;
760 /* Return the index of the pair (FST, SND) in SET, or -1 if SET contains no
763 static int state_pair_find(struct state_set *set, struct state *fst,
765 for (int i=0; i < set->used; i++)
766 if (set->states[i] == fst && set->data[i] == snd)
771 /* Jenkins' hash for void* */
772 static hash_val_t ptr_hash(const void *p) {
774 char *c = (char *) &p;
775 for (int i=0; i < sizeof(p); i++) {
777 hash += (hash << 10);
781 hash ^= (hash >> 11);
782 hash += (hash << 15);
786 typedef hash_t state_triple_hash;
788 static hash_val_t pair_hash(const void *key) {
789 register struct state *const *pair = key;
790 return pair[0]->hash + pair[1]->hash;
793 static int pair_cmp(const void *key1, const void *key2) {
794 return memcmp(key1, key2, 2*sizeof(struct state *));
797 static void state_triple_node_free(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
798 free((void *) hnode_getkey(node));
802 static state_triple_hash *state_triple_init(void) {
803 state_triple_hash *hash;
805 hash = hash_create(HASHCOUNT_T_MAX, pair_cmp, pair_hash);
808 hash_set_allocator(hash, NULL, state_triple_node_free, NULL);
812 ATTRIBUTE_RETURN_CHECK
813 static int state_triple_push(state_triple_hash *hash,
818 if (ALLOC_N(pair, 2) < 0)
822 return hash_alloc_insert(hash, pair, s3);
825 static struct state * state_triple_thd(state_triple_hash *hash,
828 struct state *pair[2];
832 node = hash_lookup(hash, pair);
833 return (node == NULL) ? NULL : (struct state *) hnode_get(node);
836 static void state_triple_free(state_triple_hash *hash) {
838 hash_free_nodes(hash);
846 ATTRIBUTE_RETURN_CHECK
847 static int mark_reachable(struct fa *fa) {
848 struct state_set *worklist = state_set_init(-1, S_NONE);
853 list_for_each(s, fa->initial) {
856 fa->initial->reachable = 1;
858 for (struct state *s = fa->initial;
860 s = state_set_pop(worklist)) {
861 for_each_trans(t, s) {
862 if (! t->to->reachable) {
863 t->to->reachable = 1;
864 F(state_set_push(worklist, t->to));
871 state_set_free(worklist);
875 /* Return all reachable states. As a sideeffect, all states have their
876 REACHABLE flag set appropriately.
878 static struct state_set *fa_states(struct fa *fa) {
879 struct state_set *visited = state_set_init(-1, S_NONE);
882 r = mark_reachable(fa);
883 E(visited == NULL || r < 0);
885 list_for_each(s, fa->initial) {
887 F(state_set_push(visited, s));
891 state_set_free(visited);
895 /* Return all reachable accepting states. As a sideeffect, all states have
896 their REACHABLE flag set appropriately.
898 static struct state_set *fa_accept_states(struct fa *fa) {
899 struct state_set *accept = state_set_init(-1, S_NONE);
904 r = mark_reachable(fa);
907 list_for_each(s, fa->initial) {
908 if (s->reachable && s->accept)
909 F(state_set_push(accept, s));
913 state_set_free(accept);
917 /* Mark all live states, i.e. states from which an accepting state can be
918 reached. All states have their REACHABLE and LIVE flags set
921 ATTRIBUTE_RETURN_CHECK
922 static int mark_live(struct fa *fa) {
925 F(mark_reachable(fa));
927 list_for_each(s, fa->initial) {
928 s->live = s->reachable && s->accept;
933 list_for_each(s, fa->initial) {
934 if (! s->live && s->reachable) {
935 for_each_trans(t, s) {
951 * Reverse an automaton in place. Change FA so that it accepts the
952 * language that is the reverse of the input automaton.
954 * Returns a list of the new initial states of the automaton. The list must
955 * be freed by the caller.
957 static struct state_set *fa_reverse(struct fa *fa) {
958 struct state_set *all = NULL;
959 struct state_set *accept = NULL;
964 accept = fa_accept_states(fa);
967 F(state_set_init_data(all));
969 /* Reverse all transitions */
971 F(ALLOC_N(tused, all->used));
972 for (int i=0; i < all->used; i++) {
973 all->data[i] = all->states[i]->trans;
974 tused[i] = all->states[i]->tused;
975 all->states[i]->trans = NULL;
976 all->states[i]->tsize = 0;
977 all->states[i]->tused = 0;
979 for (int i=0; i < all->used; i++) {
980 struct state *s = all->states[i];
981 struct trans *t = all->data[i];
983 for (int j=0; j < tused[i]; j++) {
984 r = add_new_trans(t[j].to, s, t[j].min, t[j].max);
992 /* Make new initial and final states */
993 struct state *s = add_state(fa, 0);
996 fa->initial->accept = 1;
998 for (int i=0; i < accept->used; i++) {
999 r = add_epsilon_trans(s, accept->states[i]);
1004 fa->deterministic = 0;
1006 state_set_free(all);
1009 state_set_free(all);
1010 state_set_free(accept);
1015 * Return a sorted array of all interval start points in FA. The returned
1016 * array is a string (null terminated)
1018 static uchar* start_points(struct fa *fa, int *npoints) {
1019 char pointset[UCHAR_NUM];
1020 uchar *points = NULL;
1022 F(mark_reachable(fa));
1023 MEMZERO(pointset, UCHAR_NUM);
1024 list_for_each(s, fa->initial) {
1028 for_each_trans(t, s) {
1029 pointset[t->min] = 1;
1030 if (t->max < UCHAR_MAX)
1031 pointset[t->max+1] = 1;
1036 for(int i=0; i < UCHAR_NUM; *npoints += pointset[i], i++);
1038 F(ALLOC_N(points, *npoints+1));
1039 for (int i=0, n=0; i < UCHAR_NUM; i++) {
1041 points[n++] = (uchar) i;
1051 * Operations on STATE_SET_HASH
1053 static int state_set_hash_contains(state_set_hash *smap,
1054 struct state_set *set) {
1055 return hash_lookup(smap, set) != NULL;
1059 * Find the set in SMAP that has the same states as SET. If the two are
1060 * different, i.e. they point to different memory locations, free SET and
1061 * return the set found in SMAP
1063 static struct state_set *state_set_hash_uniq(state_set_hash *smap,
1064 struct state_set *set) {
1065 hnode_t *node = hash_lookup(smap, set);
1066 const struct state_set *orig_set = hnode_getkey(node);
1067 if (orig_set != set) {
1068 state_set_free(set);
1070 return (struct state_set *) orig_set;
1073 static struct state *state_set_hash_get_state(state_set_hash *smap,
1074 struct state_set *set) {
1075 hnode_t *node = hash_lookup(smap, set);
1076 return (struct state *) hnode_get(node);
1079 static hash_val_t set_hash(const void *key) {
1080 hash_val_t hash = 0;
1081 const struct state_set *set = key;
1083 for (int i = 0; i < set->used; i++) {
1084 hash += set->states[i]->hash;
1089 static int set_cmp(const void *key1, const void *key2) {
1090 const struct state_set *set1 = key1;
1091 const struct state_set *set2 = key2;
1093 return state_set_equal(set1, set2) ? 0 : 1;
1096 static void set_destroy(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
1097 struct state_set *set = (struct state_set *) hnode_getkey(node);
1098 state_set_free(set);
1102 ATTRIBUTE_RETURN_CHECK
1103 static int state_set_hash_add(state_set_hash **smap,
1104 struct state_set *set, struct fa *fa) {
1105 if (*smap == NULL) {
1106 *smap = hash_create(HASHCOUNT_T_MAX, set_cmp, set_hash);
1108 hash_set_allocator(*smap, NULL, set_destroy, NULL);
1110 struct state *s = add_state(fa, 0);
1112 F(hash_alloc_insert(*smap, set, s));
1118 static void state_set_hash_free(state_set_hash *smap,
1119 struct state_set *protect) {
1120 if (protect != NULL) {
1121 hnode_t *node = hash_lookup(smap, protect);
1122 hash_delete(smap, node);
1123 hnode_getkey(node) = NULL;
1124 set_destroy(node, NULL);
1126 hash_free_nodes(smap);
1130 static int state_set_list_add(struct state_set_list **list,
1131 struct state_set *set) {
1132 struct state_set_list *elt;
1136 list_cons(*list, elt);
1140 static struct state_set *state_set_list_pop(struct state_set_list **list) {
1141 struct state_set_list *elt = *list;
1142 struct state_set *set = elt->set;
1149 /* Compare transitions lexicographically by (to, min, reverse max) */
1150 static int trans_to_cmp(const void *v1, const void *v2) {
1151 const struct trans *t1 = v1;
1152 const struct trans *t2 = v2;
1154 if (t1->to != t2->to) {
1155 return (t1->to < t2->to) ? -1 : 1;
1157 if (t1->min < t2->min)
1159 if (t1->min > t2->min)
1161 if (t1->max > t2->max)
1163 return (t1->max < t2->max) ? 1 : 0;
1166 /* Compare transitions lexicographically by (min, reverse max, to) */
1167 static int trans_intv_cmp(const void *v1, const void *v2) {
1168 const struct trans *t1 = v1;
1169 const struct trans *t2 = v2;
1171 if (t1->min < t2->min)
1173 if (t1->min > t2->min)
1175 if (t1->max > t2->max)
1177 if (t1->max < t2->max)
1179 if (t1->to != t2->to) {
1180 return (t1->to < t2->to) ? -1 : 1;
1186 * Reduce an automaton by combining overlapping and adjacent edge intervals
1187 * with the same destination.
1189 static void reduce(struct fa *fa) {
1190 list_for_each(s, fa->initial) {
1194 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
1196 struct trans *t = s->trans;
1197 while (j < s->tused) {
1198 if (t[i].to == t[j].to && t[j].min <= t[i].max + 1) {
1199 if (t[j].max > t[i].max)
1200 t[i].max = t[j].max;
1205 memmove(s->trans + i, s->trans + j,
1206 sizeof(*s->trans) * (s->tused - j));
1212 /* Shrink if we use less than half the allocated size */
1213 if (s->tsize > array_initial_size && 2*s->tused < s->tsize) {
1215 r = REALLOC_N(s->trans, s->tsize);
1217 s->tsize = s->tused;
1223 * Remove dead transitions from an FA; a transition is dead is it does not
1224 * lead to a live state. This also removes any states that are not
1225 * reachable any longer from FA->INITIAL.
1227 * Returns the same FA as a convenience
1230 static void collect_trans(struct fa *fa) {
1231 list_for_each(s, fa->initial) {
1236 while (i < s->tused) {
1237 if (! s->trans[i].to->live) {
1239 memmove(s->trans + i, s->trans + s->tused,
1249 static void collect_dead_states(struct fa *fa) {
1250 /* Remove all dead states and free their storage */
1251 for (struct state *s = fa->initial; s->next != NULL; ) {
1252 if (! s->next->live) {
1253 struct state *del = s->next;
1254 s->next = del->next;
1263 static int collect(struct fa *fa) {
1266 if (! fa->initial->live) {
1267 /* This automaton accepts nothing, make it the canonical
1270 list_for_each(s, fa->initial) {
1273 list_free(fa->initial->next);
1274 fa->deterministic = 1;
1277 collect_dead_states(fa);
1285 static void swap_initial(struct fa *fa) {
1286 struct state *s = fa->initial;
1287 if (s->next != NULL) {
1288 fa->initial = s->next;
1289 s->next = fa->initial->next;
1290 fa->initial->next = s;
1295 * Make a finite automaton deterministic using the given set of initial
1296 * states with the subset construction. This also eliminates dead states
1297 * and transitions and reduces and orders the transitions for each state
1299 static int determinize(struct fa *fa, struct state_set *ini) {
1301 int make_ini = (ini == NULL);
1302 const uchar *points = NULL;
1303 state_set_hash *newstate = NULL;
1304 struct state_set_list *worklist = NULL;
1307 if (fa->deterministic)
1310 points = start_points(fa, &npoints);
1313 ini = state_set_init(-1, S_NONE);
1314 if (ini == NULL || state_set_push(ini, fa->initial) < 0)
1318 F(state_set_list_add(&worklist, ini));
1319 F(state_set_hash_add(&newstate, ini, fa));
1320 // Make the new state the initial state
1322 while (worklist != NULL) {
1323 struct state_set *sset = state_set_list_pop(&worklist);
1324 struct state *r = state_set_hash_get_state(newstate, sset);
1325 for (int q=0; q < sset->used; q++) {
1326 r->accept |= sset->states[q]->accept;
1328 for (int n=0; n < npoints; n++) {
1329 struct state_set *pset = state_set_init(-1, S_SORTED);
1331 for(int q=0 ; q < sset->used; q++) {
1332 for_each_trans(t, sset->states[q]) {
1333 if (t->min <= points[n] && points[n] <= t->max) {
1334 F(state_set_add(pset, t->to));
1338 if (!state_set_hash_contains(newstate, pset)) {
1339 F(state_set_list_add(&worklist, pset));
1340 F(state_set_hash_add(&newstate, pset, fa));
1342 pset = state_set_hash_uniq(newstate, pset);
1344 struct state *q = state_set_hash_get_state(newstate, pset);
1345 uchar min = points[n];
1346 uchar max = UCHAR_MAX;
1348 max = points[n+1] - 1;
1349 if (add_new_trans(r, q, min, max) < 0)
1353 fa->deterministic = 1;
1357 state_set_hash_free(newstate, make_ini ? NULL : ini);
1358 free((void *) points);
1359 if (collect(fa) < 0)
1368 * Minimization. As a sideeffect of minimization, the transitions are
1369 * reduced and ordered.
1372 static struct state *step(struct state *s, uchar c) {
1373 for_each_trans(t, s) {
1374 if (t->min <= c && c <= t->max)
1381 struct state_list_node *first;
1382 struct state_list_node *last;
1386 struct state_list_node {
1387 struct state_list *sl;
1388 struct state_list_node *next;
1389 struct state_list_node *prev;
1390 struct state *state;
1393 static struct state_list_node *state_list_add(struct state_list *sl,
1395 struct state_list_node *n;
1403 if (sl->size++ == 0) {
1414 static void state_list_remove(struct state_list_node *n) {
1415 struct state_list *sl = n->sl;
1418 sl->first = n->next;
1420 n->prev->next = n->next;
1424 n->next->prev = n->prev;
1429 static void state_list_free(struct state_list *sl) {
1431 list_free(sl->first);
1435 /* The linear index of element (q,c) in an NSTATES * NSIGMA matrix */
1436 #define INDEX(q, c) (q * nsigma + c)
1438 static int minimize_hopcroft(struct fa *fa) {
1439 struct state_set *states = NULL;
1440 uchar *sigma = NULL;
1441 struct state_set **reverse = NULL;
1442 bitset *reverse_nonempty = NULL;
1443 struct state_set **partition = NULL;
1444 unsigned int *block = NULL;
1445 struct state_list **active = NULL;
1446 struct state_list_node **active2 = NULL;
1447 int *pending = NULL;
1448 bitset *pending2 = NULL;
1449 struct state_set *split = NULL;
1450 bitset *split2 = NULL;
1452 bitset *refine2 = NULL;
1453 struct state_set **splitblock = NULL;
1454 struct state_set *newstates = NULL;
1458 unsigned int nstates = 0;
1461 F(determinize(fa, NULL));
1463 /* Total automaton, nothing to do */
1464 if (fa->initial->tused == 1
1465 && fa->initial->trans[0].to == fa->initial
1466 && fa->initial->trans[0].min == UCHAR_MIN
1467 && fa->initial->trans[0].max == UCHAR_MAX)
1472 /* make arrays for numbered states and effective alphabet */
1473 states = state_set_init(-1, S_NONE);
1476 list_for_each(s, fa->initial) {
1477 F(state_set_push(states, s));
1479 nstates = states->used;
1481 sigma = start_points(fa, &nsigma);
1484 /* initialize data structures */
1486 /* An ss->used x nsigma matrix of lists of states */
1487 F(ALLOC_N(reverse, nstates * nsigma));
1488 reverse_nonempty = bitset_init(nstates * nsigma);
1489 E(reverse_nonempty == NULL);
1490 F(ALLOC_N(partition, nstates));
1491 F(ALLOC_N(block, nstates));
1493 F(ALLOC_N(active, nstates * nsigma));
1494 F(ALLOC_N(active2, nstates * nsigma));
1496 /* PENDING is an array of pairs of ints. The i'th pair is stored in
1497 * PENDING[2*i] and PENDING[2*i + 1]. There are NPENDING pairs in
1498 * PENDING at any time. SPENDING is the maximum number of pairs
1499 * allocated for PENDING.
1501 size_t npending = 0, spending = 0;
1502 pending2 = bitset_init(nstates * nsigma);
1503 E(pending2 == NULL);
1505 split = state_set_init(-1, S_NONE);
1506 split2 = bitset_init(nstates);
1507 E(split == NULL || split2 == NULL);
1509 F(ALLOC_N(refine, nstates));
1510 refine2 = bitset_init(nstates);
1513 F(ALLOC_N(splitblock, nstates));
1515 for (int q = 0; q < nstates; q++) {
1516 splitblock[q] = state_set_init(-1, S_NONE);
1517 partition[q] = state_set_init(-1, S_NONE);
1518 E(splitblock[q] == NULL || partition[q] == NULL);
1519 for (int x = 0; x < nsigma; x++) {
1520 reverse[INDEX(q, x)] = state_set_init(-1, S_NONE);
1521 E(reverse[INDEX(q, x)] == NULL);
1522 F(ALLOC_N(active[INDEX(q, x)], 1));
1526 /* find initial partition and reverse edges */
1527 for (int q = 0; q < nstates; q++) {
1528 struct state *qq = states->states[q];
1534 F(state_set_push(partition[j], qq));
1536 for (int x = 0; x < nsigma; x++) {
1538 struct state *p = step(qq, y);
1540 int pn = state_set_index(states, p);
1542 F(state_set_push(reverse[INDEX(pn, x)], qq));
1543 bitset_set(reverse_nonempty, INDEX(pn, x));
1547 /* initialize active sets */
1548 for (int j = 0; j <= 1; j++)
1549 for (int x = 0; x < nsigma; x++)
1550 for (int q = 0; q < partition[j]->used; q++) {
1551 struct state *qq = partition[j]->states[q];
1552 int qn = state_set_index(states, qq);
1553 if (bitset_get(reverse_nonempty, INDEX(qn, x))) {
1554 active2[INDEX(qn, x)] =
1555 state_list_add(active[INDEX(j, x)], qq);
1556 E(active2[INDEX(qn, x)] == NULL);
1560 /* initialize pending */
1561 F(ALLOC_N(pending, 2*nsigma));
1564 for (int x = 0; x < nsigma; x++) {
1565 int a0 = active[INDEX(0,x)]->size;
1566 int a1 = active[INDEX(1,x)]->size;
1574 bitset_set(pending2, INDEX(j, x));
1577 /* process pending until fixed point */
1579 while (npending-- > 0) {
1580 int p = pending[2*npending];
1581 int x = pending[2*npending+1];
1582 bitset_clr(pending2, INDEX(p, x));
1584 /* find states that need to be split off their blocks */
1585 struct state_list *sh = active[INDEX(p,x)];
1586 for (struct state_list_node *m = sh->first; m != NULL; m = m->next) {
1587 int q = state_set_index(states, m->state);
1588 struct state_set *rev = reverse[INDEX(q, x)];
1589 for (int r =0; r < rev->used; r++) {
1590 struct state *rs = rev->states[r];
1591 int s = state_set_index(states, rs);
1592 if (! bitset_get(split2, s)) {
1593 bitset_set(split2, s);
1594 F(state_set_push(split, rs));
1596 F(state_set_push(splitblock[j], rs));
1597 if (!bitset_get(refine2, j)) {
1598 bitset_set(refine2, j);
1605 for(int rr=0; rr < ref; rr++) {
1607 struct state_set *sp = splitblock[j];
1608 if (sp->used < partition[j]->used) {
1609 struct state_set *b1 = partition[j];
1610 struct state_set *b2 = partition[k];
1611 for (int s = 0; s < sp->used; s++) {
1612 state_set_remove(b1, sp->states[s]);
1613 F(state_set_push(b2, sp->states[s]));
1614 int snum = state_set_index(states, sp->states[s]);
1616 for (int c = 0; c < nsigma; c++) {
1617 struct state_list_node *sn = active2[INDEX(snum, c)];
1618 if (sn != NULL && sn->sl == active[INDEX(j,c)]) {
1619 state_list_remove(sn);
1620 active2[INDEX(snum, c)] =
1621 state_list_add(active[INDEX(k, c)],
1623 E(active2[INDEX(snum, c)] == NULL);
1628 for (int c = 0; c < nsigma; c++) {
1629 int aj = active[INDEX(j, c)]->size;
1630 int ak = active[INDEX(k, c)]->size;
1631 if (npending + 1 > spending) {
1633 F(REALLOC_N(pending, 2 * spending));
1635 pending[2*npending + 1] = c;
1636 if (!bitset_get(pending2, INDEX(j, c))
1637 && 0 < aj && aj <= ak) {
1638 bitset_set(pending2, INDEX(j, c));
1639 pending[2*npending] = j;
1641 bitset_set(pending2, INDEX(k, c));
1642 pending[2*npending] = k;
1648 for (int s = 0; s < sp->used; s++) {
1649 int snum = state_set_index(states, sp->states[s]);
1650 bitset_clr(split2, snum);
1652 bitset_clr(refine2, j);
1658 /* make a new state for each equivalence class, set initial state */
1659 newstates = state_set_init(k, S_NONE);
1660 E(newstates == NULL);
1661 F(ALLOC_N(nsnum, k));
1662 F(ALLOC_N(nsind, nstates));
1664 for (int n = 0; n < k; n++) {
1665 struct state *s = make_state();
1667 newstates->states[n] = s;
1668 struct state_set *partn = partition[n];
1669 for (int q=0; q < partn->used; q++) {
1670 struct state *qs = partn->states[q];
1671 int qnum = state_set_index(states, qs);
1672 if (qs == fa->initial)
1673 s->live = 1; /* Abuse live to flag the new intial state */
1674 nsnum[n] = qnum; /* select representative */
1675 nsind[qnum] = n; /* and point from partition to new state */
1679 /* build transitions and set acceptance */
1680 for (int n = 0; n < k; n++) {
1681 struct state *s = newstates->states[n];
1682 s->accept = states->states[nsnum[n]]->accept;
1683 for_each_trans(t, states->states[nsnum[n]]) {
1684 int toind = state_set_index(states, t->to);
1685 struct state *nto = newstates->states[nsind[toind]];
1686 F(add_new_trans(s, nto, t->min, t->max));
1690 /* Get rid of old states and transitions and turn NEWTSTATES into
1693 for (int n=0; n < k; n++)
1694 if (newstates->states[n]->live) {
1695 struct state *ini = newstates->states[n];
1696 newstates->states[n] = newstates->states[0];
1697 newstates->states[0] = ini;
1699 for (int n=0; n < k-1; n++)
1700 newstates->states[n]->next = newstates->states[n+1];
1701 fa->initial = newstates->states[0];
1709 state_set_free(states);
1711 bitset_free(reverse_nonempty);
1713 for (int i=0; i < nstates*nsigma; i++) {
1715 state_set_free(reverse[i]);
1717 state_list_free(active[i]);
1723 bitset_free(pending2);
1724 state_set_free(split);
1725 bitset_free(split2);
1727 bitset_free(refine2);
1728 for (int q=0; q < nstates; q++) {
1730 state_set_free(splitblock[q]);
1732 state_set_free(partition[q]);
1736 state_set_free(newstates);
1738 if (collect(fa) < 0)
1746 static int minimize_brzozowski(struct fa *fa) {
1747 struct state_set *set;
1749 /* Minimize using Brzozowski's algorithm */
1750 set = fa_reverse(fa);
1752 F(determinize(fa, set));
1753 state_set_free(set);
1755 set = fa_reverse(fa);
1757 F(determinize(fa, set));
1758 state_set_free(set);
1764 int fa_minimize(struct fa *fa) {
1772 if (fa_minimization_algorithm == FA_MIN_BRZOZOWSKI) {
1773 r = minimize_brzozowski(fa);
1775 r = minimize_hopcroft(fa);
1784 * Construction of fa
1787 static struct fa *fa_make_empty(void) {
1792 if (add_state(fa, 0) == NULL) {
1796 fa->deterministic = 1;
1801 static struct fa *fa_make_epsilon(void) {
1802 struct fa *fa = fa_make_empty();
1804 fa->initial->accept = 1;
1805 fa->deterministic= 1;
1811 static struct fa *fa_make_char(uchar c) {
1812 struct fa *fa = fa_make_empty();
1816 struct state *s = fa->initial;
1817 struct state *t = add_state(fa, 1);
1823 r = add_new_trans(s, t, c, c);
1826 fa->deterministic = 1;
1834 struct fa *fa_make_basic(unsigned int basic) {
1837 if (basic == FA_EMPTY) {
1838 return fa_make_empty();
1839 } else if (basic == FA_EPSILON) {
1840 return fa_make_epsilon();
1841 } else if (basic == FA_TOTAL) {
1842 struct fa *fa = fa_make_epsilon();
1843 r = add_new_trans(fa->initial, fa->initial, UCHAR_MIN, UCHAR_MAX);
1853 int fa_is_basic(struct fa *fa, unsigned int basic) {
1854 if (basic == FA_EMPTY) {
1855 return ! fa->initial->accept && fa->initial->tused == 0;
1856 } else if (basic == FA_EPSILON) {
1857 return fa->initial->accept && fa->initial->tused == 0;
1858 } else if (basic == FA_TOTAL) {
1859 if (! fa->initial->accept)
1862 if (fa->initial->tused != 2)
1864 struct trans *t1 = fa->initial->trans;
1865 struct trans *t2 = fa->initial->trans + 1;
1866 if (t1->to != fa->initial || t2->to != fa->initial)
1868 if (t2->max != UCHAR_MAX) {
1870 t2 = fa->initial->trans;
1872 return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
1873 t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
1875 struct trans *t = fa->initial->trans;
1876 return fa->initial->tused == 1 &&
1877 t->to == fa->initial &&
1878 t->min == UCHAR_MIN && t->max == UCHAR_MAX;
1884 static struct fa *fa_clone(struct fa *fa) {
1885 struct fa *result = NULL;
1886 struct state_set *set = state_set_init(-1, S_DATA|S_SORTED);
1889 if (fa == NULL || set == NULL || ALLOC(result) < 0)
1892 result->deterministic = fa->deterministic;
1893 result->minimal = fa->minimal;
1894 result->nocase = fa->nocase;
1895 list_for_each(s, fa->initial) {
1896 int i = state_set_push(set, s);
1899 struct state *q = add_state(result, s->accept);
1904 q->reachable = s->reachable;
1906 for (int i=0; i < set->used; i++) {
1907 struct state *s = set->states[i];
1908 struct state *sc = set->data[i];
1909 for_each_trans(t, s) {
1910 int to = state_set_index(set, t->to);
1912 struct state *toc = set->data[to];
1913 r = add_new_trans(sc, toc, t->min, t->max);
1918 state_set_free(set);
1921 state_set_free(set);
1926 static int case_expand(struct fa *fa);
1928 /* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
1929 ATTRIBUTE_RETURN_CHECK
1930 static int union_in_place(struct fa *fa1, struct fa **fa2) {
1934 if (fa1->nocase != (*fa2)->nocase) {
1935 if (case_expand(fa1) < 0)
1937 if (case_expand(*fa2) < 0)
1941 s = add_state(fa1, 0);
1944 r = add_epsilon_trans(s, fa1->initial);
1947 r = add_epsilon_trans(s, (*fa2)->initial);
1951 fa1->deterministic = 0;
1955 set_initial(fa1, s);
1960 struct fa *fa_union(struct fa *fa1, struct fa *fa2) {
1961 fa1 = fa_clone(fa1);
1962 fa2 = fa_clone(fa2);
1963 if (fa1 == NULL || fa2 == NULL)
1966 F(union_in_place(fa1, &fa2));
1975 /* Concat FA2 onto FA1; frees FA2 and changes FA1 to FA1.FA2 */
1976 ATTRIBUTE_RETURN_CHECK
1977 static int concat_in_place(struct fa *fa1, struct fa **fa2) {
1980 if (fa1->nocase != (*fa2)->nocase) {
1981 if (case_expand(fa1) < 0)
1983 if (case_expand(*fa2) < 0)
1987 list_for_each(s, fa1->initial) {
1990 r = add_epsilon_trans(s, (*fa2)->initial);
1996 fa1->deterministic = 0;
2003 struct fa *fa_concat(struct fa *fa1, struct fa *fa2) {
2004 fa1 = fa_clone(fa1);
2005 fa2 = fa_clone(fa2);
2007 if (fa1 == NULL || fa2 == NULL)
2010 F(concat_in_place(fa1, &fa2));
2022 static struct fa *fa_make_char_set(bitset *cset, int negate) {
2023 struct fa *fa = fa_make_empty();
2027 struct state *s = fa->initial;
2028 struct state *t = add_state(fa, 1);
2035 while (from <= UCHAR_MAX) {
2036 while (from <= UCHAR_MAX && bitset_get(cset, from) == negate)
2038 if (from > UCHAR_MAX)
2041 while (to < UCHAR_MAX && (bitset_get(cset, to + 1) == !negate))
2043 r = add_new_trans(s, t, from, to);
2049 fa->deterministic = 1;
2058 static struct fa *fa_star(struct fa *fa) {
2066 s = add_state(fa, 1);
2070 r = add_epsilon_trans(s, fa->initial);
2075 list_for_each(p, fa->initial->next) {
2077 r = add_epsilon_trans(p, s);
2082 fa->deterministic = 0;
2092 /* Form the automaton (FA){N}; FA is not modified */
2093 static struct fa *repeat(struct fa *fa, int n) {
2095 return fa_make_epsilon();
2096 } else if (n == 1) {
2097 return fa_clone(fa);
2099 struct fa *cfa = fa_clone(fa);
2103 struct fa *tfa = fa_clone(fa);
2108 if (concat_in_place(cfa, &tfa) < 0) {
2119 struct fa *fa_iter(struct fa *fa, int min, int max) {
2125 if (min > max && max != -1) {
2126 return fa_make_empty();
2129 struct fa *sfa = fa_star(fa);
2134 struct fa *cfa = repeat(fa, min);
2139 if (concat_in_place(cfa, &sfa) < 0) {
2146 struct fa *cfa = NULL;
2149 cfa = repeat(fa, min);
2153 struct fa *cfa2 = fa_clone(fa);
2159 struct fa *cfa3 = fa_clone(fa);
2165 list_for_each(s, cfa3->initial) {
2167 r = add_epsilon_trans(s, cfa2->initial);
2176 fa_merge(cfa3, &cfa2);
2180 list_for_each(s, cfa->initial) {
2182 r = add_epsilon_trans(s, cfa2->initial);
2190 fa_merge(cfa, &cfa2);
2191 cfa->deterministic = 0;
2194 if (collect(cfa) < 0) {
2202 static void sort_transition_intervals(struct fa *fa) {
2203 list_for_each(s, fa->initial) {
2204 qsort(s->trans, s->tused, sizeof(*s->trans), trans_intv_cmp);
2208 struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
2210 struct fa *fa = NULL;
2211 struct state_set *worklist = NULL;
2212 state_triple_hash *newstates = NULL;
2215 return fa_clone(fa1);
2217 if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
2218 return fa_make_empty();
2220 if (fa1->nocase != fa2->nocase) {
2221 F(case_expand(fa1));
2222 F(case_expand(fa2));
2225 fa = fa_make_empty();
2226 worklist = state_set_init(-1, S_NONE);
2227 newstates = state_triple_init();
2228 if (fa == NULL || worklist == NULL || newstates == NULL)
2231 sort_transition_intervals(fa1);
2232 sort_transition_intervals(fa2);
2234 F(state_set_push(worklist, fa1->initial));
2235 F(state_set_push(worklist, fa2->initial));
2236 F(state_set_push(worklist, fa->initial));
2237 F(state_triple_push(newstates,
2238 fa1->initial, fa2->initial, fa->initial));
2239 while (worklist->used) {
2240 struct state *s = state_set_pop(worklist);
2241 struct state *p2 = state_set_pop(worklist);
2242 struct state *p1 = state_set_pop(worklist);
2243 s->accept = p1->accept && p2->accept;
2245 struct trans *t1 = p1->trans;
2246 struct trans *t2 = p2->trans;
2247 for (int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2248 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2251 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2253 if (t2[n2].max >= t1[n1].min) {
2254 struct state *r = state_triple_thd(newstates,
2255 t1[n1].to, t2[n2].to);
2257 r = add_state(fa, 0);
2259 F(state_set_push(worklist, t1[n1].to));
2260 F(state_set_push(worklist, t2[n2].to));
2261 F(state_set_push(worklist, r));
2262 F(state_triple_push(newstates,
2263 t1[n1].to, t2[n2].to, r));
2265 char min = t1[n1].min > t2[n2].min
2266 ? t1[n1].min : t2[n2].min;
2267 char max = t1[n1].max < t2[n2].max
2268 ? t1[n1].max : t2[n2].max;
2269 ret = add_new_trans(s, r, min, max);
2276 fa->deterministic = fa1->deterministic && fa2->deterministic;
2277 fa->nocase = fa1->nocase && fa2->nocase;
2279 state_set_free(worklist);
2280 state_triple_free(newstates);
2282 if (collect(fa) < 0) {
2295 int fa_contains(struct fa *fa1, struct fa *fa2) {
2297 struct state_set *worklist = NULL; /* List of pairs of states */
2298 struct state_set *visited = NULL; /* List of pairs of states */
2300 if (fa1 == NULL || fa2 == NULL)
2306 F(determinize(fa2, NULL));
2307 sort_transition_intervals(fa1);
2308 sort_transition_intervals(fa2);
2310 F(state_pair_push(&worklist, fa1->initial, fa2->initial));
2311 F(state_pair_push(&visited, fa1->initial, fa2->initial));
2312 while (worklist->used) {
2313 struct state *p1, *p2;
2315 p1 = state_set_pop_data(worklist, &v2);
2318 if (p1->accept && !p2->accept)
2321 struct trans *t1 = p1->trans;
2322 struct trans *t2 = p2->trans;
2323 for(int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2324 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2326 int min1 = t1[n1].min, max1 = t1[n1].max;
2328 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2330 if (t2[n2].min > min1)
2332 if (t2[n2].max < CHAR_MAX)
2333 min1 = t2[n2].max + 1;
2338 if (state_pair_find(visited, t1[n1].to, t2[n2].to) == -1) {
2339 F(state_pair_push(&worklist, t1[n1].to, t2[n2].to));
2340 F(state_pair_push(&visited, t1[n1].to, t2[n2].to));
2350 state_set_free(worklist);
2351 state_set_free(visited);
2358 static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash,
2363 /* Never transition on anything in [A-Z] */
2364 if (min > 'Z' || max < 'A') {
2365 result = add_new_trans(s, crash, min, max);
2366 } else if (min >= 'A' && max <= 'Z') {
2368 } else if (max <= 'Z') {
2370 result = add_new_trans(s, crash, min, 'A' - 1);
2371 } else if (min >= 'A') {
2373 result = add_new_trans(s, crash, 'Z' + 1, max);
2375 /* min < 'A' && max > 'Z' */
2376 result = add_new_trans(s, crash, min, 'A' - 1);
2378 result = add_new_trans(s, crash, 'Z' + 1, max);
2381 result = add_new_trans(s, crash, min, max);
2386 static int totalize(struct fa *fa) {
2388 struct state *crash = add_state(fa, 0);
2391 F(mark_reachable(fa));
2392 sort_transition_intervals(fa);
2394 r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX);
2398 list_for_each(s, fa->initial) {
2399 int next = UCHAR_MIN;
2400 int tused = s->tused;
2401 for (int i=0; i < tused; i++) {
2402 uchar min = s->trans[i].min, max = s->trans[i].max;
2404 r = add_crash_trans(fa, s, crash, next, min - 1);
2411 if (next <= UCHAR_MAX) {
2412 r = add_crash_trans(fa, s, crash, next, UCHAR_MAX);
2422 struct fa *fa_complement(struct fa *fa) {
2425 F(determinize(fa, NULL));
2427 list_for_each(s, fa->initial)
2428 s->accept = ! s->accept;
2437 struct fa *fa_minus(struct fa *fa1, struct fa *fa2) {
2438 if (fa1 == NULL || fa2 == NULL)
2441 if (fa_is_basic(fa1, FA_EMPTY) || fa1 == fa2)
2442 return fa_make_empty();
2443 if (fa_is_basic(fa2, FA_EMPTY))
2444 return fa_clone(fa1);
2446 struct fa *cfa2 = fa_complement(fa2);
2450 struct fa *result = fa_intersect(fa1, cfa2);
2455 static int accept_to_accept(struct fa *fa) {
2457 struct state *s = add_state(fa, 0);
2461 F(mark_reachable(fa));
2462 list_for_each(a, fa->initial) {
2463 if (a->accept && a->reachable) {
2464 r = add_epsilon_trans(s, a);
2471 fa->deterministic = fa->minimal = 0;
2477 struct fa *fa_overlap(struct fa *fa1, struct fa *fa2) {
2478 struct fa *fa = NULL, *eps = NULL, *result = NULL;
2479 struct state_set *map = NULL;
2481 if (fa1 == NULL || fa2 == NULL)
2484 fa1 = fa_clone(fa1);
2485 fa2 = fa_clone(fa2);
2486 if (fa1 == NULL || fa2 == NULL)
2489 if (determinize(fa1, NULL) < 0)
2491 if (accept_to_accept(fa1) < 0)
2494 map = fa_reverse(fa2);
2495 state_set_free(map);
2496 if (determinize(fa2, NULL) < 0)
2498 if (accept_to_accept(fa2) < 0)
2500 map = fa_reverse(fa2);
2501 state_set_free(map);
2502 if (determinize(fa2, NULL) < 0)
2505 fa = fa_intersect(fa1, fa2);
2509 eps = fa_make_epsilon();
2513 result = fa_minus(fa, eps);
2525 int fa_equals(struct fa *fa1, struct fa *fa2) {
2526 if (fa1 == NULL || fa2 == NULL)
2529 /* fa_contains(fa1, fa2) && fa_contains(fa2, fa1) with error checking */
2530 int c1 = fa_contains(fa1, fa2);
2535 return fa_contains(fa2, fa1);
2538 static unsigned int chr_score(char c) {
2541 } else if (isalnum(c)) {
2543 } else if (isprint(c)) {
2545 } else if (c == '\0') {
2552 static unsigned int str_score(const struct re_str *str) {
2553 unsigned int score = 0;
2554 for (int i=0; i < str->len; i++) {
2555 score += chr_score(str->rx[i]);
2560 /* See if we get a better string for DST by appending C to SRC. If DST is
2561 * NULL or empty, always use SRC + C
2563 static struct re_str *string_extend(struct re_str *dst,
2564 const struct re_str *src,
2568 || str_score(src) + chr_score(c) < str_score(dst)) {
2569 int slen = src->len;
2571 dst = make_re_str(NULL);
2574 if (REALLOC_N(dst->rx, slen+2) < 0) {
2578 memcpy(dst->rx, src->rx, slen);
2580 dst->rx[slen + 1] = '\0';
2581 dst->len = slen + 1;
2586 static char pick_char(struct trans *t) {
2587 for (int c = t->min; c <= t->max; c++)
2588 if (isalpha(c)) return c;
2589 for (int c = t->min; c <= t->max; c++)
2590 if (isalnum(c)) return c;
2591 for (int c = t->min; c <= t->max; c++)
2592 if (isprint(c)) return c;
2596 /* Generate an example string for FA. Traverse all transitions and record
2597 * at each turn the "best" word found for that state.
2599 int fa_example(struct fa *fa, char **example, size_t *example_len) {
2600 struct re_str *word = NULL;
2601 struct state_set *path = NULL, *worklist = NULL;
2602 struct re_str *str = NULL;
2607 /* Sort to avoid any ambiguity because of reordering of transitions */
2608 sort_transition_intervals(fa);
2610 /* Map from state to string */
2611 path = state_set_init(-1, S_DATA|S_SORTED);
2612 str = make_re_str("");
2613 if (path == NULL || str == NULL)
2615 F(state_set_push_data(path, fa->initial, str));
2618 /* List of states still to visit */
2619 worklist = state_set_init(-1, S_NONE);
2620 if (worklist == NULL)
2622 F(state_set_push(worklist, fa->initial));
2624 while (worklist->used) {
2625 struct state *s = state_set_pop(worklist);
2626 struct re_str *ps = state_set_find_data(path, s);
2627 for_each_trans(t, s) {
2628 char c = pick_char(t);
2629 int toind = state_set_index(path, t->to);
2631 struct re_str *w = string_extend(NULL, ps, c);
2633 F(state_set_push(worklist, t->to));
2634 F(state_set_push_data(path, t->to, w));
2636 path->data[toind] = string_extend(path->data[toind], ps, c);
2641 for (int i=0; i < path->used; i++) {
2642 struct state *p = path->states[i];
2643 struct re_str *ps = path->data[i];
2645 (word == NULL || word->len == 0
2646 || (ps->len > 0 && str_score(word) > str_score(ps)))) {
2653 state_set_free(path);
2654 state_set_free(worklist);
2656 *example_len = word->len;
2657 *example = word->rx;
2662 state_set_free(path);
2663 state_set_free(worklist);
2677 static int fa_enumerate_intl(struct state *s, struct enum_intl *ei, int pos) {
2680 if (ei->bsize <= pos + 1) {
2682 F(REALLOC_N(ei->buf, ei->bsize));
2685 ei->buf[pos] = '\0';
2686 for_each_trans(t, s) {
2690 for (int i=t->min; i <= t->max; i++) {
2692 if (t->to->accept) {
2693 if (ei->nwords >= ei->limit)
2695 ei->words[ei->nwords] = strdup(ei->buf);
2696 E(ei->words[ei->nwords] == NULL);
2699 result = fa_enumerate_intl(t->to, ei, pos+1);
2704 ei->buf[pos] = '\0';
2710 int fa_enumerate(struct fa *fa, int limit, char ***words) {
2711 struct enum_intl ei;
2716 ei.bsize = 8; /* Arbitrary initial size */
2718 F(ALLOC_N(ei.words, limit));
2719 F(ALLOC_N(ei.buf, ei.bsize));
2721 /* We use the visited bit to track which states we already visited
2722 * during the construction of a word to detect loops */
2723 list_for_each(s, fa->initial)
2725 fa->initial->visited = 1;
2726 if (fa->initial->accept) {
2727 if (ei.nwords >= limit)
2729 ei.words[0] = strdup("");
2730 E(ei.words[0] == NULL);
2733 result = fa_enumerate_intl(fa->initial, &ei, 0);
2744 for (int i=0; i < ei.nwords; i++)
2750 /* Expand the automaton FA by replacing every transition s(c) -> p from
2751 * state s to p on character c by two transitions s(X) -> r, r(c) -> p via
2753 * If ADD_MARKER is true, also add for each original state s a new a loop
2754 * s(Y) -> q and q(X) -> s through a new state q.
2756 * The end result is that an automaton accepting "a|ab" is turned into one
2757 * accepting "Xa|XaXb" if add_marker is false and "(YX)*Xa|(YX)*Xa(YX)*Xb"
2758 * when add_marker is true.
2760 * The returned automaton is a copy of FA, FA is not modified.
2762 static struct fa *expand_alphabet(struct fa *fa, int add_marker,
2770 F(mark_reachable(fa));
2771 list_for_each(p, fa->initial) {
2775 struct state *r = add_state(fa, 0);
2776 r->trans = p->trans;
2777 r->tused = p->tused;
2778 r->tsize = p->tsize;
2780 p->tused = p->tsize = 0;
2781 ret = add_new_trans(p, r, X, X);
2785 struct state *q = add_state(fa, 0);
2786 ret = add_new_trans(p, q, Y, Y);
2789 ret = add_new_trans(q, p, X, X);
2800 static bitset *alphabet(struct fa *fa) {
2801 bitset *bs = bitset_init(UCHAR_NUM);
2806 list_for_each(s, fa->initial) {
2807 for (int i=0; i < s->tused; i++) {
2808 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2815 static bitset *last_chars(struct fa *fa) {
2816 bitset *bs = bitset_init(UCHAR_NUM);
2821 list_for_each(s, fa->initial) {
2822 for (int i=0; i < s->tused; i++) {
2823 if (s->trans[i].to->accept) {
2824 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2832 static bitset *first_chars(struct fa *fa) {
2833 bitset *bs = bitset_init(UCHAR_NUM);
2834 struct state *s = fa->initial;
2839 for (int i=0; i < s->tused; i++) {
2840 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2846 /* Return 1 if F1 and F2 are known to be unambiguously concatenable
2847 * according to simple heuristics. Return 0 if they need to be checked
2848 * further to decide ambiguity
2849 * Return -1 if an allocation fails
2851 static int is_splittable(struct fa *fa1, struct fa *fa2) {
2852 bitset *alpha1 = NULL;
2853 bitset *alpha2 = NULL;
2854 bitset *last1 = NULL;
2855 bitset *first2 = NULL;
2858 alpha2 = alphabet(fa2);
2859 last1 = last_chars(fa1);
2860 if (alpha2 == NULL || last1 == NULL)
2862 if (bitset_disjoint(last1, alpha2, UCHAR_NUM)) {
2867 alpha1 = alphabet(fa1);
2868 first2 = first_chars(fa2);
2869 if (alpha1 == NULL || first2 == NULL)
2871 if (bitset_disjoint(first2, alpha1, UCHAR_NUM)) {
2877 bitset_free(alpha1);
2878 bitset_free(alpha2);
2880 bitset_free(first2);
2884 /* This algorithm is due to Anders Moeller, and can be found in class
2885 * AutomatonOperations in dk.brics.grammar
2887 int fa_ambig_example(struct fa *fa1, struct fa *fa2,
2888 char **upv, size_t *upv_len,
2889 char **pv, char **v) {
2890 static const char X = '\001';
2891 static const char Y = '\002';
2892 char *result = NULL, *s = NULL;
2893 size_t result_len = 0;
2895 struct fa *mp = NULL, *ms = NULL, *sp = NULL, *ss = NULL, *amb = NULL;
2896 struct fa *a1f = NULL, *a1t = NULL, *a2f = NULL, *a2t = NULL;
2897 struct fa *b1 = NULL, *b2 = NULL;
2906 r = is_splittable(fa1, fa2);
2914 #define MPs Ys Xs "(" Xs "(.|\n))+"
2915 #define MSs Ys Xs "(" Xs "(.|\n))*"
2916 #define SPs "(" Xs "(.|\n))+" Ys Xs
2917 #define SSs "(" Xs "(.|\n))*" Ys Xs
2918 /* These could become static constants */
2919 r = fa_compile( MPs, strlen(MPs), &mp);
2920 if (r != REG_NOERROR)
2922 r = fa_compile( MSs, strlen(MSs), &ms);
2923 if (r != REG_NOERROR)
2925 r = fa_compile( SPs, strlen(SPs), &sp);
2926 if (r != REG_NOERROR)
2928 r = fa_compile( SSs, strlen(SSs), &ss);
2929 if (r != REG_NOERROR)
2938 a1f = expand_alphabet(fa1, 0, X, Y);
2939 a1t = expand_alphabet(fa1, 1, X, Y);
2940 a2f = expand_alphabet(fa2, 0, X, Y);
2941 a2t = expand_alphabet(fa2, 1, X, Y);
2942 if (a1f == NULL || a1t == NULL || a2f == NULL || a2t == NULL)
2945 /* Compute b1 = ((a1f . mp) & a1t) . ms */
2946 if (concat_in_place(a1f, &mp) < 0)
2948 b1 = fa_intersect(a1f, a1t);
2951 if (concat_in_place(b1, &ms) < 0)
2954 /* Compute b2 = ss . ((sp . a2f) & a2t) */
2955 if (concat_in_place(sp, &a2f) < 0)
2957 b2 = fa_intersect(sp, a2t);
2960 if (concat_in_place(ss, &b2) < 0)
2965 /* The automaton we are really interested in */
2966 amb = fa_intersect(b1, b2);
2971 r = fa_example(amb, &s, &s_len);
2977 result_len = (s_len-1)/2 - 1;
2978 F(ALLOC_N(result, result_len + 1));
2981 for (i=0; s[2*i] == X; i++) {
2982 assert((t - result) < result_len);
2989 for ( ;s[2*i] == X; i++) {
2990 assert((t - result) < result_len);
2997 for (; 2*i+1 < s_len; i++) {
2998 assert((t - result) < result_len);
3005 /* Clean up intermediate automata */
3021 *upv_len = result_len;
3030 * Construct an fa from a regular expression
3032 static struct fa *fa_from_re(struct re *re) {
3033 struct fa *result = NULL;
3038 result = fa_from_re(re->exp1);
3041 struct fa *fa2 = fa_from_re(re->exp2);
3044 if (union_in_place(result, &fa2) < 0)
3050 result = fa_from_re(re->exp1);
3053 struct fa *fa2 = fa_from_re(re->exp2);
3056 if (concat_in_place(result, &fa2) < 0)
3061 result = fa_make_char_set(re->cset, re->negate);
3065 struct fa *fa = fa_from_re(re->exp);
3068 result = fa_iter(fa, re->min, re->max);
3073 result = fa_make_epsilon();
3076 result = fa_make_char(re->c);
3088 static void free_re(struct re *re) {
3091 assert(re->ref == 0);
3093 if (re->type == UNION || re->type == CONCAT) {
3096 } else if (re->type == ITER) {
3098 } else if (re->type == CSET) {
3099 bitset_free(re->cset);
3104 int fa_compile(const char *regexp, size_t size, struct fa **fa) {
3105 struct re *re = NULL;
3106 struct re_parse parse;
3111 parse.rend = regexp + size;
3112 parse.error = REG_NOERROR;
3114 re = parse_regexp(&parse);
3118 *fa = fa_from_re(re);
3121 if (*fa == NULL || collect(*fa) < 0)
3122 parse.error = REG_ESPACE;
3126 /* We represent a case-insensitive FA by using only transitions on
3127 * lower-case letters.
3129 int fa_nocase(struct fa *fa) {
3130 if (fa == NULL || fa->nocase)
3134 list_for_each(s, fa->initial) {
3135 int tused = s->tused;
3136 /* For every transition on characters in [A-Z] add a corresponding
3137 * transition on [a-z]; remove any portion covering [A-Z] */
3138 for (int i=0; i < tused; i++) {
3139 struct trans *t = s->trans + i;
3140 int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
3141 int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
3143 if (t->min > 'Z' || t->max < 'A')
3145 if (t->min >= 'A' && t->max <= 'Z') {
3146 t->min = tolower(t->min);
3147 t->max = tolower(t->max);
3148 } else if (t->max <= 'Z') {
3151 F(add_new_trans(s, t->to, lc_min, lc_max));
3152 } else if (t->min >= 'A') {
3155 F(add_new_trans(s, t->to, lc_min, lc_max));
3157 /* t->min < 'A' && t->max > 'Z' */
3158 F(add_new_trans(s, t->to, 'Z' + 1, t->max));
3159 s->trans[i].max = 'A' - 1;
3160 F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
3170 int fa_is_nocase(struct fa *fa) {
3174 /* If FA is case-insensitive, turn it into a case-sensitive automaton by
3175 * adding transitions on upper-case letters for each existing transition on
3176 * lower-case letters */
3177 static int case_expand(struct fa *fa) {
3182 list_for_each(s, fa->initial) {
3183 int tused = s->tused;
3184 /* For every transition on characters in [a-z] add a corresponding
3185 * transition on [A-Z] */
3186 for (int i=0; i < tused; i++) {
3187 struct trans *t = s->trans + i;
3188 int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
3189 int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
3191 if (t->min > 'z' || t->max < 'a')
3193 F(add_new_trans(s, t->to, lc_min, lc_max));
3203 * Regular expression parser
3206 static struct re *make_re(enum re_type type) {
3208 if (make_ref(re) == 0)
3213 static struct re *make_re_rep(struct re *exp, int min, int max) {
3214 struct re *re = make_re(ITER);
3225 static struct re *make_re_binop(enum re_type type, struct re *exp1,
3227 struct re *re = make_re(type);
3238 static struct re *make_re_char(uchar c) {
3239 struct re *re = make_re(CHAR);
3245 static struct re *make_re_char_set(bool negate, bool no_ranges) {
3246 struct re *re = make_re(CSET);
3248 re->negate = negate;
3249 re->no_ranges = no_ranges;
3250 re->cset = bitset_init(UCHAR_NUM);
3251 if (re->cset == NULL)
3257 static bool more(struct re_parse *parse) {
3258 return parse->rx < parse->rend;
3261 static bool match(struct re_parse *parse, char m) {
3264 if (*parse->rx == m) {
3271 static bool peek(struct re_parse *parse, const char *chars) {
3272 return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL;
3275 static bool next(struct re_parse *parse, char *c) {
3283 static bool parse_char(struct re_parse *parse, int quoted, char *c) {
3286 if (quoted && *parse->rx == '\\') {
3288 return next(parse, c);
3290 return next(parse, c);
3294 static void add_re_char(struct re *re, uchar from, uchar to) {
3295 assert(re->type == CSET);
3296 for (unsigned int c = from; c <= to; c++)
3297 bitset_set(re->cset, c);
3300 static void parse_char_class(struct re_parse *parse, struct re *re) {
3301 if (! more(parse)) {
3302 parse->error = REG_EBRACK;
3306 parse_char(parse, 0, &from);
3308 if (match(parse, '-')) {
3309 if (! more(parse)) {
3310 parse->error = REG_EBRACK;
3313 if (peek(parse, "]")) {
3315 parse->error = REG_ERANGE;
3318 add_re_char(re, from, to);
3319 add_re_char(re, '-', '-');
3321 } else if (!parse_char(parse, 0, &to)) {
3322 parse->error = REG_ERANGE;
3327 parse->error = REG_ERANGE;
3330 add_re_char(re, from, to);
3335 static struct re *parse_simple_exp(struct re_parse *parse) {
3336 struct re *re = NULL;
3338 if (match(parse, '[')) {
3339 bool negate = match(parse, '^');
3340 re = make_re_char_set(negate, parse->no_ranges);
3342 parse->error = REG_ESPACE;
3345 parse_char_class(parse, re);
3346 if (parse->error != REG_NOERROR)
3348 while (more(parse) && ! peek(parse, "]")) {
3349 parse_char_class(parse, re);
3350 if (parse->error != REG_NOERROR)
3353 if (! match(parse, ']')) {
3354 parse->error = REG_EBRACK;
3357 } else if (match(parse, '(')) {
3358 if (match(parse, ')')) {
3359 re = make_re(EPSILON);
3361 parse->error = REG_ESPACE;
3365 re = parse_regexp(parse);
3368 if (! match(parse, ')')) {
3369 parse->error = REG_EPAREN;
3373 } else if (match(parse, '.')) {
3374 re = make_re_char_set(1, parse->no_ranges);
3376 parse->error = REG_ESPACE;
3379 add_re_char(re, '\n', '\n');
3380 } else if (more(parse)) {
3382 if (!parse_char(parse, 1, &c)) {
3383 parse->error = REG_EESCAPE;
3386 re = make_re_char(c);
3388 parse->error = REG_ESPACE;
3392 re = make_re(EPSILON);
3394 parse->error = REG_ESPACE;
3404 static int parse_int(struct re_parse *parse) {
3410 /* We need to be careful that strtoul will never access
3411 * memory beyond parse->rend
3413 for (lim = parse->rx; lim < parse->rend && *lim >= '0' && *lim <= '9';
3415 if (lim < parse->rend) {
3416 l = strtoul(parse->rx, &end, 10);
3417 used = end - parse->rx;
3419 char *s = strndup(parse->rx, parse->rend - parse->rx);
3421 parse->error = REG_ESPACE;
3424 l = strtoul(s, &end, 10);
3432 if ((l<0) || (l > INT_MAX)) {
3433 parse->error = REG_BADBR;
3439 static struct re *parse_repeated_exp(struct re_parse *parse) {
3440 struct re *re = parse_simple_exp(parse);
3443 if (match(parse, '?')) {
3444 re = make_re_rep(re, 0, 1);
3445 } else if (match(parse, '*')) {
3446 re = make_re_rep(re, 0, -1);
3447 } else if (match(parse, '+')) {
3448 re = make_re_rep(re, 1, -1);
3449 } else if (match(parse, '{')) {
3451 min = parse_int(parse);
3454 if (match(parse, ',')) {
3455 max = parse_int(parse);
3457 max = -1; /* If it's not an int, it means 'unbounded' */
3458 if (! match(parse, '}')) {
3459 parse->error = REG_EBRACE;
3462 } else if (match(parse, '}')) {
3465 parse->error = REG_EBRACE;
3468 if (min > max && max != -1) {
3469 parse->error = REG_BADBR;
3472 re = make_re_rep(re, min, max);
3475 parse->error = REG_ESPACE;
3482 static struct re *parse_concat_exp(struct re_parse *parse) {
3483 struct re *re = parse_repeated_exp(parse);
3487 if (more(parse) && ! peek(parse, ")|")) {
3488 struct re *re2 = parse_concat_exp(parse);
3491 re = make_re_binop(CONCAT, re, re2);
3493 parse->error = REG_ESPACE;
3504 static struct re *parse_regexp(struct re_parse *parse) {
3505 struct re *re = NULL;
3507 /* Something like (|r) */
3508 if (peek(parse, "|"))
3509 re = make_re(EPSILON);
3511 re = parse_concat_exp(parse);
3515 if (match(parse, '|')) {
3516 struct re *re2 = NULL;
3517 /* Something like (r|) */
3518 if (peek(parse, ")"))
3519 re2 = make_re(EPSILON);
3521 re2 = parse_regexp(parse);
3525 re = make_re_binop(UNION, re, re2);
3527 parse->error = REG_ESPACE;
3539 * Convert a STRUCT RE to a string. Code is complicated by the fact that
3540 * we try to be clever and avoid unneeded parens and concatenation with
3543 static int re_as_string(const struct re *re, struct re_str *str);
3545 static int re_binop_count(enum re_type type, const struct re *re) {
3546 assert(type == CONCAT || type == UNION);
3547 if (re->type == type) {
3548 return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2);
3554 static int re_binop_store(enum re_type type, const struct re *re,
3555 const struct re **list) {
3557 if (type == re->type) {
3558 pos = re_binop_store(type, re->exp1, list);
3559 pos += re_binop_store(type, re->exp2, list + pos);
3567 static int re_union_as_string(const struct re *re, struct re_str *str) {
3568 assert(re->type == UNION);
3571 const struct re **res = NULL;
3572 struct re_str *strings = NULL;
3575 nre = re_binop_count(re->type, re);
3576 r = ALLOC_N(res, nre);
3580 re_binop_store(re->type, re, res);
3582 r = ALLOC_N(strings, nre);
3587 for (int i=0; i < nre; i++) {
3588 if (re_as_string(res[i], strings + i) < 0)
3590 str->len += strings[i].len;
3594 r = re_str_alloc(str);
3599 for (int i=0; i < nre; i++) {
3602 memcpy(p, strings[i].rx, strings[i].len);
3603 p += strings[i].len;
3609 if (strings != NULL) {
3610 for (int i=0; i < nre; i++)
3611 release_re_str(strings + i);
3616 release_re_str(str);
3622 static int re_needs_parens_in_concat(const struct re *re) {
3623 return (re->type != CHAR && re->type != CSET && re->type != ITER);
3626 static int re_concat_as_string(const struct re *re, struct re_str *str) {
3627 assert(re->type == CONCAT);
3629 const struct re **res = NULL;
3630 struct re_str *strings = NULL;
3634 nre = re_binop_count(re->type, re);
3635 r = ALLOC_N(res, nre);
3638 re_binop_store(re->type, re, res);
3640 r = ALLOC_N(strings, nre);
3645 for (int i=0; i < nre; i++) {
3646 if (res[i]->type == EPSILON)
3648 if (re_as_string(res[i], strings + i) < 0)
3650 str->len += strings[i].len;
3651 if (re_needs_parens_in_concat(res[i]))
3655 r = re_str_alloc(str);
3660 for (int i=0; i < nre; i++) {
3661 if (res[i]->type == EPSILON)
3663 if (re_needs_parens_in_concat(res[i]))
3665 p = memcpy(p, strings[i].rx, strings[i].len);
3666 p += strings[i].len;
3667 if (re_needs_parens_in_concat(res[i]))
3674 if (strings != NULL) {
3675 for (int i=0; i < nre; i++)
3676 release_re_str(strings + i);
3681 release_re_str(str);
3686 static bool cset_contains(const struct re *cset, int c) {
3687 return bitset_get(cset->cset, c) != cset->negate;
3690 static int re_cset_as_string(const struct re *re, struct re_str *str) {
3691 const uchar rbrack = ']';
3692 const uchar dash = '-';
3693 const uchar nul = '\0';
3695 static const char *const empty_set = "[]";
3696 static const char *const total_set = "(.|\n)";
3697 static const char *const not_newline = ".";
3700 int from, to, negate;
3702 int incl_rbrack, incl_dash;
3705 str->len = strlen(empty_set);
3707 /* We can not include NUL explicitly in a CSET since we use ordinary
3708 NUL delimited strings to represent them. That means that we need to
3709 use negated representation if NUL is to be included (and vice versa)
3711 negate = cset_contains(re, nul);
3713 for (from = UCHAR_MIN;
3714 from <= UCHAR_MAX && cset_contains(re, from);
3716 if (from > UCHAR_MAX) {
3717 /* Special case: the set matches every character */
3718 str->rx = strdup(total_set);
3723 from <= UCHAR_MAX && cset_contains(re, from);
3725 if (from > UCHAR_MAX) {
3726 /* Special case: the set matches everything but '\n' */
3727 str->rx = strdup(not_newline);
3733 /* See if ']' and '-' will be explicitly included in the character set
3734 (INCL_RBRACK, INCL_DASH) As we loop over the character set, we reset
3735 these flags if they are in the set, but not mentioned explicitly
3737 incl_rbrack = cset_contains(re, rbrack) != negate;
3738 incl_dash = cset_contains(re, dash) != negate;
3740 if (re->no_ranges) {
3741 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++)
3742 if (cset_contains(re, from) != negate)
3745 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3746 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3748 if (from > UCHAR_MAX)
3751 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3754 if (to == from && (from == rbrack || from == dash))
3756 if (from == rbrack || from == dash)
3758 if (to == rbrack || to == dash)
3761 len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3);
3763 if (from < rbrack && rbrack < to)
3765 if (from < dash && dash < to)
3769 str->len += incl_rbrack + incl_dash;
3772 str->len += 1; /* For the ^ */
3774 r = re_str_alloc(str);
3785 if (re->no_ranges) {
3786 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) {
3787 if (from == rbrack || from == dash)
3789 if (cset_contains(re, from) != negate)
3793 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3794 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3796 if (from > UCHAR_MAX)
3799 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3802 if (to == from && (from == rbrack || from == dash))
3804 if (from == rbrack || from == dash)
3806 if (to == rbrack || to == dash)
3811 } else if (to == from + 1) {
3826 if (str->rx == NULL)
3828 str->len = strlen(str->rx);
3831 release_re_str(str);
3835 static int re_iter_as_string(const struct re *re, struct re_str *str) {
3836 const char *quant = NULL;
3840 if (re_as_string(re->exp, str) < 0)
3843 if (re->min == 0 && re->max == -1) {
3845 } else if (re->min == 1 && re->max == -1) {
3847 } else if (re->min == 0 && re->max == 1) {
3849 } else if (re->max == -1) {
3850 r = asprintf(&iter, "{%d,}", re->min);
3855 r = asprintf(&iter, "{%d,%d}", re->min, re->max);
3861 if (re->exp->type == CHAR || re->exp->type == CSET) {
3862 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0)
3864 strcpy(str->rx + str->len, quant);
3865 str->len += strlen(quant);
3867 /* Format '(' + str->rx ')' + quant */
3868 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0)
3870 memmove(str->rx + 1, str->rx, str->len);
3872 str->rx[str->len + 1] = ')';
3874 strcpy(str->rx + str->len, quant);
3875 str->len += strlen(quant);
3883 release_re_str(str);
3887 static int re_as_string(const struct re *re, struct re_str *str) {
3888 /* Characters that must be escaped */
3889 static const char * const special_chars = ".()[]{}*|+?\\^$";
3894 result = re_union_as_string(re, str);
3897 result = re_concat_as_string(re, str);
3900 result = re_cset_as_string(re, str);
3903 if (re->c == '\0' || strchr(special_chars, re->c) == NULL) {
3904 if (ALLOC_N(str->rx, 2) < 0)
3909 if (ALLOC_N(str->rx, 3) < 0)
3913 str->len = strlen(str->rx);
3917 result = re_iter_as_string(re, str);
3920 if (ALLOC_N(str->rx, 3) < 0)
3922 strcpy(str->rx, "()");
3923 str->len = strlen(str->rx);
3932 release_re_str(str);
3936 static int convert_trans_to_re(struct state *s) {
3937 struct re *re = NULL;
3939 struct trans *trans = NULL;
3945 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
3946 for (int i = 0; i < s->tused - 1; i++) {
3947 if (s->trans[i].to != s->trans[i+1].to)
3950 r = ALLOC_N(trans, nto);
3954 struct state *to = s->trans[0].to;
3956 for_each_trans(t, s) {
3958 trans[tind].to = to;
3959 trans[tind].re = re;
3965 re = make_re_char_set(0, 0);
3969 add_re_char(re, t->min, t->max);
3971 assert(nto == tind + 1);
3972 trans[tind].to = to;
3973 trans[tind].re = re;
3975 /* Simplify CSETs with a single char to a CHAR */
3976 for (int t=0; t < nto; t++) {
3978 uchar chr = UCHAR_MIN;
3979 for (int c = 0; c < UCHAR_NUM; c++) {
3980 if (bitset_get(trans[t].re->cset, c)) {
3986 re_unref(trans[t].re);
3987 trans[t].re = make_re_char(chr);
3988 if (trans[t].re == NULL)
3994 s->tused = s->tsize = nto;
3999 for (int i=0; i < nto; i++)
4000 unref(trans[i].re, re);
4005 ATTRIBUTE_RETURN_CHECK
4006 static int add_new_re_trans(struct state *s1, struct state *s2,
4009 r = add_new_trans(s1, s2, 0, 0);
4012 last_trans(s1)->re = re;
4016 /* Add the regular expression R1 . LOOP* . R2 to the transition
4018 static int re_collapse_trans(struct state *s1, struct state *s2,
4019 struct re *r1, struct re *loop, struct re *r2) {
4020 struct re *re = NULL;
4022 if (loop->type != EPSILON) {
4023 loop = make_re_rep(ref(loop), 0, -1);
4028 if (r1->type == EPSILON) {
4029 if (loop->type == EPSILON) {
4032 re = make_re_binop(CONCAT, loop, ref(r2));
4035 if (loop->type == EPSILON) {
4036 if (r2->type == EPSILON) {
4039 re = make_re_binop(CONCAT, ref(r1), ref(r2));
4042 re = make_re_binop(CONCAT, ref(r1), loop);
4043 if (re != NULL && r2->type != EPSILON) {
4044 re = make_re_binop(CONCAT, re, ref(r2));
4051 struct trans *t = NULL;
4052 for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1);
4053 if (t > last_trans(s1)) {
4054 if (add_new_re_trans(s1, s2, re) < 0)
4057 if (t->re == NULL) {
4060 t->re = make_re_binop(UNION, re, t->re);
4067 // FIXME: make sure we don't leak loop
4071 static int convert_strings(struct fa *fa) {
4072 struct state_set *worklist = state_set_init(-1, S_NONE);
4075 E(worklist == NULL);
4077 /* Abuse hash to count indegree, reachable to mark visited states */
4078 list_for_each(s, fa->initial) {
4083 list_for_each(s, fa->initial) {
4084 for_each_trans(t, s) {
4089 for (struct state *s = fa->initial;
4091 s = state_set_pop(worklist)) {
4092 for (int i=0; i < s->tused; i++) {
4093 struct trans *t = s->trans + i;
4094 struct state *to = t->to;
4095 while (to->hash == 1 && to->tused == 1 && ! to->accept) {
4096 if (t->re == NULL) {
4097 t->re = to->trans->re;
4098 to->trans->re = NULL;
4100 t->re = make_re_binop(CONCAT, t->re, to->trans->re);
4104 t->to = to->trans->to;
4108 for (int j=0; j < s->tused; j++) {
4109 if (j != i && s->trans[j].to == to) {
4110 /* Combine transitions i and j; remove trans j */
4111 t->re = make_re_binop(UNION, t->re, s->trans[j].re);
4114 memmove(s->trans + j, s->trans + j + 1,
4115 sizeof(s->trans[j]) * (s->tused - j - 1));
4126 if (! to->reachable) {
4128 F(state_set_push(worklist, to));
4133 for (struct state *s = fa->initial; s->next != NULL; ) {
4134 if (s->next->hash == 0 && s->next->tused == 0) {
4135 struct state *del = s->next;
4136 s->next = del->next;
4146 state_set_free(worklist);
4150 /* Convert an FA to a regular expression.
4151 * The strategy is the following:
4152 * (1) For all states S1 and S2, convert the transitions between them
4153 * into one transition whose regexp is a CSET
4154 * (2) Add a new initial state INI and a new final state FIN to the automaton,
4155 * a transition from INI to the old initial state of FA, and a transition
4156 * from all accepting states of FA to FIN; the regexp on those transitions
4157 * matches only the empty word
4158 * (3) Eliminate states S (except for INI and FIN) one by one:
4159 * Let LOOP the regexp for the transition S -> S if it exists, epsilon
4161 * For all S1, S2 different from S with S1 -> S -> S2
4162 * Let R1 the regexp of S1 -> S
4163 * R2 the regexp of S -> S2
4164 * R3 the regexp S1 -> S2 (or epsilon if no such transition)
4165 * set the regexp on the transition S1 -> S2 to
4166 * R1 . (LOOP)* . R2 | R3
4167 * (4) The regexp for the whole FA can now be found as the regexp of
4168 * the transition INI -> FIN
4169 * (5) Convert that STRUCT RE to a string with RE_AS_STRING
4171 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
4173 struct state *fin = NULL, *ini = NULL;
4174 struct re *eps = NULL;
4182 eps = make_re(EPSILON);
4186 fin = add_state(fa,1);
4192 list_for_each(s, fa->initial) {
4193 r = convert_trans_to_re(s);
4196 if (s->accept && s != fin) {
4197 r = add_new_re_trans(s, fin, ref(eps));
4204 ini = add_state(fa, 0);
4208 r = add_new_re_trans(ini, fa->initial, ref(eps));
4211 set_initial(fa, ini);
4213 convert_strings(fa);
4215 list_for_each(s, fa->initial->next) {
4219 struct re *loop = eps;
4220 for_each_trans(t, s) {
4224 list_for_each(s1, fa->initial) {
4227 for (int t1 = 0; t1 < s1->tused; t1++) {
4228 if (s1->trans[t1].to == s) {
4229 for (int t = 0; t < s->tused; t++) {
4230 if (s->trans[t].to == s)
4232 r = re_collapse_trans(s1, s->trans[t].to,
4246 for_each_trans(t, fa->initial) {
4250 if (re_as_string(t->re, &str) < 0)
4253 *regexp_len = str.len;
4257 list_for_each(s, fa->initial) {
4258 for_each_trans(t, s) {
4271 static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
4278 r1 = re_restrict_alphabet(re->exp1, from, to);
4279 r2 = re_restrict_alphabet(re->exp2, from, to);
4280 result = (r1 != 0) ? r1 : r2;
4285 bitset_negate(re->cset, UCHAR_NUM);
4287 for (int i=from; i <= to; i++)
4288 bitset_clr(re->cset, i);
4291 if (from <= re->c && re->c <= to)
4295 result = re_restrict_alphabet(re->exp, from, to);
4307 int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
4308 char **newregexp, size_t *newregexp_len,
4309 char from, char to) {
4311 struct re *re = NULL;
4312 struct re_parse parse;
4318 parse.rend = regexp + regexp_len;
4319 parse.error = REG_NOERROR;
4320 re = parse_regexp(&parse);
4321 if (parse.error != REG_NOERROR)
4324 result = re_restrict_alphabet(re, from, to);
4331 result = re_as_string(re, &str);
4332 *newregexp = str.rx;
4333 *newregexp_len = str.len;
4339 int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
4340 char **newregexp, size_t *newregexp_len) {
4342 struct re *re = NULL;
4343 struct re_parse parse;
4349 parse.rend = regexp + regexp_len;
4350 parse.error = REG_NOERROR;
4351 parse.no_ranges = 1;
4352 re = parse_regexp(&parse);
4353 if (parse.error != REG_NOERROR)
4357 result = re_as_string(re, &str);
4358 *newregexp = str.rx;
4359 *newregexp_len = str.len;
4364 /* Expand regexp so that it is case-insensitive in a case-sensitive match.
4366 * Return 1 when a change was made, -1 when an allocation failed, and 0
4367 * when no change was made.
4369 static int re_case_expand(struct re *re) {
4370 int result = 0, r1, r2;
4375 r1 = re_case_expand(re->exp1);
4376 r2 = re_case_expand(re->exp2);
4377 result = (r1 != 0) ? r1 : r2;
4380 for (int c = 'A'; c <= 'Z'; c++)
4381 if (bitset_get(re->cset, c)) {
4383 bitset_set(re->cset, tolower(c));
4385 for (int c = 'a'; c <= 'z'; c++)
4386 if (bitset_get(re->cset, c)) {
4388 bitset_set(re->cset, toupper(c));
4392 if (isalpha(re->c)) {
4397 re->cset = bitset_init(UCHAR_NUM);
4398 if (re->cset == NULL)
4400 bitset_set(re->cset, tolower(c));
4401 bitset_set(re->cset, toupper(c));
4406 result = re_case_expand(re->exp);
4418 int fa_expand_nocase(const char *regexp, size_t regexp_len,
4419 char **newregexp, size_t *newregexp_len) {
4421 struct re *re = NULL;
4422 struct re_parse parse;
4428 parse.rend = regexp + regexp_len;
4429 parse.error = REG_NOERROR;
4430 re = parse_regexp(&parse);
4431 if (parse.error != REG_NOERROR)
4434 r = re_case_expand(re);
4442 result = re_as_string(re, &str);
4443 *newregexp = str.rx;
4444 *newregexp_len = str.len;
4446 *newregexp = strndup(regexp, regexp_len);
4447 *newregexp_len = regexp_len;
4448 result = (*newregexp == NULL) ? REG_ESPACE : REG_NOERROR;
4454 static void print_char(FILE *out, uchar c) {
4455 /* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
4456 Also, a space ' ' is shown as '\s' */
4457 static const char *const escape_from = " \n\t\v\b\r\f\a/\0";
4458 static const char *const escape_to = "sntvbrfa/0";
4459 char *p = strchr(escape_from, c);
4461 int i = p - escape_from;
4462 fprintf(out, "\\\\%c", escape_to[i]);
4463 } else if (! isprint(c)) {
4464 fprintf(out, "\\\\0%03o", (unsigned char) c);
4465 } else if (c == '"') {
4466 fprintf(out, "\\\"");
4472 void fa_dot(FILE *out, struct fa *fa) {
4473 fprintf(out, "digraph {\n rankdir=LR;");
4474 list_for_each(s, fa->initial) {
4476 fprintf(out, "\"%p\" [shape=doublecircle];\n", s);
4478 fprintf(out, "\"%p\" [shape=circle];\n", s);
4481 fprintf(out, "%s -> \"%p\";\n", fa->deterministic ? "dfa" : "nfa",
4486 list_for_each(s, fa->initial) {
4487 for_each_trans(t, s) {
4488 fprintf(out, "\"%p\" -> \"%p\" [ label = \"", s, t->to);
4490 re_as_string(t->re, &str);
4491 for (int i=0; i < str.len; i++) {
4492 print_char(out, str.rx[i]);
4494 release_re_str(&str);
4496 print_char(out, t->min);
4497 if (t->min != t->max) {
4499 print_char(out, t->max);
4502 fprintf(out, "\" ];\n");
4505 fprintf(out, "}\n");
4510 * indent-tabs-mode: nil