2 * fa.c: finite automata
4 * Copyright (C) 2007-2016 David Lutterkort
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * Author: David Lutterkort <dlutter@redhat.com>
24 * This implementation follows closely the Java dk.brics.automaton package
25 * by Anders Moeller. The project's website is
26 * http://www.brics.dk/automaton/.
28 * It is by no means a complete reimplementation of that package; only a
29 * subset of what Automaton provides is implemented here.
43 #define UCHAR_NUM (UCHAR_MAX+1)
45 typedef unsigned char uchar;
47 #define E(cond) if (cond) goto error
48 #define F(expr) if ((expr) < 0) goto error
50 /* Which algorithm to use in FA_MINIMIZE */
51 int fa_minimization_algorithm = FA_MIN_HOPCROFT;
53 /* A finite automaton. INITIAL is both the initial state and the head of
54 * the list of all states. Any state that is allocated for this automaton
55 * is put on this list. Dead/unreachable states are cleared from the list
56 * at opportune times (e.g., during minimization) It's poor man's garbage
59 * Normally, transitions are on a character range [min..max]; in
60 * fa_as_regexp, we store regexps on transitions in the re field of each
61 * transition. TRANS_RE indicates that we do that, and is used by fa_dot to
62 * produce proper graphs of an automaton transitioning on regexps.
64 * For case-insensitive regexps (nocase == 1), the FA never has transitions
65 * on uppercase letters [A-Z], effectively removing these letters from the
69 struct state *initial;
70 int deterministic : 1;
72 unsigned int nocase : 1;
76 /* A state in a finite automaton. Transitions are never shared between
77 states so that we can free the list when we need to free the state */
81 unsigned int accept : 1;
82 unsigned int live : 1;
83 unsigned int reachable : 1;
84 unsigned int visited : 1; /* Used in various places to track progress */
85 /* Array of transitions. The TUSED first entries are used, the array
86 has allocated room for TSIZE */
92 /* A transition. If the input has a character in the inclusive
93 * range [MIN, MAX], move to TO
109 #define UINT_BIT (sizeof(unsigned int) * CHAR_BIT)
111 typedef unsigned int bitset;
113 static bitset *bitset_init(size_t nbits) {
115 if (ALLOC_N(bs, (nbits + UINT_BIT) / UINT_BIT) == -1)
120 static inline void bitset_clr(bitset *bs, unsigned int bit) {
121 bs[bit/UINT_BIT] &= ~(1 << (bit % UINT_BIT));
124 static inline void bitset_set(bitset *bs, unsigned int bit) {
125 bs[bit/UINT_BIT] |= 1 << (bit % UINT_BIT);
129 static inline int bitset_get(const bitset * const bs, unsigned int bit) {
130 return (bs[bit/UINT_BIT] >> bit % UINT_BIT) & 1;
134 static inline bool bitset_disjoint(const bitset *const bs1,
135 const bitset *const bs2,
137 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++) {
144 static void bitset_free(bitset *bs) {
148 static void bitset_negate(bitset *bs, size_t nbits) {
149 for (int i=0; i < (nbits + UINT_BIT) / UINT_BIT; i++)
154 * Representation of a parsed regular expression. The regular expression is
155 * parsed according to the following grammar by PARSE_REGEXP:
157 * regexp: concat_exp ('|' regexp)?
158 * concat_exp: repeated_exp concat_exp?
159 * repeated_exp: simple_exp
163 * | simple_exp '{' INT (',' INT)? '}'
164 * simple_exp: char_class
168 * char_class: '[' char_exp+ ']'
169 * | '[' '^' char_exp+ ']'
170 * char_exp: CHAR '-' CHAR
183 #define re_unref(r) unref(r, re)
189 struct { /* UNION, CONCAT */
196 /* Whether we can use character ranges when converting back
198 unsigned int no_ranges:1;
211 /* Used to keep state of the regex parse; RX may contain NUL's */
213 const char *rx; /* Current position in regex */
214 const char *rend; /* Last char of rx+ 1 */
215 int error; /* error code */
216 /* Whether new CSET's should have the no_ranges flag set */
217 unsigned int no_ranges:1;
220 /* String with explicit length, used when converting re to string */
226 static struct re *parse_regexp(struct re_parse *parse);
228 /* A map from a set of states to a state. */
229 typedef hash_t state_set_hash;
231 static hash_val_t ptr_hash(const void *p);
233 static const int array_initial_size = 4;
234 static const int array_max_expansion = 128;
236 enum state_set_init_flags {
245 unsigned int sorted : 1;
246 unsigned int with_data : 1;
247 struct state **states;
251 struct state_set_list {
252 struct state_set_list *next;
253 struct state_set *set;
256 /* Clean up FA by removing dead transitions and states and reducing
257 * transitions. Unreachable states are freed. The return value is the same
258 * as FA; returning it is merely a convenience.
260 * Only automata in this state should be returned to the user
262 ATTRIBUTE_RETURN_CHECK
263 static int collect(struct fa *fa);
265 ATTRIBUTE_RETURN_CHECK
266 static int totalize(struct fa *fa);
268 /* Print an FA into a (fairly) fixed file if the environment variable
269 * FA_DOT_DIR is set. This code is only used for debugging
271 #define FA_DOT_DIR "FA_DOT_DIR"
274 static void fa_dot_debug(struct fa *fa, const char *tag) {
276 static int count = 0;
281 if ((dot_dir = getenv(FA_DOT_DIR)) == NULL)
284 r = asprintf(&fname, "%s/fa_%02d_%s.dot", dot_dir, count++, tag);
288 fp = fopen(fname, "w");
299 static void print_char_set(struct re *set) {
306 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
307 while (bitset_get(set->cset, from) == set->negate)
309 if (from > UCHAR_MAX)
312 to < UCHAR_MAX && (bitset_get(set->cset, to+1) == !set->negate);
317 printf("%c-%c", from, to);
324 static void print_re(struct re *re) {
345 printf("){%d,%d}", re->min, re->max);
359 static void release_re_str(struct re_str *str) {
366 static void free_re_str(struct re_str *str) {
373 static struct re_str *make_re_str(const char *s) {
380 str->len = strlen(s);
381 if (str->rx == NULL) {
389 static int re_str_alloc(struct re_str *str) {
390 return ALLOC_N(str->rx, str->len + 1);
397 static void free_trans(struct state *s) {
400 s->tused = s->tsize = 0;
403 static void gut(struct fa *fa) {
404 list_for_each(s, fa->initial) {
407 list_free(fa->initial);
411 void fa_free(struct fa *fa) {
418 static struct state *make_state(void) {
422 s->hash = ptr_hash(s);
426 static struct state *add_state(struct fa *fa, int accept) {
427 struct state *s = make_state();
430 if (fa->initial == NULL) {
433 list_cons(fa->initial->next, s);
439 #define last_trans(s) ((s)->trans + (s)->tused - 1)
441 #define for_each_trans(t, s) \
442 for (struct trans *t = (s)->trans; \
443 (t - (s)->trans) < (s)->tused; \
446 ATTRIBUTE_RETURN_CHECK
447 static int add_new_trans(struct state *from, struct state *to,
448 uchar min, uchar max) {
451 if (from->tused == from->tsize) {
452 size_t tsize = from->tsize;
454 tsize = array_initial_size;
455 else if (from->tsize > array_max_expansion)
456 tsize += array_max_expansion;
459 if (REALLOC_N(from->trans, tsize) == -1)
463 from->trans[from->tused].to = to;
464 from->trans[from->tused].min = min;
465 from->trans[from->tused].max = max;
470 ATTRIBUTE_RETURN_CHECK
471 static int add_epsilon_trans(struct state *from,
474 from->accept |= to->accept;
475 for_each_trans(t, to) {
476 r = add_new_trans(from, t->to, t->min, t->max);
483 static void set_initial(struct fa *fa, struct state *s) {
484 list_remove(s, fa->initial);
485 list_cons(fa->initial, s);
488 /* Merge automaton FA2 into FA1. This simply adds FA2's states to FA1
489 and then frees FA2. It has no influence on the language accepted by FA1
491 static void fa_merge(struct fa *fa1, struct fa **fa2) {
492 list_append(fa1->initial, (*fa2)->initial);
498 * Operations on STATE_SET
500 static void state_set_free(struct state_set *set) {
508 static int state_set_init_data(struct state_set *set) {
510 if (set->data == NULL)
511 return ALLOC_N(set->data, set->size);
516 /* Create a new STATE_SET with an initial size of SIZE. If SIZE is -1, use
517 the default size ARRAY_INITIAL_SIZE. FLAGS is a bitmask indicating
519 - S_SORTED: keep the states in the set sorted by their address, and use
520 binary search for lookups. If it is not set, entries are kept in the
521 order in which they are added and lookups scan linearly through the
523 - S_DATA: allocate the DATA array in the set, and keep its size in sync
524 with the size of the STATES array.
526 static struct state_set *state_set_init(int size, int flags) {
527 struct state_set *set = NULL;
531 set->sorted = (flags & S_SORTED) ? 1 : 0;
532 set->with_data = (flags & S_DATA) ? 1 : 0;
535 F(ALLOC_N(set->states, set->size));
537 F(state_set_init_data(set));
545 ATTRIBUTE_RETURN_CHECK
546 static int state_set_expand(struct state_set *set) {
548 set->size = array_initial_size;
549 else if (set->size > array_max_expansion)
550 set->size += array_max_expansion;
553 if (REALLOC_N(set->states, set->size) < 0)
556 if (REALLOC_N(set->data, set->size) < 0)
560 /* We do this to provoke a SEGV as early as possible */
566 /* Return the index where S belongs in SET->STATES to keep it sorted. S
567 may not be in SET->STATES. The returned index is in the interval [0
568 .. SET->USED], with the latter indicating that S is larger than all
569 values in SET->STATES
571 static int state_set_pos(const struct state_set *set, const struct state *s) {
572 int l = 0, h = set->used;
575 if (set->states[m] > s)
577 else if (set->states[m] < s)
585 ATTRIBUTE_RETURN_CHECK
586 static int state_set_push(struct state_set *set, struct state *s) {
587 if (set->size == set->used)
588 if (state_set_expand(set) < 0)
591 int p = state_set_pos(set, s);
592 if (set->size == set->used)
593 if (state_set_expand(set) < 0)
595 while (p < set->used && set->states[p] <= s)
598 memmove(set->states + p + 1, set->states + p,
599 sizeof(*set->states) * (set->used - p));
600 if (set->data != NULL)
601 memmove(set->data + p + 1, set->data + p,
602 sizeof(*set->data) * (set->used - p));
608 set->states[set->used++] = s;
609 return set->used - 1;
613 ATTRIBUTE_RETURN_CHECK
614 static int state_set_push_data(struct state_set *set, struct state *s,
616 int i = state_set_push(set, s);
623 static int state_set_index(const struct state_set *set,
624 const struct state *s) {
626 int p = state_set_pos(set, s);
627 return (p < set->used && set->states[p] == s) ? p : -1;
629 for (int i=0; i < set->used; i++) {
630 if (set->states[i] == s)
637 static void state_set_remove(struct state_set *set,
638 const struct state *s) {
640 int p = state_set_index(set, s);
642 memmove(set->states + p, set->states + p + 1,
643 sizeof(*set->states) * (set->used - p - 1));
644 if (set->data != NULL)
645 memmove(set->data + p, set->data + p + 1,
646 sizeof(*set->data) * (set->used - p - 1));
648 int p = state_set_index(set, s);
650 set->states[p] = set->states[--set->used];
655 /* Only add S if it's not in SET yet. Return 1 if S was added, 0 if it was
656 already in the set and -1 on error. */
657 ATTRIBUTE_RETURN_CHECK
658 static int state_set_add(struct state_set *set, struct state *s) {
660 int p = state_set_pos(set, s);
661 if (p < set->used && set->states[p] == s)
663 if (set->size == set->used)
664 if (state_set_expand(set) < 0)
666 while (p < set->used && set->states[p] <= s)
669 memmove(set->states + p + 1, set->states + p,
670 sizeof(*set->states) * (set->used - p));
671 if (set->data != NULL)
672 memmove(set->data + p + 1, set->data + p,
673 sizeof(*set->data) * (set->used - p));
678 if (state_set_index(set, s) >= 0)
680 if (state_set_push(set, s) < 0)
685 /* We do this to provoke a SEGV as early as possible */
691 static struct state *state_set_pop(struct state_set *set) {
692 struct state *s = NULL;
694 s = set->states[--set->used];
698 static struct state *state_set_pop_data(struct state_set *set, void **d) {
699 struct state *s = NULL;
700 s = state_set_pop(set);
701 *d = set->data[set->used];
705 static void *state_set_find_data(struct state_set *set, struct state *s) {
706 int i = state_set_index(set, s);
713 static int state_set_equal(const struct state_set *set1,
714 const struct state_set *set2) {
715 if (set1->used != set2->used)
717 if (set1->sorted && set2->sorted) {
718 for (int i = 0; i < set1->used; i++)
719 if (set1->states[i] != set2->states[i])
723 for (int i=0; i < set1->used; i++)
724 if (state_set_index(set2, set1->states[i]) == -1)
731 static void state_set_compact(struct state_set *set) {
732 while (set->used > 0 && set->states[set->used] == NULL)
734 for (int i=0; i < set->used; i++) {
735 if (set->states[i] == NULL) {
737 set->states[i] = set->states[set->used];
739 set->data[i] = set->data[set->used];
741 while (set->used > 0 && set->states[set->used] == NULL)
747 /* Add an entry (FST, SND) to SET. FST is stored in SET->STATES, and SND is
748 stored in SET->DATA at the same index.
750 ATTRIBUTE_RETURN_CHECK
751 static int state_pair_push(struct state_set **set,
752 struct state *fst, struct state *snd) {
754 *set = state_set_init(-1, S_DATA);
757 int i = state_set_push(*set, fst);
760 (*set)->data[i] = snd;
765 /* Return the index of the pair (FST, SND) in SET, or -1 if SET contains no
768 static int state_pair_find(struct state_set *set, struct state *fst,
770 for (int i=0; i < set->used; i++)
771 if (set->states[i] == fst && set->data[i] == snd)
776 /* Jenkins' hash for void* */
777 static hash_val_t ptr_hash(const void *p) {
779 char *c = (char *) &p;
780 for (int i=0; i < sizeof(p); i++) {
782 hash += (hash << 10);
786 hash ^= (hash >> 11);
787 hash += (hash << 15);
791 typedef hash_t state_triple_hash;
793 static hash_val_t pair_hash(const void *key) {
794 register struct state *const *pair = key;
795 return pair[0]->hash + pair[1]->hash;
798 static int pair_cmp(const void *key1, const void *key2) {
799 return memcmp(key1, key2, 2*sizeof(struct state *));
802 static void state_triple_node_free(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
803 free((void *) hnode_getkey(node));
807 static state_triple_hash *state_triple_init(void) {
808 state_triple_hash *hash;
810 hash = hash_create(HASHCOUNT_T_MAX, pair_cmp, pair_hash);
813 hash_set_allocator(hash, NULL, state_triple_node_free, NULL);
817 ATTRIBUTE_RETURN_CHECK
818 static int state_triple_push(state_triple_hash *hash,
823 if (ALLOC_N(pair, 2) < 0)
827 return hash_alloc_insert(hash, pair, s3);
830 static struct state * state_triple_thd(state_triple_hash *hash,
833 struct state *pair[2];
837 node = hash_lookup(hash, pair);
838 return (node == NULL) ? NULL : (struct state *) hnode_get(node);
841 static void state_triple_free(state_triple_hash *hash) {
843 hash_free_nodes(hash);
851 ATTRIBUTE_RETURN_CHECK
852 static int mark_reachable(struct fa *fa) {
853 struct state_set *worklist = state_set_init(-1, S_NONE);
858 list_for_each(s, fa->initial) {
861 fa->initial->reachable = 1;
863 for (struct state *s = fa->initial;
865 s = state_set_pop(worklist)) {
866 for_each_trans(t, s) {
867 if (! t->to->reachable) {
868 t->to->reachable = 1;
869 F(state_set_push(worklist, t->to));
876 state_set_free(worklist);
880 /* Return all reachable states. As a sideeffect, all states have their
881 REACHABLE flag set appropriately.
883 static struct state_set *fa_states(struct fa *fa) {
884 struct state_set *visited = state_set_init(-1, S_NONE);
887 r = mark_reachable(fa);
888 E(visited == NULL || r < 0);
890 list_for_each(s, fa->initial) {
892 F(state_set_push(visited, s));
896 state_set_free(visited);
900 /* Return all reachable accepting states. As a sideeffect, all states have
901 their REACHABLE flag set appropriately.
903 static struct state_set *fa_accept_states(struct fa *fa) {
904 struct state_set *accept = state_set_init(-1, S_NONE);
909 r = mark_reachable(fa);
912 list_for_each(s, fa->initial) {
913 if (s->reachable && s->accept)
914 F(state_set_push(accept, s));
918 state_set_free(accept);
922 /* Mark all live states, i.e. states from which an accepting state can be
923 reached. All states have their REACHABLE and LIVE flags set
926 ATTRIBUTE_RETURN_CHECK
927 static int mark_live(struct fa *fa) {
930 F(mark_reachable(fa));
932 list_for_each(s, fa->initial) {
933 s->live = s->reachable && s->accept;
938 list_for_each(s, fa->initial) {
939 if (! s->live && s->reachable) {
940 for_each_trans(t, s) {
956 * Reverse an automaton in place. Change FA so that it accepts the
957 * language that is the reverse of the input automaton.
959 * Returns a list of the new initial states of the automaton. The list must
960 * be freed by the caller.
962 static struct state_set *fa_reverse(struct fa *fa) {
963 struct state_set *all = NULL;
964 struct state_set *accept = NULL;
969 accept = fa_accept_states(fa);
972 F(state_set_init_data(all));
974 /* Reverse all transitions */
976 F(ALLOC_N(tused, all->used));
977 for (int i=0; i < all->used; i++) {
978 all->data[i] = all->states[i]->trans;
979 tused[i] = all->states[i]->tused;
980 all->states[i]->trans = NULL;
981 all->states[i]->tsize = 0;
982 all->states[i]->tused = 0;
984 for (int i=0; i < all->used; i++) {
985 struct state *s = all->states[i];
986 struct trans *t = all->data[i];
988 for (int j=0; j < tused[i]; j++) {
989 r = add_new_trans(t[j].to, s, t[j].min, t[j].max);
997 /* Make new initial and final states */
998 struct state *s = add_state(fa, 0);
1001 fa->initial->accept = 1;
1003 for (int i=0; i < accept->used; i++) {
1004 r = add_epsilon_trans(s, accept->states[i]);
1009 fa->deterministic = 0;
1011 state_set_free(all);
1014 state_set_free(all);
1015 state_set_free(accept);
1020 * Return a sorted array of all interval start points in FA. The returned
1021 * array is a string (null terminated)
1023 static uchar* start_points(struct fa *fa, int *npoints) {
1024 char pointset[UCHAR_NUM];
1025 uchar *points = NULL;
1027 F(mark_reachable(fa));
1028 MEMZERO(pointset, UCHAR_NUM);
1029 list_for_each(s, fa->initial) {
1033 for_each_trans(t, s) {
1034 pointset[t->min] = 1;
1035 if (t->max < UCHAR_MAX)
1036 pointset[t->max+1] = 1;
1041 for(int i=0; i < UCHAR_NUM; *npoints += pointset[i], i++);
1043 F(ALLOC_N(points, *npoints+1));
1044 for (int i=0, n=0; i < UCHAR_NUM; i++) {
1046 points[n++] = (uchar) i;
1056 * Operations on STATE_SET_HASH
1058 static int state_set_hash_contains(state_set_hash *smap,
1059 struct state_set *set) {
1060 return hash_lookup(smap, set) != NULL;
1064 * Find the set in SMAP that has the same states as SET. If the two are
1065 * different, i.e. they point to different memory locations, free SET and
1066 * return the set found in SMAP
1068 static struct state_set *state_set_hash_uniq(state_set_hash *smap,
1069 struct state_set *set) {
1070 hnode_t *node = hash_lookup(smap, set);
1071 const struct state_set *orig_set = hnode_getkey(node);
1072 if (orig_set != set) {
1073 state_set_free(set);
1075 return (struct state_set *) orig_set;
1078 static struct state *state_set_hash_get_state(state_set_hash *smap,
1079 struct state_set *set) {
1080 hnode_t *node = hash_lookup(smap, set);
1081 return (struct state *) hnode_get(node);
1084 static hash_val_t set_hash(const void *key) {
1085 hash_val_t hash = 0;
1086 const struct state_set *set = key;
1088 for (int i = 0; i < set->used; i++) {
1089 hash += set->states[i]->hash;
1094 static int set_cmp(const void *key1, const void *key2) {
1095 const struct state_set *set1 = key1;
1096 const struct state_set *set2 = key2;
1098 return state_set_equal(set1, set2) ? 0 : 1;
1101 static void set_destroy(hnode_t *node, ATTRIBUTE_UNUSED void *ctx) {
1102 struct state_set *set = (struct state_set *) hnode_getkey(node);
1103 state_set_free(set);
1107 ATTRIBUTE_RETURN_CHECK
1108 static int state_set_hash_add(state_set_hash **smap,
1109 struct state_set *set, struct fa *fa) {
1110 if (*smap == NULL) {
1111 *smap = hash_create(HASHCOUNT_T_MAX, set_cmp, set_hash);
1113 hash_set_allocator(*smap, NULL, set_destroy, NULL);
1115 struct state *s = add_state(fa, 0);
1117 F(hash_alloc_insert(*smap, set, s));
1123 static void state_set_hash_free(state_set_hash *smap,
1124 struct state_set *protect) {
1125 if (protect != NULL) {
1126 hnode_t *node = hash_lookup(smap, protect);
1127 hash_delete(smap, node);
1128 hnode_getkey(node) = NULL;
1129 set_destroy(node, NULL);
1131 hash_free_nodes(smap);
1135 static int state_set_list_add(struct state_set_list **list,
1136 struct state_set *set) {
1137 struct state_set_list *elt;
1141 list_cons(*list, elt);
1145 static struct state_set *state_set_list_pop(struct state_set_list **list) {
1146 struct state_set_list *elt = *list;
1147 struct state_set *set = elt->set;
1154 /* Compare transitions lexicographically by (to, min, reverse max) */
1155 static int trans_to_cmp(const void *v1, const void *v2) {
1156 const struct trans *t1 = v1;
1157 const struct trans *t2 = v2;
1159 if (t1->to != t2->to) {
1160 return (t1->to < t2->to) ? -1 : 1;
1162 if (t1->min < t2->min)
1164 if (t1->min > t2->min)
1166 if (t1->max > t2->max)
1168 return (t1->max < t2->max) ? 1 : 0;
1171 /* Compare transitions lexicographically by (min, reverse max, to) */
1172 static int trans_intv_cmp(const void *v1, const void *v2) {
1173 const struct trans *t1 = v1;
1174 const struct trans *t2 = v2;
1176 if (t1->min < t2->min)
1178 if (t1->min > t2->min)
1180 if (t1->max > t2->max)
1182 if (t1->max < t2->max)
1184 if (t1->to != t2->to) {
1185 return (t1->to < t2->to) ? -1 : 1;
1191 * Reduce an automaton by combining overlapping and adjacent edge intervals
1192 * with the same destination.
1194 static void reduce(struct fa *fa) {
1195 list_for_each(s, fa->initial) {
1199 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
1201 struct trans *t = s->trans;
1202 while (j < s->tused) {
1203 if (t[i].to == t[j].to && t[j].min <= t[i].max + 1) {
1204 if (t[j].max > t[i].max)
1205 t[i].max = t[j].max;
1210 memmove(s->trans + i, s->trans + j,
1211 sizeof(*s->trans) * (s->tused - j));
1217 /* Shrink if we use less than half the allocated size */
1218 if (s->tsize > array_initial_size && 2*s->tused < s->tsize) {
1220 r = REALLOC_N(s->trans, s->tsize);
1222 s->tsize = s->tused;
1228 * Remove dead transitions from an FA; a transition is dead is it does not
1229 * lead to a live state. This also removes any states that are not
1230 * reachable any longer from FA->INITIAL.
1232 * Returns the same FA as a convenience
1235 static void collect_trans(struct fa *fa) {
1236 list_for_each(s, fa->initial) {
1241 while (i < s->tused) {
1242 if (! s->trans[i].to->live) {
1244 memmove(s->trans + i, s->trans + s->tused,
1254 static void collect_dead_states(struct fa *fa) {
1255 /* Remove all dead states and free their storage */
1256 for (struct state *s = fa->initial; s->next != NULL; ) {
1257 if (! s->next->live) {
1258 struct state *del = s->next;
1259 s->next = del->next;
1268 static int collect(struct fa *fa) {
1271 if (! fa->initial->live) {
1272 /* This automaton accepts nothing, make it the canonical
1275 list_for_each(s, fa->initial) {
1278 list_free(fa->initial->next);
1279 fa->deterministic = 1;
1282 collect_dead_states(fa);
1290 static void swap_initial(struct fa *fa) {
1291 struct state *s = fa->initial;
1292 if (s->next != NULL) {
1293 fa->initial = s->next;
1294 s->next = fa->initial->next;
1295 fa->initial->next = s;
1300 * Make a finite automaton deterministic using the given set of initial
1301 * states with the subset construction. This also eliminates dead states
1302 * and transitions and reduces and orders the transitions for each state
1304 static int determinize(struct fa *fa, struct state_set *ini) {
1306 int make_ini = (ini == NULL);
1307 const uchar *points = NULL;
1308 state_set_hash *newstate = NULL;
1309 struct state_set_list *worklist = NULL;
1312 if (fa->deterministic)
1315 points = start_points(fa, &npoints);
1318 ini = state_set_init(-1, S_NONE);
1319 if (ini == NULL || state_set_push(ini, fa->initial) < 0) {
1320 state_set_free(ini);
1325 F(state_set_list_add(&worklist, ini));
1326 F(state_set_hash_add(&newstate, ini, fa));
1327 // Make the new state the initial state
1329 while (worklist != NULL) {
1330 struct state_set *sset = state_set_list_pop(&worklist);
1331 struct state *r = state_set_hash_get_state(newstate, sset);
1332 for (int q=0; q < sset->used; q++) {
1333 r->accept |= sset->states[q]->accept;
1335 for (int n=0; n < npoints; n++) {
1336 struct state_set *pset = state_set_init(-1, S_SORTED);
1338 for(int q=0 ; q < sset->used; q++) {
1339 for_each_trans(t, sset->states[q]) {
1340 if (t->min <= points[n] && points[n] <= t->max) {
1341 F(state_set_add(pset, t->to));
1345 if (!state_set_hash_contains(newstate, pset)) {
1346 F(state_set_list_add(&worklist, pset));
1347 F(state_set_hash_add(&newstate, pset, fa));
1349 pset = state_set_hash_uniq(newstate, pset);
1351 struct state *q = state_set_hash_get_state(newstate, pset);
1352 uchar min = points[n];
1353 uchar max = UCHAR_MAX;
1355 max = points[n+1] - 1;
1356 if (add_new_trans(r, q, min, max) < 0)
1360 fa->deterministic = 1;
1364 state_set_hash_free(newstate, make_ini ? NULL : ini);
1365 free((void *) points);
1366 if (collect(fa) < 0)
1375 * Minimization. As a sideeffect of minimization, the transitions are
1376 * reduced and ordered.
1379 static struct state *step(struct state *s, uchar c) {
1380 for_each_trans(t, s) {
1381 if (t->min <= c && c <= t->max)
1388 struct state_list_node *first;
1389 struct state_list_node *last;
1393 struct state_list_node {
1394 struct state_list *sl;
1395 struct state_list_node *next;
1396 struct state_list_node *prev;
1397 struct state *state;
1400 static struct state_list_node *state_list_add(struct state_list *sl,
1402 struct state_list_node *n;
1410 if (sl->size++ == 0) {
1421 static void state_list_remove(struct state_list_node *n) {
1422 struct state_list *sl = n->sl;
1425 sl->first = n->next;
1427 n->prev->next = n->next;
1431 n->next->prev = n->prev;
1436 static void state_list_free(struct state_list *sl) {
1438 list_free(sl->first);
1442 /* The linear index of element (q,c) in an NSTATES * NSIGMA matrix */
1443 #define INDEX(q, c) (q * nsigma + c)
1445 static int minimize_hopcroft(struct fa *fa) {
1446 struct state_set *states = NULL;
1447 uchar *sigma = NULL;
1448 struct state_set **reverse = NULL;
1449 bitset *reverse_nonempty = NULL;
1450 struct state_set **partition = NULL;
1451 unsigned int *block = NULL;
1452 struct state_list **active = NULL;
1453 struct state_list_node **active2 = NULL;
1454 int *pending = NULL;
1455 bitset *pending2 = NULL;
1456 struct state_set *split = NULL;
1457 bitset *split2 = NULL;
1459 bitset *refine2 = NULL;
1460 struct state_set **splitblock = NULL;
1461 struct state_set *newstates = NULL;
1465 unsigned int nstates = 0;
1468 F(determinize(fa, NULL));
1470 /* Total automaton, nothing to do */
1471 if (fa->initial->tused == 1
1472 && fa->initial->trans[0].to == fa->initial
1473 && fa->initial->trans[0].min == UCHAR_MIN
1474 && fa->initial->trans[0].max == UCHAR_MAX)
1479 /* make arrays for numbered states and effective alphabet */
1480 states = state_set_init(-1, S_NONE);
1483 list_for_each(s, fa->initial) {
1484 F(state_set_push(states, s));
1486 nstates = states->used;
1488 sigma = start_points(fa, &nsigma);
1491 /* initialize data structures */
1493 /* An ss->used x nsigma matrix of lists of states */
1494 F(ALLOC_N(reverse, nstates * nsigma));
1495 reverse_nonempty = bitset_init(nstates * nsigma);
1496 E(reverse_nonempty == NULL);
1497 F(ALLOC_N(partition, nstates));
1498 F(ALLOC_N(block, nstates));
1500 F(ALLOC_N(active, nstates * nsigma));
1501 F(ALLOC_N(active2, nstates * nsigma));
1503 /* PENDING is an array of pairs of ints. The i'th pair is stored in
1504 * PENDING[2*i] and PENDING[2*i + 1]. There are NPENDING pairs in
1505 * PENDING at any time. SPENDING is the maximum number of pairs
1506 * allocated for PENDING.
1508 size_t npending = 0, spending = 0;
1509 pending2 = bitset_init(nstates * nsigma);
1510 E(pending2 == NULL);
1512 split = state_set_init(-1, S_NONE);
1513 split2 = bitset_init(nstates);
1514 E(split == NULL || split2 == NULL);
1516 F(ALLOC_N(refine, nstates));
1517 refine2 = bitset_init(nstates);
1520 F(ALLOC_N(splitblock, nstates));
1522 for (int q = 0; q < nstates; q++) {
1523 splitblock[q] = state_set_init(-1, S_NONE);
1524 partition[q] = state_set_init(-1, S_NONE);
1525 E(splitblock[q] == NULL || partition[q] == NULL);
1526 for (int x = 0; x < nsigma; x++) {
1527 reverse[INDEX(q, x)] = state_set_init(-1, S_NONE);
1528 E(reverse[INDEX(q, x)] == NULL);
1529 F(ALLOC_N(active[INDEX(q, x)], 1));
1533 /* find initial partition and reverse edges */
1534 for (int q = 0; q < nstates; q++) {
1535 struct state *qq = states->states[q];
1541 F(state_set_push(partition[j], qq));
1543 for (int x = 0; x < nsigma; x++) {
1545 struct state *p = step(qq, y);
1547 int pn = state_set_index(states, p);
1549 F(state_set_push(reverse[INDEX(pn, x)], qq));
1550 bitset_set(reverse_nonempty, INDEX(pn, x));
1554 /* initialize active sets */
1555 for (int j = 0; j <= 1; j++)
1556 for (int x = 0; x < nsigma; x++)
1557 for (int q = 0; q < partition[j]->used; q++) {
1558 struct state *qq = partition[j]->states[q];
1559 int qn = state_set_index(states, qq);
1560 if (bitset_get(reverse_nonempty, INDEX(qn, x))) {
1561 active2[INDEX(qn, x)] =
1562 state_list_add(active[INDEX(j, x)], qq);
1563 E(active2[INDEX(qn, x)] == NULL);
1567 /* initialize pending */
1568 F(ALLOC_N(pending, 2*nsigma));
1571 for (int x = 0; x < nsigma; x++) {
1572 int a0 = active[INDEX(0,x)]->size;
1573 int a1 = active[INDEX(1,x)]->size;
1581 bitset_set(pending2, INDEX(j, x));
1584 /* process pending until fixed point */
1586 while (npending-- > 0) {
1587 int p = pending[2*npending];
1588 int x = pending[2*npending+1];
1589 bitset_clr(pending2, INDEX(p, x));
1591 /* find states that need to be split off their blocks */
1592 struct state_list *sh = active[INDEX(p,x)];
1593 for (struct state_list_node *m = sh->first; m != NULL; m = m->next) {
1594 int q = state_set_index(states, m->state);
1595 struct state_set *rev = reverse[INDEX(q, x)];
1596 for (int r =0; r < rev->used; r++) {
1597 struct state *rs = rev->states[r];
1598 int s = state_set_index(states, rs);
1599 if (! bitset_get(split2, s)) {
1600 bitset_set(split2, s);
1601 F(state_set_push(split, rs));
1603 F(state_set_push(splitblock[j], rs));
1604 if (!bitset_get(refine2, j)) {
1605 bitset_set(refine2, j);
1612 for(int rr=0; rr < ref; rr++) {
1614 struct state_set *sp = splitblock[j];
1615 if (sp->used < partition[j]->used) {
1616 struct state_set *b1 = partition[j];
1617 struct state_set *b2 = partition[k];
1618 for (int s = 0; s < sp->used; s++) {
1619 state_set_remove(b1, sp->states[s]);
1620 F(state_set_push(b2, sp->states[s]));
1621 int snum = state_set_index(states, sp->states[s]);
1623 for (int c = 0; c < nsigma; c++) {
1624 struct state_list_node *sn = active2[INDEX(snum, c)];
1625 if (sn != NULL && sn->sl == active[INDEX(j,c)]) {
1626 state_list_remove(sn);
1627 active2[INDEX(snum, c)] =
1628 state_list_add(active[INDEX(k, c)],
1630 E(active2[INDEX(snum, c)] == NULL);
1635 for (int c = 0; c < nsigma; c++) {
1636 int aj = active[INDEX(j, c)]->size;
1637 int ak = active[INDEX(k, c)]->size;
1638 if (npending + 1 > spending) {
1640 F(REALLOC_N(pending, 2 * spending));
1642 pending[2*npending + 1] = c;
1643 if (!bitset_get(pending2, INDEX(j, c))
1644 && 0 < aj && aj <= ak) {
1645 bitset_set(pending2, INDEX(j, c));
1646 pending[2*npending] = j;
1648 bitset_set(pending2, INDEX(k, c));
1649 pending[2*npending] = k;
1655 for (int s = 0; s < sp->used; s++) {
1656 int snum = state_set_index(states, sp->states[s]);
1657 bitset_clr(split2, snum);
1659 bitset_clr(refine2, j);
1665 /* make a new state for each equivalence class, set initial state */
1666 newstates = state_set_init(k, S_NONE);
1667 E(newstates == NULL);
1668 F(ALLOC_N(nsnum, k));
1669 F(ALLOC_N(nsind, nstates));
1671 for (int n = 0; n < k; n++) {
1672 struct state *s = make_state();
1674 newstates->states[n] = s;
1675 struct state_set *partn = partition[n];
1676 for (int q=0; q < partn->used; q++) {
1677 struct state *qs = partn->states[q];
1678 int qnum = state_set_index(states, qs);
1679 if (qs == fa->initial)
1680 s->live = 1; /* Abuse live to flag the new initial state */
1681 nsnum[n] = qnum; /* select representative */
1682 nsind[qnum] = n; /* and point from partition to new state */
1686 /* build transitions and set acceptance */
1687 for (int n = 0; n < k; n++) {
1688 struct state *s = newstates->states[n];
1689 s->accept = states->states[nsnum[n]]->accept;
1690 for_each_trans(t, states->states[nsnum[n]]) {
1691 int toind = state_set_index(states, t->to);
1692 struct state *nto = newstates->states[nsind[toind]];
1693 F(add_new_trans(s, nto, t->min, t->max));
1697 /* Get rid of old states and transitions and turn NEWTSTATES into
1700 for (int n=0; n < k; n++)
1701 if (newstates->states[n]->live) {
1702 struct state *ini = newstates->states[n];
1703 newstates->states[n] = newstates->states[0];
1704 newstates->states[0] = ini;
1706 for (int n=0; n < k-1; n++)
1707 newstates->states[n]->next = newstates->states[n+1];
1708 fa->initial = newstates->states[0];
1716 state_set_free(states);
1718 bitset_free(reverse_nonempty);
1720 for (int i=0; i < nstates*nsigma; i++) {
1722 state_set_free(reverse[i]);
1724 state_list_free(active[i]);
1730 bitset_free(pending2);
1731 state_set_free(split);
1732 bitset_free(split2);
1734 bitset_free(refine2);
1735 for (int q=0; q < nstates; q++) {
1737 state_set_free(splitblock[q]);
1739 state_set_free(partition[q]);
1743 state_set_free(newstates);
1745 if (collect(fa) < 0)
1753 static int minimize_brzozowski(struct fa *fa) {
1754 struct state_set *set;
1756 /* Minimize using Brzozowski's algorithm */
1757 set = fa_reverse(fa);
1759 F(determinize(fa, set));
1760 state_set_free(set);
1762 set = fa_reverse(fa);
1764 F(determinize(fa, set));
1765 state_set_free(set);
1771 int fa_minimize(struct fa *fa) {
1779 if (fa_minimization_algorithm == FA_MIN_BRZOZOWSKI) {
1780 r = minimize_brzozowski(fa);
1782 r = minimize_hopcroft(fa);
1791 * Construction of fa
1794 static struct fa *fa_make_empty(void) {
1799 if (add_state(fa, 0) == NULL) {
1803 /* Even though, technically, this fa is both minimal and deterministic,
1804 * this function is also used to allocate new fa's which are then modified
1805 * further. Rather than risk erroneously marking such an fa as minimal
1806 * and deterministic, we do not do that here and take the minor hit if
1807 * that should ever need to be determined for an actual empty fa
1812 static struct fa *fa_make_epsilon(void) {
1813 struct fa *fa = fa_make_empty();
1815 fa->initial->accept = 1;
1816 fa->deterministic= 1;
1822 static struct fa *fa_make_char(uchar c) {
1823 struct fa *fa = fa_make_empty();
1827 struct state *s = fa->initial;
1828 struct state *t = add_state(fa, 1);
1834 r = add_new_trans(s, t, c, c);
1837 fa->deterministic = 1;
1845 struct fa *fa_make_basic(unsigned int basic) {
1848 if (basic == FA_EMPTY) {
1849 return fa_make_empty();
1850 } else if (basic == FA_EPSILON) {
1851 return fa_make_epsilon();
1852 } else if (basic == FA_TOTAL) {
1853 struct fa *fa = fa_make_epsilon();
1854 r = add_new_trans(fa->initial, fa->initial, UCHAR_MIN, UCHAR_MAX);
1864 int fa_is_basic(struct fa *fa, unsigned int basic) {
1865 if (basic == FA_EMPTY) {
1866 return ! fa->initial->accept && fa->initial->tused == 0;
1867 } else if (basic == FA_EPSILON) {
1868 return fa->initial->accept && fa->initial->tused == 0;
1869 } else if (basic == FA_TOTAL) {
1870 if (! fa->initial->accept)
1873 if (fa->initial->tused != 2)
1875 struct trans *t1 = fa->initial->trans;
1876 struct trans *t2 = fa->initial->trans + 1;
1877 if (t1->to != fa->initial || t2->to != fa->initial)
1879 if (t2->max != UCHAR_MAX) {
1881 t2 = fa->initial->trans;
1883 return (t1->min == UCHAR_MIN && t1->max == 'A' - 1 &&
1884 t2->min == 'Z' + 1 && t2->max == UCHAR_MAX);
1886 struct trans *t = fa->initial->trans;
1887 return fa->initial->tused == 1 &&
1888 t->to == fa->initial &&
1889 t->min == UCHAR_MIN && t->max == UCHAR_MAX;
1895 static struct fa *fa_clone(struct fa *fa) {
1896 struct fa *result = NULL;
1897 struct state_set *set = state_set_init(-1, S_DATA|S_SORTED);
1900 if (fa == NULL || set == NULL || ALLOC(result) < 0)
1903 result->deterministic = fa->deterministic;
1904 result->minimal = fa->minimal;
1905 result->nocase = fa->nocase;
1906 list_for_each(s, fa->initial) {
1907 int i = state_set_push(set, s);
1910 struct state *q = add_state(result, s->accept);
1915 q->reachable = s->reachable;
1917 for (int i=0; i < set->used; i++) {
1918 struct state *s = set->states[i];
1919 struct state *sc = set->data[i];
1920 for_each_trans(t, s) {
1921 int to = state_set_index(set, t->to);
1923 struct state *toc = set->data[to];
1924 r = add_new_trans(sc, toc, t->min, t->max);
1929 state_set_free(set);
1932 state_set_free(set);
1937 static int case_expand(struct fa *fa);
1939 /* Compute FA1|FA2 and set FA1 to that automaton. FA2 is freed */
1940 ATTRIBUTE_RETURN_CHECK
1941 static int union_in_place(struct fa *fa1, struct fa **fa2) {
1945 if (fa1->nocase != (*fa2)->nocase) {
1946 if (case_expand(fa1) < 0)
1948 if (case_expand(*fa2) < 0)
1952 s = add_state(fa1, 0);
1955 r = add_epsilon_trans(s, fa1->initial);
1958 r = add_epsilon_trans(s, (*fa2)->initial);
1962 fa1->deterministic = 0;
1966 set_initial(fa1, s);
1971 struct fa *fa_union(struct fa *fa1, struct fa *fa2) {
1972 fa1 = fa_clone(fa1);
1973 fa2 = fa_clone(fa2);
1974 if (fa1 == NULL || fa2 == NULL)
1977 F(union_in_place(fa1, &fa2));
1986 /* Concat FA2 onto FA1; frees FA2 and changes FA1 to FA1.FA2 */
1987 ATTRIBUTE_RETURN_CHECK
1988 static int concat_in_place(struct fa *fa1, struct fa **fa2) {
1991 if (fa1->nocase != (*fa2)->nocase) {
1992 if (case_expand(fa1) < 0)
1994 if (case_expand(*fa2) < 0)
1998 list_for_each(s, fa1->initial) {
2001 r = add_epsilon_trans(s, (*fa2)->initial);
2007 fa1->deterministic = 0;
2014 struct fa *fa_concat(struct fa *fa1, struct fa *fa2) {
2015 fa1 = fa_clone(fa1);
2016 fa2 = fa_clone(fa2);
2018 if (fa1 == NULL || fa2 == NULL)
2021 F(concat_in_place(fa1, &fa2));
2033 static struct fa *fa_make_char_set(bitset *cset, int negate) {
2034 struct fa *fa = fa_make_empty();
2038 struct state *s = fa->initial;
2039 struct state *t = add_state(fa, 1);
2046 while (from <= UCHAR_MAX) {
2047 while (from <= UCHAR_MAX && bitset_get(cset, from) == negate)
2049 if (from > UCHAR_MAX)
2052 while (to < UCHAR_MAX && (bitset_get(cset, to + 1) == !negate))
2054 r = add_new_trans(s, t, from, to);
2060 fa->deterministic = 1;
2069 static struct fa *fa_star(struct fa *fa) {
2077 s = add_state(fa, 1);
2081 r = add_epsilon_trans(s, fa->initial);
2086 list_for_each(p, fa->initial->next) {
2088 r = add_epsilon_trans(p, s);
2093 fa->deterministic = 0;
2103 /* Form the automaton (FA){N}; FA is not modified */
2104 static struct fa *repeat(struct fa *fa, int n) {
2106 return fa_make_epsilon();
2107 } else if (n == 1) {
2108 return fa_clone(fa);
2110 struct fa *cfa = fa_clone(fa);
2114 struct fa *tfa = fa_clone(fa);
2119 if (concat_in_place(cfa, &tfa) < 0) {
2130 struct fa *fa_iter(struct fa *fa, int min, int max) {
2136 if (min > max && max != -1) {
2137 return fa_make_empty();
2140 struct fa *sfa = fa_star(fa);
2145 struct fa *cfa = repeat(fa, min);
2150 if (concat_in_place(cfa, &sfa) < 0) {
2157 struct fa *cfa = NULL;
2160 cfa = repeat(fa, min);
2164 struct fa *cfa2 = fa_clone(fa);
2170 struct fa *cfa3 = fa_clone(fa);
2176 list_for_each(s, cfa3->initial) {
2178 r = add_epsilon_trans(s, cfa2->initial);
2187 fa_merge(cfa3, &cfa2);
2191 list_for_each(s, cfa->initial) {
2193 r = add_epsilon_trans(s, cfa2->initial);
2201 fa_merge(cfa, &cfa2);
2202 cfa->deterministic = 0;
2205 if (collect(cfa) < 0) {
2213 static void sort_transition_intervals(struct fa *fa) {
2214 list_for_each(s, fa->initial) {
2215 qsort(s->trans, s->tused, sizeof(*s->trans), trans_intv_cmp);
2219 struct fa *fa_intersect(struct fa *fa1, struct fa *fa2) {
2221 struct fa *fa = NULL;
2222 struct state_set *worklist = NULL;
2223 state_triple_hash *newstates = NULL;
2226 return fa_clone(fa1);
2228 if (fa_is_basic(fa1, FA_EMPTY) || fa_is_basic(fa2, FA_EMPTY))
2229 return fa_make_empty();
2231 if (fa1->nocase != fa2->nocase) {
2232 F(case_expand(fa1));
2233 F(case_expand(fa2));
2236 fa = fa_make_empty();
2237 worklist = state_set_init(-1, S_NONE);
2238 newstates = state_triple_init();
2239 if (fa == NULL || worklist == NULL || newstates == NULL)
2242 sort_transition_intervals(fa1);
2243 sort_transition_intervals(fa2);
2245 F(state_set_push(worklist, fa1->initial));
2246 F(state_set_push(worklist, fa2->initial));
2247 F(state_set_push(worklist, fa->initial));
2248 F(state_triple_push(newstates,
2249 fa1->initial, fa2->initial, fa->initial));
2250 while (worklist->used) {
2251 struct state *s = state_set_pop(worklist);
2252 struct state *p2 = state_set_pop(worklist);
2253 struct state *p1 = state_set_pop(worklist);
2254 s->accept = p1->accept && p2->accept;
2256 struct trans *t1 = p1->trans;
2257 struct trans *t2 = p2->trans;
2258 for (int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2259 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2262 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2264 if (t2[n2].max >= t1[n1].min) {
2265 struct state *r = state_triple_thd(newstates,
2266 t1[n1].to, t2[n2].to);
2268 r = add_state(fa, 0);
2270 F(state_set_push(worklist, t1[n1].to));
2271 F(state_set_push(worklist, t2[n2].to));
2272 F(state_set_push(worklist, r));
2273 F(state_triple_push(newstates,
2274 t1[n1].to, t2[n2].to, r));
2276 char min = t1[n1].min > t2[n2].min
2277 ? t1[n1].min : t2[n2].min;
2278 char max = t1[n1].max < t2[n2].max
2279 ? t1[n1].max : t2[n2].max;
2280 ret = add_new_trans(s, r, min, max);
2287 fa->deterministic = fa1->deterministic && fa2->deterministic;
2288 fa->nocase = fa1->nocase && fa2->nocase;
2290 state_set_free(worklist);
2291 state_triple_free(newstates);
2293 if (collect(fa) < 0) {
2306 int fa_contains(struct fa *fa1, struct fa *fa2) {
2308 struct state_set *worklist = NULL; /* List of pairs of states */
2309 struct state_set *visited = NULL; /* List of pairs of states */
2311 if (fa1 == NULL || fa2 == NULL)
2317 F(determinize(fa2, NULL));
2318 sort_transition_intervals(fa1);
2319 sort_transition_intervals(fa2);
2321 F(state_pair_push(&worklist, fa1->initial, fa2->initial));
2322 F(state_pair_push(&visited, fa1->initial, fa2->initial));
2323 while (worklist->used) {
2324 struct state *p1, *p2;
2326 p1 = state_set_pop_data(worklist, &v2);
2329 if (p1->accept && !p2->accept)
2332 struct trans *t1 = p1->trans;
2333 struct trans *t2 = p2->trans;
2334 for(int n1 = 0, b2 = 0; n1 < p1->tused; n1++) {
2335 while (b2 < p2->tused && t2[b2].max < t1[n1].min)
2337 int min1 = t1[n1].min, max1 = t1[n1].max;
2339 n2 < p2->tused && t1[n1].max >= t2[n2].min;
2341 if (t2[n2].min > min1)
2343 if (t2[n2].max < CHAR_MAX)
2344 min1 = t2[n2].max + 1;
2349 if (state_pair_find(visited, t1[n1].to, t2[n2].to) == -1) {
2350 F(state_pair_push(&worklist, t1[n1].to, t2[n2].to));
2351 F(state_pair_push(&visited, t1[n1].to, t2[n2].to));
2361 state_set_free(worklist);
2362 state_set_free(visited);
2369 static int add_crash_trans(struct fa *fa, struct state *s, struct state *crash,
2374 /* Never transition on anything in [A-Z] */
2375 if (min > 'Z' || max < 'A') {
2376 result = add_new_trans(s, crash, min, max);
2377 } else if (min >= 'A' && max <= 'Z') {
2379 } else if (max <= 'Z') {
2381 result = add_new_trans(s, crash, min, 'A' - 1);
2382 } else if (min >= 'A') {
2384 result = add_new_trans(s, crash, 'Z' + 1, max);
2386 /* min < 'A' && max > 'Z' */
2387 result = add_new_trans(s, crash, min, 'A' - 1);
2389 result = add_new_trans(s, crash, 'Z' + 1, max);
2392 result = add_new_trans(s, crash, min, max);
2397 static int totalize(struct fa *fa) {
2399 struct state *crash = add_state(fa, 0);
2402 F(mark_reachable(fa));
2403 sort_transition_intervals(fa);
2405 r = add_crash_trans(fa, crash, crash, UCHAR_MIN, UCHAR_MAX);
2409 list_for_each(s, fa->initial) {
2410 int next = UCHAR_MIN;
2411 int tused = s->tused;
2412 for (int i=0; i < tused; i++) {
2413 uchar min = s->trans[i].min, max = s->trans[i].max;
2415 r = add_crash_trans(fa, s, crash, next, min - 1);
2422 if (next <= UCHAR_MAX) {
2423 r = add_crash_trans(fa, s, crash, next, UCHAR_MAX);
2433 struct fa *fa_complement(struct fa *fa) {
2436 F(determinize(fa, NULL));
2438 list_for_each(s, fa->initial)
2439 s->accept = ! s->accept;
2448 struct fa *fa_minus(struct fa *fa1, struct fa *fa2) {
2449 if (fa1 == NULL || fa2 == NULL)
2452 if (fa_is_basic(fa1, FA_EMPTY) || fa1 == fa2)
2453 return fa_make_empty();
2454 if (fa_is_basic(fa2, FA_EMPTY))
2455 return fa_clone(fa1);
2457 struct fa *cfa2 = fa_complement(fa2);
2461 struct fa *result = fa_intersect(fa1, cfa2);
2466 static int accept_to_accept(struct fa *fa) {
2468 struct state *s = add_state(fa, 0);
2472 F(mark_reachable(fa));
2473 list_for_each(a, fa->initial) {
2474 if (a->accept && a->reachable) {
2475 r = add_epsilon_trans(s, a);
2482 fa->deterministic = fa->minimal = 0;
2488 struct fa *fa_overlap(struct fa *fa1, struct fa *fa2) {
2489 struct fa *fa = NULL, *eps = NULL, *result = NULL;
2490 struct state_set *map = NULL;
2492 if (fa1 == NULL || fa2 == NULL)
2495 fa1 = fa_clone(fa1);
2496 fa2 = fa_clone(fa2);
2497 if (fa1 == NULL || fa2 == NULL)
2500 if (determinize(fa1, NULL) < 0)
2502 if (accept_to_accept(fa1) < 0)
2505 map = fa_reverse(fa2);
2506 state_set_free(map);
2507 if (determinize(fa2, NULL) < 0)
2509 if (accept_to_accept(fa2) < 0)
2511 map = fa_reverse(fa2);
2512 state_set_free(map);
2513 if (determinize(fa2, NULL) < 0)
2516 fa = fa_intersect(fa1, fa2);
2520 eps = fa_make_epsilon();
2524 result = fa_minus(fa, eps);
2536 int fa_equals(struct fa *fa1, struct fa *fa2) {
2537 if (fa1 == NULL || fa2 == NULL)
2540 /* fa_contains(fa1, fa2) && fa_contains(fa2, fa1) with error checking */
2541 int c1 = fa_contains(fa1, fa2);
2546 return fa_contains(fa2, fa1);
2549 static unsigned int chr_score(char c) {
2552 } else if (isalnum(c)) {
2554 } else if (isprint(c)) {
2556 } else if (c == '\0') {
2563 static unsigned int str_score(const struct re_str *str) {
2564 unsigned int score = 0;
2565 for (int i=0; i < str->len; i++) {
2566 score += chr_score(str->rx[i]);
2571 /* See if we get a better string for DST by appending C to SRC. If DST is
2572 * NULL or empty, always use SRC + C
2574 static struct re_str *string_extend(struct re_str *dst,
2575 const struct re_str *src,
2579 || str_score(src) + chr_score(c) < str_score(dst)) {
2580 int slen = src->len;
2582 dst = make_re_str(NULL);
2585 if (REALLOC_N(dst->rx, slen+2) < 0) {
2589 memcpy(dst->rx, src->rx, slen);
2591 dst->rx[slen + 1] = '\0';
2592 dst->len = slen + 1;
2597 static char pick_char(struct trans *t) {
2598 for (int c = t->min; c <= t->max; c++)
2599 if (isalpha(c)) return c;
2600 for (int c = t->min; c <= t->max; c++)
2601 if (isalnum(c)) return c;
2602 for (int c = t->min; c <= t->max; c++)
2603 if (isprint(c)) return c;
2607 /* Generate an example string for FA. Traverse all transitions and record
2608 * at each turn the "best" word found for that state.
2610 int fa_example(struct fa *fa, char **example, size_t *example_len) {
2611 struct re_str *word = NULL;
2612 struct state_set *path = NULL, *worklist = NULL;
2613 struct re_str *str = NULL;
2618 /* Sort to avoid any ambiguity because of reordering of transitions */
2619 sort_transition_intervals(fa);
2621 /* Map from state to string */
2622 path = state_set_init(-1, S_DATA|S_SORTED);
2623 str = make_re_str("");
2624 if (path == NULL || str == NULL)
2626 F(state_set_push_data(path, fa->initial, str));
2629 /* List of states still to visit */
2630 worklist = state_set_init(-1, S_NONE);
2631 if (worklist == NULL)
2633 F(state_set_push(worklist, fa->initial));
2635 while (worklist->used) {
2636 struct state *s = state_set_pop(worklist);
2637 struct re_str *ps = state_set_find_data(path, s);
2638 for_each_trans(t, s) {
2639 char c = pick_char(t);
2640 int toind = state_set_index(path, t->to);
2642 struct re_str *w = string_extend(NULL, ps, c);
2644 F(state_set_push(worklist, t->to));
2645 F(state_set_push_data(path, t->to, w));
2647 path->data[toind] = string_extend(path->data[toind], ps, c);
2652 for (int i=0; i < path->used; i++) {
2653 struct state *p = path->states[i];
2654 struct re_str *ps = path->data[i];
2656 (word == NULL || word->len == 0
2657 || (ps->len > 0 && str_score(word) > str_score(ps)))) {
2664 state_set_free(path);
2665 state_set_free(worklist);
2667 *example_len = word->len;
2668 *example = word->rx;
2673 state_set_free(path);
2674 state_set_free(worklist);
2688 static int fa_enumerate_intl(struct state *s, struct enum_intl *ei, int pos) {
2691 if (ei->bsize <= pos + 1) {
2693 F(REALLOC_N(ei->buf, ei->bsize));
2696 ei->buf[pos] = '\0';
2697 for_each_trans(t, s) {
2701 for (int i=t->min; i <= t->max; i++) {
2703 if (t->to->accept) {
2704 if (ei->nwords >= ei->limit)
2706 ei->words[ei->nwords] = strdup(ei->buf);
2707 E(ei->words[ei->nwords] == NULL);
2710 result = fa_enumerate_intl(t->to, ei, pos+1);
2715 ei->buf[pos] = '\0';
2721 int fa_enumerate(struct fa *fa, int limit, char ***words) {
2722 struct enum_intl ei;
2727 ei.bsize = 8; /* Arbitrary initial size */
2729 F(ALLOC_N(ei.words, limit));
2730 F(ALLOC_N(ei.buf, ei.bsize));
2732 /* We use the visited bit to track which states we already visited
2733 * during the construction of a word to detect loops */
2734 list_for_each(s, fa->initial)
2736 fa->initial->visited = 1;
2737 if (fa->initial->accept) {
2738 if (ei.nwords >= limit)
2740 ei.words[0] = strdup("");
2741 E(ei.words[0] == NULL);
2744 result = fa_enumerate_intl(fa->initial, &ei, 0);
2755 for (int i=0; i < ei.nwords; i++)
2761 /* Expand the automaton FA by replacing every transition s(c) -> p from
2762 * state s to p on character c by two transitions s(X) -> r, r(c) -> p via
2764 * If ADD_MARKER is true, also add for each original state s a new a loop
2765 * s(Y) -> q and q(X) -> s through a new state q.
2767 * The end result is that an automaton accepting "a|ab" is turned into one
2768 * accepting "Xa|XaXb" if add_marker is false and "(YX)*Xa|(YX)*Xa(YX)*Xb"
2769 * when add_marker is true.
2771 * The returned automaton is a copy of FA, FA is not modified.
2773 static struct fa *expand_alphabet(struct fa *fa, int add_marker,
2781 F(mark_reachable(fa));
2782 list_for_each(p, fa->initial) {
2786 struct state *r = add_state(fa, 0);
2789 r->trans = p->trans;
2790 r->tused = p->tused;
2791 r->tsize = p->tsize;
2793 p->tused = p->tsize = 0;
2794 ret = add_new_trans(p, r, X, X);
2798 struct state *q = add_state(fa, 0);
2799 ret = add_new_trans(p, q, Y, Y);
2802 ret = add_new_trans(q, p, X, X);
2813 static bitset *alphabet(struct fa *fa) {
2814 bitset *bs = bitset_init(UCHAR_NUM);
2819 list_for_each(s, fa->initial) {
2820 for (int i=0; i < s->tused; i++) {
2821 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2828 static bitset *last_chars(struct fa *fa) {
2829 bitset *bs = bitset_init(UCHAR_NUM);
2834 list_for_each(s, fa->initial) {
2835 for (int i=0; i < s->tused; i++) {
2836 if (s->trans[i].to->accept) {
2837 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2845 static bitset *first_chars(struct fa *fa) {
2846 bitset *bs = bitset_init(UCHAR_NUM);
2847 struct state *s = fa->initial;
2852 for (int i=0; i < s->tused; i++) {
2853 for (uint c = s->trans[i].min; c <= s->trans[i].max; c++)
2859 /* Return 1 if F1 and F2 are known to be unambiguously concatenable
2860 * according to simple heuristics. Return 0 if they need to be checked
2861 * further to decide ambiguity
2862 * Return -1 if an allocation fails
2864 static int is_splittable(struct fa *fa1, struct fa *fa2) {
2865 bitset *alpha1 = NULL;
2866 bitset *alpha2 = NULL;
2867 bitset *last1 = NULL;
2868 bitset *first2 = NULL;
2871 alpha2 = alphabet(fa2);
2872 last1 = last_chars(fa1);
2873 if (alpha2 == NULL || last1 == NULL)
2875 if (bitset_disjoint(last1, alpha2, UCHAR_NUM)) {
2880 alpha1 = alphabet(fa1);
2881 first2 = first_chars(fa2);
2882 if (alpha1 == NULL || first2 == NULL)
2884 if (bitset_disjoint(first2, alpha1, UCHAR_NUM)) {
2890 bitset_free(alpha1);
2891 bitset_free(alpha2);
2893 bitset_free(first2);
2897 /* This algorithm is due to Anders Moeller, and can be found in class
2898 * AutomatonOperations in dk.brics.grammar
2900 int fa_ambig_example(struct fa *fa1, struct fa *fa2,
2901 char **upv, size_t *upv_len,
2902 char **pv, char **v) {
2903 static const char X = '\001';
2904 static const char Y = '\002';
2905 char *result = NULL, *s = NULL;
2906 size_t result_len = 0;
2908 struct fa *mp = NULL, *ms = NULL, *sp = NULL, *ss = NULL, *amb = NULL;
2909 struct fa *a1f = NULL, *a1t = NULL, *a2f = NULL, *a2t = NULL;
2910 struct fa *b1 = NULL, *b2 = NULL;
2919 r = is_splittable(fa1, fa2);
2927 #define MPs Ys Xs "(" Xs "(.|\n))+"
2928 #define MSs Ys Xs "(" Xs "(.|\n))*"
2929 #define SPs "(" Xs "(.|\n))+" Ys Xs
2930 #define SSs "(" Xs "(.|\n))*" Ys Xs
2931 /* These could become static constants */
2932 r = fa_compile( MPs, strlen(MPs), &mp);
2933 if (r != REG_NOERROR)
2935 r = fa_compile( MSs, strlen(MSs), &ms);
2936 if (r != REG_NOERROR)
2938 r = fa_compile( SPs, strlen(SPs), &sp);
2939 if (r != REG_NOERROR)
2941 r = fa_compile( SSs, strlen(SSs), &ss);
2942 if (r != REG_NOERROR)
2951 a1f = expand_alphabet(fa1, 0, X, Y);
2952 a1t = expand_alphabet(fa1, 1, X, Y);
2953 a2f = expand_alphabet(fa2, 0, X, Y);
2954 a2t = expand_alphabet(fa2, 1, X, Y);
2955 if (a1f == NULL || a1t == NULL || a2f == NULL || a2t == NULL)
2958 /* Compute b1 = ((a1f . mp) & a1t) . ms */
2959 if (concat_in_place(a1f, &mp) < 0)
2961 b1 = fa_intersect(a1f, a1t);
2964 if (concat_in_place(b1, &ms) < 0)
2966 if (fa_is_basic(b1, FA_EMPTY)) {
2967 /* We are done - amb which we take an example from below
2968 * will be empty, and there can therefore not be an ambiguity */
2973 /* Compute b2 = ss . ((sp . a2f) & a2t) */
2974 if (concat_in_place(sp, &a2f) < 0)
2976 b2 = fa_intersect(sp, a2t);
2979 if (concat_in_place(ss, &b2) < 0)
2984 /* The automaton we are really interested in */
2985 amb = fa_intersect(b1, b2);
2990 r = fa_example(amb, &s, &s_len);
2996 result_len = (s_len-1)/2 - 1;
2997 F(ALLOC_N(result, result_len + 1));
3000 for (i=0; s[2*i] == X; i++) {
3001 assert((t - result) < result_len);
3008 for ( ;s[2*i] == X; i++) {
3009 assert((t - result) < result_len);
3016 for (; 2*i+1 < s_len; i++) {
3017 assert((t - result) < result_len);
3024 /* Clean up intermediate automata */
3040 *upv_len = result_len;
3049 * Construct an fa from a regular expression
3051 static struct fa *fa_from_re(struct re *re) {
3052 struct fa *result = NULL;
3057 result = fa_from_re(re->exp1);
3060 struct fa *fa2 = fa_from_re(re->exp2);
3063 if (union_in_place(result, &fa2) < 0)
3069 result = fa_from_re(re->exp1);
3072 struct fa *fa2 = fa_from_re(re->exp2);
3075 if (concat_in_place(result, &fa2) < 0)
3080 result = fa_make_char_set(re->cset, re->negate);
3084 struct fa *fa = fa_from_re(re->exp);
3087 result = fa_iter(fa, re->min, re->max);
3092 result = fa_make_epsilon();
3095 result = fa_make_char(re->c);
3107 static void free_re(struct re *re) {
3110 assert(re->ref == 0);
3112 if (re->type == UNION || re->type == CONCAT) {
3115 } else if (re->type == ITER) {
3117 } else if (re->type == CSET) {
3118 bitset_free(re->cset);
3123 int fa_compile(const char *regexp, size_t size, struct fa **fa) {
3124 struct re *re = NULL;
3125 struct re_parse parse;
3130 parse.rend = regexp + size;
3131 parse.error = REG_NOERROR;
3133 re = parse_regexp(&parse);
3137 *fa = fa_from_re(re);
3140 if (*fa == NULL || collect(*fa) < 0)
3141 parse.error = REG_ESPACE;
3145 /* We represent a case-insensitive FA by using only transitions on
3146 * lower-case letters.
3148 int fa_nocase(struct fa *fa) {
3149 if (fa == NULL || fa->nocase)
3153 list_for_each(s, fa->initial) {
3154 int tused = s->tused;
3155 /* For every transition on characters in [A-Z] add a corresponding
3156 * transition on [a-z]; remove any portion covering [A-Z] */
3157 for (int i=0; i < tused; i++) {
3158 struct trans *t = s->trans + i;
3159 int lc_min = t->min < 'A' ? 'a' : tolower(t->min);
3160 int lc_max = t->max > 'Z' ? 'z' : tolower(t->max);
3162 if (t->min > 'Z' || t->max < 'A')
3164 if (t->min >= 'A' && t->max <= 'Z') {
3165 t->min = tolower(t->min);
3166 t->max = tolower(t->max);
3167 } else if (t->max <= 'Z') {
3170 F(add_new_trans(s, t->to, lc_min, lc_max));
3171 } else if (t->min >= 'A') {
3174 F(add_new_trans(s, t->to, lc_min, lc_max));
3176 /* t->min < 'A' && t->max > 'Z' */
3177 F(add_new_trans(s, t->to, 'Z' + 1, t->max));
3178 s->trans[i].max = 'A' - 1;
3179 F(add_new_trans(s, s->trans[i].to, lc_min, lc_max));
3189 int fa_is_nocase(struct fa *fa) {
3193 /* If FA is case-insensitive, turn it into a case-sensitive automaton by
3194 * adding transitions on upper-case letters for each existing transition on
3195 * lower-case letters */
3196 static int case_expand(struct fa *fa) {
3201 list_for_each(s, fa->initial) {
3202 int tused = s->tused;
3203 /* For every transition on characters in [a-z] add a corresponding
3204 * transition on [A-Z] */
3205 for (int i=0; i < tused; i++) {
3206 struct trans *t = s->trans + i;
3207 int lc_min = t->min < 'a' ? 'A' : toupper(t->min);
3208 int lc_max = t->max > 'z' ? 'Z' : toupper(t->max);
3210 if (t->min > 'z' || t->max < 'a')
3212 F(add_new_trans(s, t->to, lc_min, lc_max));
3222 * Regular expression parser
3225 static struct re *make_re(enum re_type type) {
3227 if (make_ref(re) == 0)
3232 static struct re *make_re_rep(struct re *exp, int min, int max) {
3233 struct re *re = make_re(ITER);
3244 static struct re *make_re_binop(enum re_type type, struct re *exp1,
3246 struct re *re = make_re(type);
3257 static struct re *make_re_char(uchar c) {
3258 struct re *re = make_re(CHAR);
3264 static struct re *make_re_char_set(bool negate, bool no_ranges) {
3265 struct re *re = make_re(CSET);
3267 re->negate = negate;
3268 re->no_ranges = no_ranges;
3269 re->cset = bitset_init(UCHAR_NUM);
3270 if (re->cset == NULL)
3276 static bool more(struct re_parse *parse) {
3277 return parse->rx < parse->rend;
3280 static bool match(struct re_parse *parse, char m) {
3283 if (*parse->rx == m) {
3290 static bool peek(struct re_parse *parse, const char *chars) {
3291 return *parse->rx != '\0' && strchr(chars, *parse->rx) != NULL;
3294 static bool next(struct re_parse *parse, char *c) {
3302 static bool parse_char(struct re_parse *parse, int quoted, char *c) {
3305 if (quoted && *parse->rx == '\\') {
3307 return next(parse, c);
3309 return next(parse, c);
3313 static void add_re_char(struct re *re, uchar from, uchar to) {
3314 assert(re->type == CSET);
3315 for (unsigned int c = from; c <= to; c++)
3316 bitset_set(re->cset, c);
3319 static void parse_char_class(struct re_parse *parse, struct re *re) {
3320 if (! more(parse)) {
3321 parse->error = REG_EBRACK;
3325 parse_char(parse, 0, &from);
3327 if (match(parse, '-')) {
3328 if (! more(parse)) {
3329 parse->error = REG_EBRACK;
3332 if (peek(parse, "]")) {
3334 parse->error = REG_ERANGE;
3337 add_re_char(re, from, to);
3338 add_re_char(re, '-', '-');
3340 } else if (!parse_char(parse, 0, &to)) {
3341 parse->error = REG_ERANGE;
3346 parse->error = REG_ERANGE;
3349 add_re_char(re, from, to);
3354 static struct re *parse_simple_exp(struct re_parse *parse) {
3355 struct re *re = NULL;
3357 if (match(parse, '[')) {
3358 bool negate = match(parse, '^');
3359 re = make_re_char_set(negate, parse->no_ranges);
3361 parse->error = REG_ESPACE;
3364 parse_char_class(parse, re);
3365 if (parse->error != REG_NOERROR)
3367 while (more(parse) && ! peek(parse, "]")) {
3368 parse_char_class(parse, re);
3369 if (parse->error != REG_NOERROR)
3372 if (! match(parse, ']')) {
3373 parse->error = REG_EBRACK;
3376 } else if (match(parse, '(')) {
3377 if (match(parse, ')')) {
3378 re = make_re(EPSILON);
3380 parse->error = REG_ESPACE;
3384 re = parse_regexp(parse);
3387 if (! match(parse, ')')) {
3388 parse->error = REG_EPAREN;
3392 } else if (match(parse, '.')) {
3393 re = make_re_char_set(1, parse->no_ranges);
3395 parse->error = REG_ESPACE;
3398 add_re_char(re, '\n', '\n');
3399 } else if (more(parse)) {
3401 if (!parse_char(parse, 1, &c)) {
3402 parse->error = REG_EESCAPE;
3405 re = make_re_char(c);
3407 parse->error = REG_ESPACE;
3411 re = make_re(EPSILON);
3413 parse->error = REG_ESPACE;
3423 static int parse_int(struct re_parse *parse) {
3429 /* We need to be careful that strtoul will never access
3430 * memory beyond parse->rend
3432 for (lim = parse->rx; lim < parse->rend && *lim >= '0' && *lim <= '9';
3434 if (lim < parse->rend) {
3435 l = strtoul(parse->rx, &end, 10);
3436 used = end - parse->rx;
3438 char *s = strndup(parse->rx, parse->rend - parse->rx);
3440 parse->error = REG_ESPACE;
3443 l = strtoul(s, &end, 10);
3451 if ((l<0) || (l > INT_MAX)) {
3452 parse->error = REG_BADBR;
3458 static struct re *parse_repeated_exp(struct re_parse *parse) {
3459 struct re *re = parse_simple_exp(parse);
3462 if (match(parse, '?')) {
3463 re = make_re_rep(re, 0, 1);
3464 } else if (match(parse, '*')) {
3465 re = make_re_rep(re, 0, -1);
3466 } else if (match(parse, '+')) {
3467 re = make_re_rep(re, 1, -1);
3468 } else if (match(parse, '{')) {
3470 min = parse_int(parse);
3473 if (match(parse, ',')) {
3474 max = parse_int(parse);
3476 max = -1; /* If it's not an int, it means 'unbounded' */
3477 if (! match(parse, '}')) {
3478 parse->error = REG_EBRACE;
3481 } else if (match(parse, '}')) {
3484 parse->error = REG_EBRACE;
3487 if (min > max && max != -1) {
3488 parse->error = REG_BADBR;
3491 re = make_re_rep(re, min, max);
3494 parse->error = REG_ESPACE;
3501 static struct re *parse_concat_exp(struct re_parse *parse) {
3502 struct re *re = parse_repeated_exp(parse);
3506 if (more(parse) && ! peek(parse, ")|")) {
3507 struct re *re2 = parse_concat_exp(parse);
3510 re = make_re_binop(CONCAT, re, re2);
3512 parse->error = REG_ESPACE;
3523 static struct re *parse_regexp(struct re_parse *parse) {
3524 struct re *re = NULL;
3526 /* Something like (|r) */
3527 if (peek(parse, "|"))
3528 re = make_re(EPSILON);
3530 re = parse_concat_exp(parse);
3534 if (match(parse, '|')) {
3535 struct re *re2 = NULL;
3536 /* Something like (r|) */
3537 if (peek(parse, ")"))
3538 re2 = make_re(EPSILON);
3540 re2 = parse_regexp(parse);
3544 re = make_re_binop(UNION, re, re2);
3546 parse->error = REG_ESPACE;
3558 * Convert a STRUCT RE to a string. Code is complicated by the fact that
3559 * we try to be clever and avoid unneeded parens and concatenation with
3562 static int re_as_string(const struct re *re, struct re_str *str);
3564 static int re_binop_count(enum re_type type, const struct re *re) {
3565 assert(type == CONCAT || type == UNION);
3566 if (re->type == type) {
3567 return re_binop_count(type, re->exp1) + re_binop_count(type, re->exp2);
3573 static int re_binop_store(enum re_type type, const struct re *re,
3574 const struct re **list) {
3576 if (type == re->type) {
3577 pos = re_binop_store(type, re->exp1, list);
3578 pos += re_binop_store(type, re->exp2, list + pos);
3586 static int re_union_as_string(const struct re *re, struct re_str *str) {
3587 assert(re->type == UNION);
3590 const struct re **res = NULL;
3591 struct re_str *strings = NULL;
3594 nre = re_binop_count(re->type, re);
3595 r = ALLOC_N(res, nre);
3599 re_binop_store(re->type, re, res);
3601 r = ALLOC_N(strings, nre);
3606 for (int i=0; i < nre; i++) {
3607 if (re_as_string(res[i], strings + i) < 0)
3609 str->len += strings[i].len;
3613 r = re_str_alloc(str);
3618 for (int i=0; i < nre; i++) {
3621 memcpy(p, strings[i].rx, strings[i].len);
3622 p += strings[i].len;
3628 if (strings != NULL) {
3629 for (int i=0; i < nre; i++)
3630 release_re_str(strings + i);
3635 release_re_str(str);
3641 static int re_needs_parens_in_concat(const struct re *re) {
3642 return (re->type != CHAR && re->type != CSET && re->type != ITER);
3645 static int re_concat_as_string(const struct re *re, struct re_str *str) {
3646 assert(re->type == CONCAT);
3648 const struct re **res = NULL;
3649 struct re_str *strings = NULL;
3653 nre = re_binop_count(re->type, re);
3654 r = ALLOC_N(res, nre);
3657 re_binop_store(re->type, re, res);
3659 r = ALLOC_N(strings, nre);
3664 for (int i=0; i < nre; i++) {
3665 if (res[i]->type == EPSILON)
3667 if (re_as_string(res[i], strings + i) < 0)
3669 str->len += strings[i].len;
3670 if (re_needs_parens_in_concat(res[i]))
3674 r = re_str_alloc(str);
3679 for (int i=0; i < nre; i++) {
3680 if (res[i]->type == EPSILON)
3682 if (re_needs_parens_in_concat(res[i]))
3684 p = memcpy(p, strings[i].rx, strings[i].len);
3685 p += strings[i].len;
3686 if (re_needs_parens_in_concat(res[i]))
3693 if (strings != NULL) {
3694 for (int i=0; i < nre; i++)
3695 release_re_str(strings + i);
3700 release_re_str(str);
3705 static bool cset_contains(const struct re *cset, int c) {
3706 return bitset_get(cset->cset, c) != cset->negate;
3709 static int re_cset_as_string(const struct re *re, struct re_str *str) {
3710 const uchar rbrack = ']';
3711 const uchar dash = '-';
3712 const uchar nul = '\0';
3714 static const char *const empty_set = "[]";
3715 static const char *const total_set = "(.|\n)";
3716 static const char *const not_newline = ".";
3719 int from, to, negate;
3721 int incl_rbrack, incl_dash;
3724 str->len = strlen(empty_set);
3726 /* We can not include NUL explicitly in a CSET since we use ordinary
3727 NUL delimited strings to represent them. That means that we need to
3728 use negated representation if NUL is to be included (and vice versa)
3730 negate = cset_contains(re, nul);
3732 for (from = UCHAR_MIN;
3733 from <= UCHAR_MAX && cset_contains(re, from);
3735 if (from > UCHAR_MAX) {
3736 /* Special case: the set matches every character */
3737 str->rx = strdup(total_set);
3742 from <= UCHAR_MAX && cset_contains(re, from);
3744 if (from > UCHAR_MAX) {
3745 /* Special case: the set matches everything but '\n' */
3746 str->rx = strdup(not_newline);
3752 /* See if ']' and '-' will be explicitly included in the character set
3753 (INCL_RBRACK, INCL_DASH) As we loop over the character set, we reset
3754 these flags if they are in the set, but not mentioned explicitly
3756 incl_rbrack = cset_contains(re, rbrack) != negate;
3757 incl_dash = cset_contains(re, dash) != negate;
3759 if (re->no_ranges) {
3760 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++)
3761 if (cset_contains(re, from) != negate)
3764 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3765 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3767 if (from > UCHAR_MAX)
3770 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3773 if (to == from && (from == rbrack || from == dash))
3775 if (from == rbrack || from == dash)
3777 if (to == rbrack || to == dash)
3780 len = (to == from) ? 1 : ((to == from + 1) ? 2 : 3);
3782 if (from < rbrack && rbrack < to)
3784 if (from < dash && dash < to)
3788 str->len += incl_rbrack + incl_dash;
3791 str->len += 1; /* For the ^ */
3793 r = re_str_alloc(str);
3804 if (re->no_ranges) {
3805 for (from = UCHAR_MIN; from <= UCHAR_MAX; from++) {
3806 if (from == rbrack || from == dash)
3808 if (cset_contains(re, from) != negate)
3812 for (from = UCHAR_MIN; from <= UCHAR_MAX; from = to+1) {
3813 while (from <= UCHAR_MAX && cset_contains(re, from) == negate)
3815 if (from > UCHAR_MAX)
3818 to < UCHAR_MAX && (cset_contains(re, to+1) != negate);
3821 if (to == from && (from == rbrack || from == dash))
3823 if (from == rbrack || from == dash)
3825 if (to == rbrack || to == dash)
3830 } else if (to == from + 1) {
3845 if (str->rx == NULL)
3847 str->len = strlen(str->rx);
3850 release_re_str(str);
3854 static int re_iter_as_string(const struct re *re, struct re_str *str) {
3855 const char *quant = NULL;
3859 if (re_as_string(re->exp, str) < 0)
3862 if (re->min == 0 && re->max == -1) {
3864 } else if (re->min == 1 && re->max == -1) {
3866 } else if (re->min == 0 && re->max == 1) {
3868 } else if (re->max == -1) {
3869 r = asprintf(&iter, "{%d,}", re->min);
3874 r = asprintf(&iter, "{%d,%d}", re->min, re->max);
3880 if (re->exp->type == CHAR || re->exp->type == CSET) {
3881 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1) < 0)
3883 strcpy(str->rx + str->len, quant);
3884 str->len += strlen(quant);
3886 /* Format '(' + str->rx ')' + quant */
3887 if (REALLOC_N(str->rx, str->len + strlen(quant) + 1 + 2) < 0)
3889 memmove(str->rx + 1, str->rx, str->len);
3891 str->rx[str->len + 1] = ')';
3893 strcpy(str->rx + str->len, quant);
3894 str->len += strlen(quant);
3902 release_re_str(str);
3906 static int re_as_string(const struct re *re, struct re_str *str) {
3907 /* Characters that must be escaped */
3908 static const char * const special_chars = ".()[]{}*|+?\\^$";
3913 result = re_union_as_string(re, str);
3916 result = re_concat_as_string(re, str);
3919 result = re_cset_as_string(re, str);
3922 if (re->c == '\0' || strchr(special_chars, re->c) == NULL) {
3923 if (ALLOC_N(str->rx, 2) < 0)
3928 if (ALLOC_N(str->rx, 3) < 0)
3932 str->len = strlen(str->rx);
3936 result = re_iter_as_string(re, str);
3939 if (ALLOC_N(str->rx, 3) < 0)
3941 strcpy(str->rx, "()");
3942 str->len = strlen(str->rx);
3951 release_re_str(str);
3955 static int convert_trans_to_re(struct state *s) {
3956 struct re *re = NULL;
3958 struct trans *trans = NULL;
3964 qsort(s->trans, s->tused, sizeof(*s->trans), trans_to_cmp);
3965 for (int i = 0; i < s->tused - 1; i++) {
3966 if (s->trans[i].to != s->trans[i+1].to)
3969 r = ALLOC_N(trans, nto);
3973 struct state *to = s->trans[0].to;
3975 for_each_trans(t, s) {
3977 trans[tind].to = to;
3978 trans[tind].re = re;
3984 re = make_re_char_set(0, 0);
3988 add_re_char(re, t->min, t->max);
3990 assert(nto == tind + 1);
3991 trans[tind].to = to;
3992 trans[tind].re = re;
3994 /* Simplify CSETs with a single char to a CHAR */
3995 for (int t=0; t < nto; t++) {
3997 uchar chr = UCHAR_MIN;
3998 for (int c = 0; c < UCHAR_NUM; c++) {
3999 if (bitset_get(trans[t].re->cset, c)) {
4005 re_unref(trans[t].re);
4006 trans[t].re = make_re_char(chr);
4007 if (trans[t].re == NULL)
4013 s->tused = s->tsize = nto;
4018 for (int i=0; i < nto; i++)
4019 unref(trans[i].re, re);
4024 ATTRIBUTE_RETURN_CHECK
4025 static int add_new_re_trans(struct state *s1, struct state *s2,
4028 r = add_new_trans(s1, s2, 0, 0);
4031 last_trans(s1)->re = re;
4035 /* Add the regular expression R1 . LOOP* . R2 to the transition
4037 static int re_collapse_trans(struct state *s1, struct state *s2,
4038 struct re *r1, struct re *loop, struct re *r2) {
4039 struct re *re = NULL;
4041 if (loop->type != EPSILON) {
4042 loop = make_re_rep(ref(loop), 0, -1);
4047 if (r1->type == EPSILON) {
4048 if (loop->type == EPSILON) {
4051 re = make_re_binop(CONCAT, loop, ref(r2));
4054 if (loop->type == EPSILON) {
4055 if (r2->type == EPSILON) {
4058 re = make_re_binop(CONCAT, ref(r1), ref(r2));
4061 re = make_re_binop(CONCAT, ref(r1), loop);
4062 if (re != NULL && r2->type != EPSILON) {
4063 re = make_re_binop(CONCAT, re, ref(r2));
4070 struct trans *t = NULL;
4071 for (t = s1->trans; t <= last_trans(s1) && t->to != s2; t += 1);
4072 if (t > last_trans(s1)) {
4073 if (add_new_re_trans(s1, s2, re) < 0)
4076 if (t->re == NULL) {
4079 t->re = make_re_binop(UNION, re, t->re);
4086 // FIXME: make sure we don't leak loop
4090 static int convert_strings(struct fa *fa) {
4091 struct state_set *worklist = state_set_init(-1, S_NONE);
4094 E(worklist == NULL);
4096 /* Abuse hash to count indegree, reachable to mark visited states */
4097 list_for_each(s, fa->initial) {
4102 list_for_each(s, fa->initial) {
4103 for_each_trans(t, s) {
4108 for (struct state *s = fa->initial;
4110 s = state_set_pop(worklist)) {
4111 for (int i=0; i < s->tused; i++) {
4112 struct trans *t = s->trans + i;
4113 struct state *to = t->to;
4114 while (to->hash == 1 && to->tused == 1 && ! to->accept) {
4115 if (t->re == NULL) {
4116 t->re = to->trans->re;
4117 to->trans->re = NULL;
4119 t->re = make_re_binop(CONCAT, t->re, to->trans->re);
4123 t->to = to->trans->to;
4127 for (int j=0; j < s->tused; j++) {
4128 if (j != i && s->trans[j].to == to) {
4129 /* Combine transitions i and j; remove trans j */
4130 t->re = make_re_binop(UNION, t->re, s->trans[j].re);
4133 memmove(s->trans + j, s->trans + j + 1,
4134 sizeof(s->trans[j]) * (s->tused - j - 1));
4145 if (! to->reachable) {
4147 F(state_set_push(worklist, to));
4152 for (struct state *s = fa->initial; s->next != NULL; ) {
4153 if (s->next->hash == 0 && s->next->tused == 0) {
4154 struct state *del = s->next;
4155 s->next = del->next;
4165 state_set_free(worklist);
4169 /* Convert an FA to a regular expression.
4170 * The strategy is the following:
4171 * (1) For all states S1 and S2, convert the transitions between them
4172 * into one transition whose regexp is a CSET
4173 * (2) Add a new initial state INI and a new final state FIN to the automaton,
4174 * a transition from INI to the old initial state of FA, and a transition
4175 * from all accepting states of FA to FIN; the regexp on those transitions
4176 * matches only the empty word
4177 * (3) Eliminate states S (except for INI and FIN) one by one:
4178 * Let LOOP the regexp for the transition S -> S if it exists, epsilon
4180 * For all S1, S2 different from S with S1 -> S -> S2
4181 * Let R1 the regexp of S1 -> S
4182 * R2 the regexp of S -> S2
4183 * R3 the regexp S1 -> S2 (or epsilon if no such transition)
4184 * set the regexp on the transition S1 -> S2 to
4185 * R1 . (LOOP)* . R2 | R3
4186 * (4) The regexp for the whole FA can now be found as the regexp of
4187 * the transition INI -> FIN
4188 * (5) Convert that STRUCT RE to a string with RE_AS_STRING
4190 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len) {
4192 struct state *fin = NULL, *ini = NULL;
4193 struct re *eps = NULL;
4201 eps = make_re(EPSILON);
4205 fin = add_state(fa,1);
4211 list_for_each(s, fa->initial) {
4212 r = convert_trans_to_re(s);
4215 if (s->accept && s != fin) {
4216 r = add_new_re_trans(s, fin, ref(eps));
4223 ini = add_state(fa, 0);
4227 r = add_new_re_trans(ini, fa->initial, ref(eps));
4230 set_initial(fa, ini);
4232 convert_strings(fa);
4234 list_for_each(s, fa->initial->next) {
4238 struct re *loop = eps;
4239 for_each_trans(t, s) {
4243 list_for_each(s1, fa->initial) {
4246 for (int t1 = 0; t1 < s1->tused; t1++) {
4247 if (s1->trans[t1].to == s) {
4248 for (int t = 0; t < s->tused; t++) {
4249 if (s->trans[t].to == s)
4251 r = re_collapse_trans(s1, s->trans[t].to,
4265 for_each_trans(t, fa->initial) {
4269 if (re_as_string(t->re, &str) < 0)
4272 *regexp_len = str.len;
4276 list_for_each(s, fa->initial) {
4277 for_each_trans(t, s) {
4290 static int re_restrict_alphabet(struct re *re, uchar from, uchar to) {
4297 r1 = re_restrict_alphabet(re->exp1, from, to);
4298 r2 = re_restrict_alphabet(re->exp2, from, to);
4299 result = (r1 != 0) ? r1 : r2;
4304 bitset_negate(re->cset, UCHAR_NUM);
4306 for (int i=from; i <= to; i++)
4307 bitset_clr(re->cset, i);
4310 if (from <= re->c && re->c <= to)
4314 result = re_restrict_alphabet(re->exp, from, to);
4326 int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
4327 char **newregexp, size_t *newregexp_len,
4328 char from, char to) {
4330 struct re *re = NULL;
4331 struct re_parse parse;
4337 parse.rend = regexp + regexp_len;
4338 parse.error = REG_NOERROR;
4339 re = parse_regexp(&parse);
4340 if (parse.error != REG_NOERROR)
4343 result = re_restrict_alphabet(re, from, to);
4350 result = re_as_string(re, &str);
4351 *newregexp = str.rx;
4352 *newregexp_len = str.len;
4358 int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
4359 char **newregexp, size_t *newregexp_len) {
4361 struct re *re = NULL;
4362 struct re_parse parse;
4368 parse.rend = regexp + regexp_len;
4369 parse.error = REG_NOERROR;
4370 parse.no_ranges = 1;
4371 re = parse_regexp(&parse);
4372 if (parse.error != REG_NOERROR)
4376 result = re_as_string(re, &str);
4377 *newregexp = str.rx;
4378 *newregexp_len = str.len;
4383 /* Expand regexp so that it is case-insensitive in a case-sensitive match.
4385 * Return 1 when a change was made, -1 when an allocation failed, and 0
4386 * when no change was made.
4388 static int re_case_expand(struct re *re) {
4389 int result = 0, r1, r2;
4394 r1 = re_case_expand(re->exp1);
4395 r2 = re_case_expand(re->exp2);
4396 result = (r1 != 0) ? r1 : r2;
4399 for (int c = 'A'; c <= 'Z'; c++)
4400 if (bitset_get(re->cset, c)) {
4402 bitset_set(re->cset, tolower(c));
4404 for (int c = 'a'; c <= 'z'; c++)
4405 if (bitset_get(re->cset, c)) {
4407 bitset_set(re->cset, toupper(c));
4411 if (isalpha(re->c)) {
4416 re->cset = bitset_init(UCHAR_NUM);
4417 if (re->cset == NULL)
4419 bitset_set(re->cset, tolower(c));
4420 bitset_set(re->cset, toupper(c));
4425 result = re_case_expand(re->exp);
4437 int fa_expand_nocase(const char *regexp, size_t regexp_len,
4438 char **newregexp, size_t *newregexp_len) {
4440 struct re *re = NULL;
4441 struct re_parse parse;
4447 parse.rend = regexp + regexp_len;
4448 parse.error = REG_NOERROR;
4449 re = parse_regexp(&parse);
4450 if (parse.error != REG_NOERROR)
4453 r = re_case_expand(re);
4461 result = re_as_string(re, &str);
4462 *newregexp = str.rx;
4463 *newregexp_len = str.len;
4465 *newregexp = strndup(regexp, regexp_len);
4466 *newregexp_len = regexp_len;
4467 result = (*newregexp == NULL) ? REG_ESPACE : REG_NOERROR;
4473 static void print_char(FILE *out, uchar c) {
4474 /* We escape '/' as '\\/' since dot chokes on bare slashes in labels;
4475 Also, a space ' ' is shown as '\s' */
4476 static const char *const escape_from = " \n\t\v\b\r\f\a/\0";
4477 static const char *const escape_to = "sntvbrfa/0";
4478 char *p = strchr(escape_from, c);
4480 int i = p - escape_from;
4481 fprintf(out, "\\\\%c", escape_to[i]);
4482 } else if (! isprint(c)) {
4483 fprintf(out, "\\\\0%03o", (unsigned char) c);
4484 } else if (c == '"') {
4485 fprintf(out, "\\\"");
4491 void fa_dot(FILE *out, struct fa *fa) {
4492 fprintf(out, "digraph {\n rankdir=LR;");
4493 list_for_each(s, fa->initial) {
4495 fprintf(out, "\"%p\" [shape=doublecircle];\n", s);
4497 fprintf(out, "\"%p\" [shape=circle];\n", s);
4500 fprintf(out, "%s -> \"%p\";\n", fa->deterministic ? "dfa" : "nfa",
4505 list_for_each(s, fa->initial) {
4506 for_each_trans(t, s) {
4507 fprintf(out, "\"%p\" -> \"%p\" [ label = \"", s, t->to);
4509 re_as_string(t->re, &str);
4510 for (int i=0; i < str.len; i++) {
4511 print_char(out, str.rx[i]);
4513 release_re_str(&str);
4515 print_char(out, t->min);
4516 if (t->min != t->max) {
4518 print_char(out, t->max);
4521 fprintf(out, "\" ];\n");
4524 fprintf(out, "}\n");
4527 int fa_json(FILE *out, struct fa *fa) {
4528 hash_val_t *list_hashes = NULL;
4529 int list_size = 100;
4535 fprintf(out,"{\n\t\"final\": [");
4537 F(ALLOC_N(list_hashes, list_size));
4539 list_for_each(s, fa->initial) {
4540 if (num_states == list_size - 1){
4541 list_size += list_size;
4542 F(REALLOC_N(list_hashes, list_size));
4545 list_hashes[num_states] = s->hash;
4546 // We use the hashes to map states to Z_{num_states}
4547 s->hash = num_states++;
4550 fprintf(out,"%ld", s->hash);
4553 fprintf(out, ", %ld", s->hash);
4558 fprintf(out, "],\n\t\"deterministic\": %d,\n\t\"transitions\": [\n",
4559 fa->deterministic ? 1 : 0);
4562 list_for_each(s, fa->initial) {
4563 for_each_trans(t, s) {
4565 fprintf(out, ",\n");
4567 fprintf(out, "\t\t{ \"from\": %ld, \"to\": %ld, \"on\": \"",
4568 s->hash, t->to->hash);
4569 print_char(out, t->min);
4570 if (t->min != t->max) {
4572 print_char(out, t->max);
4574 fprintf(out, "\" }");
4578 fprintf(out,"\n\t]\n}");
4582 // Restoring hash values to leave the FA structure untouched. That is
4583 // only needed if we actually copied hashes, indicated by num_states
4585 if (num_states > 0) {
4587 list_for_each(s, fa->initial) {
4588 s->hash = list_hashes[it++];
4595 bool fa_is_deterministic(struct fa *fa) {
4596 return fa->deterministic;
4599 struct state *fa_state_initial(struct fa *fa) {
4603 bool fa_state_is_accepting(struct state *st) {
4607 struct state* fa_state_next(struct state *st) {
4611 size_t fa_state_num_trans(struct state *st) {
4615 int fa_state_trans(struct state *st, size_t i,
4616 struct state **to, unsigned char *min, unsigned char *max) {
4620 (*to) = st->trans[i].to;
4621 (*min) = st->trans[i].min;
4622 (*max) = st->trans[i].max;
4628 * indent-tabs-mode: nil