2 * fa.h: finite automata
4 * Copyright (C) 2007-2015 David Lutterkort
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * Author: David Lutterkort <dlutter@redhat.com>
29 /* The type for a finite automaton. */
32 /* Denote some basic automata, used by fa_is_basic and fa_make_basic */
34 FA_EMPTY, /* Accepts the empty language, i.e. no strings */
35 FA_EPSILON, /* Accepts only the empty word */
36 FA_TOTAL /* Accepts all words */
39 /* Choice of minimization algorithm to use; either Hopcroft's O(n log(n))
40 * algorithm or Brzozowski's reverse-determinize-reverse-determinize
41 * algorithm. While the latter has exponential complexity in theory, it
42 * works quite well for some cases.
44 enum fa_minimization_algorithms {
49 /* Which minimization algorithm to use in FA_MINIMIZE. The library
50 * minimizes internally at certain points, too.
52 * Defaults to FA_MIN_HOPCROFT
54 extern int fa_minimization_algorithm;
56 /* Unless otherwise mentioned, automata passed into routines are never
57 * modified. It is the responsibility of the caller to free automata
58 * returned by any of these routines when they are no longer needed.
62 * Compile the regular expression RE of length SIZE into an automaton. The
63 * return value is the same as the return value for the POSIX function
64 * regcomp. The syntax for regular expressions is extended POSIX syntax,
65 * with the difference that '.' does not match newlines.
67 * On success, FA points to the newly allocated automaton constructed for
68 * RE, and the function returns REG_NOERROR. Otherwise, FA is NULL, and the
69 * return value indicates the error.
71 * The FA is case sensitive. Call FA_NOCASE to switch it to
74 int fa_compile(const char *re, size_t size, struct fa **fa);
76 /* Make a new automaton that accepts one of the basic languages defined in
79 struct fa *fa_make_basic(unsigned int basic);
81 /* Return 1 if FA accepts the basic language BASIC, which must be one of
82 * the constants from enum FA_BASIC.
84 int fa_is_basic(struct fa *fa, unsigned int basic);
86 /* Minimize FA using the currently-set fa_minimization_algorithm.
87 * As a side effect, the automaton will also be deterministic after being
88 * minimized. Modifies the automaton in place.
90 int fa_minimize(struct fa *fa);
92 /* Return a finite automaton that accepts the concatenation of the
93 * languages for FA1 and FA2, i.e. L(FA1).L(FA2)
95 struct fa *fa_concat(struct fa *fa1, struct fa *fa2);
97 /* Return a finite automaton that accepts the union of the languages that
98 * FA1 and FA2 accept (the '|' operator in regular expressions).
100 struct fa *fa_union(struct fa *fa1, struct fa *fa2);
102 /* Return a finite automaton that accepts the intersection of the languages
105 struct fa *fa_intersect(struct fa *fa1, struct fa *fa2);
107 /* Return a finite automaton that accepts the complement of the language of
108 * FA, i.e. the set of all words not accepted by FA
110 struct fa *fa_complement(struct fa *fa);
112 /* Return a finite automaton that accepts the set difference of the
113 * languages of FA1 and FA2, i.e. L(FA1)\L(FA2)
115 struct fa *fa_minus(struct fa *fa1, struct fa *fa2);
117 /* Return a finite automaton that accepts a repetition of the language that
118 * FA accepts. If MAX == -1, the returned automaton accepts arbitrarily
119 * long repetitions. MIN must be 0 or bigger, and unless MAX == -1, MIN
120 * must be less or equal to MAX. If MIN is greater than 0, the returned
121 * automaton accepts only words that have at least MIN repetitions of words
124 * The following common regexp repetitios are achieved by the following
125 * calls (using a lose notation equating automata and their languages):
127 * - FA* = FA_ITER(FA, 0, -1)
128 * - FA+ = FA_ITER(FA, 1, -1)
129 * - FA? = FA_ITER(FA, 0, 1)
130 * - FA{n,m} = FA_ITER(FA, n, m) with 0 <= n and m = -1 or n <= m
132 struct fa *fa_iter(struct fa *fa, int min, int max);
134 /* If successful, returns 1 if the language of FA1 is contained in the language
135 * of FA2, 0 otherwise. Returns a negative number if an error occurred.
137 int fa_contains(struct fa *fa1, struct fa *fa2);
139 /* If successful, returns 1 if the language of FA1 equals the language of FA2,
140 * 0 otherwise. Returns a negative number if an error occurred.
142 int fa_equals(struct fa *fa1, struct fa *fa2);
144 /* Free all memory used by FA */
145 void fa_free(struct fa *fa);
147 /* Print FA to OUT as a graphviz dot file */
148 void fa_dot(FILE *out, struct fa *fa);
150 /* Return a finite automaton that accepts the overlap of the languages of
151 * FA1 and FA2. The overlap of two languages is the set of strings that can
152 * be split in more than one way into a left part accepted by FA1 and a
153 * right part accepted by FA2.
155 struct fa *fa_overlap(struct fa *fa1, struct fa *fa2);
157 /* Produce an example for the language of FA. The example is not
158 * necessarily the shortest possible. The implementation works very hard to
159 * have printable characters (preferrably alphanumeric) in the example, and
160 * to avoid just an empty word.
162 * *EXAMPLE will be the example, which may be NULL. If it is non-NULL,
163 * EXAMPLE_LEN will hold the length of the example.
165 * Return 0 on success, and a negative numer on error. On error, *EXAMPLE
168 * If *EXAMPLE is set, it is the caller's responsibility to free the string
171 int fa_example(struct fa *fa, char **example, size_t *example_len);
173 /* Produce an example of an ambiguous word for the concatenation of the
174 * languages of FA1 and FA2. The return value is such a word (which must be
175 * freed by the caller) if it exists. If none exists, NULL is returned.
177 * The returned word is of the form UPV and PV and V are set to the first
178 * character of P and V in the returned word. The word UPV has the property
179 * that U and UP are accepted by FA1 and that PV and V are accepted by FA2.
181 * Neither the language of FA1 or of FA2 may contain words with the
182 * characters '\001' and '\002', as they are used during construction of
183 * the ambiguous word.
185 * UPV_LEN will be set to the length of the entire string UPV
187 * Returns 0 on success, and a negative number on failure. On failure, UPV,
188 * PV, and V will be NULL
190 int fa_ambig_example(struct fa *fa1, struct fa *fa2,
191 char **upv, size_t *upv_len,
192 char **pv, char **v);
194 /* Convert the finite automaton FA into a regular expression and set REGEXP
195 * to point to that. When REGEXP is compiled into another automaton, it is
196 * guaranteed that that automaton and FA accept the same language.
198 * The code tries to be semi-clever about keeping the generated regular
199 * expression short; to guarantee reasonably short regexps, the automaton
200 * should be minimized before passing it to this routine.
202 * On success, REGEXP_LEN is set to the length of REGEXP
204 * Return 0 on success, and a negative number on failure. The only reason
205 * for FA_AS_REGEXP to fail is running out of memory.
207 int fa_as_regexp(struct fa *fa, char **regexp, size_t *regexp_len);
209 /* Given the regular expression REGEXP construct a new regular expression
210 * NEWREGEXP that does not match strings containing any of the characters
211 * in the range FROM to TO, with the endpoints included.
213 * The new regular expression is constructed by removing the range FROM to
214 * TO from all character sets in REGEXP; if any of the characters [FROM,
215 * TO] appear outside a character set in REGEXP, return -2.
217 * Return 0 if NEWREGEXP was constructed successfully, -1 if an internal
218 * error happened (e.g., an allocation failed) and -2 if NEWREGEXP can not
219 * be constructed because any character in the range [FROM, TO] appears
220 * outside of a character set.
222 * Return a positive value if REGEXP is not syntactically valid; the value
223 * returned is one of the REG_ERRCODE_T POSIX error codes. Return 0 on
224 * success and -1 if an allocation fails.
226 int fa_restrict_alphabet(const char *regexp, size_t regexp_len,
227 char **newregexp, size_t *newregexp_len,
230 /* Convert REGEXP into one that does not use ranges inside character
233 * Return a positive value if REGEXP is not syntactically valid; the value
234 * returned is one of the REG_ERRCODE_T POSIX error codes. Return 0 on
235 * success and -1 if an allocation fails.
237 int fa_expand_char_ranges(const char *regexp, size_t regexp_len,
238 char **newregexp, size_t *newregexp_len);
240 /* Modify FA so that it matches ignoring case.
242 * Returns 0 on success, and -1 if an allocation fails. On failure, the
243 * automaton is not guaranteed to represent anything sensible.
245 int fa_nocase(struct fa *fa);
247 /* Return 1 if FA matches ignoring case, 0 if matches are case sensitive */
248 int fa_is_nocase(struct fa *fa);
250 /* Assume REGEXP is a case-insensitive regular expression, and convert it
251 * to one that matches the same strings when used case sensitively. All
252 * occurrences of individual letters c in the regular expression will be
253 * replaced by character sets [cC], and lower/upper case characters are
254 * added to character sets as needed.
256 * Return a positive value if REGEXP is not syntactically valid; the value
257 * returned is one of the REG_ERRCODE_T POSIX error codes. Return 0 on
258 * success and -1 if an allocation fails.
260 int fa_expand_nocase(const char *regexp, size_t regexp_len,
261 char **newregexp, size_t *newregexp_len);
263 /* Generate up to LIMIT words from the language of FA, which is assumed to
264 * be finite. The words are returned in WORDS, which is allocated by this
265 * function and must be freed by the caller.
267 * If FA accepts the empty word, the empty string will be included in
270 * Return the number of generated words on success, -1 if we run out of
271 * memory, and -2 if FA has more than LIMIT words.
273 int fa_enumerate(struct fa *fa, int limit, char ***words);
280 * indent-tabs-mode: nil