1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2012 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
29 #include <sys/types.h>
40 #define STREQ(a, b) (strcmp (a, b) == 0)
42 /* ISASCIIDIGIT differs from isdigit, as follows:
43 - Its arg may be any int or unsigned int; it need not be an unsigned char.
44 - It's guaranteed to evaluate its argument exactly once.
45 - It's typically faster.
46 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
47 only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless
48 it's important to use the locale's definition of `digit' even when the
49 host does not conform to Posix. */
50 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
52 /* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
54 #define _(str) gettext (str)
56 #include "mbsupport.h" /* defines MBS_SUPPORT to 1 or 0, as appropriate */
58 /* We can handle multibyte strings. */
62 #if HAVE_LANGINFO_CODESET
63 # include <langinfo.h>
81 return (c == ' ' || c == '\t');
85 /* HPUX, define those as macros in sys/param.h */
93 /* Number of bits in an unsigned char. */
98 /* First integer value that is greater than any character code. */
99 #define NOTCHAR (1 << CHARBITS)
101 /* INTBITS need not be exact, just a lower bound. */
103 # define INTBITS (CHARBITS * sizeof (int))
106 /* Number of ints required to hold a bit for every character. */
107 #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
109 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
110 typedef int charclass[CHARCLASS_INTS];
112 /* Convert a possibly-signed character to an unsigned character. This is
113 a bit safer than casting to unsigned char, since it catches some type
114 errors that the cast doesn't. */
115 static inline unsigned char to_uchar (char ch) { return ch; }
117 /* Contexts tell us whether a character is a newline or a word constituent.
118 Word-constituent characters are those that satisfy iswalnum(), plus '_'.
120 A state also stores a context value, which is nonzero if its
121 predecessors always matches a newline or a word constituent.
122 The definition of a state's context is a bit unclear, but will be
123 modified soon anyway. */
127 #define CTX_NEWLINE 4
130 /* Sometimes characters can only be matched depending on the surrounding
131 context. Such context decisions depend on what the previous character
132 was, and the value of the current (lookahead) character. Context
133 dependent constraints are encoded as 8 bit integers. Each bit that
134 is set indicates that the constraint succeeds in the corresponding
137 bit 7 - previous and current are newlines
138 bit 6 - previous was newline, current isn't
139 bit 5 - previous wasn't newline, current is
140 bit 4 - neither previous nor current is a newline
141 bit 3 - previous and current are word-constituents
142 bit 2 - previous was word-constituent, current isn't
143 bit 1 - previous wasn't word-constituent, current is
144 bit 0 - neither previous nor current is word-constituent
146 The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
147 succeeds in a particular context. Prev is the context value for
148 the previous character, curr is the context value for the lookahead
150 #define MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \
152 1 << (((prev & CTX_NEWLINE) ? 2 : 0) + ((curr & CTX_NEWLINE) ? 1 : 0) + 4))
153 #define MATCHES_LETTER_CONTEXT(constraint, prev, curr) \
155 1 << (((prev & CTX_LETTER) ? 2 : 0) + ((curr & CTX_LETTER) ? 1 : 0)))
156 #define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
157 (MATCHES_NEWLINE_CONTEXT(constraint, prev, curr) \
158 && MATCHES_LETTER_CONTEXT(constraint, prev, curr))
160 /* The following macros give information about what a constraint depends on. */
161 #define PREV_NEWLINE_DEPENDENT(constraint) \
162 (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
163 #define PREV_LETTER_DEPENDENT(constraint) \
164 (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
166 /* Tokens that match the empty string subject to some constraint actually
167 work by applying that constraint to determine what may follow them,
168 taking into account what has gone before. The following values are
169 the constraints corresponding to the special tokens previously defined. */
170 #define NO_CONSTRAINT 0xff
171 #define BEGLINE_CONSTRAINT 0xcf
172 #define ENDLINE_CONSTRAINT 0xaf
173 #define BEGWORD_CONSTRAINT 0xf2
174 #define ENDWORD_CONSTRAINT 0xf4
175 #define LIMWORD_CONSTRAINT 0xf6
176 #define NOTLIMWORD_CONSTRAINT 0xf9
178 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
179 are operators and others are terminal symbols. Most (but not all) of these
180 codes are returned by the lexical analyzer. */
183 END = -1, /* END is a terminal symbol that matches the
184 end of input; any value of END or less in
185 the parse tree is such a symbol. Accepting
186 states of the DFA are those that would have
187 a transition on END. */
189 /* Ordinary character values are terminal symbols that match themselves. */
191 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
194 BACKREF, /* BACKREF is generated by \<digit>; it
195 is not completely handled. If the scanner
196 detects a transition on backref, it returns
197 a kind of "semi-success" indicating that
198 the match will have to be verified with
199 a backtracking matcher. */
201 BEGLINE, /* BEGLINE is a terminal symbol that matches
202 the empty string if it is at the beginning
205 ENDLINE, /* ENDLINE is a terminal symbol that matches
206 the empty string if it is at the end of
209 BEGWORD, /* BEGWORD is a terminal symbol that matches
210 the empty string if it is at the beginning
213 ENDWORD, /* ENDWORD is a terminal symbol that matches
214 the empty string if it is at the end of
217 LIMWORD, /* LIMWORD is a terminal symbol that matches
218 the empty string if it is at the beginning
219 or the end of a word. */
221 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
222 matches the empty string if it is not at
223 the beginning or end of a word. */
225 QMARK, /* QMARK is an operator of one argument that
226 matches zero or one occurences of its
229 STAR, /* STAR is an operator of one argument that
230 matches the Kleene closure (zero or more
231 occurrences) of its argument. */
233 PLUS, /* PLUS is an operator of one argument that
234 matches the positive closure (one or more
235 occurrences) of its argument. */
237 REPMN, /* REPMN is a lexical token corresponding
238 to the {m,n} construct. REPMN never
239 appears in the compiled token vector. */
241 CAT, /* CAT is an operator of two arguments that
242 matches the concatenation of its
243 arguments. CAT is never returned by the
246 OR, /* OR is an operator of two arguments that
247 matches either of its arguments. */
249 LPAREN, /* LPAREN never appears in the parse tree,
250 it is only a lexeme. */
252 RPAREN, /* RPAREN never appears in the parse tree. */
254 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
255 any multibyte (or single byte) characters.
256 It is used only if MB_CUR_MAX > 1. */
258 MBCSET, /* MBCSET is similar to CSET, but for
259 multibyte characters. */
261 WCHAR, /* Only returned by lex. wctok contains
262 the wide character representation. */
264 CSET /* CSET and (and any value greater) is a
265 terminal symbol that matches any of a
266 class of characters. */
270 /* States of the recognizer correspond to sets of positions in the parse
271 tree, together with the constraints under which they may be matched.
272 So a position is encoded as an index into the parse tree together with
276 unsigned int index; /* Index into the parse array. */
277 unsigned int constraint; /* Constraint for matching this position. */
280 /* Sets of positions are stored as arrays. */
283 position *elems; /* Elements of this position set. */
284 size_t nelem; /* Number of elements in this set. */
285 size_t alloc; /* Number of elements allocated in ELEMS. */
288 /* Sets of leaves are also stored as arrays. */
291 unsigned int *elems; /* Elements of this position set. */
292 size_t nelem; /* Number of elements in this set. */
295 /* A state of the dfa consists of a set of positions, some flags,
296 and the token value of the lowest-numbered position of the state that
297 contains an END token. */
300 int hash; /* Hash of the positions of this state. */
301 position_set elems; /* Positions this state could match. */
302 unsigned char context; /* Context from previous state. */
303 char backref; /* True if this state matches a \<digit>. */
304 unsigned char constraint; /* Constraint for this state to accept. */
305 int first_end; /* Token value of the first END in elems. */
306 position_set mbps; /* Positions which can match multibyte
307 characters. e.g. period.
308 These staff are used only if
312 /* A bracket operator.
313 e.g. [a-c], [[:alpha:]], etc. */
314 struct mb_char_classes
318 wchar_t *chars; /* Normal characters. */
320 wctype_t *ch_classes; /* Character classes. */
322 wchar_t *range_sts; /* Range characters (start of the range). */
323 wchar_t *range_ends; /* Range characters (end of the range). */
325 char **equivs; /* Equivalent classes. */
328 int ncoll_elems; /* Collating elements. */
331 /* A compiled regular expression. */
334 /* Fields filled by the scanner. */
335 charclass *charclasses; /* Array of character sets for CSET tokens. */
336 int cindex; /* Index for adding new charclasses. */
337 int calloc; /* Number of charclasses currently allocated. */
339 /* Fields filled by the parser. */
340 token *tokens; /* Postfix parse array. */
341 int tindex; /* Index for adding new tokens. */
342 int talloc; /* Number of tokens currently allocated. */
343 int depth; /* Depth required of an evaluation stack
344 used for depth-first traversal of the
346 int nleaves; /* Number of leaves on the parse tree. */
347 int nregexps; /* Count of parallel regexps being built
349 unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
350 int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
352 /* The following are used only if MB_CUR_MAX > 1. */
354 /* The value of multibyte_prop[i] is defined by following rule.
355 if tokens[i] < NOTCHAR
356 bit 0 : tokens[i] is the first byte of a character, including
357 single-byte characters.
358 bit 1 : tokens[i] is the last byte of a character, including
359 single-byte characters.
361 if tokens[i] = MBCSET
362 ("the index of mbcsets correspnd to this operator" << 2) + 3
366 = 'single_byte_a', 'multi_byte_A', single_byte_b'
367 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
374 /* Array of the bracket expression in the DFA. */
375 struct mb_char_classes *mbcsets;
379 /* Fields filled by the state builder. */
380 dfa_state *states; /* States of the dfa. */
381 int sindex; /* Index for adding new states. */
382 int salloc; /* Number of states currently allocated. */
384 /* Fields filled by the parse tree->NFA conversion. */
385 position_set *follows; /* Array of follow sets, indexed by position
386 index. The follow of a position is the set
387 of positions containing characters that
388 could conceivably follow a character
389 matching the given position in a string
390 matching the regexp. Allocated to the
391 maximum possible position index. */
392 int searchflag; /* True if we are supposed to build a searching
393 as opposed to an exact matcher. A searching
394 matcher finds the first and shortest string
395 matching a regexp anywhere in the buffer,
396 whereas an exact matcher finds the longest
397 string matching, but anchored to the
398 beginning of the buffer. */
400 /* Fields filled by dfaexec. */
401 int tralloc; /* Number of transition tables that have
403 int trcount; /* Number of transition tables that have
404 actually been built. */
405 int **trans; /* Transition tables for states that can
406 never accept. If the transitions for a
407 state have not yet been computed, or the
408 state could possibly accept, its entry in
409 this table is NULL. */
410 int **realtrans; /* Trans always points to realtrans + 1; this
411 is so trans[-1] can contain NULL. */
412 int **fails; /* Transition tables after failing to accept
413 on a state that potentially could do so. */
414 int *success; /* Table of acceptance conditions used in
415 dfaexec and computed in build_state. */
416 int *newlines; /* Transitions on newlines. The entry for a
417 newline in any transition table is always
418 -1 so we can count lines without wasting
419 too many cycles. The transition for a
420 newline is stored separately and handled
421 as a special case. Newline is also used
422 as a sentinel at the end of the buffer. */
423 struct dfamust *musts; /* List of strings, at least one of which
424 is known to appear in any r.e. matching
428 /* Some macros for user access to dfa internals. */
430 /* ACCEPTING returns true if s could possibly be an accepting state of r. */
431 #define ACCEPTING(s, r) ((r).states[s].constraint)
433 /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
434 specified context. */
435 #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
436 SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint, prev, curr)
438 static void dfamust (struct dfa *dfa);
439 static void regexp (void);
441 /* These two macros are identical to the ones in gnulib's xalloc.h,
442 except that they not to case the result to "(t *)", and thus may
443 be used via type-free CALLOC and MALLOC macros. */
447 /* Allocate memory for N elements of type T, with error checking. */
448 /* extern t *XNMALLOC (size_t n, typename t); */
449 # define XNMALLOC(n, t) \
450 (sizeof (t) == 1 ? xmalloc (n) : xnmalloc (n, sizeof (t)))
452 /* Allocate memory for N elements of type T, with error checking,
454 /* extern t *XCALLOC (size_t n, typename t); */
455 # define XCALLOC(n, t) \
456 (sizeof (t) == 1 ? xzalloc (n) : xcalloc (n, sizeof (t)))
458 #define CALLOC(p, n) do { (p) = XCALLOC (n, *(p)); } while (0)
459 #define MALLOC(p, n) do { (p) = XNMALLOC (n, *(p)); } while (0)
460 #define REALLOC(p, n) do {(p) = xnrealloc (p, n, sizeof (*(p))); } while (0)
462 /* Reallocate an array of type *P if N_ALLOC is <= N_REQUIRED. */
463 #define REALLOC_IF_NECESSARY(p, n_alloc, n_required) \
466 if ((n_alloc) <= (n_required)) \
468 size_t new_n_alloc = (n_required) + !(p); \
469 (p) = x2nrealloc (p, &new_n_alloc, sizeof (*(p))); \
470 (n_alloc) = new_n_alloc; \
484 fprintf(stderr, "END");
485 else if (t < NOTCHAR)
486 fprintf(stderr, "%c", t);
491 case EMPTY: s = "EMPTY"; break;
492 case BACKREF: s = "BACKREF"; break;
493 case BEGLINE: s = "BEGLINE"; break;
494 case ENDLINE: s = "ENDLINE"; break;
495 case BEGWORD: s = "BEGWORD"; break;
496 case ENDWORD: s = "ENDWORD"; break;
497 case LIMWORD: s = "LIMWORD"; break;
498 case NOTLIMWORD: s = "NOTLIMWORD"; break;
499 case QMARK: s = "QMARK"; break;
500 case STAR: s = "STAR"; break;
501 case PLUS: s = "PLUS"; break;
502 case CAT: s = "CAT"; break;
503 case OR: s = "OR"; break;
504 case LPAREN: s = "LPAREN"; break;
505 case RPAREN: s = "RPAREN"; break;
506 case ANYCHAR: s = "ANYCHAR"; break;
507 case MBCSET: s = "MBCSET"; break;
508 default: s = "CSET"; break;
510 fprintf(stderr, "%s", s);
515 /* Stuff pertaining to charclasses. */
518 tstbit (unsigned int b, charclass const c)
520 return c[b / INTBITS] & 1 << b % INTBITS;
524 setbit (unsigned int b, charclass c)
526 c[b / INTBITS] |= 1 << b % INTBITS;
530 clrbit (unsigned int b, charclass c)
532 c[b / INTBITS] &= ~(1 << b % INTBITS);
536 copyset (charclass const src, charclass dst)
538 memcpy (dst, src, sizeof (charclass));
542 zeroset (charclass s)
544 memset (s, 0, sizeof (charclass));
552 for (i = 0; i < CHARCLASS_INTS; ++i)
557 equal (charclass const s1, charclass const s2)
559 return memcmp (s1, s2, sizeof (charclass)) == 0;
562 /* A pointer to the current dfa is kept here during parsing. */
563 static struct dfa *dfa;
565 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
567 charclass_index (charclass const s)
571 for (i = 0; i < dfa->cindex; ++i)
572 if (equal(s, dfa->charclasses[i]))
574 REALLOC_IF_NECESSARY(dfa->charclasses, dfa->calloc, dfa->cindex + 1);
576 copyset(s, dfa->charclasses[i]);
580 /* Syntax bits controlling the behavior of the lexical analyzer. */
581 static reg_syntax_t syntax_bits, syntax_bits_set;
583 /* Flag for case-folding letters into sets. */
584 static int case_fold;
586 /* End-of-line byte in data. */
587 static unsigned char eolbyte;
589 /* Cache of char-context values. */
590 static int sbit[NOTCHAR];
592 /* Set of characters considered letters. */
593 static charclass letters;
595 /* Set of characters that are newline. */
596 static charclass newline;
598 /* Add this to the test for whether a byte is word-constituent, since on
599 BSD-based systems, many values in the 128..255 range are classified as
600 alphabetic, while on glibc-based systems, they are not. */
602 # define is_valid_unibyte_character(c) 1
604 # define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
607 /* Return non-zero if C is a 'word-constituent' byte; zero otherwise. */
608 #define IS_WORD_CONSTITUENT(C) \
609 (is_valid_unibyte_character (C) && (isalnum (C) || (C) == '_'))
612 char_context (unsigned char c)
614 if (c == eolbyte || c == 0)
616 if (IS_WORD_CONSTITUENT (c))
622 wchar_context(wint_t wc)
624 if (wc == (wchar_t)eolbyte || wc == 0)
626 if (wc == L'_' || iswalnum (wc))
631 /* Entry point to set syntax options. */
633 dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
642 for (i = 0; i < NOTCHAR; ++i)
644 sbit[i] = char_context (i);
657 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
658 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
659 this may happen when folding case in weird Turkish locales where
660 dotless i/dotted I are not included in the chosen character set.
661 Return whether a bit was set in the charclass. */
664 setbit_wc (wint_t wc, charclass c)
674 /* Set a bit in the charclass for the given single byte character,
675 if it is valid in the current character set. */
677 setbit_c (int b, charclass c)
679 /* Do nothing if b is invalid in this character set. */
680 if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
685 # define setbit_c setbit
687 setbit_wc (wint_t wc, charclass c)
695 /* Like setbit_c, but if case is folded, set both cases of a letter. For
696 MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
697 and the caller takes care of setting the appropriate field of struct
700 setbit_case_fold_c (int b, charclass c)
704 wint_t wc = btowc (b);
708 if (case_fold && iswalpha (wc))
709 setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
714 if (case_fold && isalpha (b))
715 setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
721 /* UTF-8 encoding allows some optimizations that we can't otherwise
722 assume in a multibyte encoding. */
726 static int utf8 = -1;
729 #if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
730 utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
739 /* Lexical analyzer. All the dross that deals with the obnoxious
740 GNU Regex syntax bits is located here. The poor, suffering
741 reader is referred to the GNU Regex documentation for the
742 meaning of the @#%!@#%^!@ syntax bits. */
744 static char const *lexptr; /* Pointer to next input character. */
745 static int lexleft; /* Number of characters remaining. */
746 static token lasttok; /* Previous token returned; initially END. */
747 static int laststart; /* True if we're separated from beginning or (, |
748 only by zero-width characters. */
749 static int parens; /* Count of outstanding left parens. */
750 static int minrep, maxrep; /* Repeat counts for {m,n}. */
752 static int cur_mb_len = 1; /* Length of the multibyte representation of
754 /* These variables are used only if (MB_CUR_MAX > 1). */
755 static mbstate_t mbs; /* Mbstate for mbrlen(). */
756 static wchar_t wctok; /* Wide character representation of the current
757 multibyte character. */
758 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
759 Each element store the amount of remain
760 byte of corresponding multibyte character
761 in the input string. A element's value
762 is 0 if corresponding character is a
763 single byte chracter.
764 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
765 mblen_buf : 0, 3, 2, 1
767 static wchar_t *inputwcs; /* Wide character representation of input
769 The length of this array is same as
770 the length of input string(char array).
771 inputstring[i] is a single-byte char,
772 or 1st byte of a multibyte char.
773 And inputwcs[i] is the codepoint. */
774 static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
775 static unsigned char const *buf_end; /* reference to end in dfaexec(). */
779 /* Note that characters become unsigned here. */
780 # define FETCH_WC(c, wc, eoferr) \
787 return lasttok = END; \
792 cur_mb_len = mbrtowc(&_wc, lexptr, lexleft, &mbs); \
793 if (cur_mb_len <= 0) \
797 (wc) = (c) = to_uchar (*lexptr++); \
801 lexptr += cur_mb_len; \
802 lexleft -= cur_mb_len; \
809 # define FETCH(c, eoferr) \
812 FETCH_WC(c, wc, eoferr); \
816 /* Note that characters become unsigned here. */
817 # define FETCH(c, eoferr) \
824 return lasttok = END; \
826 (c) = to_uchar (*lexptr++); \
830 # define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
832 #endif /* MBS_SUPPORT */
834 typedef int predicate (int);
836 /* The following list maps the names of the Posix named character classes
837 to predicate functions that determine whether a given character is in
838 the class. The leading [ has already been eaten by the lexical analyzer. */
842 bool single_byte_only;
845 static const struct dfa_ctype prednames[] = {
846 { "alpha", isalpha, false },
847 { "upper", isupper, false },
848 { "lower", islower, false },
849 { "digit", isdigit, true },
850 { "xdigit", isxdigit, true },
851 { "space", isspace, false },
852 { "punct", ispunct, false },
853 { "alnum", isalnum, false },
854 { "print", isprint, false },
855 { "graph", isgraph, false },
856 { "cntrl", iscntrl, false },
857 { "blank", is_blank, false },
858 { NULL, NULL, false }
861 static const struct dfa_ctype * _GL_ATTRIBUTE_PURE
862 find_pred (const char *str)
865 for (i = 0; prednames[i].name; ++i)
866 if (STREQ (str, prednames[i].name))
869 return &prednames[i];
872 /* Multibyte character handling sub-routine for lex.
873 This function parse a bracket expression and build a struct
876 parse_bracket_exp (void)
882 /* Used to warn about [:space:].
883 Bit 0 = first character is a colon.
884 Bit 1 = last character is a colon.
885 Bit 2 = includes any other character but a colon.
886 Bit 3 = includes ranges, char/equiv classes or collation elements. */
887 int colon_warning_state;
893 /* Work area to build a mb_char_classes. */
894 struct mb_char_classes *work_mbc;
895 int chars_al, range_sts_al, range_ends_al, ch_classes_al,
896 equivs_al, coll_elems_al;
899 range_sts_al = range_ends_al = 0;
900 ch_classes_al = equivs_al = coll_elems_al = 0;
903 REALLOC_IF_NECESSARY(dfa->mbcsets, dfa->mbcsets_alloc, dfa->nmbcsets + 1);
905 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
906 We will update dfa->multibyte_prop[] in addtok(), because we can't
907 decide the index in dfa->tokens[]. */
909 /* Initialize work area. */
910 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
911 memset (work_mbc, 0, sizeof *work_mbc);
916 memset (ccl, 0, sizeof ccl);
917 FETCH_WC (c, wc, _("unbalanced ["));
920 FETCH_WC (c, wc, _("unbalanced ["));
926 colon_warning_state = (c == ':');
929 c1 = EOF; /* mark c1 is not initialized". */
930 colon_warning_state &= ~2;
932 /* Note that if we're looking at some other [:...:] construct,
933 we just treat it as a bunch of ordinary characters. We can do
934 this because we assume regex has checked for syntax errors before
935 dfa is ever called. */
936 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
938 #define BRACKET_BUFFER_SIZE 128
939 char str[BRACKET_BUFFER_SIZE];
940 FETCH_WC (c1, wc1, _("unbalanced ["));
942 /* If pattern contains `[[:', `[[.', or `[[='. */
944 /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
945 || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))
951 FETCH_WC (c, wc, _("unbalanced ["));
952 if ((c == c1 && *lexptr == ']') || lexleft == 0)
954 if (len < BRACKET_BUFFER_SIZE)
957 /* This is in any case an invalid class name. */
963 FETCH_WC (c, wc, _("unbalanced ["));
965 /* build character class. */
968 = (case_fold && (STREQ (str, "upper")
969 || STREQ (str, "lower"))
972 const struct dfa_ctype *pred = find_pred (class);
974 dfaerror(_("invalid character class"));
976 if (MB_CUR_MAX > 1 && !pred->single_byte_only)
978 /* Store the character class as wctype_t. */
979 wctype_t wt = wctype (class);
981 REALLOC_IF_NECESSARY(work_mbc->ch_classes,
983 work_mbc->nch_classes + 1);
984 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
987 for (c2 = 0; c2 < NOTCHAR; ++c2)
989 setbit_case_fold_c (c2, ccl);
992 else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
995 MALLOC(elem, len + 1);
996 strncpy(elem, str, len + 1);
999 /* build equivalent class. */
1001 REALLOC_IF_NECESSARY(work_mbc->equivs,
1003 work_mbc->nequivs + 1);
1004 work_mbc->equivs[work_mbc->nequivs++] = elem;
1008 /* build collating element. */
1010 REALLOC_IF_NECESSARY(work_mbc->coll_elems,
1012 work_mbc->ncoll_elems + 1);
1013 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
1016 colon_warning_state |= 8;
1018 /* Fetch new lookahead character. */
1019 FETCH_WC (c1, wc1, _("unbalanced ["));
1023 /* We treat '[' as a normal character here. c/c1/wc/wc1
1024 are already set up. */
1027 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1028 FETCH_WC(c, wc, _("unbalanced ["));
1031 FETCH_WC(c1, wc1, _("unbalanced ["));
1034 /* build range characters. */
1036 FETCH_WC(c2, wc2, _("unbalanced ["));
1039 /* In the case [x-], the - is an ordinary hyphen,
1040 which is left in c1, the lookahead character. */
1041 lexptr -= cur_mb_len;
1042 lexleft += cur_mb_len;
1046 if (c1 == '-' && c2 != ']')
1049 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1050 FETCH_WC(c2, wc2, _("unbalanced ["));
1054 /* When case folding map a range, say [m-z] (or even [M-z])
1055 to the pair of ranges, [m-z] [M-Z]. */
1056 REALLOC_IF_NECESSARY(work_mbc->range_sts,
1057 range_sts_al, work_mbc->nranges + 1);
1058 REALLOC_IF_NECESSARY(work_mbc->range_ends,
1059 range_ends_al, work_mbc->nranges + 1);
1060 work_mbc->range_sts[work_mbc->nranges] =
1061 case_fold ? towlower(wc) : (wchar_t)wc;
1062 work_mbc->range_ends[work_mbc->nranges++] =
1063 case_fold ? towlower(wc2) : (wchar_t)wc2;
1066 if (case_fold && (iswalpha(wc) || iswalpha(wc2)))
1068 REALLOC_IF_NECESSARY(work_mbc->range_sts,
1069 range_sts_al, work_mbc->nranges + 1);
1070 work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
1071 REALLOC_IF_NECESSARY(work_mbc->range_ends,
1072 range_ends_al, work_mbc->nranges + 1);
1073 work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
1085 for (c = c1; c <= c2; c++)
1086 setbit_case_fold_c (c, ccl);
1089 colon_warning_state |= 8;
1090 FETCH_WC(c1, wc1, _("unbalanced ["));
1094 colon_warning_state |= (c == ':') ? 2 : 4;
1096 if (MB_CUR_MAX == 1)
1098 setbit_case_fold_c (c, ccl);
1102 if (case_fold && iswalpha(wc))
1105 if (!setbit_wc (wc, ccl))
1107 REALLOC_IF_NECESSARY(work_mbc->chars, chars_al,
1108 work_mbc->nchars + 1);
1109 work_mbc->chars[work_mbc->nchars++] = wc;
1117 if (!setbit_wc (wc, ccl))
1119 REALLOC_IF_NECESSARY(work_mbc->chars, chars_al,
1120 work_mbc->nchars + 1);
1121 work_mbc->chars[work_mbc->nchars++] = wc;
1124 while ((wc = wc1, (c = c1) != ']'));
1126 if (colon_warning_state == 7)
1127 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1131 static charclass zeroclass;
1132 work_mbc->invert = invert;
1133 work_mbc->cset = equal(ccl, zeroclass) ? -1 : charclass_index(ccl);
1139 assert(MB_CUR_MAX == 1);
1141 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1142 clrbit(eolbyte, ccl);
1145 return CSET + charclass_index(ccl);
1156 /* Basic plan: We fetch a character. If it's a backslash,
1157 we set the backslash flag and go through the loop again.
1158 On the plus side, this avoids having a duplicate of the
1159 main switch inside the backslash case. On the minus side,
1160 it means that just about every case begins with
1161 "if (backslash) ...". */
1162 for (i = 0; i < 2; ++i)
1166 FETCH_WC (c, wctok, NULL);
1179 dfaerror(_("unfinished \\ escape"));
1186 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1188 || lasttok == LPAREN
1190 return lasttok = BEGLINE;
1196 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1198 || (syntax_bits & RE_NO_BK_PARENS
1199 ? lexleft > 0 && *lexptr == ')'
1200 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
1201 || (syntax_bits & RE_NO_BK_VBAR
1202 ? lexleft > 0 && *lexptr == '|'
1203 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
1204 || ((syntax_bits & RE_NEWLINE_ALT)
1205 && lexleft > 0 && *lexptr == '\n'))
1206 return lasttok = ENDLINE;
1218 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
1221 return lasttok = BACKREF;
1226 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1227 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
1231 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1232 return lasttok = ENDLINE; /* FIXME: should be end of string */
1236 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1237 return lasttok = BEGWORD;
1241 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1242 return lasttok = ENDWORD;
1246 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1247 return lasttok = LIMWORD;
1251 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
1252 return lasttok = NOTLIMWORD;
1256 if (syntax_bits & RE_LIMITED_OPS)
1258 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1260 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1262 return lasttok = QMARK;
1267 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1269 return lasttok = STAR;
1272 if (syntax_bits & RE_LIMITED_OPS)
1274 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
1276 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1278 return lasttok = PLUS;
1281 if (!(syntax_bits & RE_INTERVALS))
1283 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
1285 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
1288 if (syntax_bits & RE_NO_BK_BRACES)
1290 /* Scan ahead for a valid interval; if it's not valid,
1291 treat it as a literal '{'. */
1292 int lo = -1, hi = -1;
1293 char const *p = lexptr;
1294 char const *lim = p + lexleft;
1295 for (; p != lim && ISASCIIDIGIT (*p); p++)
1296 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
1297 if (p != lim && *p == ',')
1298 while (++p != lim && ISASCIIDIGIT (*p))
1299 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
1302 if (p == lim || *p != '}'
1303 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
1310 {M,} - minimum count, maximum is infinity
1311 {M,N} - M through N */
1312 FETCH(c, _("unfinished repeat count"));
1313 if (ISASCIIDIGIT (c))
1318 FETCH(c, _("unfinished repeat count"));
1319 if (! ISASCIIDIGIT (c))
1321 minrep = 10 * minrep + c - '0';
1325 dfaerror(_("malformed repeat count"));
1328 FETCH (c, _("unfinished repeat count"));
1329 if (! ISASCIIDIGIT (c))
1336 FETCH (c, _("unfinished repeat count"));
1337 if (! ISASCIIDIGIT (c))
1339 maxrep = 10 * maxrep + c - '0';
1341 if (0 <= maxrep && maxrep < minrep)
1342 dfaerror (_("malformed repeat count"));
1347 if (!(syntax_bits & RE_NO_BK_BRACES))
1350 dfaerror(_("malformed repeat count"));
1351 FETCH(c, _("unfinished repeat count"));
1354 dfaerror(_("malformed repeat count"));
1356 return lasttok = REPMN;
1359 if (syntax_bits & RE_LIMITED_OPS)
1361 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
1364 return lasttok = OR;
1367 if (syntax_bits & RE_LIMITED_OPS
1369 || !(syntax_bits & RE_NEWLINE_ALT))
1372 return lasttok = OR;
1375 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1379 return lasttok = LPAREN;
1382 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1384 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1388 return lasttok = RPAREN;
1395 /* In multibyte environment period must match with a single
1396 character not a byte. So we use ANYCHAR. */
1398 return lasttok = ANYCHAR;
1402 if (!(syntax_bits & RE_DOT_NEWLINE))
1403 clrbit(eolbyte, ccl);
1404 if (syntax_bits & RE_DOT_NOT_NULL)
1407 return lasttok = CSET + charclass_index(ccl);
1411 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1414 for (c2 = 0; c2 < NOTCHAR; ++c2)
1420 return lasttok = CSET + charclass_index(ccl);
1424 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1427 for (c2 = 0; c2 < NOTCHAR; ++c2)
1428 if (IS_WORD_CONSTITUENT(c2))
1433 return lasttok = CSET + charclass_index(ccl);
1439 return lasttok = parse_bracket_exp();
1444 /* For multibyte character sets, folding is done in atom. Always
1447 return lasttok = WCHAR;
1449 if (case_fold && isalpha(c))
1452 setbit_case_fold_c (c, ccl);
1453 return lasttok = CSET + charclass_index(ccl);
1460 /* The above loop should consume at most a backslash
1461 and some other character. */
1463 return END; /* keeps pedantic compilers happy. */
1466 /* Recursive descent parser for regular expressions. */
1468 static token tok; /* Lookahead token. */
1469 static int depth; /* Current depth of a hypothetical stack
1470 holding deferred productions. This is
1471 used to determine the depth that will be
1472 required of the real stack later on in
1476 addtok_mb (token t, int mbprop)
1480 REALLOC_IF_NECESSARY(dfa->multibyte_prop, dfa->nmultibyte_prop,
1482 dfa->multibyte_prop[dfa->tindex] = mbprop;
1485 REALLOC_IF_NECESSARY(dfa->tokens, dfa->talloc, dfa->tindex + 1);
1486 dfa->tokens[dfa->tindex++] = t;
1506 if (depth > dfa->depth)
1510 static void addtok_wc (wint_t wc);
1512 /* Add the given token to the parse tree, maintaining the depth count and
1513 updating the maximum depth if necessary. */
1517 if (MB_CUR_MAX > 1 && t == MBCSET)
1519 bool need_or = false;
1520 struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
1522 /* Extract wide characters into alternations for better performance.
1523 This does not require UTF-8. */
1524 if (!work_mbc->invert)
1527 for (i = 0; i < work_mbc->nchars; i++)
1529 addtok_wc (work_mbc->chars[i]);
1534 work_mbc->nchars = 0;
1537 /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
1538 if (work_mbc->invert
1539 || (!using_utf8() && work_mbc->cset != -1)
1540 || work_mbc->nchars != 0
1541 || work_mbc->nch_classes != 0
1542 || work_mbc->nranges != 0
1543 || work_mbc->nequivs != 0
1544 || work_mbc->ncoll_elems != 0)
1546 addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
1552 /* Characters have been handled above, so it is possible
1553 that the mbcset is empty now. Do nothing in that case. */
1554 if (work_mbc->cset != -1)
1556 assert (using_utf8 ());
1557 addtok (CSET + work_mbc->cset);
1570 /* We treat a multibyte character as a single atom, so that DFA
1571 can treat a multibyte character as a single expression.
1573 e.g. We construct following tree from "<mb1><mb2>".
1574 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1575 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1577 addtok_wc (wint_t wc)
1579 unsigned char buf[MB_LEN_MAX];
1582 memset (&s, 0, sizeof s);
1583 cur_mb_len = wcrtomb ((char *) buf, wc, &s);
1585 /* This is merely stop-gap. When cur_mb_len is 0 or negative,
1586 buf[0] is undefined, yet skipping the addtok_mb call altogether
1587 can result in heap corruption. */
1588 if (cur_mb_len <= 0)
1591 addtok_mb(buf[0], cur_mb_len == 1 ? 3 : 1);
1592 for (i = 1; i < cur_mb_len; i++)
1594 addtok_mb(buf[i], i == cur_mb_len - 1 ? 2 : 0);
1599 static void addtok_wc (wint_t wc) {}
1603 add_utf8_anychar (void)
1606 static const charclass utf8_classes[5] = {
1607 { 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */
1608 { ~0, ~0, ~0, ~0, 0, 0, 0, 0 }, /* 00-7f: 1-byte sequence */
1609 { 0, 0, 0, 0, 0, 0, 0xfffffffcU, 0 }, /* c2-df: 2-byte sequence */
1610 { 0, 0, 0, 0, 0, 0, 0, 0xffff }, /* e0-ef: 3-byte sequence */
1611 { 0, 0, 0, 0, 0, 0, 0, 0xff0000 } /* f0-f7: 4-byte sequence */
1613 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1616 /* Define the five character classes that are needed below. */
1617 if (dfa->utf8_anychar_classes[0] == 0)
1618 for (i = 0; i < n; i++)
1621 copyset (utf8_classes[i], c);
1624 if (!(syntax_bits & RE_DOT_NEWLINE))
1625 clrbit (eolbyte, c);
1626 if (syntax_bits & RE_DOT_NOT_NULL)
1629 dfa->utf8_anychar_classes[i] = CSET + charclass_index(c);
1632 /* A valid UTF-8 character is
1635 |[0xc2-0xdf][0x80-0xbf]
1636 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1637 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1639 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1640 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1641 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1642 for (i = 1; i < n; i++)
1643 addtok (dfa->utf8_anychar_classes[i]);
1646 addtok (dfa->utf8_anychar_classes[0]);
1653 /* The grammar understood by the parser is as follows.
1672 <multibyte character>
1683 LPAREN regexp RPAREN
1686 The parser builds a parse tree in postfix form in an array of tokens. */
1695 else if (MBS_SUPPORT && tok == WCHAR)
1697 addtok_wc (case_fold ? towlower(wctok) : wctok);
1699 if (case_fold && iswalpha(wctok))
1701 addtok_wc (towupper(wctok));
1708 else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8())
1710 /* For UTF-8 expand the period to a series of CSETs that define a valid
1711 UTF-8 character. This avoids using the slow multibyte path. I'm
1712 pretty sure it would be both profitable and correct to do it for
1713 any encoding; however, the optimization must be done manually as
1714 it is done above in add_utf8_anychar. So, let's start with
1715 UTF-8: it is the most used, and the structure of the encoding
1716 makes the correctness more obvious. */
1720 else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1721 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1723 || tok == ANYCHAR || tok == MBCSET
1724 #endif /* MBS_SUPPORT */
1725 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1730 else if (tok == LPAREN)
1735 dfaerror(_("unbalanced ("));
1742 /* Return the number of tokens in the given subexpression. */
1743 static int _GL_ATTRIBUTE_PURE
1744 nsubtoks (int tindex)
1748 switch (dfa->tokens[tindex - 1])
1755 return 1 + nsubtoks(tindex - 1);
1758 ntoks1 = nsubtoks(tindex - 1);
1759 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1763 /* Copy the given subexpression to the top of the tree. */
1765 copytoks (int tindex, int ntokens)
1769 for (i = 0; i < ntokens; ++i)
1771 addtok(dfa->tokens[tindex + i]);
1772 /* Update index into multibyte csets. */
1773 if (MB_CUR_MAX > 1 && dfa->tokens[tindex + i] == MBCSET)
1774 dfa->multibyte_prop[dfa->tindex - 1] = dfa->multibyte_prop[tindex + i];
1781 int tindex, ntokens, i;
1784 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1785 if (tok == REPMN && (minrep || maxrep))
1787 ntokens = nsubtoks(dfa->tindex);
1788 tindex = dfa->tindex - ntokens;
1793 for (i = 1; i < minrep; ++i)
1795 copytoks(tindex, ntokens);
1798 for (; i < maxrep; ++i)
1800 copytoks(tindex, ntokens);
1806 else if (tok == REPMN)
1808 dfa->tindex -= nsubtoks(dfa->tindex);
1823 while (tok != RPAREN && tok != OR && tok >= 0)
1842 /* Main entry point for the parser. S is a string to be parsed, len is the
1843 length of the string, so s can include NUL characters. D is a pointer to
1844 the struct dfa to parse into. */
1846 dfaparse (char const *s, size_t len, struct dfa *d)
1857 memset(&mbs, 0, sizeof mbs);
1860 if (! syntax_bits_set)
1861 dfaerror(_("no syntax specified"));
1869 dfaerror(_("unbalanced )"));
1871 addtok(END - d->nregexps);
1880 /* Some primitives for operating on sets of positions. */
1882 /* Copy one set to another; the destination must be large enough. */
1884 copy (position_set const *src, position_set *dst)
1886 REALLOC_IF_NECESSARY(dst->elems, dst->alloc, src->nelem);
1887 memcpy(dst->elems, src->elems, sizeof(dst->elems[0]) * src->nelem);
1888 dst->nelem = src->nelem;
1892 alloc_position_set (position_set *s, size_t size)
1894 MALLOC(s->elems, size);
1899 /* Insert position P in set S. S is maintained in sorted order on
1900 decreasing index. If there is already an entry in S with P.index
1901 then merge (logically-OR) P's constraints into the one in S.
1902 S->elems must point to an array large enough to hold the resulting set. */
1904 insert (position p, position_set *s)
1906 int count = s->nelem;
1907 int lo = 0, hi = count;
1911 int mid = ((unsigned) lo + (unsigned) hi) >> 1;
1912 if (s->elems[mid].index > p.index)
1918 if (lo < count && p.index == s->elems[lo].index)
1920 s->elems[lo].constraint |= p.constraint;
1924 REALLOC_IF_NECESSARY(s->elems, s->alloc, count + 1);
1925 for (i = count; i > lo; i--)
1926 s->elems[i] = s->elems[i - 1];
1931 /* Merge two sets of positions into a third. The result is exactly as if
1932 the positions of both sets were inserted into an initially empty set. */
1934 merge (position_set const *s1, position_set const *s2, position_set *m)
1938 REALLOC_IF_NECESSARY(m->elems, m->alloc, s1->nelem + s2->nelem);
1940 while (i < s1->nelem && j < s2->nelem)
1941 if (s1->elems[i].index > s2->elems[j].index)
1942 m->elems[m->nelem++] = s1->elems[i++];
1943 else if (s1->elems[i].index < s2->elems[j].index)
1944 m->elems[m->nelem++] = s2->elems[j++];
1947 m->elems[m->nelem] = s1->elems[i++];
1948 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1950 while (i < s1->nelem)
1951 m->elems[m->nelem++] = s1->elems[i++];
1952 while (j < s2->nelem)
1953 m->elems[m->nelem++] = s2->elems[j++];
1956 /* Delete a position from a set. */
1958 delete (position p, position_set *s)
1962 for (i = 0; i < s->nelem; ++i)
1963 if (p.index == s->elems[i].index)
1966 for (--s->nelem; i < s->nelem; ++i)
1967 s->elems[i] = s->elems[i + 1];
1970 /* Find the index of the state corresponding to the given position set with
1971 the given preceding context, or create a new state if there is no such
1972 state. Context tells whether we got here on a newline or letter. */
1974 state_index (struct dfa *d, position_set const *s, int context)
1980 context &= ~CTX_NONE;
1981 for (i = 0; i < s->nelem; ++i)
1982 hash ^= s->elems[i].index + s->elems[i].constraint;
1984 /* Try to find a state that exactly matches the proposed one. */
1985 for (i = 0; i < d->sindex; ++i)
1987 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1988 || context != d->states[i].context)
1990 for (j = 0; j < s->nelem; ++j)
1991 if (s->elems[j].constraint
1992 != d->states[i].elems.elems[j].constraint
1993 || s->elems[j].index != d->states[i].elems.elems[j].index)
1999 /* We'll have to create a new state. */
2000 REALLOC_IF_NECESSARY(d->states, d->salloc, d->sindex + 1);
2001 d->states[i].hash = hash;
2002 alloc_position_set(&d->states[i].elems, s->nelem);
2003 copy(s, &d->states[i].elems);
2004 d->states[i].context = context;
2005 d->states[i].backref = 0;
2006 d->states[i].constraint = 0;
2007 d->states[i].first_end = 0;
2010 d->states[i].mbps.nelem = 0;
2011 d->states[i].mbps.elems = NULL;
2013 for (j = 0; j < s->nelem; ++j)
2014 if (d->tokens[s->elems[j].index] < 0)
2016 constraint = s->elems[j].constraint;
2017 if (SUCCEEDS_IN_CONTEXT(constraint, context, CTX_NONE)
2018 || SUCCEEDS_IN_CONTEXT(constraint, context, CTX_NEWLINE)
2019 || SUCCEEDS_IN_CONTEXT(constraint, context, CTX_LETTER))
2020 d->states[i].constraint |= constraint;
2021 if (! d->states[i].first_end)
2022 d->states[i].first_end = d->tokens[s->elems[j].index];
2024 else if (d->tokens[s->elems[j].index] == BACKREF)
2026 d->states[i].constraint = NO_CONSTRAINT;
2027 d->states[i].backref = 1;
2035 /* Find the epsilon closure of a set of positions. If any position of the set
2036 contains a symbol that matches the empty string in some context, replace
2037 that position with the elements of its follow labeled with an appropriate
2038 constraint. Repeat exhaustively until no funny positions are left.
2039 S->elems must be large enough to hold the result. */
2041 epsclosure (position_set *s, struct dfa const *d)
2044 char *visited; /* array of booleans, enough to use char, not int */
2047 CALLOC(visited, d->tindex);
2049 for (i = 0; i < s->nelem; ++i)
2050 if (d->tokens[s->elems[i].index] >= NOTCHAR
2051 && d->tokens[s->elems[i].index] != BACKREF
2053 && d->tokens[s->elems[i].index] != ANYCHAR
2054 && d->tokens[s->elems[i].index] != MBCSET
2056 && d->tokens[s->elems[i].index] < CSET)
2059 p.constraint = old.constraint;
2060 delete(s->elems[i], s);
2061 if (visited[old.index])
2066 visited[old.index] = 1;
2067 switch (d->tokens[old.index])
2070 p.constraint &= BEGLINE_CONSTRAINT;
2073 p.constraint &= ENDLINE_CONSTRAINT;
2076 p.constraint &= BEGWORD_CONSTRAINT;
2079 p.constraint &= ENDWORD_CONSTRAINT;
2082 p.constraint &= LIMWORD_CONSTRAINT;
2085 p.constraint &= NOTLIMWORD_CONSTRAINT;
2090 for (j = 0; j < d->follows[old.index].nelem; ++j)
2092 p.index = d->follows[old.index].elems[j].index;
2095 /* Force rescan to start at the beginning. */
2102 /* Returns the set of contexts for which there is at least one
2103 character included in C. */
2106 charclass_context(charclass c)
2111 if (tstbit(eolbyte, c))
2112 context |= CTX_NEWLINE;
2114 for (j = 0; j < CHARCLASS_INTS; ++j)
2116 if (c[j] & letters[j])
2117 context |= CTX_LETTER;
2118 if (c[j] & ~(letters[j] | newline[j]))
2119 context |= CTX_NONE;
2125 /* Returns the subset of POSSIBLE_CONTEXTS on which the position set S
2126 depends. Each context in the set of returned contexts (let's call it
2127 SC) may have a different follow set than other contexts in SC, and
2128 also different from the follow set of the complement set. However,
2129 all contexts in the complement set will have the same follow set. */
2131 static int _GL_ATTRIBUTE_PURE
2132 state_separate_contexts (position_set *s, int possible_contexts)
2134 int separate_context = 0;
2137 for (j = 0; j < s->nelem; ++j)
2139 if ((possible_contexts & CTX_NEWLINE)
2140 && PREV_NEWLINE_DEPENDENT(s->elems[j].constraint))
2141 separate_context |= CTX_NEWLINE;
2142 if ((possible_contexts & CTX_LETTER)
2143 && PREV_LETTER_DEPENDENT(s->elems[j].constraint))
2144 separate_context |= CTX_LETTER;
2147 return separate_context;
2151 /* Perform bottom-up analysis on the parse tree, computing various functions.
2152 Note that at this point, we're pretending constructs like \< are real
2153 characters rather than constraints on what can follow them.
2155 Nullable: A node is nullable if it is at the root of a regexp that can
2156 match the empty string.
2157 * EMPTY leaves are nullable.
2158 * No other leaf is nullable.
2159 * A QMARK or STAR node is nullable.
2160 * A PLUS node is nullable if its argument is nullable.
2161 * A CAT node is nullable if both its arguments are nullable.
2162 * An OR node is nullable if either argument is nullable.
2164 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2165 that could correspond to the first character of a string matching the
2166 regexp rooted at the given node.
2167 * EMPTY leaves have empty firstpos.
2168 * The firstpos of a nonempty leaf is that leaf itself.
2169 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2171 * The firstpos of a CAT node is the firstpos of the left argument, union
2172 the firstpos of the right if the left argument is nullable.
2173 * The firstpos of an OR node is the union of firstpos of each argument.
2175 Lastpos: The lastpos of a node is the set of positions that could
2176 correspond to the last character of a string matching the regexp at
2178 * EMPTY leaves have empty lastpos.
2179 * The lastpos of a nonempty leaf is that leaf itself.
2180 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2182 * The lastpos of a CAT node is the lastpos of its right argument, union
2183 the lastpos of the left if the right argument is nullable.
2184 * The lastpos of an OR node is the union of the lastpos of each argument.
2186 Follow: The follow of a position is the set of positions that could
2187 correspond to the character following a character matching the node in
2188 a string matching the regexp. At this point we consider special symbols
2189 that match the empty string in some context to be just normal characters.
2190 Later, if we find that a special symbol is in a follow set, we will
2191 replace it with the elements of its follow, labeled with an appropriate
2193 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2194 the follow of every node in the lastpos.
2195 * Every node in the firstpos of the second argument of a CAT node is in
2196 the follow of every node in the lastpos of the first argument.
2198 Because of the postfix representation of the parse tree, the depth-first
2199 analysis is conveniently done by a linear scan with the aid of a stack.
2200 Sets are stored as arrays of the elements, obeying a stack-like allocation
2201 scheme; the number of elements in each set deeper in the stack can be
2202 used to determine the address of a particular set's array. */
2204 dfaanalyze (struct dfa *d, int searchflag)
2206 int *nullable; /* Nullable stack. */
2207 int *nfirstpos; /* Element count stack for firstpos sets. */
2208 position *firstpos; /* Array where firstpos elements are stored. */
2209 int *nlastpos; /* Element count stack for lastpos sets. */
2210 position *lastpos; /* Array where lastpos elements are stored. */
2211 position_set tmp; /* Temporary set for merging sets. */
2212 position_set merged; /* Result of merging sets. */
2213 int separate_contexts; /* Context wanted by some position. */
2215 int *o_nfirst, *o_nlast;
2216 position *o_firstpos, *o_lastpos;
2221 fprintf(stderr, "dfaanalyze:\n");
2222 for (i = 0; i < d->tindex; ++i)
2224 fprintf(stderr, " %d:", i);
2225 prtok(d->tokens[i]);
2230 d->searchflag = searchflag;
2232 MALLOC(nullable, d->depth);
2233 o_nullable = nullable;
2234 MALLOC(nfirstpos, d->depth);
2235 o_nfirst = nfirstpos;
2236 MALLOC(firstpos, d->nleaves);
2237 o_firstpos = firstpos, firstpos += d->nleaves;
2238 MALLOC(nlastpos, d->depth);
2240 MALLOC(lastpos, d->nleaves);
2241 o_lastpos = lastpos, lastpos += d->nleaves;
2242 alloc_position_set(&merged, d->nleaves);
2244 CALLOC(d->follows, d->tindex);
2246 for (i = 0; i < d->tindex; ++i)
2248 switch (d->tokens[i])
2251 /* The empty set is nullable. */
2254 /* The firstpos and lastpos of the empty leaf are both empty. */
2255 *nfirstpos++ = *nlastpos++ = 0;
2260 /* Every element in the firstpos of the argument is in the follow
2261 of every element in the lastpos. */
2262 tmp.nelem = nfirstpos[-1];
2263 tmp.elems = firstpos;
2265 for (j = 0; j < nlastpos[-1]; ++j)
2267 merge(&tmp, &d->follows[pos[j].index], &merged);
2268 copy(&merged, &d->follows[pos[j].index]);
2272 /* A QMARK or STAR node is automatically nullable. */
2273 if (d->tokens[i] != PLUS)
2278 /* Every element in the firstpos of the second argument is in the
2279 follow of every element in the lastpos of the first argument. */
2280 tmp.nelem = nfirstpos[-1];
2281 tmp.elems = firstpos;
2282 pos = lastpos + nlastpos[-1];
2283 for (j = 0; j < nlastpos[-2]; ++j)
2285 merge(&tmp, &d->follows[pos[j].index], &merged);
2286 copy(&merged, &d->follows[pos[j].index]);
2289 /* The firstpos of a CAT node is the firstpos of the first argument,
2290 union that of the second argument if the first is nullable. */
2292 nfirstpos[-2] += nfirstpos[-1];
2294 firstpos += nfirstpos[-1];
2297 /* The lastpos of a CAT node is the lastpos of the second argument,
2298 union that of the first argument if the second is nullable. */
2300 nlastpos[-2] += nlastpos[-1];
2303 pos = lastpos + nlastpos[-2];
2304 for (j = nlastpos[-1] - 1; j >= 0; --j)
2305 pos[j] = lastpos[j];
2306 lastpos += nlastpos[-2];
2307 nlastpos[-2] = nlastpos[-1];
2311 /* A CAT node is nullable if both arguments are nullable. */
2312 nullable[-2] = nullable[-1] && nullable[-2];
2317 /* The firstpos is the union of the firstpos of each argument. */
2318 nfirstpos[-2] += nfirstpos[-1];
2321 /* The lastpos is the union of the lastpos of each argument. */
2322 nlastpos[-2] += nlastpos[-1];
2325 /* An OR node is nullable if either argument is nullable. */
2326 nullable[-2] = nullable[-1] || nullable[-2];
2331 /* Anything else is a nonempty position. (Note that special
2332 constructs like \< are treated as nonempty strings here;
2333 an "epsilon closure" effectively makes them nullable later.
2334 Backreferences have to get a real position so we can detect
2335 transitions on them later. But they are nullable. */
2336 *nullable++ = d->tokens[i] == BACKREF;
2338 /* This position is in its own firstpos and lastpos. */
2339 *nfirstpos++ = *nlastpos++ = 1;
2340 --firstpos, --lastpos;
2341 firstpos->index = lastpos->index = i;
2342 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2344 /* Allocate the follow set for this position. */
2345 alloc_position_set(&d->follows[i], 1);
2349 /* ... balance the above nonsyntactic #ifdef goo... */
2350 fprintf(stderr, "node %d:", i);
2351 prtok(d->tokens[i]);
2353 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
2354 fprintf(stderr, " firstpos:");
2355 for (j = nfirstpos[-1] - 1; j >= 0; --j)
2357 fprintf(stderr, " %d:", firstpos[j].index);
2358 prtok(d->tokens[firstpos[j].index]);
2360 fprintf(stderr, "\n lastpos:");
2361 for (j = nlastpos[-1] - 1; j >= 0; --j)
2363 fprintf(stderr, " %d:", lastpos[j].index);
2364 prtok(d->tokens[lastpos[j].index]);
2370 /* For each follow set that is the follow set of a real position, replace
2371 it with its epsilon closure. */
2372 for (i = 0; i < d->tindex; ++i)
2373 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2375 || d->tokens[i] == ANYCHAR
2376 || d->tokens[i] == MBCSET
2378 || d->tokens[i] >= CSET)
2381 fprintf(stderr, "follows(%d:", i);
2382 prtok(d->tokens[i]);
2383 fprintf(stderr, "):");
2384 for (j = d->follows[i].nelem - 1; j >= 0; --j)
2386 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
2387 prtok(d->tokens[d->follows[i].elems[j].index]);
2391 copy(&d->follows[i], &merged);
2392 epsclosure(&merged, d);
2393 copy(&merged, &d->follows[i]);
2396 /* Get the epsilon closure of the firstpos of the regexp. The result will
2397 be the set of positions of state 0. */
2399 for (i = 0; i < nfirstpos[-1]; ++i)
2400 insert(firstpos[i], &merged);
2401 epsclosure(&merged, d);
2403 /* Build the initial state. */
2406 MALLOC(d->states, d->salloc);
2408 separate_contexts = state_separate_contexts(&merged, CTX_NEWLINE);
2409 state_index(d, &merged, separate_contexts);
2420 /* Find, for each character, the transition out of state s of d, and store
2421 it in the appropriate slot of trans.
2423 We divide the positions of s into groups (positions can appear in more
2424 than one group). Each group is labeled with a set of characters that
2425 every position in the group matches (taking into account, if necessary,
2426 preceding context information of s). For each group, find the union
2427 of the its elements' follows. This set is the set of positions of the
2428 new state. For each character in the group's label, set the transition
2429 on this character to be to a state corresponding to the set's positions,
2430 and its associated backward context information, if necessary.
2432 If we are building a searching matcher, we include the positions of state
2435 The collection of groups is constructed by building an equivalence-class
2436 partition of the positions of s.
2438 For each position, find the set of characters C that it matches. Eliminate
2439 any characters from C that fail on grounds of backward context.
2441 Search through the groups, looking for a group whose label L has nonempty
2442 intersection with C. If L - C is nonempty, create a new group labeled
2443 L - C and having the same positions as the current group, and set L to
2444 the intersection of L and C. Insert the position in this group, set
2445 C = C - L, and resume scanning.
2447 If after comparing with every group there are characters remaining in C,
2448 create a new group labeled with the characters of C and insert this
2449 position in that group. */
2451 dfastate (int s, struct dfa *d, int trans[])
2453 leaf_set *grps; /* As many as will ever be needed. */
2454 charclass *labels; /* Labels corresponding to the groups. */
2455 int ngrps = 0; /* Number of groups actually used. */
2456 position pos; /* Current position being considered. */
2457 charclass matches; /* Set of matching characters. */
2458 int matchesf; /* True if matches is nonempty. */
2459 charclass intersect; /* Intersection with some label set. */
2460 int intersectf; /* True if intersect is nonempty. */
2461 charclass leftovers; /* Stuff in the label that didn't match. */
2462 int leftoversf; /* True if leftovers is nonempty. */
2463 position_set follows; /* Union of the follows of some group. */
2464 position_set tmp; /* Temporary space for merging sets. */
2465 int possible_contexts; /* Contexts that this group can match. */
2466 int separate_contexts; /* Context that new state wants to know. */
2467 int state; /* New state. */
2468 int state_newline; /* New state on a newline transition. */
2469 int state_letter; /* New state on a letter transition. */
2470 int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2473 MALLOC (grps, NOTCHAR);
2474 MALLOC (labels, NOTCHAR);
2478 for (i = 0; i < d->states[s].elems.nelem; ++i)
2480 pos = d->states[s].elems.elems[i];
2481 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2482 setbit(d->tokens[pos.index], matches);
2483 else if (d->tokens[pos.index] >= CSET)
2484 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
2485 else if (MBS_SUPPORT
2486 && (d->tokens[pos.index] == ANYCHAR
2487 || d->tokens[pos.index] == MBCSET))
2488 /* MB_CUR_MAX > 1 */
2490 /* ANYCHAR and MBCSET must match with a single character, so we
2491 must put it to d->states[s].mbps, which contains the positions
2492 which can match with a single character not a byte. */
2493 if (d->states[s].mbps.nelem == 0)
2494 alloc_position_set(&d->states[s].mbps, 1);
2495 insert(pos, &(d->states[s].mbps));
2501 /* Some characters may need to be eliminated from matches because
2502 they fail in the current context. */
2503 if (pos.constraint != 0xFF)
2505 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2506 d->states[s].context & CTX_NEWLINE,
2508 clrbit(eolbyte, matches);
2509 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2510 d->states[s].context & CTX_NEWLINE, 0))
2511 for (j = 0; j < CHARCLASS_INTS; ++j)
2512 matches[j] &= newline[j];
2513 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2514 d->states[s].context & CTX_LETTER,
2516 for (j = 0; j < CHARCLASS_INTS; ++j)
2517 matches[j] &= ~letters[j];
2518 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2519 d->states[s].context & CTX_LETTER, 0))
2520 for (j = 0; j < CHARCLASS_INTS; ++j)
2521 matches[j] &= letters[j];
2523 /* If there are no characters left, there's no point in going on. */
2524 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2526 if (j == CHARCLASS_INTS)
2530 for (j = 0; j < ngrps; ++j)
2532 /* If matches contains a single character only, and the current
2533 group's label doesn't contain that character, go on to the
2535 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2536 && !tstbit(d->tokens[pos.index], labels[j]))
2539 /* Check if this group's label has a nonempty intersection with
2542 for (k = 0; k < CHARCLASS_INTS; ++k)
2543 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2547 /* It does; now find the set differences both ways. */
2548 leftoversf = matchesf = 0;
2549 for (k = 0; k < CHARCLASS_INTS; ++k)
2551 /* Even an optimizing compiler can't know this for sure. */
2552 int match = matches[k], label = labels[j][k];
2554 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2555 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2558 /* If there were leftovers, create a new group labeled with them. */
2561 copyset(leftovers, labels[ngrps]);
2562 copyset(intersect, labels[j]);
2563 MALLOC(grps[ngrps].elems, d->nleaves);
2564 memcpy(grps[ngrps].elems, grps[j].elems,
2565 sizeof (grps[j].elems[0]) * grps[j].nelem);
2566 grps[ngrps].nelem = grps[j].nelem;
2570 /* Put the position in the current group. The constraint is
2572 grps[j].elems[grps[j].nelem++] = pos.index;
2574 /* If every character matching the current position has been
2575 accounted for, we're done. */
2580 /* If we've passed the last group, and there are still characters
2581 unaccounted for, then we'll have to create a new group. */
2584 copyset(matches, labels[ngrps]);
2586 MALLOC(grps[ngrps].elems, d->nleaves);
2587 grps[ngrps].nelem = 1;
2588 grps[ngrps].elems[0] = pos.index;
2593 alloc_position_set(&follows, d->nleaves);
2594 alloc_position_set(&tmp, d->nleaves);
2596 /* If we are a searching matcher, the default transition is to a state
2597 containing the positions of state 0, otherwise the default transition
2598 is to fail miserably. */
2601 /* Find the state(s) corresponding to the positions of state 0. */
2602 copy(&d->states[0].elems, &follows);
2603 separate_contexts = state_separate_contexts(&follows, CTX_ANY);
2604 state = state_index(d, &follows, 0);
2605 if (separate_contexts & CTX_NEWLINE)
2606 state_newline = state_index(d, &follows, CTX_NEWLINE);
2608 state_newline = state;
2609 if (separate_contexts & CTX_LETTER)
2610 state_letter = state_index(d, &follows, CTX_LETTER);
2612 state_letter = state;
2614 for (i = 0; i < NOTCHAR; ++i)
2615 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2616 trans[eolbyte] = state_newline;
2619 for (i = 0; i < NOTCHAR; ++i)
2622 for (i = 0; i < ngrps; ++i)
2626 /* Find the union of the follows of the positions of the group.
2627 This is a hideously inefficient loop. Fix it someday. */
2628 for (j = 0; j < grps[i].nelem; ++j)
2629 for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
2630 insert(d->follows[grps[i].elems[j]].elems[k], &follows);
2632 if (d->mb_cur_max > 1)
2634 /* If a token in follows.elems is not 1st byte of a multibyte
2635 character, or the states of follows must accept the bytes
2636 which are not 1st byte of the multibyte character.
2637 Then, if a state of follows encounter a byte, it must not be
2638 a 1st byte of a multibyte character nor single byte character.
2639 We cansel to add state[0].follows to next state, because
2640 state[0] must accept 1st-byte
2642 For example, we assume <sb a> is a certain single byte
2643 character, <mb A> is a certain multibyte character, and the
2644 codepoint of <sb a> equals the 2nd byte of the codepoint of
2646 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2647 by accepting accepts 1st byte of <mb A>, and state[i+1]
2648 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2649 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2650 <mb A>, so we cannot add state[0]. */
2652 next_isnt_1st_byte = 0;
2653 for (j = 0; j < follows.nelem; ++j)
2655 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2657 next_isnt_1st_byte = 1;
2663 /* If we are building a searching matcher, throw in the positions
2664 of state 0 as well. */
2667 || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
2668 for (j = 0; j < d->states[0].elems.nelem; ++j)
2669 insert(d->states[0].elems.elems[j], &follows);
2671 /* Find out if the new state will want any context information. */
2672 possible_contexts = charclass_context(labels[i]);
2673 separate_contexts = state_separate_contexts(&follows, possible_contexts);
2675 /* Find the state(s) corresponding to the union of the follows. */
2676 state = state_index(d, &follows, 0);
2677 if (separate_contexts & CTX_NEWLINE)
2678 state_newline = state_index(d, &follows, CTX_NEWLINE);
2680 state_newline = state;
2681 if (separate_contexts & CTX_LETTER)
2682 state_letter = state_index(d, &follows, CTX_LETTER);
2684 state_letter = state;
2686 /* Set the transitions for each character in the current label. */
2687 for (j = 0; j < CHARCLASS_INTS; ++j)
2688 for (k = 0; k < INTBITS; ++k)
2689 if (labels[i][j] & 1 << k)
2691 int c = j * INTBITS + k;
2694 trans[c] = state_newline;
2695 else if (IS_WORD_CONSTITUENT(c))
2696 trans[c] = state_letter;
2697 else if (c < NOTCHAR)
2702 for (i = 0; i < ngrps; ++i)
2703 free(grps[i].elems);
2704 free(follows.elems);
2710 /* Some routines for manipulating a compiled dfa's transition tables.
2711 Each state may or may not have a transition table; if it does, and it
2712 is a non-accepting state, then d->trans[state] points to its table.
2713 If it is an accepting state then d->fails[state] points to its table.
2714 If it has no table at all, then d->trans[state] is NULL.
2715 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2718 build_state (int s, struct dfa *d)
2720 int *trans; /* The new transition table. */
2723 /* Set an upper limit on the number of transition tables that will ever
2724 exist at once. 1024 is arbitrary. The idea is that the frequently
2725 used transition tables will be quickly rebuilt, whereas the ones that
2726 were only needed once or twice will be cleared away. */
2727 if (d->trcount >= 1024)
2729 for (i = 0; i < d->tralloc; ++i)
2733 d->trans[i] = d->fails[i] = NULL;
2740 /* Set up the success bits for this state. */
2742 if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_NEWLINE, s, *d))
2743 d->success[s] |= CTX_NEWLINE;
2744 if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_LETTER, s, *d))
2745 d->success[s] |= CTX_LETTER;
2746 if (ACCEPTS_IN_CONTEXT(d->states[s].context, CTX_NONE, s, *d))
2747 d->success[s] |= CTX_NONE;
2749 MALLOC(trans, NOTCHAR);
2750 dfastate(s, d, trans);
2752 /* Now go through the new transition table, and make sure that the trans
2753 and fail arrays are allocated large enough to hold a pointer for the
2754 largest state mentioned in the table. */
2755 for (i = 0; i < NOTCHAR; ++i)
2756 if (trans[i] >= d->tralloc)
2758 int oldalloc = d->tralloc;
2760 while (trans[i] >= d->tralloc)
2762 REALLOC(d->realtrans, d->tralloc + 1);
2763 d->trans = d->realtrans + 1;
2764 REALLOC(d->fails, d->tralloc);
2765 REALLOC(d->success, d->tralloc);
2766 REALLOC(d->newlines, d->tralloc);
2767 while (oldalloc < d->tralloc)
2769 d->trans[oldalloc] = NULL;
2770 d->fails[oldalloc++] = NULL;
2774 /* Keep the newline transition in a special place so we can use it as
2776 d->newlines[s] = trans[eolbyte];
2777 trans[eolbyte] = -1;
2779 if (ACCEPTING(s, *d))
2780 d->fails[s] = trans;
2782 d->trans[s] = trans;
2786 build_state_zero (struct dfa *d)
2790 CALLOC(d->realtrans, d->tralloc + 1);
2791 d->trans = d->realtrans + 1;
2792 CALLOC(d->fails, d->tralloc);
2793 MALLOC(d->success, d->tralloc);
2794 MALLOC(d->newlines, d->tralloc);
2798 /* Multibyte character handling sub-routines for dfaexec. */
2800 /* Initial state may encounter the byte which is not a single byte character
2801 nor 1st byte of a multibyte character. But it is incorrect for initial
2802 state to accept such a byte.
2803 For example, in sjis encoding the regular expression like "\\" accepts
2804 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2805 0x815c. Then Initial state must skip the bytes which are not a single byte
2806 character nor 1st byte of a multibyte character. */
2807 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2810 while (inputwcs[p - buf_begin] == 0 \
2811 && mblen_buf[p - buf_begin] > 0 \
2812 && (unsigned char const *) p < buf_end) \
2814 if ((char *) p >= end) \
2824 realloc_trans_if_necessary(struct dfa *d, int new_state)
2826 /* Make sure that the trans and fail arrays are allocated large enough
2827 to hold a pointer for the new state. */
2828 if (new_state >= d->tralloc)
2830 int oldalloc = d->tralloc;
2832 while (new_state >= d->tralloc)
2834 REALLOC(d->realtrans, d->tralloc + 1);
2835 d->trans = d->realtrans + 1;
2836 REALLOC(d->fails, d->tralloc);
2837 REALLOC(d->success, d->tralloc);
2838 REALLOC(d->newlines, d->tralloc);
2839 while (oldalloc < d->tralloc)
2841 d->trans[oldalloc] = NULL;
2842 d->fails[oldalloc++] = NULL;
2847 /* Return values of transit_state_singlebyte(), and
2848 transit_state_consume_1char. */
2851 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2852 TRANSIT_STATE_DONE, /* State transition has finished. */
2853 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2854 } status_transit_state;
2856 /* Consume a single byte and transit state from 's' to '*next_state'.
2857 This function is almost same as the state transition routin in dfaexec().
2858 But state transition is done just once, otherwise matching succeed or
2859 reach the end of the buffer. */
2860 static status_transit_state
2861 transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2867 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2869 while (rval == TRANSIT_STATE_IN_PROGRESS)
2871 if ((t = d->trans[works]) != NULL)
2874 rval = TRANSIT_STATE_DONE;
2882 /* At the moment, it must not happen. */
2887 else if (d->fails[works])
2889 works = d->fails[works][*p];
2890 rval = TRANSIT_STATE_DONE;
2894 build_state(works, d);
2897 *next_state = works;
2901 /* Match a "." against the current context. buf_begin[IDX] is the
2902 current position. Return the length of the match, in bytes.
2903 POS is the position of the ".". */
2905 match_anychar (struct dfa *d, int s, position pos, int idx)
2912 mbclen = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
2914 /* Check syntax bits. */
2915 if (wc == (wchar_t)eolbyte)
2917 if (!(syntax_bits & RE_DOT_NEWLINE))
2920 else if (wc == (wchar_t)'\0')
2922 if (syntax_bits & RE_DOT_NOT_NULL)
2926 context = wchar_context(wc);
2927 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].context, context))
2933 /* Match a bracket expression against the current context.
2934 buf_begin[IDX] is the current position.
2935 Return the length of the match, in bytes.
2936 POS is the position of the bracket expression. */
2938 match_mb_charset (struct dfa *d, int s, position pos, int idx)
2941 int match; /* Flag which represent that matching succeed. */
2942 int match_len; /* Length of the character (or collating element)
2943 with which this operator match. */
2944 int op_len; /* Length of the operator. */
2947 /* Pointer to the structure to which we are currently refering. */
2948 struct mb_char_classes *work_mbc;
2951 wchar_t wc; /* Current refering character. */
2955 /* Check syntax bits. */
2956 if (wc == (wchar_t)eolbyte)
2958 if (!(syntax_bits & RE_DOT_NEWLINE))
2961 else if (wc == (wchar_t)'\0')
2963 if (syntax_bits & RE_DOT_NOT_NULL)
2967 context = wchar_context(wc);
2968 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].context, context))
2971 /* Assign the current refering operator to work_mbc. */
2972 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2973 match = !work_mbc->invert;
2974 match_len = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
2976 /* Match in range 0-255? */
2977 if (wc < NOTCHAR && work_mbc->cset != -1
2978 && tstbit((unsigned char)wc, d->charclasses[work_mbc->cset]))
2979 goto charset_matched;
2981 /* match with a character class? */
2982 for (i = 0; i<work_mbc->nch_classes; i++)
2984 if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2985 goto charset_matched;
2988 strncpy(buffer, (char const *) buf_begin + idx, match_len);
2989 buffer[match_len] = '\0';
2991 /* match with an equivalent class? */
2992 for (i = 0; i<work_mbc->nequivs; i++)
2994 op_len = strlen(work_mbc->equivs[i]);
2995 strncpy(buffer, (char const *) buf_begin + idx, op_len);
2996 buffer[op_len] = '\0';
2997 if (strcoll(work_mbc->equivs[i], buffer) == 0)
3000 goto charset_matched;
3004 /* match with a collating element? */
3005 for (i = 0; i<work_mbc->ncoll_elems; i++)
3007 op_len = strlen(work_mbc->coll_elems[i]);
3008 strncpy(buffer, (char const *) buf_begin + idx, op_len);
3009 buffer[op_len] = '\0';
3011 if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
3014 goto charset_matched;
3018 /* match with a range? */
3019 for (i = 0; i<work_mbc->nranges; i++)
3021 if (work_mbc->range_sts[i] <= wc &&
3022 wc <= work_mbc->range_ends[i])
3023 goto charset_matched;
3026 /* match with a character? */
3027 for (i = 0; i<work_mbc->nchars; i++)
3029 if (wc == work_mbc->chars[i])
3030 goto charset_matched;
3036 return match ? match_len : 0;
3039 /* Check each of `d->states[s].mbps.elem' can match or not. Then return the
3040 array which corresponds to `d->states[s].mbps.elem' and each element of
3041 the array contains the amount of the bytes with which the element can
3043 `idx' is the index from the buf_begin, and it is the current position
3045 Caller MUST free the array which this function return. */
3047 check_matching_with_multibyte_ops (struct dfa *d, int s, int idx)
3052 MALLOC(rarray, d->states[s].mbps.nelem);
3053 for (i = 0; i < d->states[s].mbps.nelem; ++i)
3055 position pos = d->states[s].mbps.elems[i];
3056 switch(d->tokens[pos.index])
3059 rarray[i] = match_anychar(d, s, pos, idx);
3062 rarray[i] = match_mb_charset(d, s, pos, idx);
3065 break; /* cannot happen. */
3071 /* Consume a single character and enumerate all of the positions which can
3072 be next position from the state `s'.
3073 `match_lens' is the input. It can be NULL, but it can also be the output
3074 of check_matching_with_multibyte_ops() for optimization.
3075 `mbclen' and `pps' are the output. `mbclen' is the length of the
3076 character consumed, and `pps' is the set this function enumerate. */
3077 static status_transit_state
3078 transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
3079 int *match_lens, int *mbclen, position_set *pps)
3084 status_transit_state rs = TRANSIT_STATE_DONE;
3086 /* Calculate the length of the (single/multi byte) character
3087 to which p points. */
3088 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
3089 : mblen_buf[*pp - buf_begin];
3091 /* Calculate the state which can be reached from the state `s' by
3092 consuming `*mbclen' single bytes from the buffer. */
3094 for (i = 0; i < *mbclen; i++)
3097 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
3099 /* Copy the positions contained by `s1' to the set `pps'. */
3100 copy(&(d->states[s1].elems), pps);
3102 /* Check (inputed)match_lens, and initialize if it is NULL. */
3103 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
3104 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
3106 work_mbls = match_lens;
3108 /* Add all of the positions which can be reached from `s' by consuming
3109 a single character. */
3110 for (i = 0; i < d->states[s].mbps.nelem ; i++)
3112 if (work_mbls[i] == *mbclen)
3113 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
3115 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
3119 if (match_lens == NULL && work_mbls != NULL)
3124 /* Transit state from s, then return new state and update the pointer of the
3125 buffer. This function is for some operator which can match with a multi-
3126 byte character or a collating element (which may be multi characters). */
3128 transit_state (struct dfa *d, int s, unsigned char const **pp)
3131 int mbclen; /* The length of current input multibyte character. */
3134 int *match_lens = NULL;
3135 int nelem = d->states[s].mbps.nelem; /* Just a alias. */
3136 position_set follows;
3137 unsigned char const *p1 = *pp;
3141 /* This state has (a) multibyte operator(s).
3142 We check whether each of them can match or not. */
3144 /* Note: caller must free the return value of this function. */
3145 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
3147 for (i = 0; i < nelem; i++)
3148 /* Search the operator which match the longest string,
3151 if (match_lens[i] > maxlen)
3152 maxlen = match_lens[i];
3156 if (nelem == 0 || maxlen == 0)
3157 /* This state has no multibyte operator which can match.
3158 We need to check only one single byte character. */
3160 status_transit_state rs;
3161 rs = transit_state_singlebyte(d, s, *pp, &s1);
3163 /* We must update the pointer if state transition succeeded. */
3164 if (rs == TRANSIT_STATE_DONE)
3171 /* This state has some operators which can match a multibyte character. */
3172 alloc_position_set(&follows, d->nleaves);
3174 /* `maxlen' may be longer than the length of a character, because it may
3175 not be a character but a (multi character) collating element.
3176 We enumerate all of the positions which `s' can reach by consuming
3178 transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
3180 wc = inputwcs[*pp - mbclen - buf_begin];
3181 s1 = state_index(d, &follows, wchar_context (wc));
3182 realloc_trans_if_necessary(d, s1);
3184 while (*pp - p1 < maxlen)
3186 transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
3188 for (i = 0; i < nelem ; i++)
3190 if (match_lens[i] == *pp - p1)
3192 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
3193 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
3197 wc = inputwcs[*pp - mbclen - buf_begin];
3198 s1 = state_index(d, &follows, wchar_context (wc));
3199 realloc_trans_if_necessary(d, s1);
3202 free(follows.elems);
3207 /* Initialize mblen_buf and inputwcs with data from the next line. */
3210 prepare_wc_buf (const char *begin, const char *end)
3213 unsigned char eol = eolbyte;
3214 size_t remain_bytes, i;
3216 buf_begin = (unsigned char *) begin;
3219 for (i = 0; i < end - begin + 1; i++)
3221 if (remain_bytes == 0)
3224 = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
3225 if (remain_bytes < 1
3226 || remain_bytes == (size_t) -1
3227 || remain_bytes == (size_t) -2
3228 || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
3231 inputwcs[i] = (wchar_t)begin[i];
3233 if (begin[i] == eol)
3238 mblen_buf[i] = remain_bytes;
3244 mblen_buf[i] = remain_bytes;
3250 buf_end = (unsigned char *) (begin + i);
3252 inputwcs[i] = 0; /* sentinel */
3253 #endif /* MBS_SUPPORT */
3256 /* Search through a buffer looking for a match to the given struct dfa.
3257 Find the first occurrence of a string matching the regexp in the
3258 buffer, and the shortest possible version thereof. Return a pointer to
3259 the first character after the match, or NULL if none is found. BEGIN
3260 points to the beginning of the buffer, and END points to the first byte
3261 after its end. Note however that we store a sentinel byte (usually
3262 newline) in *END, so the actual buffer must be one byte longer.
3263 When ALLOW_NL is nonzero, newlines may appear in the matching string.
3264 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3265 Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
3266 encountered a back-reference (1) or not (0). The caller may use this
3267 to decide whether to fall back on a backtracking matcher. */
3269 dfaexec (struct dfa *d, char const *begin, char *end,
3270 int allow_nl, int *count, int *backref)
3272 int s, s1; /* Current state. */
3273 unsigned char const *p; /* Current input character. */
3274 int **trans, *t; /* Copy of d->trans so it can be optimized
3276 unsigned char eol = eolbyte; /* Likewise for eolbyte. */
3277 unsigned char saved_end;
3280 build_state_zero(d);
3283 p = (unsigned char const *) begin;
3285 saved_end = *(unsigned char *) end;
3288 if (d->mb_cur_max > 1)
3290 MALLOC(mblen_buf, end - begin + 2);
3291 MALLOC(inputwcs, end - begin + 2);
3292 memset(&mbs, 0, sizeof(mbstate_t));
3293 prepare_wc_buf ((const char *) p, end);
3298 if (d->mb_cur_max > 1)
3299 while ((t = trans[s]) != NULL)
3304 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
3306 if (d->states[s].mbps.nelem == 0)
3312 /* Falling back to the glibc matcher in this case gives
3313 better performance (up to 25% better on [a-z], for
3314 example) and enables support for collating symbols and
3315 equivalence classes. */
3325 /* Can match with a multibyte character (and multi character
3326 collating element). Transition table might be updated. */
3327 s = transit_state(d, s, &p);
3332 while ((t = trans[s]) != NULL)
3335 if ((t = trans[s1]) == NULL)
3337 int tmp = s; s = s1; s1 = tmp; /* swap */
3344 if (s >= 0 && (char *) p <= end && d->fails[s])
3346 if (d->success[s] & sbit[*p])
3349 *backref = (d->states[s].backref != 0);
3350 if (d->mb_cur_max > 1)
3360 if (d->mb_cur_max > 1)
3362 /* Can match with a multibyte character (and multicharacter
3363 collating element). Transition table might be updated. */
3364 s = transit_state(d, s, &p);
3368 s = d->fails[s][*p++];
3372 /* If the previous character was a newline, count it. */
3373 if ((char *) p <= end && p[-1] == eol)
3378 if (d->mb_cur_max > 1)
3379 prepare_wc_buf ((const char *) p, end);
3382 /* Check if we've run off the end of the buffer. */
3383 if ((char *) p > end)
3385 if (d->mb_cur_max > 1)
3401 if (p[-1] == eol && allow_nl)
3403 s = d->newlines[s1];
3412 free_mbdata (struct dfa *d)
3416 free(d->multibyte_prop);
3417 d->multibyte_prop = NULL;
3419 for (i = 0; i < d->nmbcsets; ++i)
3422 struct mb_char_classes *p = &(d->mbcsets[i]);
3424 free(p->ch_classes);
3426 free(p->range_ends);
3428 for (j = 0; j < p->nequivs; ++j)
3432 for (j = 0; j < p->ncoll_elems; ++j)
3433 free(p->coll_elems[j]);
3434 free(p->coll_elems);
3442 /* Initialize the components of a dfa that the other routines don't
3443 initialize for themselves. */
3445 dfainit (struct dfa *d)
3447 memset (d, 0, sizeof *d);
3450 MALLOC(d->charclasses, d->calloc);
3453 MALLOC(d->tokens, d->talloc);
3455 d->mb_cur_max = MB_CUR_MAX;
3457 if (d->mb_cur_max > 1)
3459 d->nmultibyte_prop = 1;
3460 MALLOC(d->multibyte_prop, d->nmultibyte_prop);
3461 d->mbcsets_alloc = 1;
3462 MALLOC(d->mbcsets, d->mbcsets_alloc);
3467 dfaoptimize (struct dfa *d)
3471 if (!MBS_SUPPORT || !using_utf8())
3474 for (i = 0; i < d->tindex; ++i)
3476 switch(d->tokens[i])
3482 /* Requires multi-byte algorithm. */
3493 /* Parse and analyze a single string of the given length. */
3495 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3498 dfaparse(s, len, d);
3501 dfaanalyze(d, searchflag);
3504 /* Free the storage held by the components of a dfa. */
3506 dfafree (struct dfa *d)
3509 struct dfamust *dm, *ndm;
3511 free(d->charclasses);
3514 if (d->mb_cur_max > 1)
3517 for (i = 0; i < d->sindex; ++i) {
3518 free(d->states[i].elems.elems);
3520 free(d->states[i].mbps.elems);
3523 for (i = 0; i < d->tindex; ++i)
3524 free(d->follows[i].elems);
3526 for (i = 0; i < d->tralloc; ++i)
3535 for (dm = d->musts; dm; dm = ndm)
3543 /* Having found the postfix representation of the regular expression,
3544 try to find a long sequence of characters that must appear in any line
3546 Finding a "longest" sequence is beyond the scope here;
3547 we take an easy way out and hope for the best.
3548 (Take "(ab|a)b"--please.)
3550 We do a bottom-up calculation of sequences of characters that must appear
3551 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3553 sequences that must appear at the left of the match ("left")
3554 sequences that must appear at the right of the match ("right")
3555 lists of sequences that must appear somewhere in the match ("in")
3556 sequences that must constitute the match ("is")
3558 When we get to the root of the tree, we use one of the longest of its
3559 calculated "in" sequences as our answer. The sequence we find is returned in
3560 d->must (where "d" is the single argument passed to "dfamust");
3561 the length of the sequence is returned in d->mustn.
3563 The sequences calculated for the various types of node (in pseudo ANSI c)
3564 are shown below. "p" is the operand of unary operators (and the left-hand
3565 operand of binary operators); "q" is the right-hand operand of binary
3568 "ZERO" means "a zero-length sequence" below.
3570 Type left right is in
3571 ---- ---- ----- -- --
3572 char c # c # c # c # c
3574 ANYCHAR ZERO ZERO ZERO ZERO
3576 MBCSET ZERO ZERO ZERO ZERO
3578 CSET ZERO ZERO ZERO ZERO
3580 STAR ZERO ZERO ZERO ZERO
3582 QMARK ZERO ZERO ZERO ZERO
3584 PLUS p->left p->right ZERO p->in
3586 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3587 p->left : q->right : q->is!=ZERO) ? q->in plus
3588 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3591 OR longest common longest common (do p->is and substrings common to
3592 leading trailing q->is have same p->in and q->in
3593 (sub)sequence (sub)sequence length and
3594 of p->left of p->right content) ?
3595 and q->left and q->right p->is : NULL
3597 If there's anything else we recognize in the tree, all four sequences get set
3598 to zero-length sequences. If there's something we don't recognize in the tree,
3599 we just return a zero-length sequence.
3601 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3604 And. . .is it here or someplace that we might ponder "optimizations" such as
3605 egrep 'psi|epsilon' -> egrep 'psi'
3606 egrep 'pepsi|epsilon' -> egrep 'epsi'
3607 (Yes, we now find "epsi" as a "string
3608 that must occur", but we might also
3609 simplify the *entire* r.e. being sought)
3610 grep '[c]' -> grep 'c'
3611 grep '(ab|a)b' -> grep 'ab'
3612 grep 'ab*' -> grep 'a'
3613 grep 'a*b' -> grep 'b'
3615 There are several issues:
3617 Is optimization easy (enough)?
3619 Does optimization actually accomplish anything,
3620 or is the automaton you get from "psi|epsilon" (for example)
3621 the same as the one you get from "psi" (for example)?
3623 Are optimizable r.e.'s likely to be used in real-life situations
3624 (something like 'ab*' is probably unlikely; something like is
3625 'psi|epsilon' is likelier)? */
3628 icatalloc (char *old, char const *new)
3631 size_t oldsize = old == NULL ? 0 : strlen (old);
3632 size_t newsize = new == NULL ? 0 : strlen (new);
3635 result = xrealloc (old, oldsize + newsize + 1);
3636 strcpy (result + oldsize, new);
3641 icpyalloc (char const *string)
3643 return icatalloc (NULL, string);
3646 static char * _GL_ATTRIBUTE_PURE
3647 istrstr (char const *lookin, char const *lookfor)
3652 len = strlen(lookfor);
3653 for (cp = lookin; *cp != '\0'; ++cp)
3654 if (strncmp(cp, lookfor, len) == 0)
3660 freelist (char **cpp)
3666 for (i = 0; cpp[i] != NULL; ++i)
3674 enlist (char **cpp, char *new, size_t len)
3680 if ((new = icpyalloc(new)) == NULL)
3686 /* Is there already something in the list that's new (or longer)? */
3687 for (i = 0; cpp[i] != NULL; ++i)
3688 if (istrstr(cpp[i], new) != NULL)
3693 /* Eliminate any obsoleted strings. */
3695 while (cpp[j] != NULL)
3696 if (istrstr(new, cpp[j]) == NULL)
3706 /* Add the new string. */
3707 REALLOC(cpp, i + 2);
3713 /* Given pointers to two strings, return a pointer to an allocated
3714 list of their distinct common substrings. Return NULL if something
3717 comsubs (char *left, char const *right)
3724 if (left == NULL || right == NULL)
3726 cpp = malloc(sizeof *cpp);
3730 for (lcp = left; *lcp != '\0'; ++lcp)
3733 rcp = strchr (right, *lcp);
3736 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3740 rcp = strchr (rcp + 1, *lcp);
3745 char **p = enlist (cpp, lcp, len);
3759 addlists (char **old, char **new)
3763 if (old == NULL || new == NULL)
3765 for (i = 0; new[i] != NULL; ++i)
3767 old = enlist(old, new[i], strlen(new[i]));
3774 /* Given two lists of substrings, return a new list giving substrings
3777 inboth (char **left, char **right)
3783 if (left == NULL || right == NULL)
3785 both = malloc(sizeof *both);
3789 for (lnum = 0; left[lnum] != NULL; ++lnum)
3791 for (rnum = 0; right[rnum] != NULL; ++rnum)
3793 temp = comsubs(left[lnum], right[rnum]);
3799 both = addlists(both, temp);
3818 resetmust (must *mp)
3820 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3825 dfamust (struct dfa *d)
3836 static char empty_string[] = "";
3838 result = empty_string;
3840 MALLOC (musts, d->tindex + 1);
3842 for (i = 0; i <= d->tindex; ++i)
3844 for (i = 0; i <= d->tindex; ++i)
3846 mp[i].in = xmalloc(sizeof *mp[i].in);
3847 mp[i].left = xmalloc(2);
3848 mp[i].right = xmalloc(2);
3849 mp[i].is = xmalloc(2);
3850 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3854 fprintf(stderr, "dfamust:\n");
3855 for (i = 0; i < d->tindex; ++i)
3857 fprintf(stderr, " %d:", i);
3858 prtok(d->tokens[i]);
3862 for (ri = 0; ri < d->tindex; ++ri)
3864 switch (t = d->tokens[ri])
3868 assert (!"neither LPAREN nor RPAREN may appear here");
3881 assert (musts < mp);
3886 assert (&musts[2] <= mp);
3895 /* Guaranteed to be. Unlikely, but. . . */
3896 if (!STREQ (lmp->is, rmp->is))
3898 /* Left side--easy */
3900 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3902 lmp->left[i] = '\0';
3904 ln = strlen(lmp->right);
3905 rn = strlen(rmp->right);
3909 for (i = 0; i < n; ++i)
3910 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3912 for (j = 0; j < i; ++j)
3913 lmp->right[j] = lmp->right[(ln - i) + j];
3914 lmp->right[j] = '\0';
3915 new = inboth(lmp->in, rmp->in);
3924 assert (musts < mp);
3929 assert (mp == &musts[1]);
3930 for (i = 0; musts[0].in[i] != NULL; ++i)
3931 if (strlen(musts[0].in[i]) > strlen(result))
3932 result = musts[0].in[i];
3933 if (STREQ (result, musts[0].is))
3937 assert (&musts[2] <= mp);
3944 /* In. Everything in left, plus everything in
3945 right, plus catenation of
3946 left's right and right's left. */
3947 lmp->in = addlists(lmp->in, rmp->in);
3948 if (lmp->in == NULL)
3950 if (lmp->right[0] != '\0' &&
3951 rmp->left[0] != '\0')
3955 tp = icpyalloc(lmp->right);
3956 tp = icatalloc(tp, rmp->left);
3957 lmp->in = enlist(lmp->in, tp, strlen(tp));
3959 if (lmp->in == NULL)
3963 if (lmp->is[0] != '\0')
3965 lmp->left = icatalloc(lmp->left,
3967 if (lmp->left == NULL)
3971 if (rmp->is[0] == '\0')
3972 lmp->right[0] = '\0';
3973 lmp->right = icatalloc(lmp->right, rmp->right);
3974 if (lmp->right == NULL)
3976 /* Guaranteed to be */
3977 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3979 lmp->is = icatalloc(lmp->is, rmp->is);
3980 if (lmp->is == NULL)
3990 assert (!"oops! t >= END");
3994 /* not on *my* shift */
4008 /* plain character */
4010 mp->is[0] = mp->left[0] = mp->right[0] = t;
4011 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
4012 mp->in = enlist(mp->in, mp->is, (size_t)1);
4019 fprintf(stderr, " node: %d:", ri);
4020 prtok(d->tokens[ri]);
4021 fprintf(stderr, "\n in:");
4022 for (i = 0; mp->in[i]; ++i)
4023 fprintf(stderr, " \"%s\"", mp->in[i]);
4024 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
4025 fprintf(stderr, " left: \"%s\"\n", mp->left);
4026 fprintf(stderr, " right: \"%s\"\n", mp->right);
4035 MALLOC(dm->must, strlen(result) + 1);
4036 strcpy(dm->must, result);
4037 dm->next = d->musts;
4041 for (i = 0; i <= d->tindex; ++i)
4055 return xmalloc (sizeof (struct dfa));
4058 struct dfamust * _GL_ATTRIBUTE_PURE
4059 dfamusts (struct dfa const *d)
4064 /* vim:set shiftwidth=2: */