1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2016 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
35 #define STREQ(a, b) (strcmp (a, b) == 0)
37 /* ISASCIIDIGIT differs from isdigit, as follows:
38 - Its arg may be any int or unsigned int; it need not be an unsigned char.
39 - It's guaranteed to evaluate its argument exactly once.
40 - It's typically faster.
41 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
42 only '0' through '9' are digits. Prefer ISASCIIDIGIT to isdigit unless
43 it's important to use the locale's definition of "digit" even when the
44 host does not conform to Posix. */
45 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
48 #define _(str) gettext (str)
53 #include "localeinfo.h"
55 /* HPUX defines these as macros in sys/param.h. */
63 /* First integer value that is greater than any character code. */
64 enum { NOTCHAR = 1 << CHAR_BIT };
66 /* This represents part of a character class. It must be unsigned and
67 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
68 typedef unsigned long int charclass_word;
70 /* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
71 CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
72 represents 64 bits' worth of a charclass, where LO and HI are the
73 low and high-order 32 bits of the 64-bit quantity. */
74 #if ULONG_MAX >> 31 >> 31 < 3
75 enum { CHARCLASS_WORD_BITS = 32 };
76 # define CHARCLASS_PAIR(lo, hi) lo, hi
78 enum { CHARCLASS_WORD_BITS = 64 };
79 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
82 /* An initializer for a charclass whose 32-bit words are A through H. */
83 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
85 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
86 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
89 /* The maximum useful value of a charclass_word; all used bits are 1. */
90 static charclass_word const CHARCLASS_WORD_MASK
91 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
93 /* Number of words required to hold a bit for every character. */
96 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
99 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
100 typedef charclass_word charclass[CHARCLASS_WORDS];
102 /* Convert a possibly-signed character to an unsigned character. This is
103 a bit safer than casting to unsigned char, since it catches some type
104 errors that the cast doesn't. */
111 /* Contexts tell us whether a character is a newline or a word constituent.
112 Word-constituent characters are those that satisfy iswalnum, plus '_'.
113 Each character has a single CTX_* value; bitmasks of CTX_* values denote
114 a particular character class.
116 A state also stores a context value, which is a bitmask of CTX_* values.
117 A state's context represents a set of characters that the state's
118 predecessors must match. For example, a state whose context does not
119 include CTX_LETTER will never have transitions where the previous
120 character is a word constituent. A state whose context is CTX_ANY
121 might have transitions from any character. */
125 #define CTX_NEWLINE 4
128 /* Sometimes characters can only be matched depending on the surrounding
129 context. Such context decisions depend on what the previous character
130 was, and the value of the current (lookahead) character. Context
131 dependent constraints are encoded as 12-bit integers. Each bit that
132 is set indicates that the constraint succeeds in the corresponding
135 bit 8-11 - valid contexts when next character is CTX_NEWLINE
136 bit 4-7 - valid contexts when next character is CTX_LETTER
137 bit 0-3 - valid contexts when next character is CTX_NONE
139 The macro SUCCEEDS_IN_CONTEXT determines whether a given constraint
140 succeeds in a particular context. Prev is a bitmask of possible
141 context values for the previous character, curr is the (single-bit)
142 context value for the lookahead character. */
143 #define NEWLINE_CONSTRAINT(constraint) (((constraint) >> 8) & 0xf)
144 #define LETTER_CONSTRAINT(constraint) (((constraint) >> 4) & 0xf)
145 #define OTHER_CONSTRAINT(constraint) ((constraint) & 0xf)
147 #define SUCCEEDS_IN_CONTEXT(constraint, prev, curr) \
148 ((((curr) & CTX_NONE ? OTHER_CONSTRAINT (constraint) : 0) \
149 | ((curr) & CTX_LETTER ? LETTER_CONSTRAINT (constraint) : 0) \
150 | ((curr) & CTX_NEWLINE ? NEWLINE_CONSTRAINT (constraint) : 0)) \
153 /* The following macros describe what a constraint depends on. */
154 #define PREV_NEWLINE_CONSTRAINT(constraint) (((constraint) >> 2) & 0x111)
155 #define PREV_LETTER_CONSTRAINT(constraint) (((constraint) >> 1) & 0x111)
156 #define PREV_OTHER_CONSTRAINT(constraint) ((constraint) & 0x111)
158 #define PREV_NEWLINE_DEPENDENT(constraint) \
159 (PREV_NEWLINE_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
160 #define PREV_LETTER_DEPENDENT(constraint) \
161 (PREV_LETTER_CONSTRAINT (constraint) != PREV_OTHER_CONSTRAINT (constraint))
163 /* Tokens that match the empty string subject to some constraint actually
164 work by applying that constraint to determine what may follow them,
165 taking into account what has gone before. The following values are
166 the constraints corresponding to the special tokens previously defined. */
167 #define NO_CONSTRAINT 0x777
168 #define BEGLINE_CONSTRAINT 0x444
169 #define ENDLINE_CONSTRAINT 0x700
170 #define BEGWORD_CONSTRAINT 0x050
171 #define ENDWORD_CONSTRAINT 0x202
172 #define LIMWORD_CONSTRAINT 0x252
173 #define NOTLIMWORD_CONSTRAINT 0x525
175 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
176 are operators and others are terminal symbols. Most (but not all) of these
177 codes are returned by the lexical analyzer. */
179 typedef ptrdiff_t token;
181 /* States are indexed by state_num values. These are normally
182 nonnegative but -1 is used as a special value. */
183 typedef ptrdiff_t state_num;
185 /* Predefined token values. */
188 END = -1, /* END is a terminal symbol that matches the
189 end of input; any value of END or less in
190 the parse tree is such a symbol. Accepting
191 states of the DFA are those that would have
192 a transition on END. */
194 /* Ordinary character values are terminal symbols that match themselves. */
196 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
199 BACKREF, /* BACKREF is generated by \<digit>
200 or by any other construct that
201 is not completely handled. If the scanner
202 detects a transition on backref, it returns
203 a kind of "semi-success" indicating that
204 the match will have to be verified with
205 a backtracking matcher. */
207 BEGLINE, /* BEGLINE is a terminal symbol that matches
208 the empty string at the beginning of a
211 ENDLINE, /* ENDLINE is a terminal symbol that matches
212 the empty string at the end of a line. */
214 BEGWORD, /* BEGWORD is a terminal symbol that matches
215 the empty string at the beginning of a
218 ENDWORD, /* ENDWORD is a terminal symbol that matches
219 the empty string at the end of a word. */
221 LIMWORD, /* LIMWORD is a terminal symbol that matches
222 the empty string at the beginning or the
225 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
226 matches the empty string not at
227 the beginning or end of a word. */
229 QMARK, /* QMARK is an operator of one argument that
230 matches zero or one occurrences of its
233 STAR, /* STAR is an operator of one argument that
234 matches the Kleene closure (zero or more
235 occurrences) of its argument. */
237 PLUS, /* PLUS is an operator of one argument that
238 matches the positive closure (one or more
239 occurrences) of its argument. */
241 REPMN, /* REPMN is a lexical token corresponding
242 to the {m,n} construct. REPMN never
243 appears in the compiled token vector. */
245 CAT, /* CAT is an operator of two arguments that
246 matches the concatenation of its
247 arguments. CAT is never returned by the
250 OR, /* OR is an operator of two arguments that
251 matches either of its arguments. */
253 LPAREN, /* LPAREN never appears in the parse tree,
254 it is only a lexeme. */
256 RPAREN, /* RPAREN never appears in the parse tree. */
258 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
259 a valid multibyte (or single byte) character.
260 It is used only if MB_CUR_MAX > 1. */
262 MBCSET, /* MBCSET is similar to CSET, but for
263 multibyte characters. */
265 WCHAR, /* Only returned by lex. wctok contains
266 the wide character representation. */
268 CSET /* CSET and (and any value greater) is a
269 terminal symbol that matches any of a
270 class of characters. */
274 /* States of the recognizer correspond to sets of positions in the parse
275 tree, together with the constraints under which they may be matched.
276 So a position is encoded as an index into the parse tree together with
280 size_t index; /* Index into the parse array. */
281 unsigned int constraint; /* Constraint for matching this position. */
284 /* Sets of positions are stored as arrays. */
287 position *elems; /* Elements of this position set. */
288 size_t nelem; /* Number of elements in this set. */
289 size_t alloc; /* Number of elements allocated in ELEMS. */
292 /* Sets of leaves are also stored as arrays. */
295 size_t *elems; /* Elements of this position set. */
296 size_t nelem; /* Number of elements in this set. */
299 /* A state of the dfa consists of a set of positions, some flags,
300 and the token value of the lowest-numbered position of the state that
301 contains an END token. */
304 size_t hash; /* Hash of the positions of this state. */
305 position_set elems; /* Positions this state could match. */
306 unsigned char context; /* Context from previous state. */
307 unsigned short constraint; /* Constraint for this state to accept. */
308 token first_end; /* Token value of the first END in elems. */
309 position_set mbps; /* Positions which can match multibyte
310 characters or the follows, e.g., period.
311 Used only if MB_CUR_MAX > 1. */
312 state_num mb_trindex; /* Index of this state in MB_TRANS, or
313 negative if the state does not have
317 /* Maximum for any transition table count that exceeds min_trcount. */
318 enum { MAX_TRCOUNT = 1024 };
320 /* A bracket operator.
321 e.g., [a-c], [[:alpha:]], etc. */
322 struct mb_char_classes
326 wchar_t *chars; /* Normal characters. */
332 /* Syntax bits controlling the behavior of the lexical analyzer. */
333 reg_syntax_t syntax_bits;
334 bool syntax_bits_set;
336 /* Flag for case-folding letters into sets. */
339 /* True if ^ and $ match only the start and end of data, and do not match
340 end-of-line within data. */
343 /* End-of-line byte in data. */
344 unsigned char eolbyte;
346 /* Cache of char-context values. */
349 /* If never_trail[B], the byte B cannot be a non-initial byte in a
350 multibyte character. */
351 bool never_trail[NOTCHAR];
353 /* Set of characters considered letters. */
356 /* Set of characters that are newline. */
360 /* Lexical analyzer. All the dross that deals with the obnoxious
361 GNU Regex syntax bits is located here. The poor, suffering
362 reader is referred to the GNU Regex documentation for the
363 meaning of the @#%!@#%^!@ syntax bits. */
366 char const *ptr; /* Pointer to next input character. */
367 size_t left; /* Number of characters remaining. */
368 token lasttok; /* Previous token returned; initially END. */
369 size_t parens; /* Count of outstanding left parens. */
370 int minrep, maxrep; /* Repeat counts for {m,n}. */
372 /* Wide character representation of the current multibyte character,
373 or WEOF if there was an encoding error. Used only if
377 /* Length of the multibyte representation of wctok. */
380 /* We're separated from beginning or (, | only by zero-width characters. */
384 /* Recursive descent parser for regular expressions. */
388 token tok; /* Lookahead token. */
389 size_t depth; /* Current depth of a hypothetical stack
390 holding deferred productions. This is
391 used to determine the depth that will be
392 required of the real stack later on in
396 /* A compiled regular expression. */
399 /* Syntax configuration */
400 struct regex_syntax syntax;
402 /* Fields filled by the scanner. */
403 charclass *charclasses; /* Array of character sets for CSET tokens. */
404 size_t cindex; /* Index for adding new charclasses. */
405 size_t calloc; /* Number of charclasses allocated. */
406 size_t canychar; /* Index of anychar class, or (size_t) -1. */
409 struct lexer_state lex;
412 struct parser_state parse;
414 /* Fields filled by the parser. */
415 token *tokens; /* Postfix parse array. */
416 size_t tindex; /* Index for adding new tokens. */
417 size_t talloc; /* Number of tokens currently allocated. */
418 size_t depth; /* Depth required of an evaluation stack
419 used for depth-first traversal of the
421 size_t nleaves; /* Number of leaves on the parse tree. */
422 size_t nregexps; /* Count of parallel regexps being built
424 bool fast; /* The DFA is fast. */
425 token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
426 mbstate_t mbs; /* Multibyte conversion state. */
428 /* The following are valid only if MB_CUR_MAX > 1. */
430 /* The value of multibyte_prop[i] is defined by following rule.
431 if tokens[i] < NOTCHAR
432 bit 0 : tokens[i] is the first byte of a character, including
433 single-byte characters.
434 bit 1 : tokens[i] is the last byte of a character, including
435 single-byte characters.
437 if tokens[i] = MBCSET
438 ("the index of mbcsets corresponding to this operator" << 2) + 3
442 = 'single_byte_a', 'multi_byte_A', single_byte_b'
443 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
449 /* Array of the bracket expression in the DFA. */
450 struct mb_char_classes *mbcsets;
452 size_t mbcsets_alloc;
454 /* Fields filled by the superset. */
455 struct dfa *superset; /* Hint of the dfa. */
457 /* Fields filled by the state builder. */
458 dfa_state *states; /* States of the dfa. */
459 state_num sindex; /* Index for adding new states. */
460 size_t salloc; /* Number of states currently allocated. */
462 /* Fields filled by the parse tree->NFA conversion. */
463 position_set *follows; /* Array of follow sets, indexed by position
464 index. The follow of a position is the set
465 of positions containing characters that
466 could conceivably follow a character
467 matching the given position in a string
468 matching the regexp. Allocated to the
469 maximum possible position index. */
470 bool searchflag; /* We are supposed to build a searching
471 as opposed to an exact matcher. A searching
472 matcher finds the first and shortest string
473 matching a regexp anywhere in the buffer,
474 whereas an exact matcher finds the longest
475 string matching, but anchored to the
476 beginning of the buffer. */
478 /* Fields filled by dfaexec. */
479 state_num tralloc; /* Number of transition tables that have
480 slots so far, not counting trans[-1]. */
481 int trcount; /* Number of transition tables that have
482 actually been built. */
483 int min_trcount; /* Minimum of number of transition tables.
484 Always keep the number, even after freeing
485 the transition tables. It is also the
486 number of initial states. */
487 state_num **trans; /* Transition tables for states that can
488 never accept. If the transitions for a
489 state have not yet been computed, or the
490 state could possibly accept, its entry in
491 this table is NULL. This points to one
492 past the start of the allocated array,
493 and trans[-1] is always NULL. */
494 state_num **fails; /* Transition tables after failing to accept
495 on a state that potentially could do so. */
496 int *success; /* Table of acceptance conditions used in
497 dfaexec and computed in build_state. */
498 state_num *newlines; /* Transitions on newlines. The entry for a
499 newline in any transition table is always
500 -1 so we can count lines without wasting
501 too many cycles. The transition for a
502 newline is stored separately and handled
503 as a special case. Newline is also used
504 as a sentinel at the end of the buffer. */
505 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
506 context in multibyte locales, in which we
507 do not distinguish between their contexts,
508 as not supported word. */
509 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
510 state_num **mb_trans; /* Transition tables for states with ANYCHAR. */
511 state_num mb_trcount; /* Number of transition tables for states with
512 ANYCHAR that have actually been built. */
514 /* Information derived from the locale. This is at the end so that
515 a quick memset need not clear it specially. */
517 /* dfaexec implementation. */
518 char *(*dfaexec) (struct dfa *, char const *, char *,
519 bool, size_t *, bool *);
521 /* The locale is simple, like the C locale. These locales can be
522 processed more efficiently, e.g., the relationship between lower-
523 and upper-case letters is 1-1. */
526 /* Other cached information derived from the locale. */
527 struct localeinfo localeinfo;
530 /* Some macros for user access to dfa internals. */
532 /* S could possibly be an accepting state of R. */
533 #define ACCEPTING(s, r) ((r).states[s].constraint)
535 /* STATE accepts in the specified context. */
536 #define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
537 SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
539 static void regexp (struct dfa *dfa);
541 /* Store into *PWC the result of converting the leading bytes of the
542 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
543 and updating the conversion state in *D. On conversion error,
544 convert just a single byte, to WEOF. Return the number of bytes
547 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
549 * PWC points to wint_t, not to wchar_t.
550 * The last arg is a dfa *D instead of merely a multibyte conversion
552 * N must be at least 1.
553 * S[N - 1] must be a sentinel byte.
554 * Shift encodings are not supported.
555 * The return value is always in the range 1..N.
556 * D->mbs is always valid afterwards.
557 * *PWC is always set to something. */
559 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
561 unsigned char uc = s[0];
562 wint_t wc = d->localeinfo.sbctowc[uc];
567 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
568 if (0 < nbytes && nbytes < (size_t) -2)
573 memset (&d->mbs, 0, sizeof d->mbs);
588 fprintf (stderr, "END");
589 else if (t < NOTCHAR)
592 fprintf (stderr, "0x%02x", ch);
653 fprintf (stderr, "%s", s);
658 /* Stuff pertaining to charclasses. */
661 tstbit (unsigned int b, charclass const c)
663 return c[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
667 setbit (unsigned int b, charclass c)
669 c[b / CHARCLASS_WORD_BITS] |= (charclass_word) 1 << b % CHARCLASS_WORD_BITS;
673 clrbit (unsigned int b, charclass c)
675 c[b / CHARCLASS_WORD_BITS] &= ~((charclass_word) 1
676 << b % CHARCLASS_WORD_BITS);
680 copyset (charclass const src, charclass dst)
682 memcpy (dst, src, sizeof (charclass));
686 zeroset (charclass s)
688 memset (s, 0, sizeof (charclass));
696 for (i = 0; i < CHARCLASS_WORDS; ++i)
697 s[i] = CHARCLASS_WORD_MASK & ~s[i];
701 equal (charclass const s1, charclass const s2)
703 charclass_word w = 0;
705 for (i = 0; i < CHARCLASS_WORDS; i++)
711 emptyset (charclass const s)
713 charclass_word w = 0;
715 for (i = 0; i < CHARCLASS_WORDS; i++)
720 /* Ensure that the array addressed by PTR holds at least NITEMS +
721 (PTR || !NITEMS) items. Either return PTR, or reallocate the array
722 and return its new address. Although PTR may be null, the returned
725 The array holds *NALLOC items; *NALLOC is updated on reallocation.
726 ITEMSIZE is the size of one item. Avoid O(N**2) behavior on arrays
729 maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
731 if (nitems < *nalloc)
734 return x2nrealloc (ptr, nalloc, itemsize);
737 /* In DFA D, find the index of charclass S, or allocate a new one. */
739 charclass_index (struct dfa *d, charclass const s)
743 for (i = 0; i < d->cindex; ++i)
744 if (equal (s, d->charclasses[i]))
746 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
747 sizeof *d->charclasses);
749 copyset (s, d->charclasses[i]);
754 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
756 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
760 char_context (struct dfa const *dfa, unsigned char c)
762 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
764 if (unibyte_word_constituent (dfa, c))
769 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
770 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
771 this may happen when folding case in weird Turkish locales where
772 dotless i/dotted I are not included in the chosen character set.
773 Return whether a bit was set in the charclass. */
775 setbit_wc (wint_t wc, charclass c)
785 /* Set a bit for B and its case variants in the charclass C.
786 MB_CUR_MAX must be 1. */
788 setbit_case_fold_c (int b, charclass c)
790 int ub = toupper (b);
792 for (i = 0; i < NOTCHAR; i++)
793 if (toupper (i) == ub)
797 /* Return true if the locale compatible with the C locale. */
800 using_simple_locale (bool multibyte)
802 /* The native character set is known to be compatible with
803 the C locale. The following test isn't perfect, but it's good
804 enough in practice, as only ASCII and EBCDIC are in common use
805 and this test correctly accepts ASCII and rejects EBCDIC. */
806 enum { native_c_charset =
807 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
808 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
809 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
810 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
811 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
812 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
813 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
814 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
815 && '}' == 125 && '~' == 126)
818 if (native_c_charset && !multibyte)
822 /* Treat C and POSIX locales as being compatible. Also, treat
823 errors as compatible, as these are invariably from stubs. */
824 char const *loc = setlocale (LC_ALL, NULL);
825 return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX");
829 /* Fetch the next lexical input character. Set C (of type int) to the
830 next input byte, except set C to EOF if the input is a multibyte
831 character of length greater than 1. Set WC (of type wint_t) to the
832 value of the input if it is a valid multibyte character (possibly
833 of length 1); otherwise set WC to WEOF. If there is no more input,
834 report EOFERR if EOFERR is not null, and return lasttok = END
836 # define FETCH_WC(dfa, c, wc, eoferr) \
838 if (! (dfa)->lex.left) \
843 return (dfa)->lex.lasttok = END; \
848 size_t nbytes = mbs_to_wchar (&_wc, (dfa)->lex.ptr, \
849 (dfa)->lex.left, dfa); \
850 (dfa)->lex.cur_mb_len = nbytes; \
852 (c) = nbytes == 1 ? to_uchar ((dfa)->lex.ptr[0]) : EOF; \
853 (dfa)->lex.ptr += nbytes; \
854 (dfa)->lex.left -= nbytes; \
859 # define MIN(a,b) ((a) < (b) ? (a) : (b))
862 typedef int predicate (int);
864 /* The following list maps the names of the Posix named character classes
865 to predicate functions that determine whether a given character is in
866 the class. The leading [ has already been eaten by the lexical
872 bool single_byte_only;
875 static const struct dfa_ctype prednames[] = {
876 {"alpha", isalpha, false},
877 {"upper", isupper, false},
878 {"lower", islower, false},
879 {"digit", isdigit, true},
880 {"xdigit", isxdigit, false},
881 {"space", isspace, false},
882 {"punct", ispunct, false},
883 {"alnum", isalnum, false},
884 {"print", isprint, false},
885 {"graph", isgraph, false},
886 {"cntrl", iscntrl, false},
887 {"blank", isblank, false},
891 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
892 find_pred (const char *str)
895 for (i = 0; prednames[i].name; ++i)
896 if (STREQ (str, prednames[i].name))
897 return &prednames[i];
901 /* Multibyte character handling sub-routine for lex.
902 Parse a bracket expression and build a struct mb_char_classes. */
904 parse_bracket_exp (struct dfa *dfa)
910 /* This is a bracket expression that dfaexec is known to
911 process correctly. */
912 bool known_bracket_exp = true;
914 /* Used to warn about [:space:].
915 Bit 0 = first character is a colon.
916 Bit 1 = last character is a colon.
917 Bit 2 = includes any other character but a colon.
918 Bit 3 = includes ranges, char/equiv classes or collation elements. */
919 int colon_warning_state;
925 /* Work area to build a mb_char_classes. */
926 struct mb_char_classes *work_mbc;
930 if (dfa->localeinfo.multibyte)
932 dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
934 sizeof *dfa->mbcsets);
936 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
937 We will update dfa->multibyte_prop[] in addtok, because we can't
938 decide the index in dfa->tokens[]. */
940 /* Initialize work area. */
941 work_mbc = &dfa->mbcsets[dfa->nmbcsets++];
942 memset (work_mbc, 0, sizeof *work_mbc);
947 memset (ccl, 0, sizeof ccl);
948 FETCH_WC (dfa, c, wc, _("unbalanced ["));
951 FETCH_WC (dfa, c, wc, _("unbalanced ["));
953 known_bracket_exp = dfa->simple_locale;
958 colon_warning_state = (c == ':');
961 c1 = NOTCHAR; /* Mark c1 as not initialized. */
962 colon_warning_state &= ~2;
964 /* Note that if we're looking at some other [:...:] construct,
965 we just treat it as a bunch of ordinary characters. We can do
966 this because we assume regex has checked for syntax errors before
967 dfa is ever called. */
970 FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
972 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
973 || c1 == '.' || c1 == '=')
975 enum { MAX_BRACKET_STRING_LEN = 32 };
976 char str[MAX_BRACKET_STRING_LEN + 1];
980 FETCH_WC (dfa, c, wc, _("unbalanced ["));
981 if (dfa->lex.left == 0
982 || (c == c1 && dfa->lex.ptr[0] == ']'))
984 if (len < MAX_BRACKET_STRING_LEN)
987 /* This is in any case an invalid class name. */
993 FETCH_WC (dfa, c, wc, _("unbalanced ["));
995 /* Build character class. POSIX allows character
996 classes to match multicharacter collating elements,
997 but the regex code does not support that, so do not
998 worry about that possibility. */
1001 = (dfa->syntax.case_fold && (STREQ (str, "upper")
1002 || STREQ (str, "lower"))
1004 const struct dfa_ctype *pred = find_pred (class);
1006 dfaerror (_("invalid character class"));
1008 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1009 known_bracket_exp = false;
1011 for (c2 = 0; c2 < NOTCHAR; ++c2)
1012 if (pred->func (c2))
1016 known_bracket_exp = false;
1018 colon_warning_state |= 8;
1020 /* Fetch new lookahead character. */
1021 FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
1025 /* We treat '[' as a normal character here. c/c1/wc/wc1
1026 are already set up. */
1029 if (c == '\\' && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1030 FETCH_WC (dfa, c, wc, _("unbalanced ["));
1033 FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
1036 /* build range characters. */
1038 FETCH_WC (dfa, c2, wc2, _("unbalanced ["));
1040 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1041 Treat it like [-a[.aa.]] while parsing it, and
1042 remember that the set is unknown. */
1043 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1045 known_bracket_exp = false;
1051 /* In the case [x-], the - is an ordinary hyphen,
1052 which is left in c1, the lookahead character. */
1053 dfa->lex.ptr -= dfa->lex.cur_mb_len;
1054 dfa->lex.left += dfa->lex.cur_mb_len;
1058 if (c2 == '\\' && (dfa->syntax.syntax_bits
1059 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1060 FETCH_WC (dfa, c2, wc2, _("unbalanced ["));
1062 colon_warning_state |= 8;
1063 FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
1065 /* Treat [x-y] as a range if x != y. */
1066 if (wc != wc2 || wc == WEOF)
1068 if (dfa->localeinfo.multibyte)
1069 known_bracket_exp = false;
1070 else if (dfa->simple_locale)
1073 for (ci = c; ci <= c2; ci++)
1075 if (dfa->syntax.case_fold)
1077 int uc = toupper (c);
1078 int uc2 = toupper (c2);
1079 for (ci = 0; ci < NOTCHAR; ci++)
1081 int uci = toupper (ci);
1082 if (uc <= uci && uci <= uc2)
1088 known_bracket_exp = false;
1095 colon_warning_state |= (c == ':') ? 2 : 4;
1097 if (!dfa->localeinfo.multibyte)
1099 if (dfa->syntax.case_fold)
1100 setbit_case_fold_c (c, ccl);
1107 known_bracket_exp = false;
1110 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1112 unsigned int n = (dfa->syntax.case_fold
1113 ? case_folded_counterparts (wc, folded + 1) + 1
1116 for (i = 0; i < n; i++)
1117 if (!setbit_wc (folded[i], ccl))
1120 = maybe_realloc (work_mbc->chars, work_mbc->nchars,
1121 &chars_al, sizeof *work_mbc->chars);
1122 work_mbc->chars[work_mbc->nchars++] = folded[i];
1126 while ((wc = wc1, (c = c1) != ']'));
1128 if (colon_warning_state == 7)
1129 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1131 if (! known_bracket_exp)
1134 if (dfa->localeinfo.multibyte)
1136 work_mbc->invert = invert;
1137 work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
1143 assert (!dfa->localeinfo.multibyte);
1145 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1149 return CSET + charclass_index (dfa, ccl);
1159 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1161 ls->ptr = dfa->lex.ptr;
1162 ls->left = dfa->lex.left;
1164 dfa->lex.left = strlen (s);
1168 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1170 dfa->lex.ptr = ls->ptr;
1171 dfa->lex.left = ls->left;
1175 lex (struct dfa *dfa)
1178 bool backslash = false;
1182 /* Basic plan: We fetch a character. If it's a backslash,
1183 we set the backslash flag and go through the loop again.
1184 On the plus side, this avoids having a duplicate of the
1185 main switch inside the backslash case. On the minus side,
1186 it means that just about every case begins with
1187 "if (backslash) ...". */
1188 for (i = 0; i < 2; ++i)
1190 FETCH_WC (dfa, c, dfa->lex.wctok, NULL);
1197 if (dfa->lex.left == 0)
1198 dfaerror (_("unfinished \\ escape"));
1205 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1206 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1207 || dfa->lex.lasttok == OR)
1208 return dfa->lex.lasttok = BEGLINE;
1214 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1215 || dfa->lex.left == 0
1217 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1218 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1219 & (dfa->lex.ptr[0] == '\\')]
1222 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1223 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1224 & (dfa->lex.ptr[0] == '\\')]
1226 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1227 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1228 return dfa->lex.lasttok = ENDLINE;
1240 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1242 dfa->lex.laststart = false;
1243 return dfa->lex.lasttok = BACKREF;
1248 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1250 /* FIXME: should be beginning of string */
1251 return dfa->lex.lasttok = BEGLINE;
1256 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1258 /* FIXME: should be end of string */
1259 return dfa->lex.lasttok = ENDLINE;
1264 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1265 return dfa->lex.lasttok = BEGWORD;
1269 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1270 return dfa->lex.lasttok = ENDWORD;
1274 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1275 return dfa->lex.lasttok = LIMWORD;
1279 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1280 return dfa->lex.lasttok = NOTLIMWORD;
1284 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1286 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1288 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1289 && dfa->lex.laststart)
1291 return dfa->lex.lasttok = QMARK;
1296 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1297 && dfa->lex.laststart)
1299 return dfa->lex.lasttok = STAR;
1302 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1304 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1306 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1307 && dfa->lex.laststart)
1309 return dfa->lex.lasttok = PLUS;
1312 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1314 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1316 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1317 && dfa->lex.laststart)
1322 {M,} - minimum count, maximum is infinity
1324 {,} - 0 to infinity (same as '*')
1325 {M,N} - M through N */
1327 char const *p = dfa->lex.ptr;
1328 char const *lim = p + dfa->lex.left;
1329 dfa->lex.minrep = dfa->lex.maxrep = -1;
1330 for (; p != lim && ISASCIIDIGIT (*p); p++)
1331 dfa->lex.minrep = (dfa->lex.minrep < 0
1333 : MIN (RE_DUP_MAX + 1,
1334 dfa->lex.minrep * 10 + *p - '0'));
1338 dfa->lex.maxrep = dfa->lex.minrep;
1341 if (dfa->lex.minrep < 0)
1342 dfa->lex.minrep = 0;
1343 while (++p != lim && ISASCIIDIGIT (*p))
1345 = (dfa->lex.maxrep < 0
1347 : MIN (RE_DUP_MAX + 1,
1348 dfa->lex.maxrep * 10 + *p - '0'));
1351 if (! ((! backslash || (p != lim && *p++ == '\\'))
1352 && p != lim && *p++ == '}'
1353 && 0 <= dfa->lex.minrep
1354 && (dfa->lex.maxrep < 0
1355 || dfa->lex.minrep <= dfa->lex.maxrep)))
1357 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1359 dfaerror (_("invalid content of \\{\\}"));
1361 if (RE_DUP_MAX < dfa->lex.maxrep)
1362 dfaerror (_("regular expression too big"));
1364 dfa->lex.left = lim - p;
1366 dfa->lex.laststart = false;
1367 return dfa->lex.lasttok = REPMN;
1370 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1372 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1374 dfa->lex.laststart = true;
1375 return dfa->lex.lasttok = OR;
1378 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1379 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1381 dfa->lex.laststart = true;
1382 return dfa->lex.lasttok = OR;
1385 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1388 dfa->lex.laststart = true;
1389 return dfa->lex.lasttok = LPAREN;
1392 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1394 if (dfa->lex.parens == 0
1395 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1398 dfa->lex.laststart = false;
1399 return dfa->lex.lasttok = RPAREN;
1404 if (dfa->canychar == (size_t) -1)
1408 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1410 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1412 if (dfa->localeinfo.multibyte)
1413 for (c2 = 0; c2 < NOTCHAR; c2++)
1414 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1416 dfa->canychar = charclass_index (dfa, ccl);
1418 dfa->lex.laststart = false;
1419 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1421 : CSET + dfa->canychar);
1425 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1427 if (!dfa->localeinfo.multibyte)
1430 for (c2 = 0; c2 < NOTCHAR; ++c2)
1435 dfa->lex.laststart = false;
1436 return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
1439 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1440 add_utf8_anychar, makes sense. */
1442 /* \s and \S are documented to be equivalent to [[:space:]] and
1443 [^[:space:]] respectively, so tell the lexer to process those
1444 strings, each minus its "already processed" '['. */
1447 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1448 dfa->lex.lasttok = parse_bracket_exp (dfa);
1449 pop_lex_state (dfa, &ls);
1452 dfa->lex.laststart = false;
1453 return dfa->lex.lasttok;
1457 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1460 if (!dfa->localeinfo.multibyte)
1463 for (c2 = 0; c2 < NOTCHAR; ++c2)
1464 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1468 dfa->lex.laststart = false;
1469 return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
1472 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1473 add_utf8_anychar, makes sense. */
1475 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1476 [^_[:alnum:]] respectively, so tell the lexer to process those
1477 strings, each minus its "already processed" '['. */
1480 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1481 dfa->lex.lasttok = parse_bracket_exp (dfa);
1482 pop_lex_state (dfa, &ls);
1485 dfa->lex.laststart = false;
1486 return dfa->lex.lasttok;
1491 dfa->lex.laststart = false;
1492 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1496 dfa->lex.laststart = false;
1497 /* For multibyte character sets, folding is done in atom. Always
1499 if (dfa->localeinfo.multibyte)
1500 return dfa->lex.lasttok = WCHAR;
1502 if (dfa->syntax.case_fold && isalpha (c))
1505 setbit_case_fold_c (c, ccl);
1506 return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
1509 return dfa->lex.lasttok = c;
1513 /* The above loop should consume at most a backslash
1514 and some other character. */
1516 return END; /* keeps pedantic compilers happy. */
1520 addtok_mb (struct dfa *dfa, token t, int mbprop)
1522 if (dfa->talloc == dfa->tindex)
1524 dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
1525 sizeof *dfa->tokens);
1526 if (dfa->localeinfo.multibyte)
1527 dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1528 sizeof *dfa->multibyte_prop);
1530 if (dfa->localeinfo.multibyte)
1531 dfa->multibyte_prop[dfa->tindex] = mbprop;
1532 dfa->tokens[dfa->tindex++] = t;
1556 if (dfa->parse.depth > dfa->depth)
1557 dfa->depth = dfa->parse.depth;
1560 static void addtok_wc (struct dfa *dfa, wint_t wc);
1562 /* Add the given token to the parse tree, maintaining the depth count and
1563 updating the maximum depth if necessary. */
1565 addtok (struct dfa *dfa, token t)
1567 if (dfa->localeinfo.multibyte && t == MBCSET)
1569 bool need_or = false;
1570 struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
1573 /* Extract wide characters into alternations for better performance.
1574 This does not require UTF-8. */
1575 for (i = 0; i < work_mbc->nchars; i++)
1577 addtok_wc (dfa, work_mbc->chars[i]);
1582 work_mbc->nchars = 0;
1584 /* Characters have been handled above, so it is possible
1585 that the mbcset is empty now. Do nothing in that case. */
1586 if (work_mbc->cset != -1)
1588 addtok (dfa, CSET + work_mbc->cset);
1595 addtok_mb (dfa, t, 3);
1599 /* We treat a multibyte character as a single atom, so that DFA
1600 can treat a multibyte character as a single expression.
1602 e.g., we construct the following tree from "<mb1><mb2>".
1603 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1604 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1606 addtok_wc (struct dfa *dfa, wint_t wc)
1608 unsigned char buf[MB_LEN_MAX];
1609 mbstate_t s = { 0 };
1611 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1613 if (stored_bytes != (size_t) -1)
1614 dfa->lex.cur_mb_len = stored_bytes;
1617 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1618 the addtok_mb call altogether can corrupt the heap. */
1619 dfa->lex.cur_mb_len = 1;
1623 addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
1624 for (i = 1; i < dfa->lex.cur_mb_len; i++)
1626 addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
1632 add_utf8_anychar (struct dfa *dfa)
1634 static charclass const utf8_classes[5] = {
1635 /* 80-bf: non-leading bytes. */
1636 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1638 /* 00-7f: 1-byte sequence. */
1639 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1641 /* c2-df: 2-byte sequence. */
1642 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1644 /* e0-ef: 3-byte sequence. */
1645 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
1647 /* f0-f7: 4-byte sequence. */
1648 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
1650 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1653 /* Define the five character classes that are needed below. */
1654 if (dfa->utf8_anychar_classes[0] == 0)
1655 for (i = 0; i < n; i++)
1658 copyset (utf8_classes[i], c);
1661 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1663 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1666 dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
1669 /* A valid UTF-8 character is
1672 |[0xc2-0xdf][0x80-0xbf]
1673 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1674 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1676 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1677 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1678 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1679 for (i = 1; i < n; i++)
1680 addtok (dfa, dfa->utf8_anychar_classes[i]);
1683 addtok (dfa, dfa->utf8_anychar_classes[0]);
1689 /* The grammar understood by the parser is as follows.
1708 <multibyte character>
1719 LPAREN regexp RPAREN
1722 The parser builds a parse tree in postfix form in an array of tokens. */
1725 atom (struct dfa *dfa)
1727 if (dfa->parse.tok == WCHAR)
1729 if (dfa->lex.wctok == WEOF)
1730 addtok (dfa, BACKREF);
1733 addtok_wc (dfa, dfa->lex.wctok);
1735 if (dfa->syntax.case_fold)
1737 wchar_t folded[CASE_FOLDED_BUFSIZE];
1738 unsigned int i, n = case_folded_counterparts (dfa->lex.wctok,
1740 for (i = 0; i < n; i++)
1742 addtok_wc (dfa, folded[i]);
1748 dfa->parse.tok = lex (dfa);
1750 else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1752 /* For UTF-8 expand the period to a series of CSETs that define a valid
1753 UTF-8 character. This avoids using the slow multibyte path. I'm
1754 pretty sure it would be both profitable and correct to do it for
1755 any encoding; however, the optimization must be done manually as
1756 it is done above in add_utf8_anychar. So, let's start with
1757 UTF-8: it is the most used, and the structure of the encoding
1758 makes the correctness more obvious. */
1759 add_utf8_anychar (dfa);
1760 dfa->parse.tok = lex (dfa);
1762 else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1763 || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
1764 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1765 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
1766 || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
1767 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
1769 addtok (dfa, dfa->parse.tok);
1770 dfa->parse.tok = lex (dfa);
1772 else if (dfa->parse.tok == LPAREN)
1774 dfa->parse.tok = lex (dfa);
1776 if (dfa->parse.tok != RPAREN)
1777 dfaerror (_("unbalanced ("));
1778 dfa->parse.tok = lex (dfa);
1781 addtok (dfa, EMPTY);
1784 /* Return the number of tokens in the given subexpression. */
1785 static size_t _GL_ATTRIBUTE_PURE
1786 nsubtoks (struct dfa const *dfa, size_t tindex)
1790 switch (dfa->tokens[tindex - 1])
1797 return 1 + nsubtoks (dfa, tindex - 1);
1800 ntoks1 = nsubtoks (dfa, tindex - 1);
1801 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1805 /* Copy the given subexpression to the top of the tree. */
1807 copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
1811 if (dfa->localeinfo.multibyte)
1812 for (i = 0; i < ntokens; ++i)
1813 addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
1815 for (i = 0; i < ntokens; ++i)
1816 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1820 closure (struct dfa *dfa)
1823 size_t tindex, ntokens;
1826 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1827 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1828 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1830 ntokens = nsubtoks (dfa, dfa->tindex);
1831 tindex = dfa->tindex - ntokens;
1832 if (dfa->lex.maxrep < 0)
1834 if (dfa->lex.minrep == 0)
1835 addtok (dfa, QMARK);
1836 for (i = 1; i < dfa->lex.minrep; i++)
1838 copytoks (dfa, tindex, ntokens);
1841 for (; i < dfa->lex.maxrep; i++)
1843 copytoks (dfa, tindex, ntokens);
1844 addtok (dfa, QMARK);
1847 dfa->parse.tok = lex (dfa);
1849 else if (dfa->parse.tok == REPMN)
1851 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1852 dfa->parse.tok = lex (dfa);
1857 addtok (dfa, dfa->parse.tok);
1858 dfa->parse.tok = lex (dfa);
1863 branch (struct dfa* dfa)
1866 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1867 && dfa->parse.tok >= 0)
1875 regexp (struct dfa *dfa)
1878 while (dfa->parse.tok == OR)
1880 dfa->parse.tok = lex (dfa);
1886 /* Main entry point for the parser. S is a string to be parsed, len is the
1887 length of the string, so s can include NUL characters. D is a pointer to
1888 the struct dfa to parse into. */
1890 dfaparse (char const *s, size_t len, struct dfa *d)
1894 d->lex.lasttok = END;
1895 d->lex.laststart = true;
1897 if (d->localeinfo.multibyte)
1899 d->lex.cur_mb_len = 0;
1900 memset (&d->mbs, 0, sizeof d->mbs);
1903 if (!d->syntax.syntax_bits_set)
1904 dfaerror (_("no syntax specified"));
1906 d->parse.tok = lex (d);
1907 d->parse.depth = d->depth;
1911 if (d->parse.tok != END)
1912 dfaerror (_("unbalanced )"));
1914 addtok (d, END - d->nregexps);
1923 /* Some primitives for operating on sets of positions. */
1925 /* Copy one set to another. */
1927 copy (position_set const *src, position_set *dst)
1929 if (dst->alloc < src->nelem)
1932 dst->alloc = src->nelem;
1933 dst->elems = x2nrealloc (NULL, &dst->alloc, sizeof *dst->elems);
1935 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
1936 dst->nelem = src->nelem;
1940 alloc_position_set (position_set *s, size_t size)
1942 s->elems = xnmalloc (size, sizeof *s->elems);
1947 /* Insert position P in set S. S is maintained in sorted order on
1948 decreasing index. If there is already an entry in S with P.index
1949 then merge (logically-OR) P's constraints into the one in S.
1950 S->elems must point to an array large enough to hold the resulting set. */
1952 insert (position p, position_set *s)
1954 size_t count = s->nelem;
1955 size_t lo = 0, hi = count;
1959 size_t mid = (lo + hi) >> 1;
1960 if (s->elems[mid].index > p.index)
1966 if (lo < count && p.index == s->elems[lo].index)
1968 s->elems[lo].constraint |= p.constraint;
1972 s->elems = maybe_realloc (s->elems, count, &s->alloc, sizeof *s->elems);
1973 for (i = count; i > lo; i--)
1974 s->elems[i] = s->elems[i - 1];
1979 /* Merge two sets of positions into a third. The result is exactly as if
1980 the positions of both sets were inserted into an initially empty set. */
1982 merge (position_set const *s1, position_set const *s2, position_set *m)
1984 size_t i = 0, j = 0;
1986 if (m->alloc < s1->nelem + s2->nelem)
1989 m->elems = maybe_realloc (NULL, s1->nelem + s2->nelem, &m->alloc,
1993 while (i < s1->nelem && j < s2->nelem)
1994 if (s1->elems[i].index > s2->elems[j].index)
1995 m->elems[m->nelem++] = s1->elems[i++];
1996 else if (s1->elems[i].index < s2->elems[j].index)
1997 m->elems[m->nelem++] = s2->elems[j++];
2000 m->elems[m->nelem] = s1->elems[i++];
2001 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
2003 while (i < s1->nelem)
2004 m->elems[m->nelem++] = s1->elems[i++];
2005 while (j < s2->nelem)
2006 m->elems[m->nelem++] = s2->elems[j++];
2009 /* Delete a position from a set. */
2011 delete (position p, position_set *s)
2015 for (i = 0; i < s->nelem; ++i)
2016 if (p.index == s->elems[i].index)
2019 for (--s->nelem; i < s->nelem; ++i)
2020 s->elems[i] = s->elems[i + 1];
2023 /* Find the index of the state corresponding to the given position set with
2024 the given preceding context, or create a new state if there is no such
2025 state. Context tells whether we got here on a newline or letter. */
2027 state_index (struct dfa *d, position_set const *s, int context)
2032 token first_end = 0;
2034 for (i = 0; i < s->nelem; ++i)
2035 hash ^= s->elems[i].index + s->elems[i].constraint;
2037 /* Try to find a state that exactly matches the proposed one. */
2038 for (i = 0; i < d->sindex; ++i)
2040 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2041 || context != d->states[i].context)
2043 for (j = 0; j < s->nelem; ++j)
2044 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2045 || s->elems[j].index != d->states[i].elems.elems[j].index)
2052 fprintf (stderr, "new state %zd\n nextpos:", i);
2053 for (j = 0; j < s->nelem; ++j)
2055 fprintf (stderr, " %zu:", s->elems[j].index);
2056 prtok (d->tokens[s->elems[j].index]);
2058 fprintf (stderr, "\n context:");
2059 if (context ^ CTX_ANY)
2061 if (context & CTX_NONE)
2062 fprintf (stderr, " CTX_NONE");
2063 if (context & CTX_LETTER)
2064 fprintf (stderr, " CTX_LETTER");
2065 if (context & CTX_NEWLINE)
2066 fprintf (stderr, " CTX_NEWLINE");
2069 fprintf (stderr, " CTX_ANY");
2070 fprintf (stderr, "\n");
2073 for (j = 0; j < s->nelem; ++j)
2075 int c = s->elems[j].constraint;
2076 if (d->tokens[s->elems[j].index] < 0)
2078 if (SUCCEEDS_IN_CONTEXT (c, context, CTX_ANY))
2081 first_end = d->tokens[s->elems[j].index];
2083 else if (d->tokens[s->elems[j].index] == BACKREF)
2084 constraint = NO_CONSTRAINT;
2088 /* Create a new state. */
2089 d->states = maybe_realloc (d->states, d->sindex, &d->salloc,
2091 d->states[i].hash = hash;
2092 alloc_position_set (&d->states[i].elems, s->nelem);
2093 copy (s, &d->states[i].elems);
2094 d->states[i].context = context;
2095 d->states[i].constraint = constraint;
2096 d->states[i].first_end = first_end;
2097 d->states[i].mbps.nelem = 0;
2098 d->states[i].mbps.elems = NULL;
2099 d->states[i].mb_trindex = -1;
2106 /* Find the epsilon closure of a set of positions. If any position of the set
2107 contains a symbol that matches the empty string in some context, replace
2108 that position with the elements of its follow labeled with an appropriate
2109 constraint. Repeat exhaustively until no funny positions are left.
2110 S->elems must be large enough to hold the result. */
2112 epsclosure (position_set *s, struct dfa const *d, char *visited)
2116 bool initialized = false;
2118 for (i = 0; i < s->nelem; ++i)
2119 if (d->tokens[s->elems[i].index] >= NOTCHAR
2120 && d->tokens[s->elems[i].index] != BACKREF
2121 && d->tokens[s->elems[i].index] != ANYCHAR
2122 && d->tokens[s->elems[i].index] != MBCSET
2123 && d->tokens[s->elems[i].index] < CSET)
2127 memset (visited, 0, d->tindex * sizeof (*visited));
2131 p.constraint = old.constraint;
2132 delete (s->elems[i], s);
2133 if (visited[old.index])
2138 visited[old.index] = 1;
2139 switch (d->tokens[old.index])
2142 p.constraint &= BEGLINE_CONSTRAINT;
2145 p.constraint &= ENDLINE_CONSTRAINT;
2148 p.constraint &= BEGWORD_CONSTRAINT;
2151 p.constraint &= ENDWORD_CONSTRAINT;
2154 p.constraint &= LIMWORD_CONSTRAINT;
2157 p.constraint &= NOTLIMWORD_CONSTRAINT;
2162 for (j = 0; j < d->follows[old.index].nelem; ++j)
2164 p.index = d->follows[old.index].elems[j].index;
2167 /* Force rescan to start at the beginning. */
2172 /* Returns the set of contexts for which there is at least one
2173 character included in C. */
2176 charclass_context (struct dfa const *dfa, charclass c)
2181 for (j = 0; j < CHARCLASS_WORDS; ++j)
2183 if (c[j] & dfa->syntax.newline[j])
2184 context |= CTX_NEWLINE;
2185 if (c[j] & dfa->syntax.letters[j])
2186 context |= CTX_LETTER;
2187 if (c[j] & ~(dfa->syntax.letters[j] | dfa->syntax.newline[j]))
2188 context |= CTX_NONE;
2194 /* Returns the contexts on which the position set S depends. Each context
2195 in the set of returned contexts (let's call it SC) may have a different
2196 follow set than other contexts in SC, and also different from the
2197 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2198 in the complement set will have the same follow set. */
2200 static int _GL_ATTRIBUTE_PURE
2201 state_separate_contexts (position_set const *s)
2203 int separate_contexts = 0;
2206 for (j = 0; j < s->nelem; ++j)
2208 if (PREV_NEWLINE_DEPENDENT (s->elems[j].constraint))
2209 separate_contexts |= CTX_NEWLINE;
2210 if (PREV_LETTER_DEPENDENT (s->elems[j].constraint))
2211 separate_contexts |= CTX_LETTER;
2214 return separate_contexts;
2218 /* Perform bottom-up analysis on the parse tree, computing various functions.
2219 Note that at this point, we're pretending constructs like \< are real
2220 characters rather than constraints on what can follow them.
2222 Nullable: A node is nullable if it is at the root of a regexp that can
2223 match the empty string.
2224 * EMPTY leaves are nullable.
2225 * No other leaf is nullable.
2226 * A QMARK or STAR node is nullable.
2227 * A PLUS node is nullable if its argument is nullable.
2228 * A CAT node is nullable if both its arguments are nullable.
2229 * An OR node is nullable if either argument is nullable.
2231 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2232 that could correspond to the first character of a string matching the
2233 regexp rooted at the given node.
2234 * EMPTY leaves have empty firstpos.
2235 * The firstpos of a nonempty leaf is that leaf itself.
2236 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2238 * The firstpos of a CAT node is the firstpos of the left argument, union
2239 the firstpos of the right if the left argument is nullable.
2240 * The firstpos of an OR node is the union of firstpos of each argument.
2242 Lastpos: The lastpos of a node is the set of positions that could
2243 correspond to the last character of a string matching the regexp at
2245 * EMPTY leaves have empty lastpos.
2246 * The lastpos of a nonempty leaf is that leaf itself.
2247 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2249 * The lastpos of a CAT node is the lastpos of its right argument, union
2250 the lastpos of the left if the right argument is nullable.
2251 * The lastpos of an OR node is the union of the lastpos of each argument.
2253 Follow: The follow of a position is the set of positions that could
2254 correspond to the character following a character matching the node in
2255 a string matching the regexp. At this point we consider special symbols
2256 that match the empty string in some context to be just normal characters.
2257 Later, if we find that a special symbol is in a follow set, we will
2258 replace it with the elements of its follow, labeled with an appropriate
2260 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2261 the follow of every node in the lastpos.
2262 * Every node in the firstpos of the second argument of a CAT node is in
2263 the follow of every node in the lastpos of the first argument.
2265 Because of the postfix representation of the parse tree, the depth-first
2266 analysis is conveniently done by a linear scan with the aid of a stack.
2267 Sets are stored as arrays of the elements, obeying a stack-like allocation
2268 scheme; the number of elements in each set deeper in the stack can be
2269 used to determine the address of a particular set's array. */
2271 dfaanalyze (struct dfa *d, bool searchflag)
2273 /* Array allocated to hold position sets. */
2274 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2275 /* Firstpos and lastpos elements. */
2276 position *firstpos = posalloc + d->nleaves;
2277 position *lastpos = firstpos + d->nleaves;
2279 /* Stack for element counts and nullable flags. */
2282 /* Whether the entry is nullable. */
2285 /* Counts of firstpos and lastpos sets. */
2288 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2290 position_set tmp; /* Temporary set for merging sets. */
2291 position_set merged; /* Result of merging sets. */
2292 int separate_contexts; /* Context wanted by some position. */
2295 char *visited = xnmalloc (d->tindex, sizeof *visited);
2298 fprintf (stderr, "dfaanalyze:\n");
2299 for (i = 0; i < d->tindex; ++i)
2301 fprintf (stderr, " %zu:", i);
2302 prtok (d->tokens[i]);
2304 putc ('\n', stderr);
2307 d->searchflag = searchflag;
2308 alloc_position_set (&merged, d->nleaves);
2309 d->follows = xcalloc (d->tindex, sizeof *d->follows);
2311 for (i = 0; i < d->tindex; ++i)
2313 switch (d->tokens[i])
2316 /* The empty set is nullable. */
2317 stk->nullable = true;
2319 /* The firstpos and lastpos of the empty leaf are both empty. */
2320 stk->nfirstpos = stk->nlastpos = 0;
2326 /* Every element in the firstpos of the argument is in the follow
2327 of every element in the lastpos. */
2328 tmp.nelem = stk[-1].nfirstpos;
2329 tmp.elems = firstpos;
2331 for (j = 0; j < stk[-1].nlastpos; ++j)
2333 merge (&tmp, &d->follows[pos[j].index], &merged);
2334 copy (&merged, &d->follows[pos[j].index]);
2339 /* A QMARK or STAR node is automatically nullable. */
2340 if (d->tokens[i] != PLUS)
2341 stk[-1].nullable = true;
2345 /* Every element in the firstpos of the second argument is in the
2346 follow of every element in the lastpos of the first argument. */
2347 tmp.nelem = stk[-1].nfirstpos;
2348 tmp.elems = firstpos;
2349 pos = lastpos + stk[-1].nlastpos;
2350 for (j = 0; j < stk[-2].nlastpos; ++j)
2352 merge (&tmp, &d->follows[pos[j].index], &merged);
2353 copy (&merged, &d->follows[pos[j].index]);
2356 /* The firstpos of a CAT node is the firstpos of the first argument,
2357 union that of the second argument if the first is nullable. */
2358 if (stk[-2].nullable)
2359 stk[-2].nfirstpos += stk[-1].nfirstpos;
2361 firstpos += stk[-1].nfirstpos;
2363 /* The lastpos of a CAT node is the lastpos of the second argument,
2364 union that of the first argument if the second is nullable. */
2365 if (stk[-1].nullable)
2366 stk[-2].nlastpos += stk[-1].nlastpos;
2369 pos = lastpos + stk[-2].nlastpos;
2370 for (j = stk[-1].nlastpos; j-- > 0;)
2371 pos[j] = lastpos[j];
2372 lastpos += stk[-2].nlastpos;
2373 stk[-2].nlastpos = stk[-1].nlastpos;
2376 /* A CAT node is nullable if both arguments are nullable. */
2377 stk[-2].nullable &= stk[-1].nullable;
2382 /* The firstpos is the union of the firstpos of each argument. */
2383 stk[-2].nfirstpos += stk[-1].nfirstpos;
2385 /* The lastpos is the union of the lastpos of each argument. */
2386 stk[-2].nlastpos += stk[-1].nlastpos;
2388 /* An OR node is nullable if either argument is nullable. */
2389 stk[-2].nullable |= stk[-1].nullable;
2394 /* Anything else is a nonempty position. (Note that special
2395 constructs like \< are treated as nonempty strings here;
2396 an "epsilon closure" effectively makes them nullable later.
2397 Backreferences have to get a real position so we can detect
2398 transitions on them later. But they are nullable. */
2399 stk->nullable = d->tokens[i] == BACKREF;
2401 /* This position is in its own firstpos and lastpos. */
2402 stk->nfirstpos = stk->nlastpos = 1;
2405 --firstpos, --lastpos;
2406 firstpos->index = lastpos->index = i;
2407 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2409 /* Allocate the follow set for this position. */
2410 alloc_position_set (&d->follows[i], 1);
2414 /* ... balance the above nonsyntactic #ifdef goo... */
2415 fprintf (stderr, "node %zu:", i);
2416 prtok (d->tokens[i]);
2417 putc ('\n', stderr);
2419 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2420 fprintf (stderr, " firstpos:");
2421 for (j = stk[-1].nfirstpos; j-- > 0;)
2423 fprintf (stderr, " %zu:", firstpos[j].index);
2424 prtok (d->tokens[firstpos[j].index]);
2426 fprintf (stderr, "\n lastpos:");
2427 for (j = stk[-1].nlastpos; j-- > 0;)
2429 fprintf (stderr, " %zu:", lastpos[j].index);
2430 prtok (d->tokens[lastpos[j].index]);
2432 putc ('\n', stderr);
2436 /* For each follow set that is the follow set of a real position, replace
2437 it with its epsilon closure. */
2438 for (i = 0; i < d->tindex; ++i)
2439 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2440 || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
2441 || d->tokens[i] >= CSET)
2444 fprintf (stderr, "follows(%zu:", i);
2445 prtok (d->tokens[i]);
2446 fprintf (stderr, "):");
2447 for (j = d->follows[i].nelem; j-- > 0;)
2449 fprintf (stderr, " %zu:", d->follows[i].elems[j].index);
2450 prtok (d->tokens[d->follows[i].elems[j].index]);
2452 putc ('\n', stderr);
2454 copy (&d->follows[i], &merged);
2455 epsclosure (&merged, d, visited);
2456 copy (&merged, &d->follows[i]);
2459 /* Get the epsilon closure of the firstpos of the regexp. The result will
2460 be the set of positions of state 0. */
2462 for (i = 0; i < stk[-1].nfirstpos; ++i)
2463 insert (firstpos[i], &merged);
2464 epsclosure (&merged, d, visited);
2466 /* Build the initial state. */
2467 separate_contexts = state_separate_contexts (&merged);
2468 if (separate_contexts & CTX_NEWLINE)
2469 state_index (d, &merged, CTX_NEWLINE);
2470 d->initstate_notbol = d->min_trcount
2471 = state_index (d, &merged, separate_contexts ^ CTX_ANY);
2472 if (separate_contexts & CTX_LETTER)
2473 d->min_trcount = state_index (d, &merged, CTX_LETTER);
2478 free (merged.elems);
2483 /* Find, for each character, the transition out of state s of d, and store
2484 it in the appropriate slot of trans.
2486 We divide the positions of s into groups (positions can appear in more
2487 than one group). Each group is labeled with a set of characters that
2488 every position in the group matches (taking into account, if necessary,
2489 preceding context information of s). For each group, find the union
2490 of the its elements' follows. This set is the set of positions of the
2491 new state. For each character in the group's label, set the transition
2492 on this character to be to a state corresponding to the set's positions,
2493 and its associated backward context information, if necessary.
2495 If we are building a searching matcher, we include the positions of state
2498 The collection of groups is constructed by building an equivalence-class
2499 partition of the positions of s.
2501 For each position, find the set of characters C that it matches. Eliminate
2502 any characters from C that fail on grounds of backward context.
2504 Search through the groups, looking for a group whose label L has nonempty
2505 intersection with C. If L - C is nonempty, create a new group labeled
2506 L - C and having the same positions as the current group, and set L to
2507 the intersection of L and C. Insert the position in this group, set
2508 C = C - L, and resume scanning.
2510 If after comparing with every group there are characters remaining in C,
2511 create a new group labeled with the characters of C and insert this
2512 position in that group. */
2514 dfastate (state_num s, struct dfa *d, state_num trans[])
2516 leaf_set grps[NOTCHAR]; /* As many as will ever be needed. */
2517 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
2518 size_t ngrps = 0; /* Number of groups actually used. */
2519 position pos; /* Current position being considered. */
2520 charclass matches; /* Set of matching characters. */
2521 charclass_word matchesf; /* Nonzero if matches is nonempty. */
2522 charclass intersect; /* Intersection with some label set. */
2523 charclass_word intersectf; /* Nonzero if intersect is nonempty. */
2524 charclass leftovers; /* Stuff in the label that didn't match. */
2525 charclass_word leftoversf; /* Nonzero if leftovers is nonempty. */
2526 position_set follows; /* Union of the follows of some group. */
2527 position_set tmp; /* Temporary space for merging sets. */
2528 int possible_contexts; /* Contexts that this group can match. */
2529 int separate_contexts; /* Context that new state wants to know. */
2530 state_num state; /* New state. */
2531 state_num state_newline; /* New state on a newline transition. */
2532 state_num state_letter; /* New state on a letter transition. */
2533 bool next_isnt_1st_byte = false; /* We can't add state0. */
2537 fprintf (stderr, "build state %td\n", s);
2542 for (i = 0; i < d->states[s].elems.nelem; ++i)
2544 pos = d->states[s].elems.elems[i];
2545 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2546 setbit (d->tokens[pos.index], matches);
2547 else if (d->tokens[pos.index] >= CSET)
2548 copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
2549 else if (d->tokens[pos.index] == ANYCHAR)
2551 copyset (d->charclasses[d->canychar], matches);
2553 /* ANYCHAR must match with a single character, so we must put
2554 it to D->states[s].mbps which contains the positions which
2555 can match with a single character not a byte. If all
2556 positions which has ANYCHAR does not depend on context of
2557 next character, we put the follows instead of it to
2558 D->states[s].mbps to optimize. */
2559 if (SUCCEEDS_IN_CONTEXT (pos.constraint, d->states[s].context,
2562 if (d->states[s].mbps.nelem == 0)
2563 alloc_position_set (&d->states[s].mbps,
2564 d->follows[pos.index].nelem);
2565 for (j = 0; j < d->follows[pos.index].nelem; j++)
2566 insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
2572 /* Some characters may need to be eliminated from matches because
2573 they fail in the current context. */
2574 if (pos.constraint != NO_CONSTRAINT)
2576 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2577 d->states[s].context, CTX_NEWLINE))
2578 for (j = 0; j < CHARCLASS_WORDS; ++j)
2579 matches[j] &= ~d->syntax.newline[j];
2580 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2581 d->states[s].context, CTX_LETTER))
2582 for (j = 0; j < CHARCLASS_WORDS; ++j)
2583 matches[j] &= ~d->syntax.letters[j];
2584 if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
2585 d->states[s].context, CTX_NONE))
2586 for (j = 0; j < CHARCLASS_WORDS; ++j)
2587 matches[j] &= d->syntax.letters[j] | d->syntax.newline[j];
2589 /* If there are no characters left, there's no point in going on. */
2590 for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j)
2592 if (j == CHARCLASS_WORDS)
2597 fprintf (stderr, " nextpos %zu:", pos.index);
2598 prtok (d->tokens[pos.index]);
2599 fprintf (stderr, " of");
2600 for (j = 0; j < NOTCHAR; j++)
2601 if (tstbit (j, matches))
2602 fprintf (stderr, " 0x%02zx", j);
2603 fprintf (stderr, "\n");
2606 for (j = 0; j < ngrps; ++j)
2608 /* If matches contains a single character only, and the current
2609 group's label doesn't contain that character, go on to the
2611 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2612 && !tstbit (d->tokens[pos.index], labels[j]))
2615 /* Check if this group's label has a nonempty intersection with
2618 for (k = 0; k < CHARCLASS_WORDS; ++k)
2619 intersectf |= intersect[k] = matches[k] & labels[j][k];
2623 /* It does; now find the set differences both ways. */
2624 leftoversf = matchesf = 0;
2625 for (k = 0; k < CHARCLASS_WORDS; ++k)
2627 /* Even an optimizing compiler can't know this for sure. */
2628 charclass_word match = matches[k], label = labels[j][k];
2630 leftoversf |= leftovers[k] = label & ~match;
2631 matchesf |= matches[k] = match & ~label;
2634 /* If there were leftovers, create a new group labeled with them. */
2637 copyset (leftovers, labels[ngrps]);
2638 copyset (intersect, labels[j]);
2639 grps[ngrps].elems = xnmalloc (d->nleaves,
2640 sizeof *grps[ngrps].elems);
2641 memcpy (grps[ngrps].elems, grps[j].elems,
2642 sizeof (grps[j].elems[0]) * grps[j].nelem);
2643 grps[ngrps].nelem = grps[j].nelem;
2647 /* Put the position in the current group. The constraint is
2649 grps[j].elems[grps[j].nelem++] = pos.index;
2651 /* If every character matching the current position has been
2652 accounted for, we're done. */
2657 /* If we've passed the last group, and there are still characters
2658 unaccounted for, then we'll have to create a new group. */
2661 copyset (matches, labels[ngrps]);
2663 grps[ngrps].elems = xnmalloc (d->nleaves, sizeof *grps[ngrps].elems);
2664 grps[ngrps].nelem = 1;
2665 grps[ngrps].elems[0] = pos.index;
2670 alloc_position_set (&follows, d->nleaves);
2671 alloc_position_set (&tmp, d->nleaves);
2673 /* If we are a searching matcher, the default transition is to a state
2674 containing the positions of state 0, otherwise the default transition
2675 is to fail miserably. */
2681 state_letter = d->min_trcount - 1;
2682 state = d->initstate_notbol;
2684 for (c = 0; c < NOTCHAR; ++c)
2686 switch (d->syntax.sbit[c])
2689 trans[c] = state_newline;
2692 trans[c] = state_letter;
2701 for (i = 0; i < NOTCHAR; ++i)
2704 for (i = 0; i < ngrps; ++i)
2708 /* Find the union of the follows of the positions of the group.
2709 This is a hideously inefficient loop. Fix it someday. */
2710 for (j = 0; j < grps[i].nelem; ++j)
2711 for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
2712 insert (d->follows[grps[i].elems[j]].elems[k], &follows);
2714 if (d->localeinfo.multibyte)
2716 /* If a token in follows.elems is not 1st byte of a multibyte
2717 character, or the states of follows must accept the bytes
2718 which are not 1st byte of the multibyte character.
2719 Then, if a state of follows encounter a byte, it must not be
2720 a 1st byte of a multibyte character nor single byte character.
2721 We cansel to add state[0].follows to next state, because
2722 state[0] must accept 1st-byte
2724 For example, we assume <sb a> is a certain single byte
2725 character, <mb A> is a certain multibyte character, and the
2726 codepoint of <sb a> equals the 2nd byte of the codepoint of
2728 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2729 by accepting accepts 1st byte of <mb A>, and state[i+1]
2730 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2731 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2732 <mb A>, so we cannot add state[0]. */
2734 next_isnt_1st_byte = false;
2735 for (j = 0; j < follows.nelem; ++j)
2737 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2739 next_isnt_1st_byte = true;
2745 /* If we are building a searching matcher, throw in the positions
2746 of state 0 as well. */
2747 if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
2749 merge (&d->states[0].elems, &follows, &tmp);
2750 copy (&tmp, &follows);
2753 /* Find out if the new state will want any context information. */
2754 possible_contexts = charclass_context (d, labels[i]);
2755 separate_contexts = state_separate_contexts (&follows);
2757 /* Find the state(s) corresponding to the union of the follows. */
2758 if (possible_contexts & ~separate_contexts)
2759 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2762 if (separate_contexts & possible_contexts & CTX_NEWLINE)
2763 state_newline = state_index (d, &follows, CTX_NEWLINE);
2765 state_newline = state;
2766 if (separate_contexts & possible_contexts & CTX_LETTER)
2767 state_letter = state_index (d, &follows, CTX_LETTER);
2769 state_letter = state;
2772 fprintf (stderr, "group %zu\n nextpos:", i);
2773 for (j = 0; j < grps[i].nelem; ++j)
2775 fprintf (stderr, " %zu:", grps[i].elems[j]);
2776 prtok (d->tokens[grps[i].elems[j]]);
2778 fprintf (stderr, "\n follows:");
2779 for (j = 0; j < follows.nelem; ++j)
2781 fprintf (stderr, " %zu:", follows.elems[j].index);
2782 prtok (d->tokens[follows.elems[j].index]);
2784 fprintf (stderr, "\n states:");
2785 if (possible_contexts & CTX_NEWLINE)
2786 fprintf (stderr, " CTX_NEWLINE:%td", state_newline);
2787 if (possible_contexts & CTX_LETTER)
2788 fprintf (stderr, " CTX_LETTER:%td", state_letter);
2789 if (possible_contexts & CTX_NONE)
2790 fprintf (stderr, " CTX_NONE:%td", state);
2791 fprintf (stderr, "\n");
2794 /* Set the transitions for each character in the current label. */
2795 for (j = 0; j < CHARCLASS_WORDS; ++j)
2796 for (k = 0; k < CHARCLASS_WORD_BITS; ++k)
2797 if (labels[i][j] >> k & 1)
2799 int c = j * CHARCLASS_WORD_BITS + k;
2801 switch (d->syntax.sbit[c])
2804 trans[c] = state_newline;
2807 trans[c] = state_letter;
2817 fprintf (stderr, "trans table %td", s);
2818 for (i = 0; i < NOTCHAR; ++i)
2821 fprintf (stderr, "\n");
2822 fprintf (stderr, " %2td", trans[i]);
2824 fprintf (stderr, "\n");
2827 for (i = 0; i < ngrps; ++i)
2828 free (grps[i].elems);
2829 free (follows.elems);
2833 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2835 realloc_trans_if_necessary (struct dfa *d, state_num new_state)
2837 state_num oldalloc = d->tralloc;
2838 if (oldalloc <= new_state)
2840 state_num **realtrans = d->trans ? d->trans - 1 : NULL;
2841 size_t newalloc, newalloc1;
2842 newalloc1 = new_state + 1;
2843 realtrans = x2nrealloc (realtrans, &newalloc1, sizeof *realtrans);
2844 realtrans[0] = NULL;
2845 d->trans = realtrans + 1;
2846 d->tralloc = newalloc = newalloc1 - 1;
2847 d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2848 d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2849 d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2850 if (d->localeinfo.multibyte)
2852 realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
2853 realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2855 realtrans[0] = NULL;
2856 d->mb_trans = realtrans + 1;
2858 for (; oldalloc < newalloc; oldalloc++)
2860 d->trans[oldalloc] = NULL;
2861 d->fails[oldalloc] = NULL;
2862 if (d->localeinfo.multibyte)
2863 d->mb_trans[oldalloc] = NULL;
2868 /* Some routines for manipulating a compiled dfa's transition tables.
2869 Each state may or may not have a transition table; if it does, and it
2870 is a non-accepting state, then d->trans[state] points to its table.
2871 If it is an accepting state then d->fails[state] points to its table.
2872 If it has no table at all, then d->trans[state] is NULL.
2873 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2876 build_state (state_num s, struct dfa *d)
2878 state_num *trans; /* The new transition table. */
2879 state_num i, maxstate;
2881 /* Set an upper limit on the number of transition tables that will ever
2882 exist at once. MAX_TRCOUNT is arbitrary. The idea is that the frequently
2883 used transition tables will be quickly rebuilt, whereas the ones that
2884 were only needed once or twice will be cleared away. However, do not
2885 clear the initial D->min_trcount states, since they are always used. */
2886 if (MAX_TRCOUNT <= d->trcount)
2888 for (i = d->min_trcount; i < d->tralloc; ++i)
2892 d->trans[i] = d->fails[i] = NULL;
2894 d->trcount = d->min_trcount;
2896 if (d->localeinfo.multibyte)
2898 for (i = d->min_trcount; i < d->tralloc; i++)
2900 free (d->mb_trans[i]);
2901 d->mb_trans[i] = NULL;
2903 free (d->mb_trans[-1]);
2904 d->mb_trans[-1] = NULL;
2910 /* Set up the success bits for this state. */
2912 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s, *d))
2913 d->success[s] |= CTX_NEWLINE;
2914 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_LETTER, s, *d))
2915 d->success[s] |= CTX_LETTER;
2916 if (ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NONE, s, *d))
2917 d->success[s] |= CTX_NONE;
2919 trans = xmalloc (NOTCHAR * sizeof *trans);
2920 dfastate (s, d, trans);
2922 /* Now go through the new transition table, and make sure that the trans
2923 and fail arrays are allocated large enough to hold a pointer for the
2924 largest state mentioned in the table. */
2926 for (i = 0; i < NOTCHAR; ++i)
2927 if (maxstate < trans[i])
2928 maxstate = trans[i];
2929 realloc_trans_if_necessary (d, maxstate);
2931 /* Keep the newline transition in a special place so we can use it as
2933 d->newlines[s] = trans[d->syntax.eolbyte];
2934 trans[d->syntax.eolbyte] = -1;
2936 if (ACCEPTING (s, *d))
2937 d->fails[s] = trans;
2939 d->trans[s] = trans;
2942 /* Multibyte character handling sub-routines for dfaexec. */
2944 /* Consume a single byte and transit state from 's' to '*next_state'.
2945 This function is almost same as the state transition routin in dfaexec.
2946 But state transition is done just once, otherwise matching succeed or
2947 reach the end of the buffer. */
2949 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
2955 else if (d->fails[s])
2972 /* Transit state from s, then return new state and update the pointer of
2973 the buffer. This function is for a period operator which can match a
2974 multi-byte character. */
2976 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
2977 unsigned char const *end)
2981 int separate_contexts;
2984 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
2986 /* This state has some operators which can match a multibyte character. */
2987 d->mb_follows.nelem = 0;
2989 /* Calculate the state which can be reached from the state 's' by
2990 consuming 'mbclen' single bytes from the buffer. */
2992 for (i = 0; i < mbclen && 0 <= s; i++)
2993 s = transit_state_singlebyte (d, s, pp);
2998 /* It is an invalid character, so ANYCHAR is not accepted. */
3002 /* If all positions which have ANYCHAR do not depend on the context
3003 of the next character, calculate the next state with
3004 pre-calculated follows and cache the result. */
3005 if (d->states[s1].mb_trindex < 0)
3007 if (MAX_TRCOUNT <= d->mb_trcount)
3010 for (s3 = -1; s3 < d->tralloc; s3++)
3012 free (d->mb_trans[s3]);
3013 d->mb_trans[s3] = NULL;
3016 for (i = 0; i < d->sindex; i++)
3017 d->states[i].mb_trindex = -1;
3020 d->states[s1].mb_trindex = d->mb_trcount++;
3023 if (! d->mb_trans[s])
3025 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3026 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3027 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3028 for (i = 0; i < MAX_TRCOUNT; i++)
3029 d->mb_trans[s][i] = -1;
3031 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3032 return d->mb_trans[s][d->states[s1].mb_trindex];
3035 copy (&d->states[s1].mbps, &d->mb_follows);
3037 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3039 separate_contexts = state_separate_contexts (&d->mb_follows);
3040 s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3041 realloc_trans_if_necessary (d, s2);
3043 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3048 /* The initial state may encounter a byte which is not a single byte character
3049 nor the first byte of a multibyte character. But it is incorrect for the
3050 initial state to accept such a byte. For example, in Shift JIS the regular
3051 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3052 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3053 that are not a single byte character nor the first byte of a multibyte
3056 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3057 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3058 the result is greater than P, set *WCP to the final wide character
3059 processed, or to WEOF if no wide character is processed. Otherwise,
3060 if WCP is non-NULL, *WCP may or may not be updated.
3062 Both P and MBP must be no larger than END. */
3063 static unsigned char const *
3064 skip_remains_mb (struct dfa *d, unsigned char const *p,
3065 unsigned char const *mbp, char const *end)
3068 if (d->syntax.never_trail[*p])
3071 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3072 end - (char const *) mbp, d);
3076 /* Search through a buffer looking for a match to the struct dfa *D.
3077 Find the first occurrence of a string matching the regexp in the
3078 buffer, and the shortest possible version thereof. Return a pointer to
3079 the first character after the match, or NULL if none is found. BEGIN
3080 points to the beginning of the buffer, and END points to the first byte
3081 after its end. Note however that we store a sentinel byte (usually
3082 newline) in *END, so the actual buffer must be one byte longer.
3083 When ALLOW_NL, newlines may appear in the matching string.
3084 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3085 If MULTIBYTE, the input consists of multibyte characters and/or
3086 encoding-error bytes. Otherwise, it consists of single-byte characters.
3087 Here is the list of features that make this DFA matcher punt:
3088 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3089 - [^...] in non-simple locale
3090 - [[=foo=]] or [[.foo.]]
3091 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3092 - back-reference: (.)\1
3093 - word-delimiter in multibyte locale: \<, \>, \b, \B
3094 See using_simple_locale for the definition of "simple locale". */
3096 static inline char *
3097 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3098 size_t *count, bool multibyte)
3100 state_num s, s1; /* Current state. */
3101 unsigned char const *p, *mbp; /* Current input character. */
3102 state_num **trans, *t; /* Copy of d->trans so it can be optimized
3104 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3105 unsigned char saved_end;
3110 realloc_trans_if_necessary (d, 1);
3115 p = mbp = (unsigned char const *) begin;
3117 saved_end = *(unsigned char *) end;
3122 memset (&d->mbs, 0, sizeof d->mbs);
3123 if (d->mb_follows.alloc == 0)
3124 alloc_position_set (&d->mb_follows, d->nleaves);
3129 while ((t = trans[s]) != NULL)
3131 if (s < d->min_trcount)
3133 if (!multibyte || d->states[s].mbps.nelem == 0)
3139 p = mbp = skip_remains_mb (d, p, mbp, end);
3146 if (d->states[s].mbps.nelem == 0
3147 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3149 /* If an input character does not match ANYCHAR, do it
3150 like a single-byte character. */
3155 s = transit_state (d, s, &p, (unsigned char *) end);
3168 s1 = tmp; /* swap */
3171 if (s < d->min_trcount)
3182 if ((char *) p > end || p[-1] != eol || d->newlines[s1] < 0)
3188 /* The previous character was a newline, count it, and skip
3189 checking of multibyte character boundary until here. */
3193 s = (allow_nl ? d->newlines[s1]
3194 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3195 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3196 : d->initstate_notbol);
3198 else if (d->fails[s])
3200 if ((d->success[s] & d->syntax.sbit[*p])
3201 || ((char *) p == end
3202 && ACCEPTS_IN_CONTEXT (d->states[s].context, CTX_NEWLINE, s,
3206 if (multibyte && s < d->min_trcount)
3207 p = mbp = skip_remains_mb (d, p, mbp, end);
3210 if (!multibyte || d->states[s].mbps.nelem == 0
3211 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3213 /* If a input character does not match ANYCHAR, do it
3214 like a single-byte character. */
3215 s = d->fails[s][*p++];
3219 s = transit_state (d, s, &p, (unsigned char *) end);
3238 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3239 This is for performance, as dfaexec_main is an inline function. */
3242 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3243 bool allow_nl, size_t *count, bool *backref)
3245 return dfaexec_main (d, begin, end, allow_nl, count, true);
3249 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3250 bool allow_nl, size_t *count, bool *backref)
3252 return dfaexec_main (d, begin, end, allow_nl, count, false);
3255 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3256 any regexp that uses a construct not supported by this code. */
3258 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3259 bool allow_nl, size_t *count, bool *backref)
3262 return (char *) begin;
3265 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3266 but faster and set *BACKREF if the DFA code does not support this
3270 dfaexec (struct dfa *d, char const *begin, char *end,
3271 bool allow_nl, size_t *count, bool *backref)
3273 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3277 dfasuperset (struct dfa const *d)
3283 dfaisfast (struct dfa const *d)
3289 free_mbdata (struct dfa *d)
3293 free (d->multibyte_prop);
3295 for (i = 0; i < d->nmbcsets; ++i)
3296 free (d->mbcsets[i].chars);
3299 free (d->mb_follows.elems);
3304 for (s = -1; s < d->tralloc; s++)
3305 free (d->mb_trans[s]);
3306 free (d->mb_trans - 1);
3310 /* Return true if every construct in D is supported by this DFA matcher. */
3311 static bool _GL_ATTRIBUTE_PURE
3312 dfa_supported (struct dfa const *d)
3315 for (i = 0; i < d->tindex; i++)
3317 switch (d->tokens[i])
3323 if (!d->localeinfo.multibyte)
3336 dfaoptimize (struct dfa *d)
3339 bool have_backref = false;
3341 if (!d->localeinfo.using_utf8)
3344 for (i = 0; i < d->tindex; ++i)
3346 switch (d->tokens[i])
3352 have_backref = true;
3355 /* Requires multi-byte algorithm. */
3362 if (!have_backref && d->superset)
3364 /* The superset DFA is not likely to be much faster, so remove it. */
3365 dfafree (d->superset);
3371 d->localeinfo.multibyte = false;
3372 d->dfaexec = dfaexec_sb;
3377 dfassbuild (struct dfa *d)
3381 bool have_achar = false;
3382 bool have_nchar = false;
3383 struct dfa *sup = dfaalloc ();
3386 sup->localeinfo.multibyte = false;
3387 sup->dfaexec = dfaexec_sb;
3388 sup->multibyte_prop = NULL;
3389 sup->mbcsets = NULL;
3390 sup->superset = NULL;
3393 sup->follows = NULL;
3397 sup->success = NULL;
3398 sup->newlines = NULL;
3400 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3403 memcpy (sup->charclasses, d->charclasses,
3404 d->cindex * sizeof *sup->charclasses);
3407 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3408 sup->talloc = d->tindex * 2;
3410 for (i = j = 0; i < d->tindex; i++)
3412 switch (d->tokens[i])
3419 sup->tokens[j++] = CSET + charclass_index (sup, ccl);
3420 sup->tokens[j++] = STAR;
3421 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3422 || d->tokens[i + 1] == PLUS)
3430 if (d->localeinfo.multibyte)
3432 /* These constraints aren't supported in a multibyte locale.
3433 Ignore them in the superset DFA. */
3434 sup->tokens[j++] = EMPTY;
3439 sup->tokens[j++] = d->tokens[i];
3440 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3441 || d->tokens[i] >= CSET)
3448 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3457 /* Parse and analyze a single string of the given length. */
3459 dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
3461 dfaparse (s, len, d);
3464 if (dfa_supported (d))
3467 dfaanalyze (d, searchflag);
3471 d->dfaexec = dfaexec_noop;
3477 dfaanalyze (d->superset, searchflag);
3481 /* Free the storage held by the components of a dfa. */
3483 dfafree (struct dfa *d)
3487 free (d->charclasses);
3490 if (d->localeinfo.multibyte)
3493 for (i = 0; i < d->sindex; ++i)
3495 free (d->states[i].elems.elems);
3496 free (d->states[i].mbps.elems);
3502 for (i = 0; i < d->tindex; ++i)
3503 free (d->follows[i].elems);
3509 for (i = 0; i < d->tralloc; ++i)
3515 free (d->trans - 1);
3522 dfafree (d->superset);
3525 /* Having found the postfix representation of the regular expression,
3526 try to find a long sequence of characters that must appear in any line
3528 Finding a "longest" sequence is beyond the scope here;
3529 we take an easy way out and hope for the best.
3530 (Take "(ab|a)b"--please.)
3532 We do a bottom-up calculation of sequences of characters that must appear
3533 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3535 sequences that must appear at the left of the match ("left")
3536 sequences that must appear at the right of the match ("right")
3537 lists of sequences that must appear somewhere in the match ("in")
3538 sequences that must constitute the match ("is")
3540 When we get to the root of the tree, we use one of the longest of its
3541 calculated "in" sequences as our answer.
3543 The sequences calculated for the various types of node (in pseudo ANSI c)
3544 are shown below. "p" is the operand of unary operators (and the left-hand
3545 operand of binary operators); "q" is the right-hand operand of binary
3548 "ZERO" means "a zero-length sequence" below.
3550 Type left right is in
3551 ---- ---- ----- -- --
3552 char c # c # c # c # c
3554 ANYCHAR ZERO ZERO ZERO ZERO
3556 MBCSET ZERO ZERO ZERO ZERO
3558 CSET ZERO ZERO ZERO ZERO
3560 STAR ZERO ZERO ZERO ZERO
3562 QMARK ZERO ZERO ZERO ZERO
3564 PLUS p->left p->right ZERO p->in
3566 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3567 p->left : q->right : q->is!=ZERO) ? q->in plus
3568 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3571 OR longest common longest common (do p->is and substrings common
3572 leading trailing to q->is have same p->in and
3573 (sub)sequence (sub)sequence q->in length and content) ?
3574 of p->left of p->right
3575 and q->left and q->right p->is : NULL
3577 If there's anything else we recognize in the tree, all four sequences get set
3578 to zero-length sequences. If there's something we don't recognize in the
3579 tree, we just return a zero-length sequence.
3581 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3584 And ... is it here or someplace that we might ponder "optimizations" such as
3585 egrep 'psi|epsilon' -> egrep 'psi'
3586 egrep 'pepsi|epsilon' -> egrep 'epsi'
3587 (Yes, we now find "epsi" as a "string
3588 that must occur", but we might also
3589 simplify the *entire* r.e. being sought)
3590 grep '[c]' -> grep 'c'
3591 grep '(ab|a)b' -> grep 'ab'
3592 grep 'ab*' -> grep 'a'
3593 grep 'a*b' -> grep 'b'
3595 There are several issues:
3597 Is optimization easy (enough)?
3599 Does optimization actually accomplish anything,
3600 or is the automaton you get from "psi|epsilon" (for example)
3601 the same as the one you get from "psi" (for example)?
3603 Are optimizable r.e.'s likely to be used in real-life situations
3604 (something like 'ab*' is probably unlikely; something like is
3605 'psi|epsilon' is likelier)? */
3608 icatalloc (char *old, char const *new)
3612 size_t newsize = strlen (new);
3615 oldsize = strlen (old);
3616 result = xrealloc (old, oldsize + newsize + 1);
3617 memcpy (result + oldsize, new, newsize + 1);
3622 freelist (char **cpp)
3629 enlist (char **cpp, char *new, size_t len)
3632 new = memcpy (xmalloc (len + 1), new, len);
3634 /* Is there already something in the list that's new (or longer)? */
3635 for (i = 0; cpp[i] != NULL; ++i)
3636 if (strstr (cpp[i], new) != NULL)
3641 /* Eliminate any obsoleted strings. */
3643 while (cpp[j] != NULL)
3644 if (strstr (new, cpp[j]) == NULL)
3654 /* Add the new string. */
3655 cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3661 /* Given pointers to two strings, return a pointer to an allocated
3662 list of their distinct common substrings. */
3664 comsubs (char *left, char const *right)
3666 char **cpp = xzalloc (sizeof *cpp);
3669 for (lcp = left; *lcp != '\0'; ++lcp)
3672 char *rcp = strchr (right, *lcp);
3676 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3680 rcp = strchr (rcp + 1, *lcp);
3683 cpp = enlist (cpp, lcp, len);
3689 addlists (char **old, char **new)
3692 old = enlist (old, *new, strlen (*new));
3696 /* Given two lists of substrings, return a new list giving substrings
3699 inboth (char **left, char **right)
3701 char **both = xzalloc (sizeof *both);
3704 for (lnum = 0; left[lnum] != NULL; ++lnum)
3706 for (rnum = 0; right[rnum] != NULL; ++rnum)
3708 char **temp = comsubs (left[lnum], right[rnum]);
3709 both = addlists (both, temp);
3717 typedef struct must must;
3731 allocmust (must *mp, size_t size)
3733 must *new_mp = xmalloc (sizeof *new_mp);
3734 new_mp->in = xzalloc (sizeof *new_mp->in);
3735 new_mp->left = xzalloc (size);
3736 new_mp->right = xzalloc (size);
3737 new_mp->is = xzalloc (size);
3738 new_mp->begline = false;
3739 new_mp->endline = false;
3745 resetmust (must *mp)
3749 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3750 mp->begline = false;
3751 mp->endline = false;
3766 dfamust (struct dfa const *d)
3769 char const *result = "";
3772 bool begline = false;
3773 bool endline = false;
3775 bool need_begline = false;
3776 bool need_endline = false;
3777 bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
3780 for (ri = 0; ri < d->tindex; ++ri)
3782 token t = d->tokens[ri];
3786 mp = allocmust (mp, 2);
3788 need_begline = true;
3791 mp = allocmust (mp, 2);
3793 need_endline = true;
3797 assert (!"neither LPAREN nor RPAREN may appear here");
3807 mp = allocmust (mp, 2);
3819 must *lmp = mp = mp->prev;
3820 size_t j, ln, rn, n;
3822 /* Guaranteed to be. Unlikely, but ... */
3823 if (STREQ (lmp->is, rmp->is))
3825 lmp->begline &= rmp->begline;
3826 lmp->endline &= rmp->endline;
3831 lmp->begline = false;
3832 lmp->endline = false;
3834 /* Left side--easy */
3836 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3838 lmp->left[i] = '\0';
3840 ln = strlen (lmp->right);
3841 rn = strlen (rmp->right);
3845 for (i = 0; i < n; ++i)
3846 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3848 for (j = 0; j < i; ++j)
3849 lmp->right[j] = lmp->right[(ln - i) + j];
3850 lmp->right[j] = '\0';
3851 new = inboth (lmp->in, rmp->in);
3865 for (i = 0; mp->in[i] != NULL; ++i)
3866 if (strlen (mp->in[i]) > strlen (result))
3868 if (STREQ (result, mp->is))
3870 if ((!need_begline || mp->begline) && (!need_endline
3873 begline = mp->begline;
3874 endline = mp->endline;
3881 must *lmp = mp = mp->prev;
3883 /* In. Everything in left, plus everything in
3884 right, plus concatenation of
3885 left's right and right's left. */
3886 lmp->in = addlists (lmp->in, rmp->in);
3887 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
3889 size_t lrlen = strlen (lmp->right);
3890 size_t rllen = strlen (rmp->left);
3891 char *tp = xmalloc (lrlen + rllen);
3892 memcpy (tp, lmp->right, lrlen);
3893 memcpy (tp + lrlen, rmp->left, rllen);
3894 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
3898 if (lmp->is[0] != '\0')
3899 lmp->left = icatalloc (lmp->left, rmp->left);
3901 if (rmp->is[0] == '\0')
3902 lmp->right[0] = '\0';
3903 lmp->right = icatalloc (lmp->right, rmp->right);
3904 /* Guaranteed to be */
3905 if ((lmp->is[0] != '\0' || lmp->begline)
3906 && (rmp->is[0] != '\0' || rmp->endline))
3908 lmp->is = icatalloc (lmp->is, rmp->is);
3909 lmp->endline = rmp->endline;
3914 lmp->begline = false;
3915 lmp->endline = false;
3922 /* Not on *my* shift. */
3928 /* If T is a singleton, or if case-folding in a unibyte
3929 locale and T's members all case-fold to the same char,
3930 convert T to one of its members. Otherwise, do
3931 nothing further with T. */
3932 charclass *ccl = &d->charclasses[t - CSET];
3934 for (j = 0; j < NOTCHAR; j++)
3935 if (tstbit (j, *ccl))
3937 if (! (j < NOTCHAR))
3939 mp = allocmust (mp, 2);
3943 while (++j < NOTCHAR)
3944 if (tstbit (j, *ccl)
3945 && ! (case_fold_unibyte
3946 && toupper (j) == toupper (t)))
3950 mp = allocmust (mp, 2);
3956 if (d->tokens[ri + 1] == CAT)
3958 for (; rj < d->tindex - 1; rj += 2)
3960 if ((rj != ri && (d->tokens[rj] <= 0
3961 || NOTCHAR <= d->tokens[rj]))
3962 || d->tokens[rj + 1] != CAT)
3966 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
3967 mp->is[0] = mp->left[0] = mp->right[0]
3968 = case_fold_unibyte ? toupper (t) : t;
3970 for (i = 1; ri + 2 < rj; i++)
3974 mp->is[i] = mp->left[i] = mp->right[i]
3975 = case_fold_unibyte ? toupper (t) : t;
3977 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
3978 mp->in = enlist (mp->in, mp->is, i);
3987 dm = xmalloc (sizeof *dm);
3989 dm->begline = begline;
3990 dm->endline = endline;
3991 dm->must = xstrdup (result);
3996 must *prev = mp->prev;
4005 dfamustfree (struct dfamust *dm)
4014 return xmalloc (sizeof (struct dfa));
4017 /* Initialize DFA. */
4019 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4020 reg_syntax_t bits, int dfaopts)
4023 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4024 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4025 dfa->simple_locale = using_simple_locale (linfo->multibyte);
4026 dfa->localeinfo = *linfo;
4028 dfa->fast = !dfa->localeinfo.multibyte;
4031 dfa->lex.cur_mb_len = 1;
4032 dfa->syntax.syntax_bits_set = true;
4033 dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0;
4034 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4035 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4036 dfa->syntax.syntax_bits = bits;
4038 for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
4040 unsigned char uc = i;
4042 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4043 switch (dfa->syntax.sbit[uc])
4046 setbit (uc, dfa->syntax.letters);
4049 setbit (uc, dfa->syntax.newline);
4053 /* POSIX requires that the five bytes in "\n\r./" (including the
4054 terminating NUL) cannot occur inside a multibyte character. */
4055 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4056 ? (uc & 0xc0) != 0x80
4057 : strchr ("\n\r./", uc) != NULL);
4061 /* vim:set shiftwidth=2: */