1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 int *char_class_alloc,
97 const char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const char *class_name,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
208 /* For ZOS USS we must define btowc */
217 memset(& mbs, 0, sizeof(mbs));
221 mbrtowc (wtmp, tmp, 1, & mbs);
226 /* re_compile_pattern is the GNU regular expression compiler: it
227 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
228 Returns 0 if the pattern was valid, otherwise an error string.
230 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
231 are set in BUFP on entry. */
234 re_compile_pattern (pattern, length, bufp)
237 struct re_pattern_buffer *bufp;
241 /* And GNU code determines whether or not to get register information
242 by passing null for the REGS argument to re_match, etc., not by
243 setting no_sub, unless RE_NO_SUB is set. */
244 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
246 /* Match anchors at newline. */
247 bufp->newline_anchor = 1;
249 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
253 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
256 weak_alias (__re_compile_pattern, re_compile_pattern)
259 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
260 also be assigned to arbitrarily: each pattern buffer stores its own
261 syntax, so it can be changed between regex compilations. */
262 /* This has no initializer because initialized variables in Emacs
263 become read-only after dumping. */
264 reg_syntax_t re_syntax_options;
267 /* Specify the precise syntax of regexps for compilation. This provides
268 for compatibility for various utilities which historically have
269 different, incompatible syntaxes.
271 The argument SYNTAX is a bit mask comprised of the various bits
272 defined in regex.h. We return the old syntax. */
275 re_set_syntax (syntax)
278 reg_syntax_t ret = re_syntax_options;
280 re_syntax_options = syntax;
284 weak_alias (__re_set_syntax, re_set_syntax)
288 re_compile_fastmap (bufp)
289 struct re_pattern_buffer *bufp;
291 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
292 char *fastmap = bufp->fastmap;
294 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
295 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
296 if (dfa->init_state != dfa->init_state_word)
297 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
298 if (dfa->init_state != dfa->init_state_nl)
299 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
300 if (dfa->init_state != dfa->init_state_begbuf)
301 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
302 bufp->fastmap_accurate = 1;
306 weak_alias (__re_compile_fastmap, re_compile_fastmap)
310 __attribute ((always_inline))
311 re_set_fastmap (char *fastmap, int icase, int ch)
315 fastmap[tolower (ch)] = 1;
318 /* Helper function for re_compile_fastmap.
319 Compile fastmap for the initial_state INIT_STATE. */
322 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
325 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
327 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
328 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
330 int node = init_state->nodes.elems[node_cnt];
331 re_token_type_t type = dfa->nodes[node].type;
333 if (type == CHARACTER)
335 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
336 #ifdef RE_ENABLE_I18N
337 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
339 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
344 *p++ = dfa->nodes[node].opr.c;
345 while (++node < dfa->nodes_len
346 && dfa->nodes[node].type == CHARACTER
347 && dfa->nodes[node].mb_partial)
348 *p++ = dfa->nodes[node].opr.c;
349 memset (&state, '\0', sizeof (state));
350 if (__mbrtowc (&wc, (const char *) buf, p - buf,
352 && (__wcrtomb ((char *) buf, towlower (wc), &state)
354 re_set_fastmap (fastmap, 0, buf[0]);
359 else if (type == SIMPLE_BRACKET)
362 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
365 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
366 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
367 if (w & ((bitset_word_t) 1 << j))
368 re_set_fastmap (fastmap, icase, ch);
371 #ifdef RE_ENABLE_I18N
372 else if (type == COMPLEX_BRACKET)
374 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 /* See if we have to try all bytes which start multiple collation
380 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
381 collation element, and don't catch 'b' since 'b' is
382 the only collation element which starts from 'b' (and
383 it is caught by SIMPLE_BRACKET). */
384 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
385 && (cset->ncoll_syms || cset->nranges))
387 const int32_t *table = (const int32_t *)
388 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
389 for (i = 0; i < SBC_MAX; ++i)
391 re_set_fastmap (fastmap, icase, i);
395 /* See if we have to start the match at all multibyte characters,
396 i.e. where we would not find an invalid sequence. This only
397 applies to multibyte character sets; for single byte character
398 sets, the SIMPLE_BRACKET again suffices. */
399 if (dfa->mb_cur_max > 1
400 && (cset->nchar_classes || cset->non_match || cset->nranges
402 || cset->nequiv_classes
410 memset (&mbs, 0, sizeof (mbs));
411 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
412 re_set_fastmap (fastmap, false, (int) c);
419 /* ... Else catch all bytes which can start the mbchars. */
420 for (i = 0; i < cset->nmbchars; ++i)
424 memset (&state, '\0', sizeof (state));
425 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
426 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
427 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
429 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
431 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
436 #endif /* RE_ENABLE_I18N */
437 else if (type == OP_PERIOD
438 #ifdef RE_ENABLE_I18N
439 || type == OP_UTF8_PERIOD
440 #endif /* RE_ENABLE_I18N */
441 || type == END_OF_RE)
443 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
444 if (type == END_OF_RE)
445 bufp->can_be_null = 1;
451 /* Entry point for POSIX code. */
452 /* regcomp takes a regular expression as a string and compiles it.
454 PREG is a regex_t *. We do not expect any fields to be initialized,
455 since POSIX says we shouldn't. Thus, we set
457 `buffer' to the compiled pattern;
458 `used' to the length of the compiled pattern;
459 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
460 REG_EXTENDED bit in CFLAGS is set; otherwise, to
461 RE_SYNTAX_POSIX_BASIC;
462 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
463 `fastmap' to an allocated space for the fastmap;
464 `fastmap_accurate' to zero;
465 `re_nsub' to the number of subexpressions in PATTERN.
467 PATTERN is the address of the pattern string.
469 CFLAGS is a series of bits which affect compilation.
471 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
472 use POSIX basic syntax.
474 If REG_NEWLINE is set, then . and [^...] don't match newline.
475 Also, regexec will try a match beginning after every newline.
477 If REG_ICASE is set, then we considers upper- and lowercase
478 versions of letters to be equivalent when matching.
480 If REG_NOSUB is set, then when PREG is passed to regexec, that
481 routine will report only success or failure, and nothing about the
484 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
485 the return codes and their meanings.) */
488 regcomp (preg, pattern, cflags)
489 regex_t *__restrict preg;
490 const char *__restrict pattern;
494 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
495 : RE_SYNTAX_POSIX_BASIC);
501 /* Try to allocate space for the fastmap. */
502 preg->fastmap = re_malloc (char, SBC_MAX);
503 if (BE (preg->fastmap == NULL, 0))
506 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
508 /* If REG_NEWLINE is set, newlines are treated differently. */
509 if (cflags & REG_NEWLINE)
510 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
511 syntax &= ~RE_DOT_NEWLINE;
512 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
513 /* It also changes the matching behavior. */
514 preg->newline_anchor = 1;
517 preg->newline_anchor = 0;
518 preg->no_sub = !!(cflags & REG_NOSUB);
519 preg->translate = NULL;
521 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
523 /* POSIX doesn't distinguish between an unmatched open-group and an
524 unmatched close-group: both are REG_EPAREN. */
525 if (ret == REG_ERPAREN)
528 /* We have already checked preg->fastmap != NULL. */
529 if (BE (ret == REG_NOERROR, 1))
530 /* Compute the fastmap now, since regexec cannot modify the pattern
531 buffer. This function never fails in this implementation. */
532 (void) re_compile_fastmap (preg);
535 /* Some error occurred while compiling the expression. */
536 re_free (preg->fastmap);
537 preg->fastmap = NULL;
543 weak_alias (__regcomp, regcomp)
546 /* Returns a message corresponding to an error code, ERRCODE, returned
547 from either regcomp or regexec. We don't use PREG here. */
550 regerror (errcode, preg, errbuf, errbuf_size)
552 const regex_t *__restrict preg;
553 char *__restrict errbuf;
560 || errcode >= (int) (sizeof (__re_error_msgid_idx)
561 / sizeof (__re_error_msgid_idx[0])), 0))
562 /* Only error codes returned by the rest of the code should be passed
563 to this routine. If we are given anything else, or if other regex
564 code generates an invalid error code, then the program has a bug.
565 Dump core so we can fix it. */
568 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
570 msg_size = strlen (msg) + 1; /* Includes the null. */
572 if (BE (errbuf_size != 0, 1))
574 if (BE (msg_size > errbuf_size, 0))
576 memcpy (errbuf, msg, errbuf_size - 1);
577 errbuf[errbuf_size - 1] = 0;
580 memcpy (errbuf, msg, msg_size);
586 weak_alias (__regerror, regerror)
590 #ifdef RE_ENABLE_I18N
591 /* This static array is used for the map to single-byte characters when
592 UTF-8 is used. Otherwise we would allocate memory just to initialize
593 it the same all the time. UTF-8 is the preferred encoding so this is
594 a worthwhile optimization. */
596 static const bitset_t utf8_sb_map = {
597 /* Set the first 128 bits. */
598 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
600 #else /* ! (__GNUC__ >= 3) */
601 static bitset_t utf8_sb_map;
602 #endif /* __GNUC__ >= 3 */
603 #endif /* RE_ENABLE_I18N */
607 free_dfa_content (re_dfa_t *dfa)
612 for (i = 0; i < dfa->nodes_len; ++i)
613 free_token (dfa->nodes + i);
614 re_free (dfa->nexts);
615 for (i = 0; i < dfa->nodes_len; ++i)
617 if (dfa->eclosures != NULL)
618 re_node_set_free (dfa->eclosures + i);
619 if (dfa->inveclosures != NULL)
620 re_node_set_free (dfa->inveclosures + i);
621 if (dfa->edests != NULL)
622 re_node_set_free (dfa->edests + i);
624 re_free (dfa->edests);
625 re_free (dfa->eclosures);
626 re_free (dfa->inveclosures);
627 re_free (dfa->nodes);
629 if (dfa->state_table)
630 for (i = 0; i <= dfa->state_hash_mask; ++i)
632 struct re_state_table_entry *entry = dfa->state_table + i;
633 for (j = 0; j < entry->num; ++j)
635 re_dfastate_t *state = entry->array[j];
638 re_free (entry->array);
640 re_free (dfa->state_table);
641 #ifdef RE_ENABLE_I18N
642 if (dfa->sb_char != utf8_sb_map)
643 re_free (dfa->sb_char);
645 re_free (dfa->subexp_map);
647 re_free (dfa->re_str);
654 /* Free dynamically allocated space used by PREG. */
660 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
661 if (BE (dfa != NULL, 1))
662 free_dfa_content (dfa);
666 re_free (preg->fastmap);
667 preg->fastmap = NULL;
669 re_free (preg->translate);
670 preg->translate = NULL;
673 weak_alias (__regfree, regfree)
676 /* Entry points compatible with 4.2 BSD regex library. We don't define
677 them unless specifically requested. */
679 #if defined _REGEX_RE_COMP || defined _LIBC
681 /* BSD has one and only one pattern buffer. */
682 static struct re_pattern_buffer re_comp_buf;
686 /* Make these definitions weak in libc, so POSIX programs can redefine
687 these names if they don't use our functions, and still use
688 regcomp/regexec above without link errors. */
699 if (!re_comp_buf.buffer)
700 return gettext ("No previous regular expression");
704 if (re_comp_buf.buffer)
706 fastmap = re_comp_buf.fastmap;
707 re_comp_buf.fastmap = NULL;
708 __regfree (&re_comp_buf);
709 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
710 re_comp_buf.fastmap = fastmap;
713 if (re_comp_buf.fastmap == NULL)
715 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
716 if (re_comp_buf.fastmap == NULL)
717 return (char *) gettext (__re_error_msgid
718 + __re_error_msgid_idx[(int) REG_ESPACE]);
721 /* Since `re_exec' always passes NULL for the `regs' argument, we
722 don't need to initialize the pattern buffer fields which affect it. */
724 /* Match anchors at newlines. */
725 re_comp_buf.newline_anchor = 1;
727 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
732 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
733 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 libc_freeres_fn (free_mem)
739 __regfree (&re_comp_buf);
743 #endif /* _REGEX_RE_COMP */
745 /* Internal entry point.
746 Compile the regular expression PATTERN, whose length is LENGTH.
747 SYNTAX indicate regular expression's syntax. */
750 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
753 reg_errcode_t err = REG_NOERROR;
757 /* Initialize the pattern buffer. */
758 preg->fastmap_accurate = 0;
759 preg->syntax = syntax;
760 preg->not_bol = preg->not_eol = 0;
763 preg->can_be_null = 0;
764 preg->regs_allocated = REGS_UNALLOCATED;
766 /* Initialize the dfa. */
767 dfa = (re_dfa_t *) preg->buffer;
768 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
770 /* If zero allocated, but buffer is non-null, try to realloc
771 enough space. This loses if buffer's address is bogus, but
772 that is the user's responsibility. If ->buffer is NULL this
773 is a simple allocation. */
774 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
777 preg->allocated = sizeof (re_dfa_t);
778 preg->buffer = (unsigned char *) dfa;
780 preg->used = sizeof (re_dfa_t);
782 err = init_dfa (dfa, length);
783 if (BE (err != REG_NOERROR, 0))
785 free_dfa_content (dfa);
791 /* Note: length+1 will not overflow since it is checked in init_dfa. */
792 dfa->re_str = re_malloc (char, length + 1);
793 strncpy (dfa->re_str, pattern, length + 1);
796 __libc_lock_init (dfa->lock);
798 err = re_string_construct (®exp, pattern, length, preg->translate,
799 syntax & RE_ICASE, dfa);
800 if (BE (err != REG_NOERROR, 0))
802 re_compile_internal_free_return:
803 free_workarea_compile (preg);
804 re_string_destruct (®exp);
805 free_dfa_content (dfa);
811 /* Parse the regular expression, and build a structure tree. */
813 dfa->str_tree = parse (®exp, preg, syntax, &err);
814 if (BE (dfa->str_tree == NULL, 0))
815 goto re_compile_internal_free_return;
817 /* Analyze the tree and create the nfa. */
818 err = analyze (preg);
819 if (BE (err != REG_NOERROR, 0))
820 goto re_compile_internal_free_return;
822 #ifdef RE_ENABLE_I18N
823 /* If possible, do searching in single byte encoding to speed things up. */
824 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828 /* Then create the initial state of the dfa. */
829 err = create_initial_state (dfa);
831 /* Release work areas. */
832 free_workarea_compile (preg);
833 re_string_destruct (®exp);
835 if (BE (err != REG_NOERROR, 0))
837 free_dfa_content (dfa);
845 /* Initialize DFA. We use the length of the regular expression PAT_LEN
846 as the initial length of some arrays. */
849 init_dfa (re_dfa_t *dfa, size_t pat_len)
851 unsigned int table_size;
856 memset (dfa, '\0', sizeof (re_dfa_t));
858 /* Force allocation of str_tree_storage the first time. */
859 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
861 /* Avoid overflows. */
862 if (pat_len == SIZE_MAX)
865 dfa->nodes_alloc = pat_len + 1;
866 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
868 /* table_size = 2 ^ ceil(log pat_len) */
869 for (table_size = 1; ; table_size <<= 1)
870 if (table_size > pat_len)
873 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
874 dfa->state_hash_mask = table_size - 1;
876 dfa->mb_cur_max = MB_CUR_MAX;
878 if (dfa->mb_cur_max == 6
879 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
881 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
884 # ifdef HAVE_LANGINFO_CODESET
885 codeset_name = nl_langinfo (CODESET);
887 codeset_name = getenv ("LC_ALL");
888 if (codeset_name == NULL || codeset_name[0] == '\0')
889 codeset_name = getenv ("LC_CTYPE");
890 if (codeset_name == NULL || codeset_name[0] == '\0')
891 codeset_name = getenv ("LANG");
892 if (codeset_name == NULL)
894 else if (strchr (codeset_name, '.') != NULL)
895 codeset_name = strchr (codeset_name, '.') + 1;
898 /* strcasecmp isn't a standard interface. brute force check */
900 if (strcasecmp (codeset_name, "UTF-8") == 0
901 || strcasecmp (codeset_name, "UTF8") == 0)
904 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
905 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
906 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
907 && (codeset_name[3] == '-'
908 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
909 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
913 /* We check exhaustively in the loop below if this charset is a
914 superset of ASCII. */
915 dfa->map_notascii = 0;
918 #ifdef RE_ENABLE_I18N
919 if (dfa->mb_cur_max > 1)
923 #if !defined(__GNUC__) || __GNUC__ < 3
924 static short utf8_sb_map_inited = 0;
926 if (! utf8_sb_map_inited)
930 utf8_sb_map_inited = 0;
931 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
932 utf8_sb_map[i] = BITSET_WORD_MAX;
935 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
941 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
942 if (BE (dfa->sb_char == NULL, 0))
945 /* Set the bits corresponding to single byte chars. */
946 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
947 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
949 wint_t wch = __btowc (ch);
951 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
953 if (isascii (ch) && wch != ch)
954 dfa->map_notascii = 1;
961 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
966 /* Initialize WORD_CHAR table, which indicate which character is
967 "word". In this case "word" means that it is the word construction
968 character used by some operators like "\<", "\>", etc. */
972 init_word_char (re_dfa_t *dfa)
975 dfa->word_ops_used = 1;
977 if (BE (dfa->map_notascii == 0, 1))
979 if (sizeof (dfa->word_char[0]) == 8)
981 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
982 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
985 else if (sizeof (dfa->word_char[0]) == 4)
987 dfa->word_char[0] = UINT32_C (0x00000000);
988 dfa->word_char[1] = UINT32_C (0x03ff0000);
989 dfa->word_char[2] = UINT32_C (0x87fffffe);
990 dfa->word_char[3] = UINT32_C (0x07fffffe);
997 if (BE (dfa->is_utf8, 1))
999 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
1005 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
1006 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
1007 if (isalnum (ch) || ch == '_')
1008 dfa->word_char[i] |= (bitset_word_t) 1 << j;
1011 /* Free the work area which are only used while compiling. */
1014 free_workarea_compile (regex_t *preg)
1016 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1017 bin_tree_storage_t *storage, *next;
1018 for (storage = dfa->str_tree_storage; storage; storage = next)
1020 next = storage->next;
1023 dfa->str_tree_storage = NULL;
1024 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1025 dfa->str_tree = NULL;
1026 re_free (dfa->org_indices);
1027 dfa->org_indices = NULL;
1030 /* Create initial states for all contexts. */
1032 static reg_errcode_t
1033 create_initial_state (re_dfa_t *dfa)
1037 re_node_set init_nodes;
1039 /* Initial states have the epsilon closure of the node which is
1040 the first node of the regular expression. */
1041 first = dfa->str_tree->first->node_idx;
1042 dfa->init_node = first;
1043 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1044 if (BE (err != REG_NOERROR, 0))
1047 /* The back-references which are in initial states can epsilon transit,
1048 since in this case all of the subexpressions can be null.
1049 Then we add epsilon closures of the nodes which are the next nodes of
1050 the back-references. */
1051 if (dfa->nbackref > 0)
1052 for (i = 0; i < init_nodes.nelem; ++i)
1054 int node_idx = init_nodes.elems[i];
1055 re_token_type_t type = dfa->nodes[node_idx].type;
1058 if (type != OP_BACK_REF)
1060 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1062 re_token_t *clexp_node;
1063 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1064 if (clexp_node->type == OP_CLOSE_SUBEXP
1065 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1068 if (clexp_idx == init_nodes.nelem)
1071 if (type == OP_BACK_REF)
1073 int dest_idx = dfa->edests[node_idx].elems[0];
1074 if (!re_node_set_contains (&init_nodes, dest_idx))
1076 reg_errcode_t err = re_node_set_merge (&init_nodes,
1079 if (err != REG_NOERROR)
1086 /* It must be the first time to invoke acquire_state. */
1087 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1088 /* We don't check ERR here, since the initial state must not be NULL. */
1089 if (BE (dfa->init_state == NULL, 0))
1091 if (dfa->init_state->has_constraint)
1093 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1095 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1097 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1101 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1102 || dfa->init_state_begbuf == NULL, 0))
1106 dfa->init_state_word = dfa->init_state_nl
1107 = dfa->init_state_begbuf = dfa->init_state;
1109 re_node_set_free (&init_nodes);
1113 #ifdef RE_ENABLE_I18N
1114 /* If it is possible to do searching in single byte encoding instead of UTF-8
1115 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1116 DFA nodes where needed. */
1119 optimize_utf8 (re_dfa_t *dfa)
1121 int node, i, mb_chars = 0, has_period = 0;
1123 for (node = 0; node < dfa->nodes_len; ++node)
1124 switch (dfa->nodes[node].type)
1127 if (dfa->nodes[node].opr.c >= 0x80)
1131 switch (dfa->nodes[node].opr.ctx_type)
1139 /* Word anchors etc. cannot be handled. It's okay to test
1140 opr.ctx_type since constraints (for all DFA nodes) are
1141 created by ORing one or more opr.ctx_type values. */
1151 case OP_DUP_ASTERISK:
1152 case OP_OPEN_SUBEXP:
1153 case OP_CLOSE_SUBEXP:
1155 case COMPLEX_BRACKET:
1157 case SIMPLE_BRACKET:
1158 /* Just double check. The non-ASCII range starts at 0x80. */
1159 assert (0x80 % BITSET_WORD_BITS == 0);
1160 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1161 if (dfa->nodes[node].opr.sbcset[i])
1168 if (mb_chars || has_period)
1169 for (node = 0; node < dfa->nodes_len; ++node)
1171 if (dfa->nodes[node].type == CHARACTER
1172 && dfa->nodes[node].opr.c >= 0x80)
1173 dfa->nodes[node].mb_partial = 0;
1174 else if (dfa->nodes[node].type == OP_PERIOD)
1175 dfa->nodes[node].type = OP_UTF8_PERIOD;
1178 /* The search can be in single byte locale. */
1179 dfa->mb_cur_max = 1;
1181 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1185 /* Analyze the structure tree, and calculate "first", "next", "edest",
1186 "eclosure", and "inveclosure". */
1188 static reg_errcode_t
1189 analyze (regex_t *preg)
1191 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1194 /* Allocate arrays. */
1195 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1196 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1197 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1198 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1199 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1200 || dfa->eclosures == NULL, 0))
1203 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1204 if (dfa->subexp_map != NULL)
1207 for (i = 0; i < preg->re_nsub; i++)
1208 dfa->subexp_map[i] = i;
1209 preorder (dfa->str_tree, optimize_subexps, dfa);
1210 for (i = 0; i < preg->re_nsub; i++)
1211 if (dfa->subexp_map[i] != i)
1213 if (i == preg->re_nsub)
1215 free (dfa->subexp_map);
1216 dfa->subexp_map = NULL;
1220 ret = postorder (dfa->str_tree, lower_subexps, preg);
1221 if (BE (ret != REG_NOERROR, 0))
1223 ret = postorder (dfa->str_tree, calc_first, dfa);
1224 if (BE (ret != REG_NOERROR, 0))
1226 preorder (dfa->str_tree, calc_next, dfa);
1227 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1228 if (BE (ret != REG_NOERROR, 0))
1230 ret = calc_eclosure (dfa);
1231 if (BE (ret != REG_NOERROR, 0))
1234 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1235 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1236 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1239 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1240 if (BE (dfa->inveclosures == NULL, 0))
1242 ret = calc_inveclosure (dfa);
1248 /* Our parse trees are very unbalanced, so we cannot use a stack to
1249 implement parse tree visits. Instead, we use parent pointers and
1250 some hairy code in these two functions. */
1251 static reg_errcode_t
1252 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1255 bin_tree_t *node, *prev;
1257 for (node = root; ; )
1259 /* Descend down the tree, preferably to the left (or to the right
1260 if that's the only child). */
1261 while (node->left || node->right)
1269 reg_errcode_t err = fn (extra, node);
1270 if (BE (err != REG_NOERROR, 0))
1272 if (node->parent == NULL)
1275 node = node->parent;
1277 /* Go up while we have a node that is reached from the right. */
1278 while (node->right == prev || node->right == NULL);
1283 static reg_errcode_t
1284 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1289 for (node = root; ; )
1291 reg_errcode_t err = fn (extra, node);
1292 if (BE (err != REG_NOERROR, 0))
1295 /* Go to the left node, or up and to the right. */
1300 bin_tree_t *prev = NULL;
1301 while (node->right == prev || node->right == NULL)
1304 node = node->parent;
1313 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1314 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1315 backreferences as well. Requires a preorder visit. */
1316 static reg_errcode_t
1317 optimize_subexps (void *extra, bin_tree_t *node)
1319 re_dfa_t *dfa = (re_dfa_t *) extra;
1321 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1323 int idx = node->token.opr.idx;
1324 node->token.opr.idx = dfa->subexp_map[idx];
1325 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1328 else if (node->token.type == SUBEXP
1329 && node->left && node->left->token.type == SUBEXP)
1331 int other_idx = node->left->token.opr.idx;
1333 node->left = node->left->left;
1335 node->left->parent = node;
1337 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1338 if (other_idx < BITSET_WORD_BITS)
1339 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1345 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1346 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1347 static reg_errcode_t
1348 lower_subexps (void *extra, bin_tree_t *node)
1350 regex_t *preg = (regex_t *) extra;
1351 reg_errcode_t err = REG_NOERROR;
1353 if (node->left && node->left->token.type == SUBEXP)
1355 node->left = lower_subexp (&err, preg, node->left);
1357 node->left->parent = node;
1359 if (node->right && node->right->token.type == SUBEXP)
1361 node->right = lower_subexp (&err, preg, node->right);
1363 node->right->parent = node;
1370 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1372 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1373 bin_tree_t *body = node->left;
1374 bin_tree_t *op, *cls, *tree1, *tree;
1377 /* We do not optimize empty subexpressions, because otherwise we may
1378 have bad CONCAT nodes with NULL children. This is obviously not
1379 very common, so we do not lose much. An example that triggers
1380 this case is the sed "script" /\(\)/x. */
1381 && node->left != NULL
1382 && (node->token.opr.idx >= BITSET_WORD_BITS
1383 || !(dfa->used_bkref_map
1384 & ((bitset_word_t) 1 << node->token.opr.idx))))
1387 /* Convert the SUBEXP node to the concatenation of an
1388 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1389 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1390 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1391 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1392 tree = create_tree (dfa, op, tree1, CONCAT);
1393 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1399 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1400 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1404 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1405 nodes. Requires a postorder visit. */
1406 static reg_errcode_t
1407 calc_first (void *extra, bin_tree_t *node)
1409 re_dfa_t *dfa = (re_dfa_t *) extra;
1410 if (node->token.type == CONCAT)
1412 node->first = node->left->first;
1413 node->node_idx = node->left->node_idx;
1418 node->node_idx = re_dfa_add_node (dfa, node->token);
1419 if (BE (node->node_idx == -1, 0))
1421 if (node->token.type == ANCHOR)
1422 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1427 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1428 static reg_errcode_t
1429 calc_next (void *extra, bin_tree_t *node)
1431 switch (node->token.type)
1433 case OP_DUP_ASTERISK:
1434 node->left->next = node;
1437 node->left->next = node->right->first;
1438 node->right->next = node->next;
1442 node->left->next = node->next;
1444 node->right->next = node->next;
1450 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1451 static reg_errcode_t
1452 link_nfa_nodes (void *extra, bin_tree_t *node)
1454 re_dfa_t *dfa = (re_dfa_t *) extra;
1455 int idx = node->node_idx;
1456 reg_errcode_t err = REG_NOERROR;
1458 switch (node->token.type)
1464 assert (node->next == NULL);
1467 case OP_DUP_ASTERISK:
1471 dfa->has_plural_match = 1;
1472 if (node->left != NULL)
1473 left = node->left->first->node_idx;
1475 left = node->next->node_idx;
1476 if (node->right != NULL)
1477 right = node->right->first->node_idx;
1479 right = node->next->node_idx;
1481 assert (right > -1);
1482 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1487 case OP_OPEN_SUBEXP:
1488 case OP_CLOSE_SUBEXP:
1489 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1493 dfa->nexts[idx] = node->next->node_idx;
1494 if (node->token.type == OP_BACK_REF)
1495 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1499 assert (!IS_EPSILON_NODE (node->token.type));
1500 dfa->nexts[idx] = node->next->node_idx;
1507 /* Duplicate the epsilon closure of the node ROOT_NODE.
1508 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1509 to their own constraint. */
1511 static reg_errcode_t
1513 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1514 int root_node, unsigned int init_constraint)
1516 int org_node, clone_node, ret;
1517 unsigned int constraint = init_constraint;
1518 for (org_node = top_org_node, clone_node = top_clone_node;;)
1520 int org_dest, clone_dest;
1521 if (dfa->nodes[org_node].type == OP_BACK_REF)
1523 /* If the back reference epsilon-transit, its destination must
1524 also have the constraint. Then duplicate the epsilon closure
1525 of the destination of the back reference, and store it in
1526 edests of the back reference. */
1527 org_dest = dfa->nexts[org_node];
1528 re_node_set_empty (dfa->edests + clone_node);
1529 clone_dest = duplicate_node (dfa, org_dest, constraint);
1530 if (BE (clone_dest == -1, 0))
1532 dfa->nexts[clone_node] = dfa->nexts[org_node];
1533 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 if (BE (ret < 0, 0))
1537 else if (dfa->edests[org_node].nelem == 0)
1539 /* In case of the node can't epsilon-transit, don't duplicate the
1540 destination and store the original destination as the
1541 destination of the node. */
1542 dfa->nexts[clone_node] = dfa->nexts[org_node];
1545 else if (dfa->edests[org_node].nelem == 1)
1547 /* In case of the node can epsilon-transit, and it has only one
1549 org_dest = dfa->edests[org_node].elems[0];
1550 re_node_set_empty (dfa->edests + clone_node);
1551 /* If the node is root_node itself, it means the epsilon clsoure
1552 has a loop. Then tie it to the destination of the root_node. */
1553 if (org_node == root_node && clone_node != org_node)
1555 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1556 if (BE (ret < 0, 0))
1560 /* In case of the node has another constraint, add it. */
1561 constraint |= dfa->nodes[org_node].constraint;
1562 clone_dest = duplicate_node (dfa, org_dest, constraint);
1563 if (BE (clone_dest == -1, 0))
1565 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566 if (BE (ret < 0, 0))
1569 else /* dfa->edests[org_node].nelem == 2 */
1571 /* In case of the node can epsilon-transit, and it has two
1572 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1573 org_dest = dfa->edests[org_node].elems[0];
1574 re_node_set_empty (dfa->edests + clone_node);
1575 /* Search for a duplicated node which satisfies the constraint. */
1576 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1577 if (clone_dest == -1)
1579 /* There is no such duplicated node, create a new one. */
1581 clone_dest = duplicate_node (dfa, org_dest, constraint);
1582 if (BE (clone_dest == -1, 0))
1584 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1585 if (BE (ret < 0, 0))
1587 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1588 root_node, constraint);
1589 if (BE (err != REG_NOERROR, 0))
1594 /* There is a duplicated node which satisfies the constraint,
1595 use it to avoid infinite loop. */
1596 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1597 if (BE (ret < 0, 0))
1601 org_dest = dfa->edests[org_node].elems[1];
1602 clone_dest = duplicate_node (dfa, org_dest, constraint);
1603 if (BE (clone_dest == -1, 0))
1605 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1606 if (BE (ret < 0, 0))
1609 org_node = org_dest;
1610 clone_node = clone_dest;
1615 /* Search for a node which is duplicated from the node ORG_NODE, and
1616 satisfies the constraint CONSTRAINT. */
1619 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1620 unsigned int constraint)
1623 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1625 if (org_node == dfa->org_indices[idx]
1626 && constraint == dfa->nodes[idx].constraint)
1627 return idx; /* Found. */
1629 return -1; /* Not found. */
1632 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1633 Return the index of the new node, or -1 if insufficient storage is
1637 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1639 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1640 if (BE (dup_idx != -1, 1))
1642 dfa->nodes[dup_idx].constraint = constraint;
1643 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1644 dfa->nodes[dup_idx].duplicated = 1;
1646 /* Store the index of the original node. */
1647 dfa->org_indices[dup_idx] = org_idx;
1652 static reg_errcode_t
1653 calc_inveclosure (re_dfa_t *dfa)
1656 for (idx = 0; idx < dfa->nodes_len; ++idx)
1657 re_node_set_init_empty (dfa->inveclosures + idx);
1659 for (src = 0; src < dfa->nodes_len; ++src)
1661 int *elems = dfa->eclosures[src].elems;
1662 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1664 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1665 if (BE (ret == -1, 0))
1673 /* Calculate "eclosure" for all the node in DFA. */
1675 static reg_errcode_t
1676 calc_eclosure (re_dfa_t *dfa)
1678 int node_idx, incomplete;
1680 assert (dfa->nodes_len > 0);
1683 /* For each nodes, calculate epsilon closure. */
1684 for (node_idx = 0; ; ++node_idx)
1687 re_node_set eclosure_elem;
1688 if (node_idx == dfa->nodes_len)
1697 assert (dfa->eclosures[node_idx].nelem != -1);
1700 /* If we have already calculated, skip it. */
1701 if (dfa->eclosures[node_idx].nelem != 0)
1703 /* Calculate epsilon closure of `node_idx'. */
1704 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1705 if (BE (err != REG_NOERROR, 0))
1708 if (dfa->eclosures[node_idx].nelem == 0)
1711 re_node_set_free (&eclosure_elem);
1717 /* Calculate epsilon closure of NODE. */
1719 static reg_errcode_t
1720 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1724 re_node_set eclosure;
1727 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1728 if (BE (err != REG_NOERROR, 0))
1731 /* This indicates that we are calculating this node now.
1732 We reference this value to avoid infinite loop. */
1733 dfa->eclosures[node].nelem = -1;
1735 /* If the current node has constraints, duplicate all nodes
1736 since they must inherit the constraints. */
1737 if (dfa->nodes[node].constraint
1738 && dfa->edests[node].nelem
1739 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1741 err = duplicate_node_closure (dfa, node, node, node,
1742 dfa->nodes[node].constraint);
1743 if (BE (err != REG_NOERROR, 0))
1747 /* Expand each epsilon destination nodes. */
1748 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1749 for (i = 0; i < dfa->edests[node].nelem; ++i)
1751 re_node_set eclosure_elem;
1752 int edest = dfa->edests[node].elems[i];
1753 /* If calculating the epsilon closure of `edest' is in progress,
1754 return intermediate result. */
1755 if (dfa->eclosures[edest].nelem == -1)
1760 /* If we haven't calculated the epsilon closure of `edest' yet,
1761 calculate now. Otherwise use calculated epsilon closure. */
1762 if (dfa->eclosures[edest].nelem == 0)
1764 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1765 if (BE (err != REG_NOERROR, 0))
1769 eclosure_elem = dfa->eclosures[edest];
1770 /* Merge the epsilon closure of `edest'. */
1771 err = re_node_set_merge (&eclosure, &eclosure_elem);
1772 if (BE (err != REG_NOERROR, 0))
1774 /* If the epsilon closure of `edest' is incomplete,
1775 the epsilon closure of this node is also incomplete. */
1776 if (dfa->eclosures[edest].nelem == 0)
1779 re_node_set_free (&eclosure_elem);
1783 /* An epsilon closure includes itself. */
1784 ret = re_node_set_insert (&eclosure, node);
1785 if (BE (ret < 0, 0))
1787 if (incomplete && !root)
1788 dfa->eclosures[node].nelem = 0;
1790 dfa->eclosures[node] = eclosure;
1791 *new_set = eclosure;
1795 /* Functions for token which are used in the parser. */
1797 /* Fetch a token from INPUT.
1798 We must not use this function inside bracket expressions. */
1802 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1804 re_string_skip_bytes (input, peek_token (result, input, syntax));
1807 /* Peek a token from INPUT, and return the length of the token.
1808 We must not use this function inside bracket expressions. */
1812 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1816 if (re_string_eoi (input))
1818 token->type = END_OF_RE;
1822 c = re_string_peek_byte (input, 0);
1825 token->word_char = 0;
1826 #ifdef RE_ENABLE_I18N
1827 token->mb_partial = 0;
1828 if (input->mb_cur_max > 1 &&
1829 !re_string_first_byte (input, re_string_cur_idx (input)))
1831 token->type = CHARACTER;
1832 token->mb_partial = 1;
1839 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1841 token->type = BACK_SLASH;
1845 c2 = re_string_peek_byte_case (input, 1);
1847 token->type = CHARACTER;
1848 #ifdef RE_ENABLE_I18N
1849 if (input->mb_cur_max > 1)
1851 wint_t wc = re_string_wchar_at (input,
1852 re_string_cur_idx (input) + 1);
1853 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1857 token->word_char = IS_WORD_CHAR (c2) != 0;
1862 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1863 token->type = OP_ALT;
1865 case '1': case '2': case '3': case '4': case '5':
1866 case '6': case '7': case '8': case '9':
1867 if (!(syntax & RE_NO_BK_REFS))
1869 token->type = OP_BACK_REF;
1870 token->opr.idx = c2 - '1';
1874 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = ANCHOR;
1877 token->opr.ctx_type = WORD_FIRST;
1881 if (!(syntax & RE_NO_GNU_OPS))
1883 token->type = ANCHOR;
1884 token->opr.ctx_type = WORD_LAST;
1888 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = WORD_DELIM;
1895 if (!(syntax & RE_NO_GNU_OPS))
1897 token->type = ANCHOR;
1898 token->opr.ctx_type = NOT_WORD_DELIM;
1902 if (!(syntax & RE_NO_GNU_OPS))
1903 token->type = OP_WORD;
1906 if (!(syntax & RE_NO_GNU_OPS))
1907 token->type = OP_NOTWORD;
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 token->type = OP_SPACE;
1914 if (!(syntax & RE_NO_GNU_OPS))
1915 token->type = OP_NOTSPACE;
1918 if (!(syntax & RE_NO_GNU_OPS))
1920 token->type = ANCHOR;
1921 token->opr.ctx_type = BUF_FIRST;
1925 if (!(syntax & RE_NO_GNU_OPS))
1927 token->type = ANCHOR;
1928 token->opr.ctx_type = BUF_LAST;
1932 if (!(syntax & RE_NO_BK_PARENS))
1933 token->type = OP_OPEN_SUBEXP;
1936 if (!(syntax & RE_NO_BK_PARENS))
1937 token->type = OP_CLOSE_SUBEXP;
1940 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1941 token->type = OP_DUP_PLUS;
1944 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945 token->type = OP_DUP_QUESTION;
1948 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1949 token->type = OP_OPEN_DUP_NUM;
1952 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953 token->type = OP_CLOSE_DUP_NUM;
1961 token->type = CHARACTER;
1962 #ifdef RE_ENABLE_I18N
1963 if (input->mb_cur_max > 1)
1965 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1966 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1970 token->word_char = IS_WORD_CHAR (token->opr.c);
1975 if (syntax & RE_NEWLINE_ALT)
1976 token->type = OP_ALT;
1979 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1980 token->type = OP_ALT;
1983 token->type = OP_DUP_ASTERISK;
1986 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1987 token->type = OP_DUP_PLUS;
1990 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991 token->type = OP_DUP_QUESTION;
1994 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1995 token->type = OP_OPEN_DUP_NUM;
1998 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999 token->type = OP_CLOSE_DUP_NUM;
2002 if (syntax & RE_NO_BK_PARENS)
2003 token->type = OP_OPEN_SUBEXP;
2006 if (syntax & RE_NO_BK_PARENS)
2007 token->type = OP_CLOSE_SUBEXP;
2010 token->type = OP_OPEN_BRACKET;
2013 token->type = OP_PERIOD;
2016 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2017 re_string_cur_idx (input) != 0)
2019 char prev = re_string_peek_byte (input, -1);
2020 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2023 token->type = ANCHOR;
2024 token->opr.ctx_type = LINE_FIRST;
2027 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2028 re_string_cur_idx (input) + 1 != re_string_length (input))
2031 re_string_skip_bytes (input, 1);
2032 peek_token (&next, input, syntax);
2033 re_string_skip_bytes (input, -1);
2034 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2037 token->type = ANCHOR;
2038 token->opr.ctx_type = LINE_LAST;
2046 /* Peek a token from INPUT, and return the length of the token.
2047 We must not use this function out of bracket expressions. */
2051 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2054 if (re_string_eoi (input))
2056 token->type = END_OF_RE;
2059 c = re_string_peek_byte (input, 0);
2062 #ifdef RE_ENABLE_I18N
2063 if (input->mb_cur_max > 1 &&
2064 !re_string_first_byte (input, re_string_cur_idx (input)))
2066 token->type = CHARACTER;
2069 #endif /* RE_ENABLE_I18N */
2071 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2072 && re_string_cur_idx (input) + 1 < re_string_length (input))
2074 /* In this case, '\' escape a character. */
2076 re_string_skip_bytes (input, 1);
2077 c2 = re_string_peek_byte (input, 0);
2079 token->type = CHARACTER;
2082 if (c == '[') /* '[' is a special char in a bracket exps. */
2086 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2087 c2 = re_string_peek_byte (input, 1);
2095 token->type = OP_OPEN_COLL_ELEM;
2098 token->type = OP_OPEN_EQUIV_CLASS;
2101 if (syntax & RE_CHAR_CLASSES)
2103 token->type = OP_OPEN_CHAR_CLASS;
2106 /* else fall through. */
2108 token->type = CHARACTER;
2118 token->type = OP_CHARSET_RANGE;
2121 token->type = OP_CLOSE_BRACKET;
2124 token->type = OP_NON_MATCH_LIST;
2127 token->type = CHARACTER;
2132 /* Functions for parser. */
2134 /* Entry point of the parser.
2135 Parse the regular expression REGEXP and return the structure tree.
2136 If an error is occured, ERR is set by error code, and return NULL.
2137 This function build the following tree, from regular expression <reg_exp>:
2143 CAT means concatenation.
2144 EOR means end of regular expression. */
2147 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2150 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2151 bin_tree_t *tree, *eor, *root;
2152 re_token_t current_token;
2153 dfa->syntax = syntax;
2154 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2156 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2158 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2160 root = create_tree (dfa, tree, eor, CONCAT);
2163 if (BE (eor == NULL || root == NULL, 0))
2171 /* This function build the following tree, from regular expression
2172 <branch1>|<branch2>:
2178 ALT means alternative, which represents the operator `|'. */
2181 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2182 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2184 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2185 bin_tree_t *tree, *branch = NULL;
2186 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2187 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2190 while (token->type == OP_ALT)
2192 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2193 if (token->type != OP_ALT && token->type != END_OF_RE
2194 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2202 tree = create_tree (dfa, tree, branch, OP_ALT);
2203 if (BE (tree == NULL, 0))
2212 /* This function build the following tree, from regular expression
2219 CAT means concatenation. */
2222 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2225 bin_tree_t *tree, *exp;
2226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2228 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2231 while (token->type != OP_ALT && token->type != END_OF_RE
2232 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2234 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2235 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2238 postorder (tree, free_tree, NULL);
2241 if (tree != NULL && exp != NULL)
2243 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2244 if (newtree == NULL)
2246 postorder (exp, free_tree, NULL);
2247 postorder (tree, free_tree, NULL);
2253 else if (tree == NULL)
2255 /* Otherwise exp == NULL, we don't need to create new tree. */
2260 /* This function build the following tree, from regular expression a*:
2267 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2268 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2270 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2272 switch (token->type)
2275 tree = create_token_tree (dfa, NULL, NULL, token);
2276 if (BE (tree == NULL, 0))
2281 #ifdef RE_ENABLE_I18N
2282 if (dfa->mb_cur_max > 1)
2284 while (!re_string_eoi (regexp)
2285 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2287 bin_tree_t *mbc_remain;
2288 fetch_token (token, regexp, syntax);
2289 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2290 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2291 if (BE (mbc_remain == NULL || tree == NULL, 0))
2300 case OP_OPEN_SUBEXP:
2301 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2302 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2305 case OP_OPEN_BRACKET:
2306 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2307 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2311 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2316 dfa->used_bkref_map |= 1 << token->opr.idx;
2317 tree = create_token_tree (dfa, NULL, NULL, token);
2318 if (BE (tree == NULL, 0))
2324 dfa->has_mb_node = 1;
2326 case OP_OPEN_DUP_NUM:
2327 if (syntax & RE_CONTEXT_INVALID_DUP)
2333 case OP_DUP_ASTERISK:
2335 case OP_DUP_QUESTION:
2336 if (syntax & RE_CONTEXT_INVALID_OPS)
2341 else if (syntax & RE_CONTEXT_INDEP_OPS)
2343 fetch_token (token, regexp, syntax);
2344 return parse_expression (regexp, preg, token, syntax, nest, err);
2346 /* else fall through */
2347 case OP_CLOSE_SUBEXP:
2348 if ((token->type == OP_CLOSE_SUBEXP) &&
2349 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2354 /* else fall through */
2355 case OP_CLOSE_DUP_NUM:
2356 /* We treat it as a normal character. */
2358 /* Then we can these characters as normal characters. */
2359 token->type = CHARACTER;
2360 /* mb_partial and word_char bits should be initialized already
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (BE (tree == NULL, 0))
2370 if ((token->opr.ctx_type
2371 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2372 && dfa->word_ops_used == 0)
2373 init_word_char (dfa);
2374 if (token->opr.ctx_type == WORD_DELIM
2375 || token->opr.ctx_type == NOT_WORD_DELIM)
2377 bin_tree_t *tree_first, *tree_last;
2378 if (token->opr.ctx_type == WORD_DELIM)
2380 token->opr.ctx_type = WORD_FIRST;
2381 tree_first = create_token_tree (dfa, NULL, NULL, token);
2382 token->opr.ctx_type = WORD_LAST;
2386 token->opr.ctx_type = INSIDE_WORD;
2387 tree_first = create_token_tree (dfa, NULL, NULL, token);
2388 token->opr.ctx_type = INSIDE_NOTWORD;
2390 tree_last = create_token_tree (dfa, NULL, NULL, token);
2391 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2392 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2400 tree = create_token_tree (dfa, NULL, NULL, token);
2401 if (BE (tree == NULL, 0))
2407 /* We must return here, since ANCHORs can't be followed
2408 by repetition operators.
2409 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2410 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2411 fetch_token (token, regexp, syntax);
2414 tree = create_token_tree (dfa, NULL, NULL, token);
2415 if (BE (tree == NULL, 0))
2420 if (dfa->mb_cur_max > 1)
2421 dfa->has_mb_node = 1;
2425 tree = build_charclass_op (dfa, regexp->trans,
2428 token->type == OP_NOTWORD, err);
2429 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2434 tree = build_charclass_op (dfa, regexp->trans,
2437 token->type == OP_NOTSPACE, err);
2438 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2448 /* Must not happen? */
2454 fetch_token (token, regexp, syntax);
2456 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2457 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2459 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2460 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2462 /* In BRE consecutive duplications are not allowed. */
2463 if ((syntax & RE_CONTEXT_INVALID_DUP)
2464 && (token->type == OP_DUP_ASTERISK
2465 || token->type == OP_OPEN_DUP_NUM))
2475 /* This function build the following tree, from regular expression
2483 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2484 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2486 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2489 cur_nsub = preg->re_nsub++;
2491 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2493 /* The subexpression may be a null string. */
2494 if (token->type == OP_CLOSE_SUBEXP)
2498 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2499 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2502 postorder (tree, free_tree, NULL);
2505 if (BE (*err != REG_NOERROR, 0))
2509 if (cur_nsub <= '9' - '1')
2510 dfa->completed_bkref_map |= 1 << cur_nsub;
2512 tree = create_tree (dfa, tree, NULL, SUBEXP);
2513 if (BE (tree == NULL, 0))
2518 tree->token.opr.idx = cur_nsub;
2522 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2525 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2526 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2528 bin_tree_t *tree = NULL, *old_tree = NULL;
2529 int i, start, end, start_idx = re_string_cur_idx (regexp);
2530 #ifndef RE_TOKEN_INIT_BUG
2531 re_token_t start_token = *token;
2533 re_token_t start_token;
2535 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2538 if (token->type == OP_OPEN_DUP_NUM)
2541 start = fetch_number (regexp, token, syntax);
2544 if (token->type == CHARACTER && token->opr.c == ',')
2545 start = 0; /* We treat "{,m}" as "{0,m}". */
2548 *err = REG_BADBR; /* <re>{} is invalid. */
2552 if (BE (start != -2, 1))
2554 /* We treat "{n}" as "{n,n}". */
2555 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2556 : ((token->type == CHARACTER && token->opr.c == ',')
2557 ? fetch_number (regexp, token, syntax) : -2));
2559 if (BE (start == -2 || end == -2, 0))
2561 /* Invalid sequence. */
2562 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2564 if (token->type == END_OF_RE)
2572 /* If the syntax bit is set, rollback. */
2573 re_string_set_index (regexp, start_idx);
2574 *token = start_token;
2575 token->type = CHARACTER;
2576 /* mb_partial and word_char bits should be already initialized by
2581 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2583 /* First number greater than second. */
2590 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2594 fetch_token (token, regexp, syntax);
2596 if (BE (elem == NULL, 0))
2598 if (BE (start == 0 && end == 0, 0))
2600 postorder (elem, free_tree, NULL);
2604 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2605 if (BE (start > 0, 0))
2608 for (i = 2; i <= start; ++i)
2610 elem = duplicate_tree (elem, dfa);
2611 tree = create_tree (dfa, tree, elem, CONCAT);
2612 if (BE (elem == NULL || tree == NULL, 0))
2613 goto parse_dup_op_espace;
2619 /* Duplicate ELEM before it is marked optional. */
2620 elem = duplicate_tree (elem, dfa);
2626 if (elem->token.type == SUBEXP)
2627 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2629 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2630 if (BE (tree == NULL, 0))
2631 goto parse_dup_op_espace;
2633 /* This loop is actually executed only when end != -1,
2634 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2635 already created the start+1-th copy. */
2636 for (i = start + 2; i <= end; ++i)
2638 elem = duplicate_tree (elem, dfa);
2639 tree = create_tree (dfa, tree, elem, CONCAT);
2640 if (BE (elem == NULL || tree == NULL, 0))
2641 goto parse_dup_op_espace;
2643 tree = create_tree (dfa, tree, NULL, OP_ALT);
2644 if (BE (tree == NULL, 0))
2645 goto parse_dup_op_espace;
2649 tree = create_tree (dfa, old_tree, tree, CONCAT);
2653 parse_dup_op_espace:
2658 /* Size of the names for collating symbol/equivalence_class/character_class.
2659 I'm not sure, but maybe enough. */
2660 #define BRACKET_NAME_BUF_SIZE 32
2663 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2664 Build the range expression which starts from START_ELEM, and ends
2665 at END_ELEM. The result are written to MBCSET and SBCSET.
2666 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2667 mbcset->range_ends, is a pointer argument sinse we may
2670 static reg_errcode_t
2672 # ifdef RE_ENABLE_I18N
2673 build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset,
2674 int *range_alloc, bracket_elem_t *start_elem,
2675 bracket_elem_t *end_elem)
2676 # else /* not RE_ENABLE_I18N */
2677 build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
2678 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2679 # endif /* not RE_ENABLE_I18N */
2681 unsigned int start_ch, end_ch;
2683 /* Equivalence Classes and Character Classes can't be a range start/end. */
2684 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2685 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2689 /* We can handle no multi character collating elements without libc
2691 if (BE ((start_elem->type == COLL_SYM
2692 && strlen ((char *) start_elem->opr.name) > 1)
2693 || (end_elem->type == COLL_SYM
2694 && strlen ((char *) end_elem->opr.name) > 1), 0))
2695 return REG_ECOLLATE;
2697 # ifdef RE_ENABLE_I18N
2703 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2704 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2706 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2707 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2711 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2712 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2713 * unsigned, so we don't have sign extension problems.
2715 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2716 ? start_ch : start_elem->opr.wch);
2717 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2718 ? end_ch : end_elem->opr.wch);
2720 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2721 ? __btowc (start_ch) : start_elem->opr.wch);
2722 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2723 ? __btowc (end_ch) : end_elem->opr.wch);
2725 if (start_wc == WEOF || end_wc == WEOF)
2726 return REG_ECOLLATE;
2727 else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
2730 /* Got valid collation sequence values, add them as a new entry.
2731 However, for !_LIBC we have no collation elements: if the
2732 character set is single byte, the single byte character set
2733 that we build below suffices. parse_bracket_exp passes
2734 no MBCSET if dfa->mb_cur_max == 1. */
2737 /* Check the space of the arrays. */
2738 if (BE (*range_alloc == mbcset->nranges, 0))
2740 /* There is not enough space, need realloc. */
2741 wchar_t *new_array_start, *new_array_end;
2744 /* +1 in case of mbcset->nranges is 0. */
2745 new_nranges = 2 * mbcset->nranges + 1;
2746 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2747 are NULL if *range_alloc == 0. */
2748 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2750 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2753 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2756 mbcset->range_starts = new_array_start;
2757 mbcset->range_ends = new_array_end;
2758 *range_alloc = new_nranges;
2761 mbcset->range_starts[mbcset->nranges] = start_wc;
2762 mbcset->range_ends[mbcset->nranges++] = end_wc;
2765 /* Build the table for single byte characters. */
2766 for (wc = 0; wc < SBC_MAX; ++wc)
2768 if (start_wc <= wc && wc <= end_wc)
2769 bitset_set (sbcset, wc);
2772 # else /* not RE_ENABLE_I18N */
2775 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2776 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2778 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2779 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2781 if (start_ch > end_ch)
2783 /* Build the table for single byte characters. */
2784 for (ch = 0; ch < SBC_MAX; ++ch)
2785 if (start_ch <= ch && ch <= end_ch)
2786 bitset_set (sbcset, ch);
2788 # endif /* not RE_ENABLE_I18N */
2791 #endif /* not _LIBC */
2794 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2795 Build the collating element which is represented by NAME.
2796 The result are written to MBCSET and SBCSET.
2797 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2798 pointer argument since we may update it. */
2800 static reg_errcode_t
2802 # ifdef RE_ENABLE_I18N
2803 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2804 int *coll_sym_alloc, const unsigned char *name)
2805 # else /* not RE_ENABLE_I18N */
2806 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2807 # endif /* not RE_ENABLE_I18N */
2809 size_t name_len = strlen ((const char *) name);
2810 if (BE (name_len != 1, 0))
2811 return REG_ECOLLATE;
2814 bitset_set (sbcset, name[0]);
2818 #endif /* not _LIBC */
2820 /* This function parse bracket expression like "[abc]", "[a-c]",
2824 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2825 reg_syntax_t syntax, reg_errcode_t *err)
2828 const unsigned char *collseqmb;
2829 const char *collseqwc;
2832 const int32_t *symb_table;
2833 const unsigned char *extra;
2835 /* Local function for parse_bracket_exp used in _LIBC environement.
2836 Seek the collating symbol entry correspondings to NAME.
2837 Return the index of the symbol in the SYMB_TABLE. */
2840 __attribute ((always_inline))
2841 seek_collating_symbol_entry (name, name_len)
2842 const unsigned char *name;
2845 int32_t hash = elem_hash ((const char *) name, name_len);
2846 int32_t elem = hash % table_size;
2847 if (symb_table[2 * elem] != 0)
2849 int32_t second = hash % (table_size - 2) + 1;
2853 /* First compare the hashing value. */
2854 if (symb_table[2 * elem] == hash
2855 /* Compare the length of the name. */
2856 && name_len == extra[symb_table[2 * elem + 1]]
2857 /* Compare the name. */
2858 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2861 /* Yep, this is the entry. */
2868 while (symb_table[2 * elem] != 0);
2873 /* Local function for parse_bracket_exp used in _LIBC environment.
2874 Look up the collation sequence value of BR_ELEM.
2875 Return the value if succeeded, UINT_MAX otherwise. */
2877 auto inline unsigned int
2878 __attribute ((always_inline))
2879 lookup_collation_sequence_value (br_elem)
2880 bracket_elem_t *br_elem;
2882 if (br_elem->type == SB_CHAR)
2885 if (MB_CUR_MAX == 1)
2888 return collseqmb[br_elem->opr.ch];
2891 wint_t wc = __btowc (br_elem->opr.ch);
2892 return __collseq_table_lookup (collseqwc, wc);
2895 else if (br_elem->type == MB_CHAR)
2898 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2900 else if (br_elem->type == COLL_SYM)
2902 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2906 elem = seek_collating_symbol_entry (br_elem->opr.name,
2908 if (symb_table[2 * elem] != 0)
2910 /* We found the entry. */
2911 idx = symb_table[2 * elem + 1];
2912 /* Skip the name of collating element name. */
2913 idx += 1 + extra[idx];
2914 /* Skip the byte sequence of the collating element. */
2915 idx += 1 + extra[idx];
2916 /* Adjust for the alignment. */
2917 idx = (idx + 3) & ~3;
2918 /* Skip the multibyte collation sequence value. */
2919 idx += sizeof (unsigned int);
2920 /* Skip the wide char sequence of the collating element. */
2921 idx += sizeof (unsigned int) *
2922 (1 + *(unsigned int *) (extra + idx));
2923 /* Return the collation sequence value. */
2924 return *(unsigned int *) (extra + idx);
2926 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2928 /* No valid character. Match it as a single byte
2930 return collseqmb[br_elem->opr.name[0]];
2933 else if (sym_name_len == 1)
2934 return collseqmb[br_elem->opr.name[0]];
2939 /* Local function for parse_bracket_exp used in _LIBC environement.
2940 Build the range expression which starts from START_ELEM, and ends
2941 at END_ELEM. The result are written to MBCSET and SBCSET.
2942 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2943 mbcset->range_ends, is a pointer argument sinse we may
2946 auto inline reg_errcode_t
2947 __attribute ((always_inline))
2948 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2949 re_charset_t *mbcset;
2952 bracket_elem_t *start_elem, *end_elem;
2955 uint32_t start_collseq;
2956 uint32_t end_collseq;
2958 /* Equivalence Classes and Character Classes can't be a range
2960 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2961 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2965 start_collseq = lookup_collation_sequence_value (start_elem);
2966 end_collseq = lookup_collation_sequence_value (end_elem);
2967 /* Check start/end collation sequence values. */
2968 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2969 return REG_ECOLLATE;
2970 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2973 /* Got valid collation sequence values, add them as a new entry.
2974 However, if we have no collation elements, and the character set
2975 is single byte, the single byte character set that we
2976 build below suffices. */
2977 if (nrules > 0 || dfa->mb_cur_max > 1)
2979 /* Check the space of the arrays. */
2980 if (BE (*range_alloc == mbcset->nranges, 0))
2982 /* There is not enough space, need realloc. */
2983 uint32_t *new_array_start;
2984 uint32_t *new_array_end;
2987 /* +1 in case of mbcset->nranges is 0. */
2988 new_nranges = 2 * mbcset->nranges + 1;
2989 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2991 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2994 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2997 mbcset->range_starts = new_array_start;
2998 mbcset->range_ends = new_array_end;
2999 *range_alloc = new_nranges;
3002 mbcset->range_starts[mbcset->nranges] = start_collseq;
3003 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3006 /* Build the table for single byte characters. */
3007 for (ch = 0; ch < SBC_MAX; ch++)
3009 uint32_t ch_collseq;
3011 if (MB_CUR_MAX == 1)
3014 ch_collseq = collseqmb[ch];
3016 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3017 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3018 bitset_set (sbcset, ch);
3023 /* Local function for parse_bracket_exp used in _LIBC environement.
3024 Build the collating element which is represented by NAME.
3025 The result are written to MBCSET and SBCSET.
3026 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3027 pointer argument sinse we may update it. */
3029 auto inline reg_errcode_t
3030 __attribute ((always_inline))
3031 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3032 re_charset_t *mbcset;
3033 int *coll_sym_alloc;
3035 const unsigned char *name;
3038 size_t name_len = strlen ((const char *) name);
3041 elem = seek_collating_symbol_entry (name, name_len);
3042 if (symb_table[2 * elem] != 0)
3044 /* We found the entry. */
3045 idx = symb_table[2 * elem + 1];
3046 /* Skip the name of collating element name. */
3047 idx += 1 + extra[idx];
3049 else if (symb_table[2 * elem] == 0 && name_len == 1)
3051 /* No valid character, treat it as a normal
3053 bitset_set (sbcset, name[0]);
3057 return REG_ECOLLATE;
3059 /* Got valid collation sequence, add it as a new entry. */
3060 /* Check the space of the arrays. */
3061 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3063 /* Not enough, realloc it. */
3064 /* +1 in case of mbcset->ncoll_syms is 0. */
3065 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3066 /* Use realloc since mbcset->coll_syms is NULL
3068 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3069 new_coll_sym_alloc);
3070 if (BE (new_coll_syms == NULL, 0))
3072 mbcset->coll_syms = new_coll_syms;
3073 *coll_sym_alloc = new_coll_sym_alloc;
3075 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3080 if (BE (name_len != 1, 0))
3081 return REG_ECOLLATE;
3084 bitset_set (sbcset, name[0]);
3091 re_token_t br_token;
3092 re_bitset_ptr_t sbcset;
3093 #ifdef RE_ENABLE_I18N
3094 re_charset_t *mbcset;
3095 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3096 int equiv_class_alloc = 0, char_class_alloc = 0;
3097 #endif /* not RE_ENABLE_I18N */
3099 bin_tree_t *work_tree;
3101 int first_round = 1;
3103 collseqmb = (const unsigned char *)
3104 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3105 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3111 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3112 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3113 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3114 _NL_COLLATE_SYMB_TABLEMB);
3115 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3116 _NL_COLLATE_SYMB_EXTRAMB);
3119 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3120 #ifdef RE_ENABLE_I18N
3121 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3122 #endif /* RE_ENABLE_I18N */
3123 #ifdef RE_ENABLE_I18N
3124 if (BE (sbcset == NULL || mbcset == NULL, 0))
3126 if (BE (sbcset == NULL, 0))
3127 #endif /* RE_ENABLE_I18N */
3130 #ifdef RE_ENABLE_I18N
3137 token_len = peek_token_bracket (token, regexp, syntax);
3138 if (BE (token->type == END_OF_RE, 0))
3141 goto parse_bracket_exp_free_return;
3143 if (token->type == OP_NON_MATCH_LIST)
3145 #ifdef RE_ENABLE_I18N
3146 mbcset->non_match = 1;
3147 #endif /* not RE_ENABLE_I18N */
3149 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3150 bitset_set (sbcset, '\n');
3151 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3152 token_len = peek_token_bracket (token, regexp, syntax);
3153 if (BE (token->type == END_OF_RE, 0))
3156 goto parse_bracket_exp_free_return;
3160 /* We treat the first ']' as a normal character. */
3161 if (token->type == OP_CLOSE_BRACKET)
3162 token->type = CHARACTER;
3166 bracket_elem_t start_elem, end_elem;
3167 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3168 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3170 int token_len2 = 0, is_range_exp = 0;
3173 start_elem.opr.name = start_name_buf;
3174 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3175 syntax, first_round);
3176 if (BE (ret != REG_NOERROR, 0))
3179 goto parse_bracket_exp_free_return;
3183 /* Get information about the next token. We need it in any case. */
3184 token_len = peek_token_bracket (token, regexp, syntax);
3186 /* Do not check for ranges if we know they are not allowed. */
3187 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3189 if (BE (token->type == END_OF_RE, 0))
3192 goto parse_bracket_exp_free_return;
3194 if (token->type == OP_CHARSET_RANGE)
3196 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3197 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3198 if (BE (token2.type == END_OF_RE, 0))
3201 goto parse_bracket_exp_free_return;
3203 if (token2.type == OP_CLOSE_BRACKET)
3205 /* We treat the last '-' as a normal character. */
3206 re_string_skip_bytes (regexp, -token_len);
3207 token->type = CHARACTER;
3214 if (is_range_exp == 1)
3216 end_elem.opr.name = end_name_buf;
3217 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3219 if (BE (ret != REG_NOERROR, 0))
3222 goto parse_bracket_exp_free_return;
3225 token_len = peek_token_bracket (token, regexp, syntax);
3228 *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc,
3229 &start_elem, &end_elem);
3231 # ifdef RE_ENABLE_I18N
3232 *err = build_range_exp (syntax, sbcset,
3233 dfa->mb_cur_max > 1 ? mbcset : NULL,
3234 &range_alloc, &start_elem, &end_elem);
3236 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3238 #endif /* RE_ENABLE_I18N */
3239 if (BE (*err != REG_NOERROR, 0))
3240 goto parse_bracket_exp_free_return;
3244 switch (start_elem.type)
3247 bitset_set (sbcset, start_elem.opr.ch);
3249 #ifdef RE_ENABLE_I18N
3251 /* Check whether the array has enough space. */
3252 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3254 wchar_t *new_mbchars;
3255 /* Not enough, realloc it. */
3256 /* +1 in case of mbcset->nmbchars is 0. */
3257 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3258 /* Use realloc since array is NULL if *alloc == 0. */
3259 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3261 if (BE (new_mbchars == NULL, 0))
3262 goto parse_bracket_exp_espace;
3263 mbcset->mbchars = new_mbchars;
3265 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3267 #endif /* RE_ENABLE_I18N */
3269 *err = build_equiv_class (sbcset,
3270 #ifdef RE_ENABLE_I18N
3271 mbcset, &equiv_class_alloc,
3272 #endif /* RE_ENABLE_I18N */
3273 start_elem.opr.name);
3274 if (BE (*err != REG_NOERROR, 0))
3275 goto parse_bracket_exp_free_return;
3278 *err = build_collating_symbol (sbcset,
3279 #ifdef RE_ENABLE_I18N
3280 mbcset, &coll_sym_alloc,
3281 #endif /* RE_ENABLE_I18N */
3282 start_elem.opr.name);
3283 if (BE (*err != REG_NOERROR, 0))
3284 goto parse_bracket_exp_free_return;
3287 *err = build_charclass (regexp->trans, sbcset,
3288 #ifdef RE_ENABLE_I18N
3289 mbcset, &char_class_alloc,
3290 #endif /* RE_ENABLE_I18N */
3291 (const char *) start_elem.opr.name, syntax);
3292 if (BE (*err != REG_NOERROR, 0))
3293 goto parse_bracket_exp_free_return;
3300 if (BE (token->type == END_OF_RE, 0))
3303 goto parse_bracket_exp_free_return;
3305 if (token->type == OP_CLOSE_BRACKET)
3309 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3311 /* If it is non-matching list. */
3313 bitset_not (sbcset);
3315 #ifdef RE_ENABLE_I18N
3316 /* Ensure only single byte characters are set. */
3317 if (dfa->mb_cur_max > 1)
3318 bitset_mask (sbcset, dfa->sb_char);
3320 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3321 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3322 || mbcset->non_match)))
3324 bin_tree_t *mbc_tree;
3326 /* Build a tree for complex bracket. */
3327 dfa->has_mb_node = 1;
3328 br_token.type = COMPLEX_BRACKET;
3329 br_token.opr.mbcset = mbcset;
3330 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3331 if (BE (mbc_tree == NULL, 0))
3332 goto parse_bracket_exp_espace;
3333 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3334 if (sbcset[sbc_idx])
3336 /* If there are no bits set in sbcset, there is no point
3337 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3338 if (sbc_idx < BITSET_WORDS)
3340 /* Build a tree for simple bracket. */
3341 br_token.type = SIMPLE_BRACKET;
3342 br_token.opr.sbcset = sbcset;
3343 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3344 if (BE (work_tree == NULL, 0))
3345 goto parse_bracket_exp_espace;
3347 /* Then join them by ALT node. */
3348 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3349 if (BE (work_tree == NULL, 0))
3350 goto parse_bracket_exp_espace;
3355 work_tree = mbc_tree;
3359 #endif /* not RE_ENABLE_I18N */
3361 #ifdef RE_ENABLE_I18N
3362 free_charset (mbcset);
3364 /* Build a tree for simple bracket. */
3365 br_token.type = SIMPLE_BRACKET;
3366 br_token.opr.sbcset = sbcset;
3367 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3368 if (BE (work_tree == NULL, 0))
3369 goto parse_bracket_exp_espace;
3373 parse_bracket_exp_espace:
3375 parse_bracket_exp_free_return:
3377 #ifdef RE_ENABLE_I18N
3378 free_charset (mbcset);
3379 #endif /* RE_ENABLE_I18N */
3383 /* Parse an element in the bracket expression. */
3385 static reg_errcode_t
3386 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3387 re_token_t *token, int token_len, re_dfa_t *dfa,
3388 reg_syntax_t syntax, int accept_hyphen)
3390 #ifdef RE_ENABLE_I18N
3392 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3393 if (cur_char_size > 1)
3395 elem->type = MB_CHAR;
3396 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3397 re_string_skip_bytes (regexp, cur_char_size);
3400 #endif /* RE_ENABLE_I18N */
3401 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3402 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3403 || token->type == OP_OPEN_EQUIV_CLASS)
3404 return parse_bracket_symbol (elem, regexp, token);
3405 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3407 /* A '-' must only appear as anything but a range indicator before
3408 the closing bracket. Everything else is an error. */
3410 (void) peek_token_bracket (&token2, regexp, syntax);
3411 if (token2.type != OP_CLOSE_BRACKET)
3412 /* The actual error value is not standardized since this whole
3413 case is undefined. But ERANGE makes good sense. */
3416 elem->type = SB_CHAR;
3417 elem->opr.ch = token->opr.c;
3421 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3422 such as [:<character_class>:], [.<collating_element>.], and
3423 [=<equivalent_class>=]. */
3425 static reg_errcode_t
3426 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3429 unsigned char ch, delim = token->opr.c;
3431 if (re_string_eoi(regexp))
3435 if (i >= BRACKET_NAME_BUF_SIZE)
3437 if (token->type == OP_OPEN_CHAR_CLASS)
3438 ch = re_string_fetch_byte_case (regexp);
3440 ch = re_string_fetch_byte (regexp);
3441 if (re_string_eoi(regexp))
3443 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3445 elem->opr.name[i] = ch;
3447 re_string_skip_bytes (regexp, 1);
3448 elem->opr.name[i] = '\0';
3449 switch (token->type)
3451 case OP_OPEN_COLL_ELEM:
3452 elem->type = COLL_SYM;
3454 case OP_OPEN_EQUIV_CLASS:
3455 elem->type = EQUIV_CLASS;
3457 case OP_OPEN_CHAR_CLASS:
3458 elem->type = CHAR_CLASS;
3466 /* Helper function for parse_bracket_exp.
3467 Build the equivalence class which is represented by NAME.
3468 The result are written to MBCSET and SBCSET.
3469 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3470 is a pointer argument sinse we may update it. */
3472 static reg_errcode_t
3473 #ifdef RE_ENABLE_I18N
3474 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3475 int *equiv_class_alloc, const unsigned char *name)
3476 #else /* not RE_ENABLE_I18N */
3477 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3478 #endif /* not RE_ENABLE_I18N */
3481 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3484 const int32_t *table, *indirect;
3485 const unsigned char *weights, *extra, *cp;
3486 unsigned char char_buf[2];
3490 /* This #include defines a local function! */
3491 # include <locale/weight.h>
3492 /* Calculate the index for equivalence class. */
3494 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3495 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3496 _NL_COLLATE_WEIGHTMB);
3497 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3498 _NL_COLLATE_EXTRAMB);
3499 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3500 _NL_COLLATE_INDIRECTMB);
3501 idx1 = findidx (&cp, -1);
3502 if (BE (idx1 == 0 || *cp != '\0', 0))
3503 /* This isn't a valid character. */
3504 return REG_ECOLLATE;
3506 /* Build single byte matcing table for this equivalence class. */
3507 char_buf[1] = (unsigned char) '\0';
3508 len = weights[idx1 & 0xffffff];
3509 for (ch = 0; ch < SBC_MAX; ++ch)
3513 idx2 = findidx (&cp, 1);
3518 /* This isn't a valid character. */
3520 /* Compare only if the length matches and the collation rule
3521 index is the same. */
3522 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3526 while (cnt <= len &&
3527 weights[(idx1 & 0xffffff) + 1 + cnt]
3528 == weights[(idx2 & 0xffffff) + 1 + cnt])
3532 bitset_set (sbcset, ch);
3535 /* Check whether the array has enough space. */
3536 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3538 /* Not enough, realloc it. */
3539 /* +1 in case of mbcset->nequiv_classes is 0. */
3540 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3541 /* Use realloc since the array is NULL if *alloc == 0. */
3542 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3544 new_equiv_class_alloc);
3545 if (BE (new_equiv_classes == NULL, 0))
3547 mbcset->equiv_classes = new_equiv_classes;
3548 *equiv_class_alloc = new_equiv_class_alloc;
3550 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3555 if (BE (strlen ((const char *) name) != 1, 0))
3556 return REG_ECOLLATE;
3557 bitset_set (sbcset, *name);
3562 /* Helper function for parse_bracket_exp.
3563 Build the character class which is represented by NAME.
3564 The result are written to MBCSET and SBCSET.
3565 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3566 is a pointer argument sinse we may update it. */
3568 static reg_errcode_t
3569 #ifdef RE_ENABLE_I18N
3570 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3571 re_charset_t *mbcset, int *char_class_alloc,
3572 const char *class_name, reg_syntax_t syntax)
3573 #else /* not RE_ENABLE_I18N */
3574 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3575 const char *class_name, reg_syntax_t syntax)
3576 #endif /* not RE_ENABLE_I18N */
3580 /* In case of REG_ICASE "upper" and "lower" match the both of
3581 upper and lower cases. */
3582 if ((syntax & RE_ICASE)
3583 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3584 class_name = "alpha";
3586 #ifdef RE_ENABLE_I18N
3587 /* Check the space of the arrays. */
3588 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3590 /* Not enough, realloc it. */
3591 /* +1 in case of mbcset->nchar_classes is 0. */
3592 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3593 /* Use realloc since array is NULL if *alloc == 0. */
3594 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3595 new_char_class_alloc);
3596 if (BE (new_char_classes == NULL, 0))
3598 mbcset->char_classes = new_char_classes;
3599 *char_class_alloc = new_char_class_alloc;
3601 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3602 #endif /* RE_ENABLE_I18N */
3604 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3606 if (BE (trans != NULL, 0)) \
3608 for (i = 0; i < SBC_MAX; ++i) \
3609 if (ctype_func (i)) \
3610 bitset_set (sbcset, trans[i]); \
3614 for (i = 0; i < SBC_MAX; ++i) \
3615 if (ctype_func (i)) \
3616 bitset_set (sbcset, i); \
3620 if (strcmp (class_name, "alnum") == 0)
3621 BUILD_CHARCLASS_LOOP (isalnum);
3622 else if (strcmp (class_name, "cntrl") == 0)
3623 BUILD_CHARCLASS_LOOP (iscntrl);
3624 else if (strcmp (class_name, "lower") == 0)
3625 BUILD_CHARCLASS_LOOP (islower);
3626 else if (strcmp (class_name, "space") == 0)
3627 BUILD_CHARCLASS_LOOP (isspace);
3628 else if (strcmp (class_name, "alpha") == 0)
3629 BUILD_CHARCLASS_LOOP (isalpha);
3630 else if (strcmp (class_name, "digit") == 0)
3631 BUILD_CHARCLASS_LOOP (isdigit);
3632 else if (strcmp (class_name, "print") == 0)
3633 BUILD_CHARCLASS_LOOP (isprint);
3634 else if (strcmp (class_name, "upper") == 0)
3635 BUILD_CHARCLASS_LOOP (isupper);
3636 else if (strcmp (class_name, "blank") == 0)
3638 BUILD_CHARCLASS_LOOP (isblank);
3640 /* see comments above */
3641 BUILD_CHARCLASS_LOOP (is_blank);
3643 else if (strcmp (class_name, "graph") == 0)
3644 BUILD_CHARCLASS_LOOP (isgraph);
3645 else if (strcmp (class_name, "punct") == 0)
3646 BUILD_CHARCLASS_LOOP (ispunct);
3647 else if (strcmp (class_name, "xdigit") == 0)
3648 BUILD_CHARCLASS_LOOP (isxdigit);
3656 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3657 const char *class_name,
3658 const char *extra, int non_match,
3661 re_bitset_ptr_t sbcset;
3662 #ifdef RE_ENABLE_I18N
3663 re_charset_t *mbcset;
3665 #endif /* not RE_ENABLE_I18N */
3667 re_token_t br_token;
3670 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3671 #ifdef RE_ENABLE_I18N
3672 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3673 #endif /* RE_ENABLE_I18N */
3675 #ifdef RE_ENABLE_I18N
3676 if (BE (sbcset == NULL || mbcset == NULL, 0))
3677 #else /* not RE_ENABLE_I18N */
3678 if (BE (sbcset == NULL, 0))
3679 #endif /* not RE_ENABLE_I18N */
3687 #ifdef RE_ENABLE_I18N
3688 mbcset->non_match = 1;
3689 #endif /* not RE_ENABLE_I18N */
3692 /* We don't care the syntax in this case. */
3693 ret = build_charclass (trans, sbcset,
3694 #ifdef RE_ENABLE_I18N
3696 #endif /* RE_ENABLE_I18N */
3699 if (BE (ret != REG_NOERROR, 0))
3702 #ifdef RE_ENABLE_I18N
3703 free_charset (mbcset);
3704 #endif /* RE_ENABLE_I18N */
3708 /* \w match '_' also. */
3709 for (; *extra; extra++)
3710 bitset_set (sbcset, *extra);
3712 /* If it is non-matching list. */
3714 bitset_not (sbcset);
3716 #ifdef RE_ENABLE_I18N
3717 /* Ensure only single byte characters are set. */
3718 if (dfa->mb_cur_max > 1)
3719 bitset_mask (sbcset, dfa->sb_char);
3722 /* Build a tree for simple bracket. */
3723 br_token.type = SIMPLE_BRACKET;
3724 br_token.opr.sbcset = sbcset;
3725 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726 if (BE (tree == NULL, 0))
3727 goto build_word_op_espace;
3729 #ifdef RE_ENABLE_I18N
3730 if (dfa->mb_cur_max > 1)
3732 bin_tree_t *mbc_tree;
3733 /* Build a tree for complex bracket. */
3734 br_token.type = COMPLEX_BRACKET;
3735 br_token.opr.mbcset = mbcset;
3736 dfa->has_mb_node = 1;
3737 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3738 if (BE (mbc_tree == NULL, 0))
3739 goto build_word_op_espace;
3740 /* Then join them by ALT node. */
3741 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3742 if (BE (mbc_tree != NULL, 1))
3747 free_charset (mbcset);
3750 #else /* not RE_ENABLE_I18N */
3752 #endif /* not RE_ENABLE_I18N */
3754 build_word_op_espace:
3756 #ifdef RE_ENABLE_I18N
3757 free_charset (mbcset);
3758 #endif /* RE_ENABLE_I18N */
3763 /* This is intended for the expressions like "a{1,3}".
3764 Fetch a number from `input', and return the number.
3765 Return -1, if the number field is empty like "{,1}".
3766 Return -2, If an error is occured. */
3769 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3775 fetch_token (token, input, syntax);
3777 if (BE (token->type == END_OF_RE, 0))
3779 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3781 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3782 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3783 num = (num > RE_DUP_MAX) ? -2 : num;
3788 #ifdef RE_ENABLE_I18N
3790 free_charset (re_charset_t *cset)
3792 re_free (cset->mbchars);
3794 re_free (cset->coll_syms);
3795 re_free (cset->equiv_classes);
3796 re_free (cset->range_starts);
3797 re_free (cset->range_ends);
3799 re_free (cset->char_classes);
3802 #endif /* RE_ENABLE_I18N */
3804 /* Functions for binary tree operation. */
3806 /* Create a tree node. */
3809 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3810 re_token_type_t type)
3814 return create_token_tree (dfa, left, right, &t);
3818 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3819 const re_token_t *token)
3822 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3824 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3826 if (storage == NULL)
3828 storage->next = dfa->str_tree_storage;
3829 dfa->str_tree_storage = storage;
3830 dfa->str_tree_storage_idx = 0;
3832 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3834 tree->parent = NULL;
3836 tree->right = right;
3837 tree->token = *token;
3838 tree->token.duplicated = 0;
3839 tree->token.opt_subexp = 0;
3842 tree->node_idx = -1;
3845 left->parent = tree;
3847 right->parent = tree;
3851 /* Mark the tree SRC as an optional subexpression.
3852 To be called from preorder or postorder. */
3854 static reg_errcode_t
3855 mark_opt_subexp (void *extra, bin_tree_t *node)
3857 int idx = (int) (long) extra;
3858 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3859 node->token.opt_subexp = 1;
3864 /* Free the allocated memory inside NODE. */
3867 free_token (re_token_t *node)
3869 #ifdef RE_ENABLE_I18N
3870 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3871 free_charset (node->opr.mbcset);
3873 #endif /* RE_ENABLE_I18N */
3874 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3875 re_free (node->opr.sbcset);
3878 /* Worker function for tree walking. Free the allocated memory inside NODE
3879 and its children. */
3881 static reg_errcode_t
3882 free_tree (void *extra, bin_tree_t *node)
3884 free_token (&node->token);
3889 /* Duplicate the node SRC, and return new node. This is a preorder
3890 visit similar to the one implemented by the generic visitor, but
3891 we need more infrastructure to maintain two parallel trees --- so,
3892 it's easier to duplicate. */
3895 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3897 const bin_tree_t *node;
3898 bin_tree_t *dup_root;
3899 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3901 for (node = root; ; )
3903 /* Create a new tree and link it back to the current parent. */
3904 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3907 (*p_new)->parent = dup_node;
3908 (*p_new)->token.duplicated = 1;
3911 /* Go to the left node, or up and to the right. */
3915 p_new = &dup_node->left;
3919 const bin_tree_t *prev = NULL;
3920 while (node->right == prev || node->right == NULL)
3923 node = node->parent;
3924 dup_node = dup_node->parent;
3929 p_new = &dup_node->right;