1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_token_t *token, int token_len,
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 int *char_class_alloc,
97 const char *class_name,
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 const char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const char *class_name,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
203 /* Entry points for GNU code. */
208 /* For ZOS USS we must define btowc */
217 memset(& mbs, 0, sizeof(mbs));
221 mbrtowc (wtmp, tmp, 1, & mbs);
226 /* re_compile_pattern is the GNU regular expression compiler: it
227 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
228 Returns 0 if the pattern was valid, otherwise an error string.
230 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
231 are set in BUFP on entry. */
234 re_compile_pattern (pattern, length, bufp)
237 struct re_pattern_buffer *bufp;
241 /* And GNU code determines whether or not to get register information
242 by passing null for the REGS argument to re_match, etc., not by
243 setting no_sub, unless RE_NO_SUB is set. */
244 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
246 /* Match anchors at newline. */
247 bufp->newline_anchor = 1;
249 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
253 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
256 weak_alias (__re_compile_pattern, re_compile_pattern)
259 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
260 also be assigned to arbitrarily: each pattern buffer stores its own
261 syntax, so it can be changed between regex compilations. */
262 /* This has no initializer because initialized variables in Emacs
263 become read-only after dumping. */
264 reg_syntax_t re_syntax_options;
267 /* Specify the precise syntax of regexps for compilation. This provides
268 for compatibility for various utilities which historically have
269 different, incompatible syntaxes.
271 The argument SYNTAX is a bit mask comprised of the various bits
272 defined in regex.h. We return the old syntax. */
275 re_set_syntax (syntax)
278 reg_syntax_t ret = re_syntax_options;
280 re_syntax_options = syntax;
284 weak_alias (__re_set_syntax, re_set_syntax)
288 re_compile_fastmap (bufp)
289 struct re_pattern_buffer *bufp;
291 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
292 char *fastmap = bufp->fastmap;
294 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
295 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
296 if (dfa->init_state != dfa->init_state_word)
297 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
298 if (dfa->init_state != dfa->init_state_nl)
299 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
300 if (dfa->init_state != dfa->init_state_begbuf)
301 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
302 bufp->fastmap_accurate = 1;
306 weak_alias (__re_compile_fastmap, re_compile_fastmap)
310 __attribute ((always_inline))
311 re_set_fastmap (char *fastmap, int icase, int ch)
315 fastmap[tolower (ch)] = 1;
318 /* Helper function for re_compile_fastmap.
319 Compile fastmap for the initial_state INIT_STATE. */
322 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
325 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
327 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
328 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
330 int node = init_state->nodes.elems[node_cnt];
331 re_token_type_t type = dfa->nodes[node].type;
333 if (type == CHARACTER)
335 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
336 #ifdef RE_ENABLE_I18N
337 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
339 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
344 *p++ = dfa->nodes[node].opr.c;
345 while (++node < dfa->nodes_len
346 && dfa->nodes[node].type == CHARACTER
347 && dfa->nodes[node].mb_partial)
348 *p++ = dfa->nodes[node].opr.c;
349 memset (&state, '\0', sizeof (state));
350 if (__mbrtowc (&wc, (const char *) buf, p - buf,
352 && (__wcrtomb ((char *) buf, towlower (wc), &state)
354 re_set_fastmap (fastmap, 0, buf[0]);
359 else if (type == SIMPLE_BRACKET)
362 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
365 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
366 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
367 if (w & ((bitset_word_t) 1 << j))
368 re_set_fastmap (fastmap, icase, ch);
371 #ifdef RE_ENABLE_I18N
372 else if (type == COMPLEX_BRACKET)
374 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 /* See if we have to try all bytes which start multiple collation
380 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
381 collation element, and don't catch 'b' since 'b' is
382 the only collation element which starts from 'b' (and
383 it is caught by SIMPLE_BRACKET). */
384 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
385 && (cset->ncoll_syms || cset->nranges))
387 const int32_t *table = (const int32_t *)
388 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
389 for (i = 0; i < SBC_MAX; ++i)
391 re_set_fastmap (fastmap, icase, i);
395 /* See if we have to start the match at all multibyte characters,
396 i.e. where we would not find an invalid sequence. This only
397 applies to multibyte character sets; for single byte character
398 sets, the SIMPLE_BRACKET again suffices. */
399 if (dfa->mb_cur_max > 1
400 && (cset->nchar_classes || cset->non_match || cset->nranges
402 || cset->nequiv_classes
410 memset (&mbs, 0, sizeof (mbs));
411 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
412 re_set_fastmap (fastmap, false, (int) c);
419 /* ... Else catch all bytes which can start the mbchars. */
420 for (i = 0; i < cset->nmbchars; ++i)
424 memset (&state, '\0', sizeof (state));
425 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
426 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
427 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
429 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
431 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
436 #endif /* RE_ENABLE_I18N */
437 else if (type == OP_PERIOD
438 #ifdef RE_ENABLE_I18N
439 || type == OP_UTF8_PERIOD
440 #endif /* RE_ENABLE_I18N */
441 || type == END_OF_RE)
443 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
444 if (type == END_OF_RE)
445 bufp->can_be_null = 1;
451 /* Entry point for POSIX code. */
452 /* regcomp takes a regular expression as a string and compiles it.
454 PREG is a regex_t *. We do not expect any fields to be initialized,
455 since POSIX says we shouldn't. Thus, we set
457 `buffer' to the compiled pattern;
458 `used' to the length of the compiled pattern;
459 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
460 REG_EXTENDED bit in CFLAGS is set; otherwise, to
461 RE_SYNTAX_POSIX_BASIC;
462 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
463 `fastmap' to an allocated space for the fastmap;
464 `fastmap_accurate' to zero;
465 `re_nsub' to the number of subexpressions in PATTERN.
467 PATTERN is the address of the pattern string.
469 CFLAGS is a series of bits which affect compilation.
471 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
472 use POSIX basic syntax.
474 If REG_NEWLINE is set, then . and [^...] don't match newline.
475 Also, regexec will try a match beginning after every newline.
477 If REG_ICASE is set, then we considers upper- and lowercase
478 versions of letters to be equivalent when matching.
480 If REG_NOSUB is set, then when PREG is passed to regexec, that
481 routine will report only success or failure, and nothing about the
484 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
485 the return codes and their meanings.) */
488 regcomp (preg, pattern, cflags)
489 regex_t *__restrict preg;
490 const char *__restrict pattern;
494 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
495 : RE_SYNTAX_POSIX_BASIC);
501 /* Try to allocate space for the fastmap. */
502 preg->fastmap = re_malloc (char, SBC_MAX);
503 if (BE (preg->fastmap == NULL, 0))
506 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
508 /* If REG_NEWLINE is set, newlines are treated differently. */
509 if (cflags & REG_NEWLINE)
510 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
511 syntax &= ~RE_DOT_NEWLINE;
512 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
513 /* It also changes the matching behavior. */
514 preg->newline_anchor = 1;
517 preg->newline_anchor = 0;
518 preg->no_sub = !!(cflags & REG_NOSUB);
519 preg->translate = NULL;
521 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
523 /* POSIX doesn't distinguish between an unmatched open-group and an
524 unmatched close-group: both are REG_EPAREN. */
525 if (ret == REG_ERPAREN)
528 /* We have already checked preg->fastmap != NULL. */
529 if (BE (ret == REG_NOERROR, 1))
530 /* Compute the fastmap now, since regexec cannot modify the pattern
531 buffer. This function never fails in this implementation. */
532 (void) re_compile_fastmap (preg);
535 /* Some error occurred while compiling the expression. */
536 re_free (preg->fastmap);
537 preg->fastmap = NULL;
543 weak_alias (__regcomp, regcomp)
546 /* Returns a message corresponding to an error code, ERRCODE, returned
547 from either regcomp or regexec. We don't use PREG here. */
550 regerror (errcode, preg, errbuf, errbuf_size)
552 const regex_t *__restrict preg;
553 char *__restrict errbuf;
560 || errcode >= (int) (sizeof (__re_error_msgid_idx)
561 / sizeof (__re_error_msgid_idx[0])), 0))
562 /* Only error codes returned by the rest of the code should be passed
563 to this routine. If we are given anything else, or if other regex
564 code generates an invalid error code, then the program has a bug.
565 Dump core so we can fix it. */
568 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
570 msg_size = strlen (msg) + 1; /* Includes the null. */
572 if (BE (errbuf_size != 0, 1))
574 if (BE (msg_size > errbuf_size, 0))
576 #if defined HAVE_MEMPCPY || defined _LIBC
577 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
579 memcpy (errbuf, msg, errbuf_size - 1);
580 errbuf[errbuf_size - 1] = 0;
584 memcpy (errbuf, msg, msg_size);
590 weak_alias (__regerror, regerror)
594 #ifdef RE_ENABLE_I18N
595 /* This static array is used for the map to single-byte characters when
596 UTF-8 is used. Otherwise we would allocate memory just to initialize
597 it the same all the time. UTF-8 is the preferred encoding so this is
598 a worthwhile optimization. */
600 static const bitset_t utf8_sb_map = {
601 /* Set the first 128 bits. */
602 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
604 #else /* ! (__GNUC__ >= 3) */
605 static bitset_t utf8_sb_map;
606 #endif /* __GNUC__ >= 3 */
607 #endif /* RE_ENABLE_I18N */
611 free_dfa_content (re_dfa_t *dfa)
616 for (i = 0; i < dfa->nodes_len; ++i)
617 free_token (dfa->nodes + i);
618 re_free (dfa->nexts);
619 for (i = 0; i < dfa->nodes_len; ++i)
621 if (dfa->eclosures != NULL)
622 re_node_set_free (dfa->eclosures + i);
623 if (dfa->inveclosures != NULL)
624 re_node_set_free (dfa->inveclosures + i);
625 if (dfa->edests != NULL)
626 re_node_set_free (dfa->edests + i);
628 re_free (dfa->edests);
629 re_free (dfa->eclosures);
630 re_free (dfa->inveclosures);
631 re_free (dfa->nodes);
633 if (dfa->state_table)
634 for (i = 0; i <= dfa->state_hash_mask; ++i)
636 struct re_state_table_entry *entry = dfa->state_table + i;
637 for (j = 0; j < entry->num; ++j)
639 re_dfastate_t *state = entry->array[j];
642 re_free (entry->array);
644 re_free (dfa->state_table);
645 #ifdef RE_ENABLE_I18N
646 if (dfa->sb_char != utf8_sb_map)
647 re_free (dfa->sb_char);
649 re_free (dfa->subexp_map);
651 re_free (dfa->re_str);
658 /* Free dynamically allocated space used by PREG. */
664 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
665 if (BE (dfa != NULL, 1))
666 free_dfa_content (dfa);
670 re_free (preg->fastmap);
671 preg->fastmap = NULL;
673 re_free (preg->translate);
674 preg->translate = NULL;
677 weak_alias (__regfree, regfree)
680 /* Entry points compatible with 4.2 BSD regex library. We don't define
681 them unless specifically requested. */
683 #if defined _REGEX_RE_COMP || defined _LIBC
685 /* BSD has one and only one pattern buffer. */
686 static struct re_pattern_buffer re_comp_buf;
690 /* Make these definitions weak in libc, so POSIX programs can redefine
691 these names if they don't use our functions, and still use
692 regcomp/regexec above without link errors. */
703 if (!re_comp_buf.buffer)
704 return gettext ("No previous regular expression");
708 if (re_comp_buf.buffer)
710 fastmap = re_comp_buf.fastmap;
711 re_comp_buf.fastmap = NULL;
712 __regfree (&re_comp_buf);
713 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
714 re_comp_buf.fastmap = fastmap;
717 if (re_comp_buf.fastmap == NULL)
719 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
720 if (re_comp_buf.fastmap == NULL)
721 return (char *) gettext (__re_error_msgid
722 + __re_error_msgid_idx[(int) REG_ESPACE]);
725 /* Since `re_exec' always passes NULL for the `regs' argument, we
726 don't need to initialize the pattern buffer fields which affect it. */
728 /* Match anchors at newlines. */
729 re_comp_buf.newline_anchor = 1;
731 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
736 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
737 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741 libc_freeres_fn (free_mem)
743 __regfree (&re_comp_buf);
747 #endif /* _REGEX_RE_COMP */
749 /* Internal entry point.
750 Compile the regular expression PATTERN, whose length is LENGTH.
751 SYNTAX indicate regular expression's syntax. */
754 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
757 reg_errcode_t err = REG_NOERROR;
761 /* Initialize the pattern buffer. */
762 preg->fastmap_accurate = 0;
763 preg->syntax = syntax;
764 preg->not_bol = preg->not_eol = 0;
767 preg->can_be_null = 0;
768 preg->regs_allocated = REGS_UNALLOCATED;
770 /* Initialize the dfa. */
771 dfa = (re_dfa_t *) preg->buffer;
772 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
774 /* If zero allocated, but buffer is non-null, try to realloc
775 enough space. This loses if buffer's address is bogus, but
776 that is the user's responsibility. If ->buffer is NULL this
777 is a simple allocation. */
778 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
781 preg->allocated = sizeof (re_dfa_t);
782 preg->buffer = (unsigned char *) dfa;
784 preg->used = sizeof (re_dfa_t);
786 err = init_dfa (dfa, length);
787 if (BE (err != REG_NOERROR, 0))
789 free_dfa_content (dfa);
795 /* Note: length+1 will not overflow since it is checked in init_dfa. */
796 dfa->re_str = re_malloc (char, length + 1);
797 strncpy (dfa->re_str, pattern, length + 1);
800 __libc_lock_init (dfa->lock);
802 err = re_string_construct (®exp, pattern, length, preg->translate,
803 syntax & RE_ICASE, dfa);
804 if (BE (err != REG_NOERROR, 0))
806 re_compile_internal_free_return:
807 free_workarea_compile (preg);
808 re_string_destruct (®exp);
809 free_dfa_content (dfa);
815 /* Parse the regular expression, and build a structure tree. */
817 dfa->str_tree = parse (®exp, preg, syntax, &err);
818 if (BE (dfa->str_tree == NULL, 0))
819 goto re_compile_internal_free_return;
821 /* Analyze the tree and create the nfa. */
822 err = analyze (preg);
823 if (BE (err != REG_NOERROR, 0))
824 goto re_compile_internal_free_return;
826 #ifdef RE_ENABLE_I18N
827 /* If possible, do searching in single byte encoding to speed things up. */
828 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
832 /* Then create the initial state of the dfa. */
833 err = create_initial_state (dfa);
835 /* Release work areas. */
836 free_workarea_compile (preg);
837 re_string_destruct (®exp);
839 if (BE (err != REG_NOERROR, 0))
841 free_dfa_content (dfa);
849 /* Initialize DFA. We use the length of the regular expression PAT_LEN
850 as the initial length of some arrays. */
853 init_dfa (re_dfa_t *dfa, size_t pat_len)
855 unsigned int table_size;
859 #if defined(GAWK) && defined(LIBC_IS_BORKED)
860 /* Needed for brain damaged systems */
861 extern int gawk_mb_cur_max;
864 memset (dfa, '\0', sizeof (re_dfa_t));
866 /* Force allocation of str_tree_storage the first time. */
867 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
869 /* Avoid overflows. */
870 if (pat_len == SIZE_MAX)
873 dfa->nodes_alloc = pat_len + 1;
874 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
876 /* table_size = 2 ^ ceil(log pat_len) */
877 for (table_size = 1; ; table_size <<= 1)
878 if (table_size > pat_len)
881 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
882 dfa->state_hash_mask = table_size - 1;
884 #if defined(GAWK) && defined(LIBC_IS_BORKED)
885 dfa->mb_cur_max = gawk_mb_cur_max;
887 dfa->mb_cur_max = MB_CUR_MAX;
890 if (dfa->mb_cur_max == 6
891 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
893 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
896 # ifdef HAVE_LANGINFO_CODESET
897 codeset_name = nl_langinfo (CODESET);
899 codeset_name = getenv ("LC_ALL");
900 if (codeset_name == NULL || codeset_name[0] == '\0')
901 codeset_name = getenv ("LC_CTYPE");
902 if (codeset_name == NULL || codeset_name[0] == '\0')
903 codeset_name = getenv ("LANG");
904 if (codeset_name == NULL)
906 else if (strchr (codeset_name, '.') != NULL)
907 codeset_name = strchr (codeset_name, '.') + 1;
910 /* strcasecmp isn't a standard interface. brute force check */
912 if (strcasecmp (codeset_name, "UTF-8") == 0
913 || strcasecmp (codeset_name, "UTF8") == 0)
916 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
917 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
918 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
919 && (codeset_name[3] == '-'
920 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
921 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
923 #if defined(GAWK) && defined(LIBC_IS_BORKED)
924 if (gawk_mb_cur_max == 1)
926 #endif /* defined(GAWK) && defined(LIBC_IS_BORKED) */
929 /* We check exhaustively in the loop below if this charset is a
930 superset of ASCII. */
931 dfa->map_notascii = 0;
934 #ifdef RE_ENABLE_I18N
935 if (dfa->mb_cur_max > 1)
939 #if !defined(__GNUC__) || __GNUC__ < 3
940 static short utf8_sb_map_inited = 0;
942 if (! utf8_sb_map_inited)
946 utf8_sb_map_inited = 0;
947 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
948 utf8_sb_map[i] = BITSET_WORD_MAX;
951 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
957 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
958 if (BE (dfa->sb_char == NULL, 0))
961 /* Set the bits corresponding to single byte chars. */
962 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
963 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
965 wint_t wch = __btowc (ch);
967 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
969 if (isascii (ch) && wch != ch)
970 dfa->map_notascii = 1;
977 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
982 /* Initialize WORD_CHAR table, which indicate which character is
983 "word". In this case "word" means that it is the word construction
984 character used by some operators like "\<", "\>", etc. */
988 init_word_char (re_dfa_t *dfa)
991 dfa->word_ops_used = 1;
993 if (BE (dfa->map_notascii == 0, 1))
995 if (sizeof (dfa->word_char[0]) == 8)
997 /* The extra temporaries here avoid "implicitly truncated"
998 warnings in the case when this is dead code, i.e. 32-bit. */
999 const uint64_t wc0 = UINT64_C (0x03ff000000000000);
1000 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
1001 dfa->word_char[0] = wc0;
1002 dfa->word_char[1] = wc1;
1005 else if (sizeof (dfa->word_char[0]) == 4)
1007 dfa->word_char[0] = UINT32_C (0x00000000);
1008 dfa->word_char[1] = UINT32_C (0x03ff0000);
1009 dfa->word_char[2] = UINT32_C (0x87fffffe);
1010 dfa->word_char[3] = UINT32_C (0x07fffffe);
1017 if (BE (dfa->is_utf8, 1))
1019 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
1025 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
1026 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
1027 if (isalnum (ch) || ch == '_')
1028 dfa->word_char[i] |= (bitset_word_t) 1 << j;
1031 /* Free the work area which are only used while compiling. */
1034 free_workarea_compile (regex_t *preg)
1036 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1037 bin_tree_storage_t *storage, *next;
1038 for (storage = dfa->str_tree_storage; storage; storage = next)
1040 next = storage->next;
1043 dfa->str_tree_storage = NULL;
1044 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1045 dfa->str_tree = NULL;
1046 re_free (dfa->org_indices);
1047 dfa->org_indices = NULL;
1050 /* Create initial states for all contexts. */
1052 static reg_errcode_t
1053 create_initial_state (re_dfa_t *dfa)
1057 re_node_set init_nodes;
1059 /* Initial states have the epsilon closure of the node which is
1060 the first node of the regular expression. */
1061 first = dfa->str_tree->first->node_idx;
1062 dfa->init_node = first;
1063 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1064 if (BE (err != REG_NOERROR, 0))
1067 /* The back-references which are in initial states can epsilon transit,
1068 since in this case all of the subexpressions can be null.
1069 Then we add epsilon closures of the nodes which are the next nodes of
1070 the back-references. */
1071 if (dfa->nbackref > 0)
1072 for (i = 0; i < init_nodes.nelem; ++i)
1074 int node_idx = init_nodes.elems[i];
1075 re_token_type_t type = dfa->nodes[node_idx].type;
1078 if (type != OP_BACK_REF)
1080 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1082 re_token_t *clexp_node;
1083 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1084 if (clexp_node->type == OP_CLOSE_SUBEXP
1085 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1088 if (clexp_idx == init_nodes.nelem)
1091 if (type == OP_BACK_REF)
1093 int dest_idx = dfa->edests[node_idx].elems[0];
1094 if (!re_node_set_contains (&init_nodes, dest_idx))
1096 reg_errcode_t err = re_node_set_merge (&init_nodes,
1099 if (err != REG_NOERROR)
1106 /* It must be the first time to invoke acquire_state. */
1107 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1108 /* We don't check ERR here, since the initial state must not be NULL. */
1109 if (BE (dfa->init_state == NULL, 0))
1111 if (dfa->init_state->has_constraint)
1113 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1115 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1117 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1121 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1122 || dfa->init_state_begbuf == NULL, 0))
1126 dfa->init_state_word = dfa->init_state_nl
1127 = dfa->init_state_begbuf = dfa->init_state;
1129 re_node_set_free (&init_nodes);
1133 #ifdef RE_ENABLE_I18N
1134 /* If it is possible to do searching in single byte encoding instead of UTF-8
1135 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1136 DFA nodes where needed. */
1139 optimize_utf8 (re_dfa_t *dfa)
1141 int node, i, mb_chars = 0, has_period = 0;
1143 for (node = 0; node < dfa->nodes_len; ++node)
1144 switch (dfa->nodes[node].type)
1147 if (dfa->nodes[node].opr.c >= 0x80)
1151 switch (dfa->nodes[node].opr.ctx_type)
1159 /* Word anchors etc. cannot be handled. It's okay to test
1160 opr.ctx_type since constraints (for all DFA nodes) are
1161 created by ORing one or more opr.ctx_type values. */
1171 case OP_DUP_ASTERISK:
1172 case OP_OPEN_SUBEXP:
1173 case OP_CLOSE_SUBEXP:
1175 case COMPLEX_BRACKET:
1177 case SIMPLE_BRACKET:
1178 /* Just double check. The non-ASCII range starts at 0x80. */
1179 assert (0x80 % BITSET_WORD_BITS == 0);
1180 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1181 if (dfa->nodes[node].opr.sbcset[i])
1188 if (mb_chars || has_period)
1189 for (node = 0; node < dfa->nodes_len; ++node)
1191 if (dfa->nodes[node].type == CHARACTER
1192 && dfa->nodes[node].opr.c >= 0x80)
1193 dfa->nodes[node].mb_partial = 0;
1194 else if (dfa->nodes[node].type == OP_PERIOD)
1195 dfa->nodes[node].type = OP_UTF8_PERIOD;
1198 /* The search can be in single byte locale. */
1199 dfa->mb_cur_max = 1;
1201 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1205 /* Analyze the structure tree, and calculate "first", "next", "edest",
1206 "eclosure", and "inveclosure". */
1208 static reg_errcode_t
1209 analyze (regex_t *preg)
1211 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1214 /* Allocate arrays. */
1215 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1216 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1217 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1218 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1219 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1220 || dfa->eclosures == NULL, 0))
1223 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1224 if (dfa->subexp_map != NULL)
1227 for (i = 0; i < preg->re_nsub; i++)
1228 dfa->subexp_map[i] = i;
1229 preorder (dfa->str_tree, optimize_subexps, dfa);
1230 for (i = 0; i < preg->re_nsub; i++)
1231 if (dfa->subexp_map[i] != i)
1233 if (i == preg->re_nsub)
1235 free (dfa->subexp_map);
1236 dfa->subexp_map = NULL;
1240 ret = postorder (dfa->str_tree, lower_subexps, preg);
1241 if (BE (ret != REG_NOERROR, 0))
1243 ret = postorder (dfa->str_tree, calc_first, dfa);
1244 if (BE (ret != REG_NOERROR, 0))
1246 preorder (dfa->str_tree, calc_next, dfa);
1247 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1248 if (BE (ret != REG_NOERROR, 0))
1250 ret = calc_eclosure (dfa);
1251 if (BE (ret != REG_NOERROR, 0))
1254 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1255 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1256 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1259 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1260 if (BE (dfa->inveclosures == NULL, 0))
1262 ret = calc_inveclosure (dfa);
1268 /* Our parse trees are very unbalanced, so we cannot use a stack to
1269 implement parse tree visits. Instead, we use parent pointers and
1270 some hairy code in these two functions. */
1271 static reg_errcode_t
1272 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1275 bin_tree_t *node, *prev;
1277 for (node = root; ; )
1279 /* Descend down the tree, preferably to the left (or to the right
1280 if that's the only child). */
1281 while (node->left || node->right)
1289 reg_errcode_t err = fn (extra, node);
1290 if (BE (err != REG_NOERROR, 0))
1292 if (node->parent == NULL)
1295 node = node->parent;
1297 /* Go up while we have a node that is reached from the right. */
1298 while (node->right == prev || node->right == NULL);
1303 static reg_errcode_t
1304 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1309 for (node = root; ; )
1311 reg_errcode_t err = fn (extra, node);
1312 if (BE (err != REG_NOERROR, 0))
1315 /* Go to the left node, or up and to the right. */
1320 bin_tree_t *prev = NULL;
1321 while (node->right == prev || node->right == NULL)
1324 node = node->parent;
1333 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1334 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1335 backreferences as well. Requires a preorder visit. */
1336 static reg_errcode_t
1337 optimize_subexps (void *extra, bin_tree_t *node)
1339 re_dfa_t *dfa = (re_dfa_t *) extra;
1341 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1343 int idx = node->token.opr.idx;
1344 node->token.opr.idx = dfa->subexp_map[idx];
1345 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1348 else if (node->token.type == SUBEXP
1349 && node->left && node->left->token.type == SUBEXP)
1351 int other_idx = node->left->token.opr.idx;
1353 node->left = node->left->left;
1355 node->left->parent = node;
1357 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1358 if (other_idx < BITSET_WORD_BITS)
1359 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1365 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1366 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1367 static reg_errcode_t
1368 lower_subexps (void *extra, bin_tree_t *node)
1370 regex_t *preg = (regex_t *) extra;
1371 reg_errcode_t err = REG_NOERROR;
1373 if (node->left && node->left->token.type == SUBEXP)
1375 node->left = lower_subexp (&err, preg, node->left);
1377 node->left->parent = node;
1379 if (node->right && node->right->token.type == SUBEXP)
1381 node->right = lower_subexp (&err, preg, node->right);
1383 node->right->parent = node;
1390 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1392 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1393 bin_tree_t *body = node->left;
1394 bin_tree_t *op, *cls, *tree1, *tree;
1397 /* We do not optimize empty subexpressions, because otherwise we may
1398 have bad CONCAT nodes with NULL children. This is obviously not
1399 very common, so we do not lose much. An example that triggers
1400 this case is the sed "script" /\(\)/x. */
1401 && node->left != NULL
1402 && (node->token.opr.idx >= BITSET_WORD_BITS
1403 || !(dfa->used_bkref_map
1404 & ((bitset_word_t) 1 << node->token.opr.idx))))
1407 /* Convert the SUBEXP node to the concatenation of an
1408 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1409 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1410 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1411 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1412 tree = create_tree (dfa, op, tree1, CONCAT);
1413 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1419 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1420 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1424 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1425 nodes. Requires a postorder visit. */
1426 static reg_errcode_t
1427 calc_first (void *extra, bin_tree_t *node)
1429 re_dfa_t *dfa = (re_dfa_t *) extra;
1430 if (node->token.type == CONCAT)
1432 node->first = node->left->first;
1433 node->node_idx = node->left->node_idx;
1438 node->node_idx = re_dfa_add_node (dfa, node->token);
1439 if (BE (node->node_idx == -1, 0))
1441 if (node->token.type == ANCHOR)
1442 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1447 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1448 static reg_errcode_t
1449 calc_next (void *extra, bin_tree_t *node)
1451 switch (node->token.type)
1453 case OP_DUP_ASTERISK:
1454 node->left->next = node;
1457 node->left->next = node->right->first;
1458 node->right->next = node->next;
1462 node->left->next = node->next;
1464 node->right->next = node->next;
1470 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1471 static reg_errcode_t
1472 link_nfa_nodes (void *extra, bin_tree_t *node)
1474 re_dfa_t *dfa = (re_dfa_t *) extra;
1475 int idx = node->node_idx;
1476 reg_errcode_t err = REG_NOERROR;
1478 switch (node->token.type)
1484 assert (node->next == NULL);
1487 case OP_DUP_ASTERISK:
1491 dfa->has_plural_match = 1;
1492 if (node->left != NULL)
1493 left = node->left->first->node_idx;
1495 left = node->next->node_idx;
1496 if (node->right != NULL)
1497 right = node->right->first->node_idx;
1499 right = node->next->node_idx;
1501 assert (right > -1);
1502 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1507 case OP_OPEN_SUBEXP:
1508 case OP_CLOSE_SUBEXP:
1509 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1513 dfa->nexts[idx] = node->next->node_idx;
1514 if (node->token.type == OP_BACK_REF)
1515 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1519 assert (!IS_EPSILON_NODE (node->token.type));
1520 dfa->nexts[idx] = node->next->node_idx;
1527 /* Duplicate the epsilon closure of the node ROOT_NODE.
1528 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1529 to their own constraint. */
1531 static reg_errcode_t
1533 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1534 int root_node, unsigned int init_constraint)
1536 int org_node, clone_node, ret;
1537 unsigned int constraint = init_constraint;
1538 for (org_node = top_org_node, clone_node = top_clone_node;;)
1540 int org_dest, clone_dest;
1541 if (dfa->nodes[org_node].type == OP_BACK_REF)
1543 /* If the back reference epsilon-transit, its destination must
1544 also have the constraint. Then duplicate the epsilon closure
1545 of the destination of the back reference, and store it in
1546 edests of the back reference. */
1547 org_dest = dfa->nexts[org_node];
1548 re_node_set_empty (dfa->edests + clone_node);
1549 clone_dest = duplicate_node (dfa, org_dest, constraint);
1550 if (BE (clone_dest == -1, 0))
1552 dfa->nexts[clone_node] = dfa->nexts[org_node];
1553 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554 if (BE (ret < 0, 0))
1557 else if (dfa->edests[org_node].nelem == 0)
1559 /* In case of the node can't epsilon-transit, don't duplicate the
1560 destination and store the original destination as the
1561 destination of the node. */
1562 dfa->nexts[clone_node] = dfa->nexts[org_node];
1565 else if (dfa->edests[org_node].nelem == 1)
1567 /* In case of the node can epsilon-transit, and it has only one
1569 org_dest = dfa->edests[org_node].elems[0];
1570 re_node_set_empty (dfa->edests + clone_node);
1571 /* If the node is root_node itself, it means the epsilon clsoure
1572 has a loop. Then tie it to the destination of the root_node. */
1573 if (org_node == root_node && clone_node != org_node)
1575 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1576 if (BE (ret < 0, 0))
1580 /* In case of the node has another constraint, add it. */
1581 constraint |= dfa->nodes[org_node].constraint;
1582 clone_dest = duplicate_node (dfa, org_dest, constraint);
1583 if (BE (clone_dest == -1, 0))
1585 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1586 if (BE (ret < 0, 0))
1589 else /* dfa->edests[org_node].nelem == 2 */
1591 /* In case of the node can epsilon-transit, and it has two
1592 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1593 org_dest = dfa->edests[org_node].elems[0];
1594 re_node_set_empty (dfa->edests + clone_node);
1595 /* Search for a duplicated node which satisfies the constraint. */
1596 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1597 if (clone_dest == -1)
1599 /* There is no such duplicated node, create a new one. */
1601 clone_dest = duplicate_node (dfa, org_dest, constraint);
1602 if (BE (clone_dest == -1, 0))
1604 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1605 if (BE (ret < 0, 0))
1607 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1608 root_node, constraint);
1609 if (BE (err != REG_NOERROR, 0))
1614 /* There is a duplicated node which satisfies the constraint,
1615 use it to avoid infinite loop. */
1616 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1617 if (BE (ret < 0, 0))
1621 org_dest = dfa->edests[org_node].elems[1];
1622 clone_dest = duplicate_node (dfa, org_dest, constraint);
1623 if (BE (clone_dest == -1, 0))
1625 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1626 if (BE (ret < 0, 0))
1629 org_node = org_dest;
1630 clone_node = clone_dest;
1635 /* Search for a node which is duplicated from the node ORG_NODE, and
1636 satisfies the constraint CONSTRAINT. */
1639 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1640 unsigned int constraint)
1643 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1645 if (org_node == dfa->org_indices[idx]
1646 && constraint == dfa->nodes[idx].constraint)
1647 return idx; /* Found. */
1649 return -1; /* Not found. */
1652 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1653 Return the index of the new node, or -1 if insufficient storage is
1657 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1659 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1660 if (BE (dup_idx != -1, 1))
1662 dfa->nodes[dup_idx].constraint = constraint;
1663 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1664 dfa->nodes[dup_idx].duplicated = 1;
1666 /* Store the index of the original node. */
1667 dfa->org_indices[dup_idx] = org_idx;
1672 static reg_errcode_t
1673 calc_inveclosure (re_dfa_t *dfa)
1676 for (idx = 0; idx < dfa->nodes_len; ++idx)
1677 re_node_set_init_empty (dfa->inveclosures + idx);
1679 for (src = 0; src < dfa->nodes_len; ++src)
1681 int *elems = dfa->eclosures[src].elems;
1682 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1684 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1685 if (BE (ret == -1, 0))
1693 /* Calculate "eclosure" for all the node in DFA. */
1695 static reg_errcode_t
1696 calc_eclosure (re_dfa_t *dfa)
1698 int node_idx, incomplete;
1700 assert (dfa->nodes_len > 0);
1703 /* For each nodes, calculate epsilon closure. */
1704 for (node_idx = 0; ; ++node_idx)
1707 re_node_set eclosure_elem;
1708 if (node_idx == dfa->nodes_len)
1717 assert (dfa->eclosures[node_idx].nelem != -1);
1720 /* If we have already calculated, skip it. */
1721 if (dfa->eclosures[node_idx].nelem != 0)
1723 /* Calculate epsilon closure of `node_idx'. */
1724 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1725 if (BE (err != REG_NOERROR, 0))
1728 if (dfa->eclosures[node_idx].nelem == 0)
1731 re_node_set_free (&eclosure_elem);
1737 /* Calculate epsilon closure of NODE. */
1739 static reg_errcode_t
1740 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1744 re_node_set eclosure;
1747 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1748 if (BE (err != REG_NOERROR, 0))
1751 /* This indicates that we are calculating this node now.
1752 We reference this value to avoid infinite loop. */
1753 dfa->eclosures[node].nelem = -1;
1755 /* If the current node has constraints, duplicate all nodes
1756 since they must inherit the constraints. */
1757 if (dfa->nodes[node].constraint
1758 && dfa->edests[node].nelem
1759 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1761 err = duplicate_node_closure (dfa, node, node, node,
1762 dfa->nodes[node].constraint);
1763 if (BE (err != REG_NOERROR, 0))
1767 /* Expand each epsilon destination nodes. */
1768 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1769 for (i = 0; i < dfa->edests[node].nelem; ++i)
1771 re_node_set eclosure_elem;
1772 int edest = dfa->edests[node].elems[i];
1773 /* If calculating the epsilon closure of `edest' is in progress,
1774 return intermediate result. */
1775 if (dfa->eclosures[edest].nelem == -1)
1780 /* If we haven't calculated the epsilon closure of `edest' yet,
1781 calculate now. Otherwise use calculated epsilon closure. */
1782 if (dfa->eclosures[edest].nelem == 0)
1784 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1785 if (BE (err != REG_NOERROR, 0))
1789 eclosure_elem = dfa->eclosures[edest];
1790 /* Merge the epsilon closure of `edest'. */
1791 err = re_node_set_merge (&eclosure, &eclosure_elem);
1792 if (BE (err != REG_NOERROR, 0))
1794 /* If the epsilon closure of `edest' is incomplete,
1795 the epsilon closure of this node is also incomplete. */
1796 if (dfa->eclosures[edest].nelem == 0)
1799 re_node_set_free (&eclosure_elem);
1803 /* An epsilon closure includes itself. */
1804 ret = re_node_set_insert (&eclosure, node);
1805 if (BE (ret < 0, 0))
1807 if (incomplete && !root)
1808 dfa->eclosures[node].nelem = 0;
1810 dfa->eclosures[node] = eclosure;
1811 *new_set = eclosure;
1815 /* Functions for token which are used in the parser. */
1817 /* Fetch a token from INPUT.
1818 We must not use this function inside bracket expressions. */
1822 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1824 re_string_skip_bytes (input, peek_token (result, input, syntax));
1827 /* Peek a token from INPUT, and return the length of the token.
1828 We must not use this function inside bracket expressions. */
1832 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1836 if (re_string_eoi (input))
1838 token->type = END_OF_RE;
1842 c = re_string_peek_byte (input, 0);
1845 token->word_char = 0;
1846 #ifdef RE_ENABLE_I18N
1847 token->mb_partial = 0;
1848 if (input->mb_cur_max > 1 &&
1849 !re_string_first_byte (input, re_string_cur_idx (input)))
1851 token->type = CHARACTER;
1852 token->mb_partial = 1;
1859 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1861 token->type = BACK_SLASH;
1865 c2 = re_string_peek_byte_case (input, 1);
1867 token->type = CHARACTER;
1868 #ifdef RE_ENABLE_I18N
1869 if (input->mb_cur_max > 1)
1871 wint_t wc = re_string_wchar_at (input,
1872 re_string_cur_idx (input) + 1);
1873 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1877 token->word_char = IS_WORD_CHAR (c2) != 0;
1882 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1883 token->type = OP_ALT;
1885 case '1': case '2': case '3': case '4': case '5':
1886 case '6': case '7': case '8': case '9':
1887 if (!(syntax & RE_NO_BK_REFS))
1889 token->type = OP_BACK_REF;
1890 token->opr.idx = c2 - '1';
1894 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = WORD_FIRST;
1901 if (!(syntax & RE_NO_GNU_OPS))
1903 token->type = ANCHOR;
1904 token->opr.ctx_type = WORD_LAST;
1908 if (!(syntax & RE_NO_GNU_OPS))
1910 token->type = ANCHOR;
1911 token->opr.ctx_type = WORD_DELIM;
1915 if (!(syntax & RE_NO_GNU_OPS))
1917 token->type = ANCHOR;
1918 token->opr.ctx_type = NOT_WORD_DELIM;
1922 if (!(syntax & RE_NO_GNU_OPS))
1923 token->type = OP_WORD;
1926 if (!(syntax & RE_NO_GNU_OPS))
1927 token->type = OP_NOTWORD;
1930 if (!(syntax & RE_NO_GNU_OPS))
1931 token->type = OP_SPACE;
1934 if (!(syntax & RE_NO_GNU_OPS))
1935 token->type = OP_NOTSPACE;
1938 if (!(syntax & RE_NO_GNU_OPS))
1940 token->type = ANCHOR;
1941 token->opr.ctx_type = BUF_FIRST;
1945 if (!(syntax & RE_NO_GNU_OPS))
1947 token->type = ANCHOR;
1948 token->opr.ctx_type = BUF_LAST;
1952 if (!(syntax & RE_NO_BK_PARENS))
1953 token->type = OP_OPEN_SUBEXP;
1956 if (!(syntax & RE_NO_BK_PARENS))
1957 token->type = OP_CLOSE_SUBEXP;
1960 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1961 token->type = OP_DUP_PLUS;
1964 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1965 token->type = OP_DUP_QUESTION;
1968 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1969 token->type = OP_OPEN_DUP_NUM;
1972 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1973 token->type = OP_CLOSE_DUP_NUM;
1981 token->type = CHARACTER;
1982 #ifdef RE_ENABLE_I18N
1983 if (input->mb_cur_max > 1)
1985 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1986 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1990 token->word_char = IS_WORD_CHAR (token->opr.c);
1995 if (syntax & RE_NEWLINE_ALT)
1996 token->type = OP_ALT;
1999 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
2000 token->type = OP_ALT;
2003 token->type = OP_DUP_ASTERISK;
2006 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
2007 token->type = OP_DUP_PLUS;
2010 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
2011 token->type = OP_DUP_QUESTION;
2014 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2015 token->type = OP_OPEN_DUP_NUM;
2018 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2019 token->type = OP_CLOSE_DUP_NUM;
2022 if (syntax & RE_NO_BK_PARENS)
2023 token->type = OP_OPEN_SUBEXP;
2026 if (syntax & RE_NO_BK_PARENS)
2027 token->type = OP_CLOSE_SUBEXP;
2030 token->type = OP_OPEN_BRACKET;
2033 token->type = OP_PERIOD;
2036 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2037 re_string_cur_idx (input) != 0)
2039 char prev = re_string_peek_byte (input, -1);
2040 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2043 token->type = ANCHOR;
2044 token->opr.ctx_type = LINE_FIRST;
2047 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2048 re_string_cur_idx (input) + 1 != re_string_length (input))
2051 re_string_skip_bytes (input, 1);
2052 peek_token (&next, input, syntax);
2053 re_string_skip_bytes (input, -1);
2054 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2057 token->type = ANCHOR;
2058 token->opr.ctx_type = LINE_LAST;
2066 /* Peek a token from INPUT, and return the length of the token.
2067 We must not use this function out of bracket expressions. */
2071 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2074 if (re_string_eoi (input))
2076 token->type = END_OF_RE;
2079 c = re_string_peek_byte (input, 0);
2082 #ifdef RE_ENABLE_I18N
2083 if (input->mb_cur_max > 1 &&
2084 !re_string_first_byte (input, re_string_cur_idx (input)))
2086 token->type = CHARACTER;
2089 #endif /* RE_ENABLE_I18N */
2091 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2092 && re_string_cur_idx (input) + 1 < re_string_length (input))
2094 /* In this case, '\' escape a character. */
2096 re_string_skip_bytes (input, 1);
2097 c2 = re_string_peek_byte (input, 0);
2099 token->type = CHARACTER;
2102 if (c == '[') /* '[' is a special char in a bracket exps. */
2106 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2107 c2 = re_string_peek_byte (input, 1);
2115 token->type = OP_OPEN_COLL_ELEM;
2118 token->type = OP_OPEN_EQUIV_CLASS;
2121 if (syntax & RE_CHAR_CLASSES)
2123 token->type = OP_OPEN_CHAR_CLASS;
2126 /* else fall through. */
2128 token->type = CHARACTER;
2138 token->type = OP_CHARSET_RANGE;
2141 token->type = OP_CLOSE_BRACKET;
2144 token->type = OP_NON_MATCH_LIST;
2147 token->type = CHARACTER;
2152 /* Functions for parser. */
2154 /* Entry point of the parser.
2155 Parse the regular expression REGEXP and return the structure tree.
2156 If an error is occured, ERR is set by error code, and return NULL.
2157 This function build the following tree, from regular expression <reg_exp>:
2163 CAT means concatenation.
2164 EOR means end of regular expression. */
2167 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2170 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2171 bin_tree_t *tree, *eor, *root;
2172 re_token_t current_token;
2173 dfa->syntax = syntax;
2174 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2175 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2176 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2178 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2180 root = create_tree (dfa, tree, eor, CONCAT);
2183 if (BE (eor == NULL || root == NULL, 0))
2191 /* This function build the following tree, from regular expression
2192 <branch1>|<branch2>:
2198 ALT means alternative, which represents the operator `|'. */
2201 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2202 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2204 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2205 bin_tree_t *tree, *branch = NULL;
2206 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2207 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2210 while (token->type == OP_ALT)
2212 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2213 if (token->type != OP_ALT && token->type != END_OF_RE
2214 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2216 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2217 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2222 tree = create_tree (dfa, tree, branch, OP_ALT);
2223 if (BE (tree == NULL, 0))
2232 /* This function build the following tree, from regular expression
2239 CAT means concatenation. */
2242 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2243 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2245 bin_tree_t *tree, *exp;
2246 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2247 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2248 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2251 while (token->type != OP_ALT && token->type != END_OF_RE
2252 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2254 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2255 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2258 postorder (tree, free_tree, NULL);
2261 if (tree != NULL && exp != NULL)
2263 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2264 if (newtree == NULL)
2266 postorder (exp, free_tree, NULL);
2267 postorder (tree, free_tree, NULL);
2273 else if (tree == NULL)
2275 /* Otherwise exp == NULL, we don't need to create new tree. */
2280 /* This function build the following tree, from regular expression a*:
2287 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2288 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2290 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2292 switch (token->type)
2295 tree = create_token_tree (dfa, NULL, NULL, token);
2296 if (BE (tree == NULL, 0))
2301 #ifdef RE_ENABLE_I18N
2302 if (dfa->mb_cur_max > 1)
2304 while (!re_string_eoi (regexp)
2305 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2307 bin_tree_t *mbc_remain;
2308 fetch_token (token, regexp, syntax);
2309 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2310 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2311 if (BE (mbc_remain == NULL || tree == NULL, 0))
2320 case OP_OPEN_SUBEXP:
2321 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2322 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2325 case OP_OPEN_BRACKET:
2326 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2327 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2331 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2336 dfa->used_bkref_map |= 1 << token->opr.idx;
2337 tree = create_token_tree (dfa, NULL, NULL, token);
2338 if (BE (tree == NULL, 0))
2344 dfa->has_mb_node = 1;
2346 case OP_OPEN_DUP_NUM:
2347 if (syntax & RE_CONTEXT_INVALID_DUP)
2353 case OP_DUP_ASTERISK:
2355 case OP_DUP_QUESTION:
2356 if (syntax & RE_CONTEXT_INVALID_OPS)
2361 else if (syntax & RE_CONTEXT_INDEP_OPS)
2363 fetch_token (token, regexp, syntax);
2364 return parse_expression (regexp, preg, token, syntax, nest, err);
2366 /* else fall through */
2367 case OP_CLOSE_SUBEXP:
2368 if ((token->type == OP_CLOSE_SUBEXP) &&
2369 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2374 /* else fall through */
2375 case OP_CLOSE_DUP_NUM:
2376 /* We treat it as a normal character. */
2378 /* Then we can these characters as normal characters. */
2379 token->type = CHARACTER;
2380 /* mb_partial and word_char bits should be initialized already
2382 tree = create_token_tree (dfa, NULL, NULL, token);
2383 if (BE (tree == NULL, 0))
2390 if ((token->opr.ctx_type
2391 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2392 && dfa->word_ops_used == 0)
2393 init_word_char (dfa);
2394 if (token->opr.ctx_type == WORD_DELIM
2395 || token->opr.ctx_type == NOT_WORD_DELIM)
2397 bin_tree_t *tree_first, *tree_last;
2398 if (token->opr.ctx_type == WORD_DELIM)
2400 token->opr.ctx_type = WORD_FIRST;
2401 tree_first = create_token_tree (dfa, NULL, NULL, token);
2402 token->opr.ctx_type = WORD_LAST;
2406 token->opr.ctx_type = INSIDE_WORD;
2407 tree_first = create_token_tree (dfa, NULL, NULL, token);
2408 token->opr.ctx_type = INSIDE_NOTWORD;
2410 tree_last = create_token_tree (dfa, NULL, NULL, token);
2411 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2412 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2420 tree = create_token_tree (dfa, NULL, NULL, token);
2421 if (BE (tree == NULL, 0))
2427 /* We must return here, since ANCHORs can't be followed
2428 by repetition operators.
2429 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2430 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2431 fetch_token (token, regexp, syntax);
2434 tree = create_token_tree (dfa, NULL, NULL, token);
2435 if (BE (tree == NULL, 0))
2440 if (dfa->mb_cur_max > 1)
2441 dfa->has_mb_node = 1;
2445 tree = build_charclass_op (dfa, regexp->trans,
2448 token->type == OP_NOTWORD, err);
2449 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2454 tree = build_charclass_op (dfa, regexp->trans,
2457 token->type == OP_NOTSPACE, err);
2458 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2468 /* Must not happen? */
2474 fetch_token (token, regexp, syntax);
2476 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2477 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2479 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2480 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2482 /* In BRE consecutive duplications are not allowed. */
2483 if ((syntax & RE_CONTEXT_INVALID_DUP)
2484 && (token->type == OP_DUP_ASTERISK
2485 || token->type == OP_OPEN_DUP_NUM))
2495 /* This function build the following tree, from regular expression
2503 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2504 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2506 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2509 cur_nsub = preg->re_nsub++;
2511 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2513 /* The subexpression may be a null string. */
2514 if (token->type == OP_CLOSE_SUBEXP)
2518 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2519 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2522 postorder (tree, free_tree, NULL);
2525 if (BE (*err != REG_NOERROR, 0))
2529 if (cur_nsub <= '9' - '1')
2530 dfa->completed_bkref_map |= 1 << cur_nsub;
2532 tree = create_tree (dfa, tree, NULL, SUBEXP);
2533 if (BE (tree == NULL, 0))
2538 tree->token.opr.idx = cur_nsub;
2542 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2545 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2546 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2548 bin_tree_t *tree = NULL, *old_tree = NULL;
2549 int i, start, end, start_idx = re_string_cur_idx (regexp);
2550 re_token_t start_token = *token;
2552 if (token->type == OP_OPEN_DUP_NUM)
2555 start = fetch_number (regexp, token, syntax);
2558 if (token->type == CHARACTER && token->opr.c == ',')
2559 start = 0; /* We treat "{,m}" as "{0,m}". */
2562 *err = REG_BADBR; /* <re>{} is invalid. */
2566 if (BE (start != -2, 1))
2568 /* We treat "{n}" as "{n,n}". */
2569 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2570 : ((token->type == CHARACTER && token->opr.c == ',')
2571 ? fetch_number (regexp, token, syntax) : -2));
2573 if (BE (start == -2 || end == -2, 0))
2575 /* Invalid sequence. */
2576 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2578 if (token->type == END_OF_RE)
2586 /* If the syntax bit is set, rollback. */
2587 re_string_set_index (regexp, start_idx);
2588 *token = start_token;
2589 token->type = CHARACTER;
2590 /* mb_partial and word_char bits should be already initialized by
2595 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2597 /* First number greater than second. */
2604 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2608 fetch_token (token, regexp, syntax);
2610 if (BE (elem == NULL, 0))
2612 if (BE (start == 0 && end == 0, 0))
2614 postorder (elem, free_tree, NULL);
2618 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2619 if (BE (start > 0, 0))
2622 for (i = 2; i <= start; ++i)
2624 elem = duplicate_tree (elem, dfa);
2625 tree = create_tree (dfa, tree, elem, CONCAT);
2626 if (BE (elem == NULL || tree == NULL, 0))
2627 goto parse_dup_op_espace;
2633 /* Duplicate ELEM before it is marked optional. */
2634 elem = duplicate_tree (elem, dfa);
2640 if (elem->token.type == SUBEXP)
2641 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2643 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2644 if (BE (tree == NULL, 0))
2645 goto parse_dup_op_espace;
2647 /* This loop is actually executed only when end != -1,
2648 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2649 already created the start+1-th copy. */
2650 for (i = start + 2; i <= end; ++i)
2652 elem = duplicate_tree (elem, dfa);
2653 tree = create_tree (dfa, tree, elem, CONCAT);
2654 if (BE (elem == NULL || tree == NULL, 0))
2655 goto parse_dup_op_espace;
2657 tree = create_tree (dfa, tree, NULL, OP_ALT);
2658 if (BE (tree == NULL, 0))
2659 goto parse_dup_op_espace;
2663 tree = create_tree (dfa, old_tree, tree, CONCAT);
2667 parse_dup_op_espace:
2672 /* Size of the names for collating symbol/equivalence_class/character_class.
2673 I'm not sure, but maybe enough. */
2674 #define BRACKET_NAME_BUF_SIZE 32
2677 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2678 Build the range expression which starts from START_ELEM, and ends
2679 at END_ELEM. The result are written to MBCSET and SBCSET.
2680 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2681 mbcset->range_ends, is a pointer argument sinse we may
2684 static reg_errcode_t
2686 # ifdef RE_ENABLE_I18N
2687 build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset,
2688 int *range_alloc, bracket_elem_t *start_elem,
2689 bracket_elem_t *end_elem)
2690 # else /* not RE_ENABLE_I18N */
2691 build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
2692 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2693 # endif /* not RE_ENABLE_I18N */
2695 unsigned int start_ch, end_ch;
2696 /* Equivalence Classes and Character Classes can't be a range start/end. */
2697 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2698 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2702 /* We can handle no multi character collating elements without libc
2704 if (BE ((start_elem->type == COLL_SYM
2705 && strlen ((char *) start_elem->opr.name) > 1)
2706 || (end_elem->type == COLL_SYM
2707 && strlen ((char *) end_elem->opr.name) > 1), 0))
2708 return REG_ECOLLATE;
2710 # ifdef RE_ENABLE_I18N
2716 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2724 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2725 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2726 * unsigned, so we don't have sign extension problems.
2728 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2729 ? start_ch : start_elem->opr.wch);
2730 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2731 ? end_ch : end_elem->opr.wch);
2733 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2734 ? __btowc (start_ch) : start_elem->opr.wch);
2735 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2736 ? __btowc (end_ch) : end_elem->opr.wch);
2738 if (start_wc == WEOF || end_wc == WEOF)
2739 return REG_ECOLLATE;
2740 else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
2743 /* Got valid collation sequence values, add them as a new entry.
2744 However, for !_LIBC we have no collation elements: if the
2745 character set is single byte, the single byte character set
2746 that we build below suffices. parse_bracket_exp passes
2747 no MBCSET if dfa->mb_cur_max == 1. */
2750 /* Check the space of the arrays. */
2751 if (BE (*range_alloc == mbcset->nranges, 0))
2753 /* There is not enough space, need realloc. */
2754 wchar_t *new_array_start, *new_array_end;
2757 /* +1 in case of mbcset->nranges is 0. */
2758 new_nranges = 2 * mbcset->nranges + 1;
2759 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2760 are NULL if *range_alloc == 0. */
2761 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2763 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2766 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2768 /* if one is not NULL, free it to avoid leaks */
2769 if (new_array_start != NULL)
2770 re_free(new_array_start);
2771 if (new_array_end != NULL)
2772 re_free(new_array_end);
2776 mbcset->range_starts = new_array_start;
2777 mbcset->range_ends = new_array_end;
2778 *range_alloc = new_nranges;
2781 mbcset->range_starts[mbcset->nranges] = start_wc;
2782 mbcset->range_ends[mbcset->nranges++] = end_wc;
2785 /* Build the table for single byte characters. */
2786 for (wc = 0; wc < SBC_MAX; ++wc)
2788 if (start_wc <= wc && wc <= end_wc)
2789 bitset_set (sbcset, wc);
2792 # else /* not RE_ENABLE_I18N */
2795 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2796 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2798 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2799 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2801 if (start_ch > end_ch)
2803 /* Build the table for single byte characters. */
2804 for (ch = 0; ch < SBC_MAX; ++ch)
2805 if (start_ch <= ch && ch <= end_ch)
2806 bitset_set (sbcset, ch);
2808 # endif /* not RE_ENABLE_I18N */
2811 #endif /* not _LIBC */
2814 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2815 Build the collating element which is represented by NAME.
2816 The result are written to MBCSET and SBCSET.
2817 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2818 pointer argument since we may update it. */
2820 static reg_errcode_t
2822 # ifdef RE_ENABLE_I18N
2823 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2824 int *coll_sym_alloc, const unsigned char *name)
2825 # else /* not RE_ENABLE_I18N */
2826 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2827 # endif /* not RE_ENABLE_I18N */
2829 size_t name_len = strlen ((const char *) name);
2830 if (BE (name_len != 1, 0))
2831 return REG_ECOLLATE;
2834 bitset_set (sbcset, name[0]);
2838 #endif /* not _LIBC */
2840 /* This function parse bracket expression like "[abc]", "[a-c]",
2844 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2845 reg_syntax_t syntax, reg_errcode_t *err)
2848 const unsigned char *collseqmb;
2849 const char *collseqwc;
2852 const int32_t *symb_table;
2853 const unsigned char *extra;
2855 /* Local function for parse_bracket_exp used in _LIBC environement.
2856 Seek the collating symbol entry correspondings to NAME.
2857 Return the index of the symbol in the SYMB_TABLE,
2858 or -1 if not found. */
2861 __attribute ((always_inline))
2862 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2866 for (elem = 0; elem < table_size; elem++)
2867 if (symb_table[2 * elem] != 0)
2869 int32_t idx = symb_table[2 * elem + 1];
2870 /* Skip the name of collating element name. */
2871 idx += 1 + extra[idx];
2872 if (/* Compare the length of the name. */
2873 name_len == extra[idx]
2874 /* Compare the name. */
2875 && memcmp (name, &extra[idx + 1], name_len) == 0)
2876 /* Yep, this is the entry. */
2882 /* Local function for parse_bracket_exp used in _LIBC environment.
2883 Look up the collation sequence value of BR_ELEM.
2884 Return the value if succeeded, UINT_MAX otherwise. */
2886 auto inline unsigned int
2887 __attribute ((always_inline))
2888 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2890 if (br_elem->type == SB_CHAR)
2893 if (MB_CUR_MAX == 1)
2896 return collseqmb[br_elem->opr.ch];
2899 wint_t wc = __btowc (br_elem->opr.ch);
2900 return __collseq_table_lookup (collseqwc, wc);
2903 else if (br_elem->type == MB_CHAR)
2906 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2908 else if (br_elem->type == COLL_SYM)
2910 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2914 elem = seek_collating_symbol_entry (br_elem->opr.name,
2918 /* We found the entry. */
2919 idx = symb_table[2 * elem + 1];
2920 /* Skip the name of collating element name. */
2921 idx += 1 + extra[idx];
2922 /* Skip the byte sequence of the collating element. */
2923 idx += 1 + extra[idx];
2924 /* Adjust for the alignment. */
2925 idx = (idx + 3) & ~3;
2926 /* Skip the multibyte collation sequence value. */
2927 idx += sizeof (unsigned int);
2928 /* Skip the wide char sequence of the collating element. */
2929 idx += sizeof (unsigned int) *
2930 (1 + *(unsigned int *) (extra + idx));
2931 /* Return the collation sequence value. */
2932 return *(unsigned int *) (extra + idx);
2934 else if (sym_name_len == 1)
2936 /* No valid character. Match it as a single byte
2938 return collseqmb[br_elem->opr.name[0]];
2941 else if (sym_name_len == 1)
2942 return collseqmb[br_elem->opr.name[0]];
2947 /* Local function for parse_bracket_exp used in _LIBC environement.
2948 Build the range expression which starts from START_ELEM, and ends
2949 at END_ELEM. The result are written to MBCSET and SBCSET.
2950 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2951 mbcset->range_ends, is a pointer argument sinse we may
2954 auto inline reg_errcode_t
2955 __attribute ((always_inline))
2956 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2957 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2960 uint32_t start_collseq;
2961 uint32_t end_collseq;
2963 /* Equivalence Classes and Character Classes can't be a range
2965 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2966 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2970 start_collseq = lookup_collation_sequence_value (start_elem);
2971 end_collseq = lookup_collation_sequence_value (end_elem);
2972 /* Check start/end collation sequence values. */
2973 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2974 return REG_ECOLLATE;
2975 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2978 /* Got valid collation sequence values, add them as a new entry.
2979 However, if we have no collation elements, and the character set
2980 is single byte, the single byte character set that we
2981 build below suffices. */
2982 if (nrules > 0 || dfa->mb_cur_max > 1)
2984 /* Check the space of the arrays. */
2985 if (BE (*range_alloc == mbcset->nranges, 0))
2987 /* There is not enough space, need realloc. */
2988 uint32_t *new_array_start;
2989 uint32_t *new_array_end;
2992 /* +1 in case of mbcset->nranges is 0. */
2993 new_nranges = 2 * mbcset->nranges + 1;
2994 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2996 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2999 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3002 mbcset->range_starts = new_array_start;
3003 mbcset->range_ends = new_array_end;
3004 *range_alloc = new_nranges;
3007 mbcset->range_starts[mbcset->nranges] = start_collseq;
3008 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3011 /* Build the table for single byte characters. */
3012 for (ch = 0; ch < SBC_MAX; ch++)
3014 uint32_t ch_collseq;
3016 if (MB_CUR_MAX == 1)
3019 ch_collseq = collseqmb[ch];
3021 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3022 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3023 bitset_set (sbcset, ch);
3028 /* Local function for parse_bracket_exp used in _LIBC environement.
3029 Build the collating element which is represented by NAME.
3030 The result are written to MBCSET and SBCSET.
3031 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3032 pointer argument sinse we may update it. */
3034 auto inline reg_errcode_t
3035 __attribute ((always_inline))
3036 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3037 int *coll_sym_alloc, const unsigned char *name)
3040 size_t name_len = strlen ((const char *) name);
3043 elem = seek_collating_symbol_entry (name, name_len);
3046 /* We found the entry. */
3047 idx = symb_table[2 * elem + 1];
3048 /* Skip the name of collating element name. */
3049 idx += 1 + extra[idx];
3051 else if (name_len == 1)
3053 /* No valid character, treat it as a normal
3055 bitset_set (sbcset, name[0]);
3059 return REG_ECOLLATE;
3061 /* Got valid collation sequence, add it as a new entry. */
3062 /* Check the space of the arrays. */
3063 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3065 /* Not enough, realloc it. */
3066 /* +1 in case of mbcset->ncoll_syms is 0. */
3067 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3068 /* Use realloc since mbcset->coll_syms is NULL
3070 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3071 new_coll_sym_alloc);
3072 if (BE (new_coll_syms == NULL, 0))
3074 mbcset->coll_syms = new_coll_syms;
3075 *coll_sym_alloc = new_coll_sym_alloc;
3077 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3082 if (BE (name_len != 1, 0))
3083 return REG_ECOLLATE;
3086 bitset_set (sbcset, name[0]);
3093 re_token_t br_token;
3094 re_bitset_ptr_t sbcset;
3095 #ifdef RE_ENABLE_I18N
3096 re_charset_t *mbcset;
3097 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3098 int equiv_class_alloc = 0, char_class_alloc = 0;
3099 #endif /* not RE_ENABLE_I18N */
3101 bin_tree_t *work_tree;
3103 int first_round = 1;
3105 collseqmb = (const unsigned char *)
3106 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3107 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3113 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3114 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3115 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3116 _NL_COLLATE_SYMB_TABLEMB);
3117 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3118 _NL_COLLATE_SYMB_EXTRAMB);
3121 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3122 #ifdef RE_ENABLE_I18N
3123 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3124 #endif /* RE_ENABLE_I18N */
3125 #ifdef RE_ENABLE_I18N
3126 if (BE (sbcset == NULL || mbcset == NULL, 0))
3128 if (BE (sbcset == NULL, 0))
3129 #endif /* RE_ENABLE_I18N */
3131 #ifdef RE_ENABLE_I18N
3139 token_len = peek_token_bracket (token, regexp, syntax);
3140 if (BE (token->type == END_OF_RE, 0))
3143 goto parse_bracket_exp_free_return;
3145 if (token->type == OP_NON_MATCH_LIST)
3147 #ifdef RE_ENABLE_I18N
3148 mbcset->non_match = 1;
3149 #endif /* not RE_ENABLE_I18N */
3151 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3152 bitset_set (sbcset, '\n');
3153 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3154 token_len = peek_token_bracket (token, regexp, syntax);
3155 if (BE (token->type == END_OF_RE, 0))
3158 goto parse_bracket_exp_free_return;
3162 /* We treat the first ']' as a normal character. */
3163 if (token->type == OP_CLOSE_BRACKET)
3164 token->type = CHARACTER;
3168 bracket_elem_t start_elem, end_elem;
3169 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3170 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3172 int token_len2 = 0, is_range_exp = 0;
3175 start_elem.opr.name = start_name_buf;
3176 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3177 syntax, first_round);
3178 if (BE (ret != REG_NOERROR, 0))
3181 goto parse_bracket_exp_free_return;
3185 /* Get information about the next token. We need it in any case. */
3186 token_len = peek_token_bracket (token, regexp, syntax);
3188 /* Do not check for ranges if we know they are not allowed. */
3189 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3191 if (BE (token->type == END_OF_RE, 0))
3194 goto parse_bracket_exp_free_return;
3196 if (token->type == OP_CHARSET_RANGE)
3198 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3199 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3200 if (BE (token2.type == END_OF_RE, 0))
3203 goto parse_bracket_exp_free_return;
3205 if (token2.type == OP_CLOSE_BRACKET)
3207 /* We treat the last '-' as a normal character. */
3208 re_string_skip_bytes (regexp, -token_len);
3209 token->type = CHARACTER;
3216 if (is_range_exp == 1)
3218 end_elem.opr.name = end_name_buf;
3219 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3221 if (BE (ret != REG_NOERROR, 0))
3224 goto parse_bracket_exp_free_return;
3227 token_len = peek_token_bracket (token, regexp, syntax);
3230 *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc,
3231 &start_elem, &end_elem);
3233 # ifdef RE_ENABLE_I18N
3234 *err = build_range_exp (syntax, sbcset,
3235 dfa->mb_cur_max > 1 ? mbcset : NULL,
3236 &range_alloc, &start_elem, &end_elem);
3238 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3240 #endif /* RE_ENABLE_I18N */
3241 if (BE (*err != REG_NOERROR, 0))
3242 goto parse_bracket_exp_free_return;
3246 switch (start_elem.type)
3249 bitset_set (sbcset, start_elem.opr.ch);
3251 #ifdef RE_ENABLE_I18N
3253 /* Check whether the array has enough space. */
3254 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3256 wchar_t *new_mbchars;
3257 /* Not enough, realloc it. */
3258 /* +1 in case of mbcset->nmbchars is 0. */
3259 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3260 /* Use realloc since array is NULL if *alloc == 0. */
3261 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3263 if (BE (new_mbchars == NULL, 0))
3264 goto parse_bracket_exp_espace;
3265 mbcset->mbchars = new_mbchars;
3267 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3269 #endif /* RE_ENABLE_I18N */
3271 *err = build_equiv_class (sbcset,
3272 #ifdef RE_ENABLE_I18N
3273 mbcset, &equiv_class_alloc,
3274 #endif /* RE_ENABLE_I18N */
3275 start_elem.opr.name);
3276 if (BE (*err != REG_NOERROR, 0))
3277 goto parse_bracket_exp_free_return;
3280 *err = build_collating_symbol (sbcset,
3281 #ifdef RE_ENABLE_I18N
3282 mbcset, &coll_sym_alloc,
3283 #endif /* RE_ENABLE_I18N */
3284 start_elem.opr.name);
3285 if (BE (*err != REG_NOERROR, 0))
3286 goto parse_bracket_exp_free_return;
3289 *err = build_charclass (regexp->trans, sbcset,
3290 #ifdef RE_ENABLE_I18N
3291 mbcset, &char_class_alloc,
3292 #endif /* RE_ENABLE_I18N */
3293 (const char *) start_elem.opr.name, syntax);
3294 if (BE (*err != REG_NOERROR, 0))
3295 goto parse_bracket_exp_free_return;
3302 if (BE (token->type == END_OF_RE, 0))
3305 goto parse_bracket_exp_free_return;
3307 if (token->type == OP_CLOSE_BRACKET)
3311 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3313 /* If it is non-matching list. */
3315 bitset_not (sbcset);
3317 #ifdef RE_ENABLE_I18N
3318 /* Ensure only single byte characters are set. */
3319 if (dfa->mb_cur_max > 1)
3320 bitset_mask (sbcset, dfa->sb_char);
3322 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3323 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3324 || mbcset->non_match)))
3326 bin_tree_t *mbc_tree;
3328 /* Build a tree for complex bracket. */
3329 dfa->has_mb_node = 1;
3330 br_token.type = COMPLEX_BRACKET;
3331 br_token.opr.mbcset = mbcset;
3332 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3333 if (BE (mbc_tree == NULL, 0))
3334 goto parse_bracket_exp_espace;
3335 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3336 if (sbcset[sbc_idx])
3338 /* If there are no bits set in sbcset, there is no point
3339 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3340 if (sbc_idx < BITSET_WORDS)
3342 /* Build a tree for simple bracket. */
3343 br_token.type = SIMPLE_BRACKET;
3344 br_token.opr.sbcset = sbcset;
3345 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3346 if (BE (work_tree == NULL, 0))
3347 goto parse_bracket_exp_espace;
3349 /* Then join them by ALT node. */
3350 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3351 if (BE (work_tree == NULL, 0))
3352 goto parse_bracket_exp_espace;
3357 work_tree = mbc_tree;
3361 #endif /* not RE_ENABLE_I18N */
3363 #ifdef RE_ENABLE_I18N
3364 free_charset (mbcset);
3366 /* Build a tree for simple bracket. */
3367 br_token.type = SIMPLE_BRACKET;
3368 br_token.opr.sbcset = sbcset;
3369 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3370 if (BE (work_tree == NULL, 0))
3371 goto parse_bracket_exp_espace;
3375 parse_bracket_exp_espace:
3377 parse_bracket_exp_free_return:
3379 #ifdef RE_ENABLE_I18N
3380 free_charset (mbcset);
3381 #endif /* RE_ENABLE_I18N */
3385 /* Parse an element in the bracket expression. */
3387 static reg_errcode_t
3388 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3389 re_token_t *token, int token_len, re_dfa_t *dfa,
3390 reg_syntax_t syntax, int accept_hyphen)
3392 #ifdef RE_ENABLE_I18N
3394 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3395 if (cur_char_size > 1)
3397 elem->type = MB_CHAR;
3398 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3399 re_string_skip_bytes (regexp, cur_char_size);
3402 #endif /* RE_ENABLE_I18N */
3403 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3404 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3405 || token->type == OP_OPEN_EQUIV_CLASS)
3406 return parse_bracket_symbol (elem, regexp, token);
3407 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3409 /* A '-' must only appear as anything but a range indicator before
3410 the closing bracket. Everything else is an error. */
3412 (void) peek_token_bracket (&token2, regexp, syntax);
3413 if (token2.type != OP_CLOSE_BRACKET)
3414 /* The actual error value is not standardized since this whole
3415 case is undefined. But ERANGE makes good sense. */
3418 elem->type = SB_CHAR;
3419 elem->opr.ch = token->opr.c;
3423 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3424 such as [:<character_class>:], [.<collating_element>.], and
3425 [=<equivalent_class>=]. */
3427 static reg_errcode_t
3428 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3431 unsigned char ch, delim = token->opr.c;
3433 if (re_string_eoi(regexp))
3437 if (i >= BRACKET_NAME_BUF_SIZE)
3439 if (token->type == OP_OPEN_CHAR_CLASS)
3440 ch = re_string_fetch_byte_case (regexp);
3442 ch = re_string_fetch_byte (regexp);
3443 if (re_string_eoi(regexp))
3445 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3447 elem->opr.name[i] = ch;
3449 re_string_skip_bytes (regexp, 1);
3450 elem->opr.name[i] = '\0';
3451 switch (token->type)
3453 case OP_OPEN_COLL_ELEM:
3454 elem->type = COLL_SYM;
3456 case OP_OPEN_EQUIV_CLASS:
3457 elem->type = EQUIV_CLASS;
3459 case OP_OPEN_CHAR_CLASS:
3460 elem->type = CHAR_CLASS;
3468 /* Helper function for parse_bracket_exp.
3469 Build the equivalence class which is represented by NAME.
3470 The result are written to MBCSET and SBCSET.
3471 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3472 is a pointer argument sinse we may update it. */
3474 static reg_errcode_t
3475 #ifdef RE_ENABLE_I18N
3476 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3477 int *equiv_class_alloc, const unsigned char *name)
3478 #else /* not RE_ENABLE_I18N */
3479 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3480 #endif /* not RE_ENABLE_I18N */
3483 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3486 const int32_t *table, *indirect;
3487 const unsigned char *weights, *extra, *cp;
3488 unsigned char char_buf[2];
3492 /* This #include defines a local function! */
3493 # include <locale/weight.h>
3494 /* Calculate the index for equivalence class. */
3496 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3497 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3498 _NL_COLLATE_WEIGHTMB);
3499 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3500 _NL_COLLATE_EXTRAMB);
3501 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3502 _NL_COLLATE_INDIRECTMB);
3503 idx1 = findidx (&cp, -1);
3504 if (BE (idx1 == 0 || *cp != '\0', 0))
3505 /* This isn't a valid character. */
3506 return REG_ECOLLATE;
3508 /* Build single byte matcing table for this equivalence class. */
3509 char_buf[1] = (unsigned char) '\0';
3510 len = weights[idx1 & 0xffffff];
3511 for (ch = 0; ch < SBC_MAX; ++ch)
3515 idx2 = findidx (&cp, 1);
3520 /* This isn't a valid character. */
3522 /* Compare only if the length matches and the collation rule
3523 index is the same. */
3524 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3528 while (cnt <= len &&
3529 weights[(idx1 & 0xffffff) + 1 + cnt]
3530 == weights[(idx2 & 0xffffff) + 1 + cnt])
3534 bitset_set (sbcset, ch);
3537 /* Check whether the array has enough space. */
3538 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3540 /* Not enough, realloc it. */
3541 /* +1 in case of mbcset->nequiv_classes is 0. */
3542 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3543 /* Use realloc since the array is NULL if *alloc == 0. */
3544 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3546 new_equiv_class_alloc);
3547 if (BE (new_equiv_classes == NULL, 0))
3549 mbcset->equiv_classes = new_equiv_classes;
3550 *equiv_class_alloc = new_equiv_class_alloc;
3552 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3557 if (BE (strlen ((const char *) name) != 1, 0))
3558 return REG_ECOLLATE;
3559 bitset_set (sbcset, *name);
3564 /* Helper function for parse_bracket_exp.
3565 Build the character class which is represented by NAME.
3566 The result are written to MBCSET and SBCSET.
3567 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3568 is a pointer argument sinse we may update it. */
3570 static reg_errcode_t
3571 #ifdef RE_ENABLE_I18N
3572 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3573 re_charset_t *mbcset, int *char_class_alloc,
3574 const char *class_name, reg_syntax_t syntax)
3575 #else /* not RE_ENABLE_I18N */
3576 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3577 const char *class_name, reg_syntax_t syntax)
3578 #endif /* not RE_ENABLE_I18N */
3582 /* In case of REG_ICASE "upper" and "lower" match the both of
3583 upper and lower cases. */
3584 if ((syntax & RE_ICASE)
3585 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3586 class_name = "alpha";
3588 #ifdef RE_ENABLE_I18N
3589 /* Check the space of the arrays. */
3590 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3592 /* Not enough, realloc it. */
3593 /* +1 in case of mbcset->nchar_classes is 0. */
3594 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3595 /* Use realloc since array is NULL if *alloc == 0. */
3596 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3597 new_char_class_alloc);
3598 if (BE (new_char_classes == NULL, 0))
3600 mbcset->char_classes = new_char_classes;
3601 *char_class_alloc = new_char_class_alloc;
3603 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3604 #endif /* RE_ENABLE_I18N */
3606 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3608 if (BE (trans != NULL, 0)) \
3610 for (i = 0; i < SBC_MAX; ++i) \
3611 if (ctype_func (i)) \
3612 bitset_set (sbcset, trans[i]); \
3616 for (i = 0; i < SBC_MAX; ++i) \
3617 if (ctype_func (i)) \
3618 bitset_set (sbcset, i); \
3622 if (strcmp (class_name, "alnum") == 0)
3623 BUILD_CHARCLASS_LOOP (isalnum);
3624 else if (strcmp (class_name, "cntrl") == 0)
3625 BUILD_CHARCLASS_LOOP (iscntrl);
3626 else if (strcmp (class_name, "lower") == 0)
3627 BUILD_CHARCLASS_LOOP (islower);
3628 else if (strcmp (class_name, "space") == 0)
3629 BUILD_CHARCLASS_LOOP (isspace);
3630 else if (strcmp (class_name, "alpha") == 0)
3631 BUILD_CHARCLASS_LOOP (isalpha);
3632 else if (strcmp (class_name, "digit") == 0)
3633 BUILD_CHARCLASS_LOOP (isdigit);
3634 else if (strcmp (class_name, "print") == 0)
3635 BUILD_CHARCLASS_LOOP (isprint);
3636 else if (strcmp (class_name, "upper") == 0)
3637 BUILD_CHARCLASS_LOOP (isupper);
3638 else if (strcmp (class_name, "blank") == 0)
3640 BUILD_CHARCLASS_LOOP (isblank);
3642 /* see comments above */
3643 BUILD_CHARCLASS_LOOP (is_blank);
3645 else if (strcmp (class_name, "graph") == 0)
3646 BUILD_CHARCLASS_LOOP (isgraph);
3647 else if (strcmp (class_name, "punct") == 0)
3648 BUILD_CHARCLASS_LOOP (ispunct);
3649 else if (strcmp (class_name, "xdigit") == 0)
3650 BUILD_CHARCLASS_LOOP (isxdigit);
3658 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3659 const char *class_name,
3660 const char *extra, int non_match,
3663 re_bitset_ptr_t sbcset;
3664 #ifdef RE_ENABLE_I18N
3665 re_charset_t *mbcset;
3667 #endif /* not RE_ENABLE_I18N */
3669 re_token_t br_token;
3672 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3673 #ifdef RE_ENABLE_I18N
3674 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3675 #endif /* RE_ENABLE_I18N */
3677 #ifdef RE_ENABLE_I18N
3678 if (BE (sbcset == NULL || mbcset == NULL, 0))
3679 #else /* not RE_ENABLE_I18N */
3680 if (BE (sbcset == NULL, 0))
3681 #endif /* not RE_ENABLE_I18N */
3683 /* if one is not NULL, free it to avoid leaks */
3686 #ifdef RE_ENABLE_I18N
3696 #ifdef RE_ENABLE_I18N
3697 mbcset->non_match = 1;
3698 #endif /* not RE_ENABLE_I18N */
3701 /* We don't care the syntax in this case. */
3702 ret = build_charclass (trans, sbcset,
3703 #ifdef RE_ENABLE_I18N
3705 #endif /* RE_ENABLE_I18N */
3708 if (BE (ret != REG_NOERROR, 0))
3711 #ifdef RE_ENABLE_I18N
3712 free_charset (mbcset);
3713 #endif /* RE_ENABLE_I18N */
3717 /* \w match '_' also. */
3718 for (; *extra; extra++)
3719 bitset_set (sbcset, *extra);
3721 /* If it is non-matching list. */
3723 bitset_not (sbcset);
3725 #ifdef RE_ENABLE_I18N
3726 /* Ensure only single byte characters are set. */
3727 if (dfa->mb_cur_max > 1)
3728 bitset_mask (sbcset, dfa->sb_char);
3731 /* Build a tree for simple bracket. */
3732 memset(& br_token, 0, sizeof(br_token)); /* silence "not initialized" errors froms static checkers */
3733 br_token.type = SIMPLE_BRACKET;
3734 br_token.opr.sbcset = sbcset;
3735 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3736 if (BE (tree == NULL, 0))
3737 goto build_word_op_espace;
3739 #ifdef RE_ENABLE_I18N
3740 if (dfa->mb_cur_max > 1)
3742 bin_tree_t *mbc_tree;
3743 /* Build a tree for complex bracket. */
3744 br_token.type = COMPLEX_BRACKET;
3745 br_token.opr.mbcset = mbcset;
3746 dfa->has_mb_node = 1;
3747 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3748 if (BE (mbc_tree == NULL, 0))
3749 goto build_word_op_espace;
3750 /* Then join them by ALT node. */
3751 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3752 if (BE (mbc_tree != NULL, 1))
3757 free_charset (mbcset);
3760 #else /* not RE_ENABLE_I18N */
3762 #endif /* not RE_ENABLE_I18N */
3764 build_word_op_espace:
3766 #ifdef RE_ENABLE_I18N
3767 free_charset (mbcset);
3768 #endif /* RE_ENABLE_I18N */
3773 /* This is intended for the expressions like "a{1,3}".
3774 Fetch a number from `input', and return the number.
3775 Return -1, if the number field is empty like "{,1}".
3776 Return -2, If an error is occured. */
3779 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3785 fetch_token (token, input, syntax);
3787 if (BE (token->type == END_OF_RE, 0))
3789 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3791 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3792 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3793 num = (num > RE_DUP_MAX) ? -2 : num;
3798 #ifdef RE_ENABLE_I18N
3800 free_charset (re_charset_t *cset)
3802 re_free (cset->mbchars);
3804 re_free (cset->coll_syms);
3805 re_free (cset->equiv_classes);
3806 re_free (cset->range_starts);
3807 re_free (cset->range_ends);
3809 re_free (cset->char_classes);
3812 #endif /* RE_ENABLE_I18N */
3814 /* Functions for binary tree operation. */
3816 /* Create a tree node. */
3819 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3820 re_token_type_t type)
3823 memset(& t, 0, sizeof(t)); /* silence "not initialized" errors froms static checkers */
3825 return create_token_tree (dfa, left, right, &t);
3829 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3830 const re_token_t *token)
3833 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3835 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3837 if (storage == NULL)
3839 storage->next = dfa->str_tree_storage;
3840 dfa->str_tree_storage = storage;
3841 dfa->str_tree_storage_idx = 0;
3843 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3845 tree->parent = NULL;
3847 tree->right = right;
3848 tree->token = *token;
3849 tree->token.duplicated = 0;
3850 tree->token.opt_subexp = 0;
3853 tree->node_idx = -1;
3856 left->parent = tree;
3858 right->parent = tree;
3862 /* Mark the tree SRC as an optional subexpression.
3863 To be called from preorder or postorder. */
3865 static reg_errcode_t
3866 mark_opt_subexp (void *extra, bin_tree_t *node)
3868 int idx = (int) (long) extra;
3869 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3870 node->token.opt_subexp = 1;
3875 /* Free the allocated memory inside NODE. */
3878 free_token (re_token_t *node)
3880 #ifdef RE_ENABLE_I18N
3881 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3882 free_charset (node->opr.mbcset);
3884 #endif /* RE_ENABLE_I18N */
3885 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3886 re_free (node->opr.sbcset);
3889 /* Worker function for tree walking. Free the allocated memory inside NODE
3890 and its children. */
3892 static reg_errcode_t
3893 free_tree (void *extra, bin_tree_t *node)
3895 free_token (&node->token);
3900 /* Duplicate the node SRC, and return new node. This is a preorder
3901 visit similar to the one implemented by the generic visitor, but
3902 we need more infrastructure to maintain two parallel trees --- so,
3903 it's easier to duplicate. */
3906 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3908 const bin_tree_t *node;
3909 bin_tree_t *dup_root;
3910 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3912 for (node = root; ; )
3914 /* Create a new tree and link it back to the current parent. */
3915 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3918 (*p_new)->parent = dup_node;
3919 (*p_new)->token.duplicated = 1;
3922 /* Go to the left node, or up and to the right. */
3926 p_new = &dup_node->left;
3930 const bin_tree_t *prev = NULL;
3931 while (node->right == prev || node->right == NULL)
3934 node = node->parent;
3935 dup_node = dup_node->parent;
3940 p_new = &dup_node->right;