1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
21 # include <locale/weight.h>
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
36 static void optimize_utf8 (re_dfa_t *dfa);
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
61 static int peek_token (re_token_t *token, re_string_t *input,
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
85 re_token_t *token, int token_len,
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx[] =
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236 weak_alias (__re_compile_pattern, re_compile_pattern)
238 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
254 re_set_syntax (reg_syntax_t syntax)
256 reg_syntax_t ret = re_syntax_options;
258 re_syntax_options = syntax;
261 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = bufp->buffer;
267 char *fastmap = bufp->fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
283 __attribute__ ((always_inline))
284 re_set_fastmap (char *fastmap, bool icase, int ch)
288 fastmap[tolower (ch)] = 1;
291 /* Helper function for re_compile_fastmap.
292 Compile fastmap for the initial_state INIT_STATE. */
295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
298 re_dfa_t *dfa = bufp->buffer;
300 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
301 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
303 Idx node = init_state->nodes.elems[node_cnt];
304 re_token_type_t type = dfa->nodes[node].type;
306 if (type == CHARACTER)
308 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309 #ifdef RE_ENABLE_I18N
310 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
312 unsigned char buf[MB_LEN_MAX];
318 *p++ = dfa->nodes[node].opr.c;
319 while (++node < dfa->nodes_len
320 && dfa->nodes[node].type == CHARACTER
321 && dfa->nodes[node].mb_partial)
322 *p++ = dfa->nodes[node].opr.c;
323 memset (&state, '\0', sizeof (state));
324 if (__mbrtowc (&wc, (const char *) buf, p - buf,
326 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
328 re_set_fastmap (fastmap, false, buf[0]);
332 else if (type == SIMPLE_BRACKET)
335 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 if (w & ((bitset_word_t) 1 << j))
341 re_set_fastmap (fastmap, icase, ch);
344 #ifdef RE_ENABLE_I18N
345 else if (type == COMPLEX_BRACKET)
347 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
351 /* See if we have to try all bytes which start multiple collation
353 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 collation element, and don't catch 'b' since 'b' is
355 the only collation element which starts from 'b' (and
356 it is caught by SIMPLE_BRACKET). */
357 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 && (cset->ncoll_syms || cset->nranges))
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0; i < SBC_MAX; ++i)
364 re_set_fastmap (fastmap, icase, i);
368 /* See if we have to start the match at all multibyte characters,
369 i.e. where we would not find an invalid sequence. This only
370 applies to multibyte character sets; for single byte character
371 sets, the SIMPLE_BRACKET again suffices. */
372 if (dfa->mb_cur_max > 1
373 && (cset->nchar_classes || cset->non_match || cset->nranges
375 || cset->nequiv_classes
383 memset (&mbs, 0, sizeof (mbs));
384 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 re_set_fastmap (fastmap, false, (int) c);
392 /* ... Else catch all bytes which can start the mbchars. */
393 for (i = 0; i < cset->nmbchars; ++i)
397 memset (&state, '\0', sizeof (state));
398 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
402 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
404 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
409 #endif /* RE_ENABLE_I18N */
410 else if (type == OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type == OP_UTF8_PERIOD
413 #endif /* RE_ENABLE_I18N */
414 || type == END_OF_RE)
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 'buffer' to the compiled pattern;
431 'used' to the length of the compiled pattern;
432 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 'fastmap' to an allocated space for the fastmap;
437 'fastmap_accurate' to zero;
438 're_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
464 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465 : RE_SYNTAX_POSIX_BASIC);
471 /* Try to allocate space for the fastmap. */
472 preg->fastmap = re_malloc (char, SBC_MAX);
473 if (__glibc_unlikely (preg->fastmap == NULL))
476 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
478 /* If REG_NEWLINE is set, newlines are treated differently. */
479 if (cflags & REG_NEWLINE)
480 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
481 syntax &= ~RE_DOT_NEWLINE;
482 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483 /* It also changes the matching behavior. */
484 preg->newline_anchor = 1;
487 preg->newline_anchor = 0;
488 preg->no_sub = !!(cflags & REG_NOSUB);
489 preg->translate = NULL;
491 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
493 /* POSIX doesn't distinguish between an unmatched open-group and an
494 unmatched close-group: both are REG_EPAREN. */
495 if (ret == REG_ERPAREN)
498 /* We have already checked preg->fastmap != NULL. */
499 if (__glibc_likely (ret == REG_NOERROR))
500 /* Compute the fastmap now, since regexec cannot modify the pattern
501 buffer. This function never fails in this implementation. */
502 (void) re_compile_fastmap (preg);
505 /* Some error occurred while compiling the expression. */
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
512 libc_hidden_def (__regcomp)
513 weak_alias (__regcomp, regcomp)
515 /* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
524 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
526 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
527 /* Only error codes returned by the rest of the code should be passed
528 to this routine. If we are given anything else, or if other regex
529 code generates an invalid error code, then the program has a bug.
530 Dump core so we can fix it. */
533 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
535 msg_size = strlen (msg) + 1; /* Includes the null. */
537 if (__glibc_likely (errbuf_size != 0))
539 size_t cpy_size = msg_size;
540 if (__glibc_unlikely (msg_size > errbuf_size))
542 cpy_size = errbuf_size - 1;
543 errbuf[cpy_size] = '\0';
545 memcpy (errbuf, msg, cpy_size);
550 weak_alias (__regerror, regerror)
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map =
560 /* Set the first 128 bits. */
561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
562 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
564 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
565 # error "bitset_word_t is narrower than 32 bits"
566 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX, BITSET_WORD_MAX,
570 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
574 >> (SBC_MAX % BITSET_WORD_BITS == 0
576 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
583 free_dfa_content (re_dfa_t *dfa)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
611 re_dfastate_t *state = entry->array[j];
614 re_free (entry->array);
616 re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
621 re_free (dfa->subexp_map);
623 re_free (dfa->re_str);
630 /* Free dynamically allocated space used by PREG. */
633 regfree (regex_t *preg)
635 re_dfa_t *dfa = preg->buffer;
636 if (__glibc_likely (dfa != NULL))
638 lock_fini (dfa->lock);
639 free_dfa_content (dfa);
644 re_free (preg->fastmap);
645 preg->fastmap = NULL;
647 re_free (preg->translate);
648 preg->translate = NULL;
650 libc_hidden_def (__regfree)
651 weak_alias (__regfree, regfree)
653 /* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
656 #if defined _REGEX_RE_COMP || defined _LIBC
658 /* BSD has one and only one pattern buffer. */
659 static struct re_pattern_buffer re_comp_buf;
663 /* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
668 re_comp (const char *s)
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
680 if (re_comp_buf.buffer)
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
686 re_comp_buf.fastmap = fastmap;
689 if (re_comp_buf.fastmap == NULL)
691 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692 if (re_comp_buf.fastmap == NULL)
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
697 /* Since 're_exec' always passes NULL for the 'regs' argument, we
698 don't need to initialize the pattern buffer fields which affect it. */
700 /* Match anchors at newlines. */
701 re_comp_buf.newline_anchor = 1;
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
713 libc_freeres_fn (free_mem)
715 __regfree (&re_comp_buf);
719 #endif /* _REGEX_RE_COMP */
721 /* Internal entry point.
722 Compile the regular expression PATTERN, whose length is LENGTH.
723 SYNTAX indicate regular expression's syntax. */
726 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
729 reg_errcode_t err = REG_NOERROR;
733 /* Initialize the pattern buffer. */
734 preg->fastmap_accurate = 0;
735 preg->syntax = syntax;
736 preg->not_bol = preg->not_eol = 0;
739 preg->can_be_null = 0;
740 preg->regs_allocated = REGS_UNALLOCATED;
742 /* Initialize the dfa. */
744 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
746 /* If zero allocated, but buffer is non-null, try to realloc
747 enough space. This loses if buffer's address is bogus, but
748 that is the user's responsibility. If ->buffer is NULL this
749 is a simple allocation. */
750 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
753 preg->allocated = sizeof (re_dfa_t);
756 preg->used = sizeof (re_dfa_t);
758 err = init_dfa (dfa, length);
759 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
761 if (__glibc_unlikely (err != REG_NOERROR))
763 free_dfa_content (dfa);
769 /* Note: length+1 will not overflow since it is checked in init_dfa. */
770 dfa->re_str = re_malloc (char, length + 1);
771 strncpy (dfa->re_str, pattern, length + 1);
774 err = re_string_construct (®exp, pattern, length, preg->translate,
775 (syntax & RE_ICASE) != 0, dfa);
776 if (__glibc_unlikely (err != REG_NOERROR))
778 re_compile_internal_free_return:
779 free_workarea_compile (preg);
780 re_string_destruct (®exp);
781 lock_fini (dfa->lock);
782 free_dfa_content (dfa);
788 /* Parse the regular expression, and build a structure tree. */
790 dfa->str_tree = parse (®exp, preg, syntax, &err);
791 if (__glibc_unlikely (dfa->str_tree == NULL))
792 goto re_compile_internal_free_return;
794 /* Analyze the tree and create the nfa. */
795 err = analyze (preg);
796 if (__glibc_unlikely (err != REG_NOERROR))
797 goto re_compile_internal_free_return;
799 #ifdef RE_ENABLE_I18N
800 /* If possible, do searching in single byte encoding to speed things up. */
801 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
805 /* Then create the initial state of the dfa. */
806 err = create_initial_state (dfa);
808 /* Release work areas. */
809 free_workarea_compile (preg);
810 re_string_destruct (®exp);
812 if (__glibc_unlikely (err != REG_NOERROR))
814 lock_fini (dfa->lock);
815 free_dfa_content (dfa);
823 /* Initialize DFA. We use the length of the regular expression PAT_LEN
824 as the initial length of some arrays. */
827 init_dfa (re_dfa_t *dfa, size_t pat_len)
829 __re_size_t table_size;
831 const char *codeset_name;
833 #ifdef RE_ENABLE_I18N
834 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
836 size_t max_i18n_object_size = 0;
838 size_t max_object_size =
839 MAX (sizeof (struct re_state_table_entry),
840 MAX (sizeof (re_token_t),
841 MAX (sizeof (re_node_set),
842 MAX (sizeof (regmatch_t),
843 max_i18n_object_size))));
845 memset (dfa, '\0', sizeof (re_dfa_t));
847 /* Force allocation of str_tree_storage the first time. */
848 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
850 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
851 calculation below, and for similar doubling calculations
852 elsewhere. And it's <= rather than <, because some of the
853 doubling calculations add 1 afterwards. */
854 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
869 dfa->mb_cur_max = MB_CUR_MAX;
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
877 codeset_name = nl_langinfo (CODESET);
878 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
879 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
880 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
881 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
884 /* We check exhaustively in the loop below if this charset is a
885 superset of ASCII. */
886 dfa->map_notascii = 0;
889 #ifdef RE_ENABLE_I18N
890 if (dfa->mb_cur_max > 1)
893 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
898 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
899 if (__glibc_unlikely (dfa->sb_char == NULL))
902 /* Set the bits corresponding to single byte chars. */
903 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
904 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
906 wint_t wch = __btowc (ch);
908 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
910 if (isascii (ch) && wch != ch)
911 dfa->map_notascii = 1;
918 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
923 /* Initialize WORD_CHAR table, which indicate which character is
924 "word". In this case "word" means that it is the word construction
925 character used by some operators like "\<", "\>", etc. */
928 init_word_char (re_dfa_t *dfa)
933 dfa->word_ops_used = 1;
934 if (__glibc_likely (dfa->map_notascii == 0))
936 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
937 them, an issue when this code is used in Gnulib. */
938 bitset_word_t bits0 = 0x00000000;
939 bitset_word_t bits1 = 0x03ff0000;
940 bitset_word_t bits2 = 0x87fffffe;
941 bitset_word_t bits3 = 0x07fffffe;
942 if (BITSET_WORD_BITS == 64)
944 /* Pacify gcc -Woverflow on 32-bit platformns. */
945 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
946 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
949 else if (BITSET_WORD_BITS == 32)
951 dfa->word_char[0] = bits0;
952 dfa->word_char[1] = bits1;
953 dfa->word_char[2] = bits2;
954 dfa->word_char[3] = bits3;
961 if (__glibc_likely (dfa->is_utf8))
963 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
969 for (; i < BITSET_WORDS; ++i)
970 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
971 if (isalnum (ch) || ch == '_')
972 dfa->word_char[i] |= (bitset_word_t) 1 << j;
975 /* Free the work area which are only used while compiling. */
978 free_workarea_compile (regex_t *preg)
980 re_dfa_t *dfa = preg->buffer;
981 bin_tree_storage_t *storage, *next;
982 for (storage = dfa->str_tree_storage; storage; storage = next)
984 next = storage->next;
987 dfa->str_tree_storage = NULL;
988 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989 dfa->str_tree = NULL;
990 re_free (dfa->org_indices);
991 dfa->org_indices = NULL;
994 /* Create initial states for all contexts. */
997 create_initial_state (re_dfa_t *dfa)
1001 re_node_set init_nodes;
1003 /* Initial states have the epsilon closure of the node which is
1004 the first node of the regular expression. */
1005 first = dfa->str_tree->first->node_idx;
1006 dfa->init_node = first;
1007 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008 if (__glibc_unlikely (err != REG_NOERROR))
1011 /* The back-references which are in initial states can epsilon transit,
1012 since in this case all of the subexpressions can be null.
1013 Then we add epsilon closures of the nodes which are the next nodes of
1014 the back-references. */
1015 if (dfa->nbackref > 0)
1016 for (i = 0; i < init_nodes.nelem; ++i)
1018 Idx node_idx = init_nodes.elems[i];
1019 re_token_type_t type = dfa->nodes[node_idx].type;
1022 if (type != OP_BACK_REF)
1024 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1026 re_token_t *clexp_node;
1027 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028 if (clexp_node->type == OP_CLOSE_SUBEXP
1029 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1032 if (clexp_idx == init_nodes.nelem)
1035 if (type == OP_BACK_REF)
1037 Idx dest_idx = dfa->edests[node_idx].elems[0];
1038 if (!re_node_set_contains (&init_nodes, dest_idx))
1040 reg_errcode_t merge_err
1041 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042 if (merge_err != REG_NOERROR)
1049 /* It must be the first time to invoke acquire_state. */
1050 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051 /* We don't check ERR here, since the initial state must not be NULL. */
1052 if (__glibc_unlikely (dfa->init_state == NULL))
1054 if (dfa->init_state->has_constraint)
1056 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1058 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1060 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1064 if (__glibc_unlikely (dfa->init_state_word == NULL
1065 || dfa->init_state_nl == NULL
1066 || dfa->init_state_begbuf == NULL))
1070 dfa->init_state_word = dfa->init_state_nl
1071 = dfa->init_state_begbuf = dfa->init_state;
1073 re_node_set_free (&init_nodes);
1077 #ifdef RE_ENABLE_I18N
1078 /* If it is possible to do searching in single byte encoding instead of UTF-8
1079 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080 DFA nodes where needed. */
1083 optimize_utf8 (re_dfa_t *dfa)
1087 bool mb_chars = false;
1088 bool has_period = false;
1090 for (node = 0; node < dfa->nodes_len; ++node)
1091 switch (dfa->nodes[node].type)
1094 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1098 switch (dfa->nodes[node].opr.ctx_type)
1106 /* Word anchors etc. cannot be handled. It's okay to test
1107 opr.ctx_type since constraints (for all DFA nodes) are
1108 created by ORing one or more opr.ctx_type values. */
1118 case OP_DUP_ASTERISK:
1119 case OP_OPEN_SUBEXP:
1120 case OP_CLOSE_SUBEXP:
1122 case COMPLEX_BRACKET:
1124 case SIMPLE_BRACKET:
1125 /* Just double check. */
1127 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1129 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1132 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1142 if (mb_chars || has_period)
1143 for (node = 0; node < dfa->nodes_len; ++node)
1145 if (dfa->nodes[node].type == CHARACTER
1146 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1147 dfa->nodes[node].mb_partial = 0;
1148 else if (dfa->nodes[node].type == OP_PERIOD)
1149 dfa->nodes[node].type = OP_UTF8_PERIOD;
1152 /* The search can be in single byte locale. */
1153 dfa->mb_cur_max = 1;
1155 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1159 /* Analyze the structure tree, and calculate "first", "next", "edest",
1160 "eclosure", and "inveclosure". */
1162 static reg_errcode_t
1163 analyze (regex_t *preg)
1165 re_dfa_t *dfa = preg->buffer;
1168 /* Allocate arrays. */
1169 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174 || dfa->edests == NULL || dfa->eclosures == NULL))
1177 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178 if (dfa->subexp_map != NULL)
1181 for (i = 0; i < preg->re_nsub; i++)
1182 dfa->subexp_map[i] = i;
1183 preorder (dfa->str_tree, optimize_subexps, dfa);
1184 for (i = 0; i < preg->re_nsub; i++)
1185 if (dfa->subexp_map[i] != i)
1187 if (i == preg->re_nsub)
1189 re_free (dfa->subexp_map);
1190 dfa->subexp_map = NULL;
1194 ret = postorder (dfa->str_tree, lower_subexps, preg);
1195 if (__glibc_unlikely (ret != REG_NOERROR))
1197 ret = postorder (dfa->str_tree, calc_first, dfa);
1198 if (__glibc_unlikely (ret != REG_NOERROR))
1200 preorder (dfa->str_tree, calc_next, dfa);
1201 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202 if (__glibc_unlikely (ret != REG_NOERROR))
1204 ret = calc_eclosure (dfa);
1205 if (__glibc_unlikely (ret != REG_NOERROR))
1208 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1209 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1210 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1213 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214 if (__glibc_unlikely (dfa->inveclosures == NULL))
1216 ret = calc_inveclosure (dfa);
1222 /* Our parse trees are very unbalanced, so we cannot use a stack to
1223 implement parse tree visits. Instead, we use parent pointers and
1224 some hairy code in these two functions. */
1225 static reg_errcode_t
1226 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1229 bin_tree_t *node, *prev;
1231 for (node = root; ; )
1233 /* Descend down the tree, preferably to the left (or to the right
1234 if that's the only child). */
1235 while (node->left || node->right)
1243 reg_errcode_t err = fn (extra, node);
1244 if (__glibc_unlikely (err != REG_NOERROR))
1246 if (node->parent == NULL)
1249 node = node->parent;
1251 /* Go up while we have a node that is reached from the right. */
1252 while (node->right == prev || node->right == NULL);
1257 static reg_errcode_t
1258 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1263 for (node = root; ; )
1265 reg_errcode_t err = fn (extra, node);
1266 if (__glibc_unlikely (err != REG_NOERROR))
1269 /* Go to the left node, or up and to the right. */
1274 bin_tree_t *prev = NULL;
1275 while (node->right == prev || node->right == NULL)
1278 node = node->parent;
1287 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1288 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1289 backreferences as well. Requires a preorder visit. */
1290 static reg_errcode_t
1291 optimize_subexps (void *extra, bin_tree_t *node)
1293 re_dfa_t *dfa = (re_dfa_t *) extra;
1295 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1297 int idx = node->token.opr.idx;
1298 node->token.opr.idx = dfa->subexp_map[idx];
1299 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1302 else if (node->token.type == SUBEXP
1303 && node->left && node->left->token.type == SUBEXP)
1305 Idx other_idx = node->left->token.opr.idx;
1307 node->left = node->left->left;
1309 node->left->parent = node;
1311 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312 if (other_idx < BITSET_WORD_BITS)
1313 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1319 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1320 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1321 static reg_errcode_t
1322 lower_subexps (void *extra, bin_tree_t *node)
1324 regex_t *preg = (regex_t *) extra;
1325 reg_errcode_t err = REG_NOERROR;
1327 if (node->left && node->left->token.type == SUBEXP)
1329 node->left = lower_subexp (&err, preg, node->left);
1331 node->left->parent = node;
1333 if (node->right && node->right->token.type == SUBEXP)
1335 node->right = lower_subexp (&err, preg, node->right);
1337 node->right->parent = node;
1344 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1346 re_dfa_t *dfa = preg->buffer;
1347 bin_tree_t *body = node->left;
1348 bin_tree_t *op, *cls, *tree1, *tree;
1351 /* We do not optimize empty subexpressions, because otherwise we may
1352 have bad CONCAT nodes with NULL children. This is obviously not
1353 very common, so we do not lose much. An example that triggers
1354 this case is the sed "script" /\(\)/x. */
1355 && node->left != NULL
1356 && (node->token.opr.idx >= BITSET_WORD_BITS
1357 || !(dfa->used_bkref_map
1358 & ((bitset_word_t) 1 << node->token.opr.idx))))
1361 /* Convert the SUBEXP node to the concatenation of an
1362 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1363 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366 tree = create_tree (dfa, op, tree1, CONCAT);
1367 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368 || op == NULL || cls == NULL))
1374 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1379 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1380 nodes. Requires a postorder visit. */
1381 static reg_errcode_t
1382 calc_first (void *extra, bin_tree_t *node)
1384 re_dfa_t *dfa = (re_dfa_t *) extra;
1385 if (node->token.type == CONCAT)
1387 node->first = node->left->first;
1388 node->node_idx = node->left->node_idx;
1393 node->node_idx = re_dfa_add_node (dfa, node->token);
1394 if (__glibc_unlikely (node->node_idx == -1))
1396 if (node->token.type == ANCHOR)
1397 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1402 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1403 static reg_errcode_t
1404 calc_next (void *extra, bin_tree_t *node)
1406 switch (node->token.type)
1408 case OP_DUP_ASTERISK:
1409 node->left->next = node;
1412 node->left->next = node->right->first;
1413 node->right->next = node->next;
1417 node->left->next = node->next;
1419 node->right->next = node->next;
1425 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1426 static reg_errcode_t
1427 link_nfa_nodes (void *extra, bin_tree_t *node)
1429 re_dfa_t *dfa = (re_dfa_t *) extra;
1430 Idx idx = node->node_idx;
1431 reg_errcode_t err = REG_NOERROR;
1433 switch (node->token.type)
1439 DEBUG_ASSERT (node->next == NULL);
1442 case OP_DUP_ASTERISK:
1446 dfa->has_plural_match = 1;
1447 if (node->left != NULL)
1448 left = node->left->first->node_idx;
1450 left = node->next->node_idx;
1451 if (node->right != NULL)
1452 right = node->right->first->node_idx;
1454 right = node->next->node_idx;
1455 DEBUG_ASSERT (left > -1);
1456 DEBUG_ASSERT (right > -1);
1457 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1462 case OP_OPEN_SUBEXP:
1463 case OP_CLOSE_SUBEXP:
1464 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1468 dfa->nexts[idx] = node->next->node_idx;
1469 if (node->token.type == OP_BACK_REF)
1470 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1474 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1475 dfa->nexts[idx] = node->next->node_idx;
1482 /* Duplicate the epsilon closure of the node ROOT_NODE.
1483 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484 to their own constraint. */
1486 static reg_errcode_t
1487 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488 Idx root_node, unsigned int init_constraint)
1490 Idx org_node, clone_node;
1492 unsigned int constraint = init_constraint;
1493 for (org_node = top_org_node, clone_node = top_clone_node;;)
1495 Idx org_dest, clone_dest;
1496 if (dfa->nodes[org_node].type == OP_BACK_REF)
1498 /* If the back reference epsilon-transit, its destination must
1499 also have the constraint. Then duplicate the epsilon closure
1500 of the destination of the back reference, and store it in
1501 edests of the back reference. */
1502 org_dest = dfa->nexts[org_node];
1503 re_node_set_empty (dfa->edests + clone_node);
1504 clone_dest = duplicate_node (dfa, org_dest, constraint);
1505 if (__glibc_unlikely (clone_dest == -1))
1507 dfa->nexts[clone_node] = dfa->nexts[org_node];
1508 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509 if (__glibc_unlikely (! ok))
1512 else if (dfa->edests[org_node].nelem == 0)
1514 /* In case of the node can't epsilon-transit, don't duplicate the
1515 destination and store the original destination as the
1516 destination of the node. */
1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
1520 else if (dfa->edests[org_node].nelem == 1)
1522 /* In case of the node can epsilon-transit, and it has only one
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 /* If the node is root_node itself, it means the epsilon closure
1527 has a loop. Then tie it to the destination of the root_node. */
1528 if (org_node == root_node && clone_node != org_node)
1530 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531 if (__glibc_unlikely (! ok))
1535 /* In case the node has another constraint, append it. */
1536 constraint |= dfa->nodes[org_node].constraint;
1537 clone_dest = duplicate_node (dfa, org_dest, constraint);
1538 if (__glibc_unlikely (clone_dest == -1))
1540 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541 if (__glibc_unlikely (! ok))
1544 else /* dfa->edests[org_node].nelem == 2 */
1546 /* In case of the node can epsilon-transit, and it has two
1547 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1548 org_dest = dfa->edests[org_node].elems[0];
1549 re_node_set_empty (dfa->edests + clone_node);
1550 /* Search for a duplicated node which satisfies the constraint. */
1551 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552 if (clone_dest == -1)
1554 /* There is no such duplicated node, create a new one. */
1556 clone_dest = duplicate_node (dfa, org_dest, constraint);
1557 if (__glibc_unlikely (clone_dest == -1))
1559 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 if (__glibc_unlikely (! ok))
1562 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563 root_node, constraint);
1564 if (__glibc_unlikely (err != REG_NOERROR))
1569 /* There is a duplicated node which satisfies the constraint,
1570 use it to avoid infinite loop. */
1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 if (__glibc_unlikely (! ok))
1576 org_dest = dfa->edests[org_node].elems[1];
1577 clone_dest = duplicate_node (dfa, org_dest, constraint);
1578 if (__glibc_unlikely (clone_dest == -1))
1580 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581 if (__glibc_unlikely (! ok))
1584 org_node = org_dest;
1585 clone_node = clone_dest;
1590 /* Search for a node which is duplicated from the node ORG_NODE, and
1591 satisfies the constraint CONSTRAINT. */
1594 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595 unsigned int constraint)
1598 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1600 if (org_node == dfa->org_indices[idx]
1601 && constraint == dfa->nodes[idx].constraint)
1602 return idx; /* Found. */
1604 return -1; /* Not found. */
1607 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1608 Return the index of the new node, or -1 if insufficient storage is
1612 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1614 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615 if (__glibc_likely (dup_idx != -1))
1617 dfa->nodes[dup_idx].constraint = constraint;
1618 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1619 dfa->nodes[dup_idx].duplicated = 1;
1621 /* Store the index of the original node. */
1622 dfa->org_indices[dup_idx] = org_idx;
1627 static reg_errcode_t
1628 calc_inveclosure (re_dfa_t *dfa)
1632 for (idx = 0; idx < dfa->nodes_len; ++idx)
1633 re_node_set_init_empty (dfa->inveclosures + idx);
1635 for (src = 0; src < dfa->nodes_len; ++src)
1637 Idx *elems = dfa->eclosures[src].elems;
1638 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1640 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641 if (__glibc_unlikely (! ok))
1649 /* Calculate "eclosure" for all the node in DFA. */
1651 static reg_errcode_t
1652 calc_eclosure (re_dfa_t *dfa)
1656 DEBUG_ASSERT (dfa->nodes_len > 0);
1658 /* For each nodes, calculate epsilon closure. */
1659 for (node_idx = 0; ; ++node_idx)
1662 re_node_set eclosure_elem;
1663 if (node_idx == dfa->nodes_len)
1671 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1673 /* If we have already calculated, skip it. */
1674 if (dfa->eclosures[node_idx].nelem != 0)
1676 /* Calculate epsilon closure of 'node_idx'. */
1677 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678 if (__glibc_unlikely (err != REG_NOERROR))
1681 if (dfa->eclosures[node_idx].nelem == 0)
1684 re_node_set_free (&eclosure_elem);
1690 /* Calculate epsilon closure of NODE. */
1692 static reg_errcode_t
1693 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1697 re_node_set eclosure;
1698 bool incomplete = false;
1699 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1700 if (__glibc_unlikely (err != REG_NOERROR))
1703 /* An epsilon closure includes itself. */
1704 eclosure.elems[eclosure.nelem++] = node;
1706 /* This indicates that we are calculating this node now.
1707 We reference this value to avoid infinite loop. */
1708 dfa->eclosures[node].nelem = -1;
1710 /* If the current node has constraints, duplicate all nodes
1711 since they must inherit the constraints. */
1712 if (dfa->nodes[node].constraint
1713 && dfa->edests[node].nelem
1714 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1716 err = duplicate_node_closure (dfa, node, node, node,
1717 dfa->nodes[node].constraint);
1718 if (__glibc_unlikely (err != REG_NOERROR))
1722 /* Expand each epsilon destination nodes. */
1723 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1724 for (i = 0; i < dfa->edests[node].nelem; ++i)
1726 re_node_set eclosure_elem;
1727 Idx edest = dfa->edests[node].elems[i];
1728 /* If calculating the epsilon closure of 'edest' is in progress,
1729 return intermediate result. */
1730 if (dfa->eclosures[edest].nelem == -1)
1735 /* If we haven't calculated the epsilon closure of 'edest' yet,
1736 calculate now. Otherwise use calculated epsilon closure. */
1737 if (dfa->eclosures[edest].nelem == 0)
1739 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1740 if (__glibc_unlikely (err != REG_NOERROR))
1744 eclosure_elem = dfa->eclosures[edest];
1745 /* Merge the epsilon closure of 'edest'. */
1746 err = re_node_set_merge (&eclosure, &eclosure_elem);
1747 if (__glibc_unlikely (err != REG_NOERROR))
1749 /* If the epsilon closure of 'edest' is incomplete,
1750 the epsilon closure of this node is also incomplete. */
1751 if (dfa->eclosures[edest].nelem == 0)
1754 re_node_set_free (&eclosure_elem);
1758 if (incomplete && !root)
1759 dfa->eclosures[node].nelem = 0;
1761 dfa->eclosures[node] = eclosure;
1762 *new_set = eclosure;
1766 /* Functions for token which are used in the parser. */
1768 /* Fetch a token from INPUT.
1769 We must not use this function inside bracket expressions. */
1772 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1774 re_string_skip_bytes (input, peek_token (result, input, syntax));
1777 /* Peek a token from INPUT, and return the length of the token.
1778 We must not use this function inside bracket expressions. */
1781 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1785 if (re_string_eoi (input))
1787 token->type = END_OF_RE;
1791 c = re_string_peek_byte (input, 0);
1794 token->word_char = 0;
1795 #ifdef RE_ENABLE_I18N
1796 token->mb_partial = 0;
1797 if (input->mb_cur_max > 1
1798 && !re_string_first_byte (input, re_string_cur_idx (input)))
1800 token->type = CHARACTER;
1801 token->mb_partial = 1;
1808 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1810 token->type = BACK_SLASH;
1814 c2 = re_string_peek_byte_case (input, 1);
1816 token->type = CHARACTER;
1817 #ifdef RE_ENABLE_I18N
1818 if (input->mb_cur_max > 1)
1820 wint_t wc = re_string_wchar_at (input,
1821 re_string_cur_idx (input) + 1);
1822 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1826 token->word_char = IS_WORD_CHAR (c2) != 0;
1831 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832 token->type = OP_ALT;
1834 case '1': case '2': case '3': case '4': case '5':
1835 case '6': case '7': case '8': case '9':
1836 if (!(syntax & RE_NO_BK_REFS))
1838 token->type = OP_BACK_REF;
1839 token->opr.idx = c2 - '1';
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_FIRST;
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_LAST;
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = WORD_DELIM;
1864 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = ANCHOR;
1867 token->opr.ctx_type = NOT_WORD_DELIM;
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = OP_WORD;
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = OP_NOTWORD;
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = OP_SPACE;
1883 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = OP_NOTSPACE;
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_FIRST;
1894 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = BUF_LAST;
1901 if (!(syntax & RE_NO_BK_PARENS))
1902 token->type = OP_OPEN_SUBEXP;
1905 if (!(syntax & RE_NO_BK_PARENS))
1906 token->type = OP_CLOSE_SUBEXP;
1909 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910 token->type = OP_DUP_PLUS;
1913 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914 token->type = OP_DUP_QUESTION;
1917 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918 token->type = OP_OPEN_DUP_NUM;
1921 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922 token->type = OP_CLOSE_DUP_NUM;
1930 token->type = CHARACTER;
1931 #ifdef RE_ENABLE_I18N
1932 if (input->mb_cur_max > 1)
1934 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1939 token->word_char = IS_WORD_CHAR (token->opr.c);
1944 if (syntax & RE_NEWLINE_ALT)
1945 token->type = OP_ALT;
1948 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949 token->type = OP_ALT;
1952 token->type = OP_DUP_ASTERISK;
1955 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956 token->type = OP_DUP_PLUS;
1959 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960 token->type = OP_DUP_QUESTION;
1963 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964 token->type = OP_OPEN_DUP_NUM;
1967 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968 token->type = OP_CLOSE_DUP_NUM;
1971 if (syntax & RE_NO_BK_PARENS)
1972 token->type = OP_OPEN_SUBEXP;
1975 if (syntax & RE_NO_BK_PARENS)
1976 token->type = OP_CLOSE_SUBEXP;
1979 token->type = OP_OPEN_BRACKET;
1982 token->type = OP_PERIOD;
1985 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1986 && re_string_cur_idx (input) != 0)
1988 char prev = re_string_peek_byte (input, -1);
1989 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1992 token->type = ANCHOR;
1993 token->opr.ctx_type = LINE_FIRST;
1996 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1997 && re_string_cur_idx (input) + 1 != re_string_length (input))
2000 re_string_skip_bytes (input, 1);
2001 peek_token (&next, input, syntax);
2002 re_string_skip_bytes (input, -1);
2003 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2006 token->type = ANCHOR;
2007 token->opr.ctx_type = LINE_LAST;
2015 /* Peek a token from INPUT, and return the length of the token.
2016 We must not use this function out of bracket expressions. */
2019 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2022 if (re_string_eoi (input))
2024 token->type = END_OF_RE;
2027 c = re_string_peek_byte (input, 0);
2030 #ifdef RE_ENABLE_I18N
2031 if (input->mb_cur_max > 1
2032 && !re_string_first_byte (input, re_string_cur_idx (input)))
2034 token->type = CHARACTER;
2037 #endif /* RE_ENABLE_I18N */
2039 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040 && re_string_cur_idx (input) + 1 < re_string_length (input))
2042 /* In this case, '\' escape a character. */
2044 re_string_skip_bytes (input, 1);
2045 c2 = re_string_peek_byte (input, 0);
2047 token->type = CHARACTER;
2050 if (c == '[') /* '[' is a special char in a bracket exps. */
2054 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2055 c2 = re_string_peek_byte (input, 1);
2063 token->type = OP_OPEN_COLL_ELEM;
2067 token->type = OP_OPEN_EQUIV_CLASS;
2071 if (syntax & RE_CHAR_CLASSES)
2073 token->type = OP_OPEN_CHAR_CLASS;
2078 token->type = CHARACTER;
2088 token->type = OP_CHARSET_RANGE;
2091 token->type = OP_CLOSE_BRACKET;
2094 token->type = OP_NON_MATCH_LIST;
2097 token->type = CHARACTER;
2102 /* Functions for parser. */
2104 /* Entry point of the parser.
2105 Parse the regular expression REGEXP and return the structure tree.
2106 If an error occurs, ERR is set by error code, and return NULL.
2107 This function build the following tree, from regular expression <reg_exp>:
2113 CAT means concatenation.
2114 EOR means end of regular expression. */
2117 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2120 re_dfa_t *dfa = preg->buffer;
2121 bin_tree_t *tree, *eor, *root;
2122 re_token_t current_token;
2123 dfa->syntax = syntax;
2124 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2126 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2128 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2130 root = create_tree (dfa, tree, eor, CONCAT);
2133 if (__glibc_unlikely (eor == NULL || root == NULL))
2141 /* This function build the following tree, from regular expression
2142 <branch1>|<branch2>:
2148 ALT means alternative, which represents the operator '|'. */
2151 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2154 re_dfa_t *dfa = preg->buffer;
2155 bin_tree_t *tree, *branch = NULL;
2156 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2157 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2158 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2161 while (token->type == OP_ALT)
2163 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2164 if (token->type != OP_ALT && token->type != END_OF_RE
2165 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2167 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2168 dfa->completed_bkref_map = initial_bkref_map;
2169 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2170 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2173 postorder (tree, free_tree, NULL);
2176 dfa->completed_bkref_map |= accumulated_bkref_map;
2180 tree = create_tree (dfa, tree, branch, OP_ALT);
2181 if (__glibc_unlikely (tree == NULL))
2190 /* This function build the following tree, from regular expression
2197 CAT means concatenation. */
2200 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2201 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2203 bin_tree_t *tree, *expr;
2204 re_dfa_t *dfa = preg->buffer;
2205 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2206 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2209 while (token->type != OP_ALT && token->type != END_OF_RE
2210 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2212 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2213 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2216 postorder (tree, free_tree, NULL);
2219 if (tree != NULL && expr != NULL)
2221 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2222 if (newtree == NULL)
2224 postorder (expr, free_tree, NULL);
2225 postorder (tree, free_tree, NULL);
2231 else if (tree == NULL)
2233 /* Otherwise expr == NULL, we don't need to create new tree. */
2238 /* This function build the following tree, from regular expression a*:
2245 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2246 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2248 re_dfa_t *dfa = preg->buffer;
2250 switch (token->type)
2253 tree = create_token_tree (dfa, NULL, NULL, token);
2254 if (__glibc_unlikely (tree == NULL))
2259 #ifdef RE_ENABLE_I18N
2260 if (dfa->mb_cur_max > 1)
2262 while (!re_string_eoi (regexp)
2263 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2265 bin_tree_t *mbc_remain;
2266 fetch_token (token, regexp, syntax);
2267 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2268 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2269 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2279 case OP_OPEN_SUBEXP:
2280 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2281 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2285 case OP_OPEN_BRACKET:
2286 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2287 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2292 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2297 dfa->used_bkref_map |= 1 << token->opr.idx;
2298 tree = create_token_tree (dfa, NULL, NULL, token);
2299 if (__glibc_unlikely (tree == NULL))
2305 dfa->has_mb_node = 1;
2308 case OP_OPEN_DUP_NUM:
2309 if (syntax & RE_CONTEXT_INVALID_DUP)
2315 case OP_DUP_ASTERISK:
2317 case OP_DUP_QUESTION:
2318 if (syntax & RE_CONTEXT_INVALID_OPS)
2323 else if (syntax & RE_CONTEXT_INDEP_OPS)
2325 fetch_token (token, regexp, syntax);
2326 return parse_expression (regexp, preg, token, syntax, nest, err);
2329 case OP_CLOSE_SUBEXP:
2330 if ((token->type == OP_CLOSE_SUBEXP)
2331 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2337 case OP_CLOSE_DUP_NUM:
2338 /* We treat it as a normal character. */
2340 /* Then we can these characters as normal characters. */
2341 token->type = CHARACTER;
2342 /* mb_partial and word_char bits should be initialized already
2344 tree = create_token_tree (dfa, NULL, NULL, token);
2345 if (__glibc_unlikely (tree == NULL))
2353 if ((token->opr.ctx_type
2354 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2355 && dfa->word_ops_used == 0)
2356 init_word_char (dfa);
2357 if (token->opr.ctx_type == WORD_DELIM
2358 || token->opr.ctx_type == NOT_WORD_DELIM)
2360 bin_tree_t *tree_first, *tree_last;
2361 if (token->opr.ctx_type == WORD_DELIM)
2363 token->opr.ctx_type = WORD_FIRST;
2364 tree_first = create_token_tree (dfa, NULL, NULL, token);
2365 token->opr.ctx_type = WORD_LAST;
2369 token->opr.ctx_type = INSIDE_WORD;
2370 tree_first = create_token_tree (dfa, NULL, NULL, token);
2371 token->opr.ctx_type = INSIDE_NOTWORD;
2373 tree_last = create_token_tree (dfa, NULL, NULL, token);
2374 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2375 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2384 tree = create_token_tree (dfa, NULL, NULL, token);
2385 if (__glibc_unlikely (tree == NULL))
2391 /* We must return here, since ANCHORs can't be followed
2392 by repetition operators.
2393 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2394 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2395 fetch_token (token, regexp, syntax);
2399 tree = create_token_tree (dfa, NULL, NULL, token);
2400 if (__glibc_unlikely (tree == NULL))
2405 if (dfa->mb_cur_max > 1)
2406 dfa->has_mb_node = 1;
2411 tree = build_charclass_op (dfa, regexp->trans,
2414 token->type == OP_NOTWORD, err);
2415 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2421 tree = build_charclass_op (dfa, regexp->trans,
2424 token->type == OP_NOTSPACE, err);
2425 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2438 /* Must not happen? */
2439 DEBUG_ASSERT (false);
2442 fetch_token (token, regexp, syntax);
2444 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2447 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2449 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2452 postorder (tree, free_tree, NULL);
2456 /* In BRE consecutive duplications are not allowed. */
2457 if ((syntax & RE_CONTEXT_INVALID_DUP)
2458 && (token->type == OP_DUP_ASTERISK
2459 || token->type == OP_OPEN_DUP_NUM))
2462 postorder (tree, free_tree, NULL);
2471 /* This function build the following tree, from regular expression
2479 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2480 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2482 re_dfa_t *dfa = preg->buffer;
2485 cur_nsub = preg->re_nsub++;
2487 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2489 /* The subexpression may be a null string. */
2490 if (token->type == OP_CLOSE_SUBEXP)
2494 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495 if (__glibc_unlikely (*err == REG_NOERROR
2496 && token->type != OP_CLOSE_SUBEXP))
2499 postorder (tree, free_tree, NULL);
2502 if (__glibc_unlikely (*err != REG_NOERROR))
2506 if (cur_nsub <= '9' - '1')
2507 dfa->completed_bkref_map |= 1 << cur_nsub;
2509 tree = create_tree (dfa, tree, NULL, SUBEXP);
2510 if (__glibc_unlikely (tree == NULL))
2515 tree->token.opr.idx = cur_nsub;
2519 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2522 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2523 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2525 bin_tree_t *tree = NULL, *old_tree = NULL;
2526 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2527 re_token_t start_token = *token;
2529 if (token->type == OP_OPEN_DUP_NUM)
2532 start = fetch_number (regexp, token, syntax);
2535 if (token->type == CHARACTER && token->opr.c == ',')
2536 start = 0; /* We treat "{,m}" as "{0,m}". */
2539 *err = REG_BADBR; /* <re>{} is invalid. */
2543 if (__glibc_likely (start != -2))
2545 /* We treat "{n}" as "{n,n}". */
2546 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2547 : ((token->type == CHARACTER && token->opr.c == ',')
2548 ? fetch_number (regexp, token, syntax) : -2));
2550 if (__glibc_unlikely (start == -2 || end == -2))
2552 /* Invalid sequence. */
2553 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2555 if (token->type == END_OF_RE)
2563 /* If the syntax bit is set, rollback. */
2564 re_string_set_index (regexp, start_idx);
2565 *token = start_token;
2566 token->type = CHARACTER;
2567 /* mb_partial and word_char bits should be already initialized by
2572 if (__glibc_unlikely ((end != -1 && start > end)
2573 || token->type != OP_CLOSE_DUP_NUM))
2575 /* First number greater than second. */
2580 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2588 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2589 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2592 fetch_token (token, regexp, syntax);
2594 if (__glibc_unlikely (elem == NULL))
2596 if (__glibc_unlikely (start == 0 && end == 0))
2598 postorder (elem, free_tree, NULL);
2602 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2603 if (__glibc_unlikely (start > 0))
2606 for (i = 2; i <= start; ++i)
2608 elem = duplicate_tree (elem, dfa);
2609 tree = create_tree (dfa, tree, elem, CONCAT);
2610 if (__glibc_unlikely (elem == NULL || tree == NULL))
2611 goto parse_dup_op_espace;
2617 /* Duplicate ELEM before it is marked optional. */
2618 elem = duplicate_tree (elem, dfa);
2619 if (__glibc_unlikely (elem == NULL))
2620 goto parse_dup_op_espace;
2626 if (elem->token.type == SUBEXP)
2628 uintptr_t subidx = elem->token.opr.idx;
2629 postorder (elem, mark_opt_subexp, (void *) subidx);
2632 tree = create_tree (dfa, elem, NULL,
2633 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2634 if (__glibc_unlikely (tree == NULL))
2635 goto parse_dup_op_espace;
2637 /* This loop is actually executed only when end != -1,
2638 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2639 already created the start+1-th copy. */
2640 if (TYPE_SIGNED (Idx) || end != -1)
2641 for (i = start + 2; i <= end; ++i)
2643 elem = duplicate_tree (elem, dfa);
2644 tree = create_tree (dfa, tree, elem, CONCAT);
2645 if (__glibc_unlikely (elem == NULL || tree == NULL))
2646 goto parse_dup_op_espace;
2648 tree = create_tree (dfa, tree, NULL, OP_ALT);
2649 if (__glibc_unlikely (tree == NULL))
2650 goto parse_dup_op_espace;
2654 tree = create_tree (dfa, old_tree, tree, CONCAT);
2658 parse_dup_op_espace:
2663 /* Size of the names for collating symbol/equivalence_class/character_class.
2664 I'm not sure, but maybe enough. */
2665 #define BRACKET_NAME_BUF_SIZE 32
2669 # ifdef RE_ENABLE_I18N
2670 /* Convert the byte B to the corresponding wide character. In a
2671 unibyte locale, treat B as itself. In a multibyte locale, return
2672 WEOF if B is an encoding error. */
2674 parse_byte (unsigned char b, re_charset_t *mbcset)
2676 return mbcset == NULL ? b : __btowc (b);
2680 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2681 Build the range expression which starts from START_ELEM, and ends
2682 at END_ELEM. The result are written to MBCSET and SBCSET.
2683 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2684 mbcset->range_ends, is a pointer argument since we may
2687 static reg_errcode_t
2688 # ifdef RE_ENABLE_I18N
2689 build_range_exp (const reg_syntax_t syntax,
2691 re_charset_t *mbcset,
2693 const bracket_elem_t *start_elem,
2694 const bracket_elem_t *end_elem)
2695 # else /* not RE_ENABLE_I18N */
2696 build_range_exp (const reg_syntax_t syntax,
2698 const bracket_elem_t *start_elem,
2699 const bracket_elem_t *end_elem)
2700 # endif /* not RE_ENABLE_I18N */
2702 unsigned int start_ch, end_ch;
2703 /* Equivalence Classes and Character Classes can't be a range start/end. */
2704 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2705 || start_elem->type == CHAR_CLASS
2706 || end_elem->type == EQUIV_CLASS
2707 || end_elem->type == CHAR_CLASS))
2710 /* We can handle no multi character collating elements without libc
2712 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2713 && strlen ((char *) start_elem->opr.name) > 1)
2714 || (end_elem->type == COLL_SYM
2715 && strlen ((char *) end_elem->opr.name) > 1)))
2716 return REG_ECOLLATE;
2718 # ifdef RE_ENABLE_I18N
2724 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2725 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2727 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2728 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2730 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2731 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2732 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2733 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2734 if (start_wc == WEOF || end_wc == WEOF)
2735 return REG_ECOLLATE;
2736 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2737 && start_wc > end_wc))
2740 /* Got valid collation sequence values, add them as a new entry.
2741 However, for !_LIBC we have no collation elements: if the
2742 character set is single byte, the single byte character set
2743 that we build below suffices. parse_bracket_exp passes
2744 no MBCSET if dfa->mb_cur_max == 1. */
2747 /* Check the space of the arrays. */
2748 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2750 /* There is not enough space, need realloc. */
2751 wchar_t *new_array_start, *new_array_end;
2754 /* +1 in case of mbcset->nranges is 0. */
2755 new_nranges = 2 * mbcset->nranges + 1;
2756 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2757 are NULL if *range_alloc == 0. */
2758 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2760 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2763 if (__glibc_unlikely (new_array_start == NULL
2764 || new_array_end == NULL))
2766 re_free (new_array_start);
2767 re_free (new_array_end);
2771 mbcset->range_starts = new_array_start;
2772 mbcset->range_ends = new_array_end;
2773 *range_alloc = new_nranges;
2776 mbcset->range_starts[mbcset->nranges] = start_wc;
2777 mbcset->range_ends[mbcset->nranges++] = end_wc;
2780 /* Build the table for single byte characters. */
2781 for (wc = 0; wc < SBC_MAX; ++wc)
2783 if (start_wc <= wc && wc <= end_wc)
2784 bitset_set (sbcset, wc);
2787 # else /* not RE_ENABLE_I18N */
2790 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2791 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2793 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2794 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2796 if (start_ch > end_ch)
2798 /* Build the table for single byte characters. */
2799 for (ch = 0; ch < SBC_MAX; ++ch)
2800 if (start_ch <= ch && ch <= end_ch)
2801 bitset_set (sbcset, ch);
2803 # endif /* not RE_ENABLE_I18N */
2806 #endif /* not _LIBC */
2809 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2810 Build the collating element which is represented by NAME.
2811 The result are written to MBCSET and SBCSET.
2812 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2813 pointer argument since we may update it. */
2815 static reg_errcode_t
2816 # ifdef RE_ENABLE_I18N
2817 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2818 Idx *coll_sym_alloc, const unsigned char *name)
2819 # else /* not RE_ENABLE_I18N */
2820 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2821 # endif /* not RE_ENABLE_I18N */
2823 size_t name_len = strlen ((const char *) name);
2824 if (__glibc_unlikely (name_len != 1))
2825 return REG_ECOLLATE;
2828 bitset_set (sbcset, name[0]);
2832 #endif /* not _LIBC */
2834 /* This function parse bracket expression like "[abc]", "[a-c]",
2838 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2839 reg_syntax_t syntax, reg_errcode_t *err)
2842 const unsigned char *collseqmb;
2843 const char *collseqwc;
2846 const int32_t *symb_table;
2847 const unsigned char *extra;
2849 /* Local function for parse_bracket_exp used in _LIBC environment.
2850 Seek the collating symbol entry corresponding to NAME.
2851 Return the index of the symbol in the SYMB_TABLE,
2852 or -1 if not found. */
2855 __attribute__ ((always_inline))
2856 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2860 for (elem = 0; elem < table_size; elem++)
2861 if (symb_table[2 * elem] != 0)
2863 int32_t idx = symb_table[2 * elem + 1];
2864 /* Skip the name of collating element name. */
2865 idx += 1 + extra[idx];
2866 if (/* Compare the length of the name. */
2867 name_len == extra[idx]
2868 /* Compare the name. */
2869 && memcmp (name, &extra[idx + 1], name_len) == 0)
2870 /* Yep, this is the entry. */
2876 /* Local function for parse_bracket_exp used in _LIBC environment.
2877 Look up the collation sequence value of BR_ELEM.
2878 Return the value if succeeded, UINT_MAX otherwise. */
2880 auto inline unsigned int
2881 __attribute__ ((always_inline))
2882 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2884 if (br_elem->type == SB_CHAR)
2887 if (MB_CUR_MAX == 1)
2890 return collseqmb[br_elem->opr.ch];
2893 wint_t wc = __btowc (br_elem->opr.ch);
2894 return __collseq_table_lookup (collseqwc, wc);
2897 else if (br_elem->type == MB_CHAR)
2900 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2902 else if (br_elem->type == COLL_SYM)
2904 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2908 elem = seek_collating_symbol_entry (br_elem->opr.name,
2912 /* We found the entry. */
2913 idx = symb_table[2 * elem + 1];
2914 /* Skip the name of collating element name. */
2915 idx += 1 + extra[idx];
2916 /* Skip the byte sequence of the collating element. */
2917 idx += 1 + extra[idx];
2918 /* Adjust for the alignment. */
2919 idx = (idx + 3) & ~3;
2920 /* Skip the multibyte collation sequence value. */
2921 idx += sizeof (unsigned int);
2922 /* Skip the wide char sequence of the collating element. */
2923 idx += sizeof (unsigned int) *
2924 (1 + *(unsigned int *) (extra + idx));
2925 /* Return the collation sequence value. */
2926 return *(unsigned int *) (extra + idx);
2928 else if (sym_name_len == 1)
2930 /* No valid character. Match it as a single byte
2932 return collseqmb[br_elem->opr.name[0]];
2935 else if (sym_name_len == 1)
2936 return collseqmb[br_elem->opr.name[0]];
2941 /* Local function for parse_bracket_exp used in _LIBC environment.
2942 Build the range expression which starts from START_ELEM, and ends
2943 at END_ELEM. The result are written to MBCSET and SBCSET.
2944 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2945 mbcset->range_ends, is a pointer argument since we may
2948 auto inline reg_errcode_t
2949 __attribute__ ((always_inline))
2950 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2951 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2954 uint32_t start_collseq;
2955 uint32_t end_collseq;
2957 /* Equivalence Classes and Character Classes can't be a range
2959 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2960 || start_elem->type == CHAR_CLASS
2961 || end_elem->type == EQUIV_CLASS
2962 || end_elem->type == CHAR_CLASS))
2965 /* FIXME: Implement rational ranges here, too. */
2966 start_collseq = lookup_collation_sequence_value (start_elem);
2967 end_collseq = lookup_collation_sequence_value (end_elem);
2968 /* Check start/end collation sequence values. */
2969 if (__glibc_unlikely (start_collseq == UINT_MAX
2970 || end_collseq == UINT_MAX))
2971 return REG_ECOLLATE;
2972 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2973 && start_collseq > end_collseq))
2976 /* Got valid collation sequence values, add them as a new entry.
2977 However, if we have no collation elements, and the character set
2978 is single byte, the single byte character set that we
2979 build below suffices. */
2980 if (nrules > 0 || dfa->mb_cur_max > 1)
2982 /* Check the space of the arrays. */
2983 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2985 /* There is not enough space, need realloc. */
2986 uint32_t *new_array_start;
2987 uint32_t *new_array_end;
2990 /* +1 in case of mbcset->nranges is 0. */
2991 new_nranges = 2 * mbcset->nranges + 1;
2992 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2994 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2997 if (__glibc_unlikely (new_array_start == NULL
2998 || new_array_end == NULL))
3001 mbcset->range_starts = new_array_start;
3002 mbcset->range_ends = new_array_end;
3003 *range_alloc = new_nranges;
3006 mbcset->range_starts[mbcset->nranges] = start_collseq;
3007 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3010 /* Build the table for single byte characters. */
3011 for (ch = 0; ch < SBC_MAX; ch++)
3013 uint32_t ch_collseq;
3015 if (MB_CUR_MAX == 1)
3018 ch_collseq = collseqmb[ch];
3020 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3021 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3022 bitset_set (sbcset, ch);
3027 /* Local function for parse_bracket_exp used in _LIBC environment.
3028 Build the collating element which is represented by NAME.
3029 The result are written to MBCSET and SBCSET.
3030 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3031 pointer argument since we may update it. */
3033 auto inline reg_errcode_t
3034 __attribute__ ((always_inline))
3035 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3036 Idx *coll_sym_alloc, const unsigned char *name)
3039 size_t name_len = strlen ((const char *) name);
3042 elem = seek_collating_symbol_entry (name, name_len);
3045 /* We found the entry. */
3046 idx = symb_table[2 * elem + 1];
3047 /* Skip the name of collating element name. */
3048 idx += 1 + extra[idx];
3050 else if (name_len == 1)
3052 /* No valid character, treat it as a normal
3054 bitset_set (sbcset, name[0]);
3058 return REG_ECOLLATE;
3060 /* Got valid collation sequence, add it as a new entry. */
3061 /* Check the space of the arrays. */
3062 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3064 /* Not enough, realloc it. */
3065 /* +1 in case of mbcset->ncoll_syms is 0. */
3066 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3067 /* Use realloc since mbcset->coll_syms is NULL
3069 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3070 new_coll_sym_alloc);
3071 if (__glibc_unlikely (new_coll_syms == NULL))
3073 mbcset->coll_syms = new_coll_syms;
3074 *coll_sym_alloc = new_coll_sym_alloc;
3076 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3081 if (__glibc_unlikely (name_len != 1))
3082 return REG_ECOLLATE;
3085 bitset_set (sbcset, name[0]);
3092 re_token_t br_token;
3093 re_bitset_ptr_t sbcset;
3094 #ifdef RE_ENABLE_I18N
3095 re_charset_t *mbcset;
3096 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3097 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3098 #endif /* not RE_ENABLE_I18N */
3099 bool non_match = false;
3100 bin_tree_t *work_tree;
3102 bool first_round = true;
3104 collseqmb = (const unsigned char *)
3105 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3106 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3112 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3113 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3114 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3115 _NL_COLLATE_SYMB_TABLEMB);
3116 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3117 _NL_COLLATE_SYMB_EXTRAMB);
3120 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3121 #ifdef RE_ENABLE_I18N
3122 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3123 #endif /* RE_ENABLE_I18N */
3124 #ifdef RE_ENABLE_I18N
3125 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3127 if (__glibc_unlikely (sbcset == NULL))
3128 #endif /* RE_ENABLE_I18N */
3131 #ifdef RE_ENABLE_I18N
3138 token_len = peek_token_bracket (token, regexp, syntax);
3139 if (__glibc_unlikely (token->type == END_OF_RE))
3142 goto parse_bracket_exp_free_return;
3144 if (token->type == OP_NON_MATCH_LIST)
3146 #ifdef RE_ENABLE_I18N
3147 mbcset->non_match = 1;
3148 #endif /* not RE_ENABLE_I18N */
3150 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3151 bitset_set (sbcset, '\n');
3152 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3153 token_len = peek_token_bracket (token, regexp, syntax);
3154 if (__glibc_unlikely (token->type == END_OF_RE))
3157 goto parse_bracket_exp_free_return;
3161 /* We treat the first ']' as a normal character. */
3162 if (token->type == OP_CLOSE_BRACKET)
3163 token->type = CHARACTER;
3167 bracket_elem_t start_elem, end_elem;
3168 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3169 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3172 bool is_range_exp = false;
3175 start_elem.opr.name = start_name_buf;
3176 start_elem.type = COLL_SYM;
3177 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3178 syntax, first_round);
3179 if (__glibc_unlikely (ret != REG_NOERROR))
3182 goto parse_bracket_exp_free_return;
3184 first_round = false;
3186 /* Get information about the next token. We need it in any case. */
3187 token_len = peek_token_bracket (token, regexp, syntax);
3189 /* Do not check for ranges if we know they are not allowed. */
3190 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3192 if (__glibc_unlikely (token->type == END_OF_RE))
3195 goto parse_bracket_exp_free_return;
3197 if (token->type == OP_CHARSET_RANGE)
3199 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3200 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3201 if (__glibc_unlikely (token2.type == END_OF_RE))
3204 goto parse_bracket_exp_free_return;
3206 if (token2.type == OP_CLOSE_BRACKET)
3208 /* We treat the last '-' as a normal character. */
3209 re_string_skip_bytes (regexp, -token_len);
3210 token->type = CHARACTER;
3213 is_range_exp = true;
3217 if (is_range_exp == true)
3219 end_elem.opr.name = end_name_buf;
3220 end_elem.type = COLL_SYM;
3221 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3223 if (__glibc_unlikely (ret != REG_NOERROR))
3226 goto parse_bracket_exp_free_return;
3229 token_len = peek_token_bracket (token, regexp, syntax);
3232 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3233 &start_elem, &end_elem);
3235 # ifdef RE_ENABLE_I18N
3236 *err = build_range_exp (syntax, sbcset,
3237 dfa->mb_cur_max > 1 ? mbcset : NULL,
3238 &range_alloc, &start_elem, &end_elem);
3240 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3242 #endif /* RE_ENABLE_I18N */
3243 if (__glibc_unlikely (*err != REG_NOERROR))
3244 goto parse_bracket_exp_free_return;
3248 switch (start_elem.type)
3251 bitset_set (sbcset, start_elem.opr.ch);
3253 #ifdef RE_ENABLE_I18N
3255 /* Check whether the array has enough space. */
3256 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3258 wchar_t *new_mbchars;
3259 /* Not enough, realloc it. */
3260 /* +1 in case of mbcset->nmbchars is 0. */
3261 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3262 /* Use realloc since array is NULL if *alloc == 0. */
3263 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3265 if (__glibc_unlikely (new_mbchars == NULL))
3266 goto parse_bracket_exp_espace;
3267 mbcset->mbchars = new_mbchars;
3269 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3271 #endif /* RE_ENABLE_I18N */
3273 *err = build_equiv_class (sbcset,
3274 #ifdef RE_ENABLE_I18N
3275 mbcset, &equiv_class_alloc,
3276 #endif /* RE_ENABLE_I18N */
3277 start_elem.opr.name);
3278 if (__glibc_unlikely (*err != REG_NOERROR))
3279 goto parse_bracket_exp_free_return;
3282 *err = build_collating_symbol (sbcset,
3283 #ifdef RE_ENABLE_I18N
3284 mbcset, &coll_sym_alloc,
3285 #endif /* RE_ENABLE_I18N */
3286 start_elem.opr.name);
3287 if (__glibc_unlikely (*err != REG_NOERROR))
3288 goto parse_bracket_exp_free_return;
3291 *err = build_charclass (regexp->trans, sbcset,
3292 #ifdef RE_ENABLE_I18N
3293 mbcset, &char_class_alloc,
3294 #endif /* RE_ENABLE_I18N */
3295 (const char *) start_elem.opr.name,
3297 if (__glibc_unlikely (*err != REG_NOERROR))
3298 goto parse_bracket_exp_free_return;
3301 DEBUG_ASSERT (false);
3305 if (__glibc_unlikely (token->type == END_OF_RE))
3308 goto parse_bracket_exp_free_return;
3310 if (token->type == OP_CLOSE_BRACKET)
3314 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3316 /* If it is non-matching list. */
3318 bitset_not (sbcset);
3320 #ifdef RE_ENABLE_I18N
3321 /* Ensure only single byte characters are set. */
3322 if (dfa->mb_cur_max > 1)
3323 bitset_mask (sbcset, dfa->sb_char);
3325 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3326 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3327 || mbcset->non_match)))
3329 bin_tree_t *mbc_tree;
3331 /* Build a tree for complex bracket. */
3332 dfa->has_mb_node = 1;
3333 br_token.type = COMPLEX_BRACKET;
3334 br_token.opr.mbcset = mbcset;
3335 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3336 if (__glibc_unlikely (mbc_tree == NULL))
3337 goto parse_bracket_exp_espace;
3338 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3339 if (sbcset[sbc_idx])
3341 /* If there are no bits set in sbcset, there is no point
3342 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3343 if (sbc_idx < BITSET_WORDS)
3345 /* Build a tree for simple bracket. */
3346 br_token.type = SIMPLE_BRACKET;
3347 br_token.opr.sbcset = sbcset;
3348 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3349 if (__glibc_unlikely (work_tree == NULL))
3350 goto parse_bracket_exp_espace;
3352 /* Then join them by ALT node. */
3353 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3354 if (__glibc_unlikely (work_tree == NULL))
3355 goto parse_bracket_exp_espace;
3360 work_tree = mbc_tree;
3364 #endif /* not RE_ENABLE_I18N */
3366 #ifdef RE_ENABLE_I18N
3367 free_charset (mbcset);
3369 /* Build a tree for simple bracket. */
3370 br_token.type = SIMPLE_BRACKET;
3371 br_token.opr.sbcset = sbcset;
3372 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3373 if (__glibc_unlikely (work_tree == NULL))
3374 goto parse_bracket_exp_espace;
3378 parse_bracket_exp_espace:
3380 parse_bracket_exp_free_return:
3382 #ifdef RE_ENABLE_I18N
3383 free_charset (mbcset);
3384 #endif /* RE_ENABLE_I18N */
3388 /* Parse an element in the bracket expression. */
3390 static reg_errcode_t
3391 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3392 re_token_t *token, int token_len, re_dfa_t *dfa,
3393 reg_syntax_t syntax, bool accept_hyphen)
3395 #ifdef RE_ENABLE_I18N
3397 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3398 if (cur_char_size > 1)
3400 elem->type = MB_CHAR;
3401 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3402 re_string_skip_bytes (regexp, cur_char_size);
3405 #endif /* RE_ENABLE_I18N */
3406 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3407 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3408 || token->type == OP_OPEN_EQUIV_CLASS)
3409 return parse_bracket_symbol (elem, regexp, token);
3410 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3412 /* A '-' must only appear as anything but a range indicator before
3413 the closing bracket. Everything else is an error. */
3415 (void) peek_token_bracket (&token2, regexp, syntax);
3416 if (token2.type != OP_CLOSE_BRACKET)
3417 /* The actual error value is not standardized since this whole
3418 case is undefined. But ERANGE makes good sense. */
3421 elem->type = SB_CHAR;
3422 elem->opr.ch = token->opr.c;
3426 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3427 such as [:<character_class>:], [.<collating_element>.], and
3428 [=<equivalent_class>=]. */
3430 static reg_errcode_t
3431 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3434 unsigned char ch, delim = token->opr.c;
3436 if (re_string_eoi(regexp))
3440 if (i >= BRACKET_NAME_BUF_SIZE)
3442 if (token->type == OP_OPEN_CHAR_CLASS)
3443 ch = re_string_fetch_byte_case (regexp);
3445 ch = re_string_fetch_byte (regexp);
3446 if (re_string_eoi(regexp))
3448 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3450 elem->opr.name[i] = ch;
3452 re_string_skip_bytes (regexp, 1);
3453 elem->opr.name[i] = '\0';
3454 switch (token->type)
3456 case OP_OPEN_COLL_ELEM:
3457 elem->type = COLL_SYM;
3459 case OP_OPEN_EQUIV_CLASS:
3460 elem->type = EQUIV_CLASS;
3462 case OP_OPEN_CHAR_CLASS:
3463 elem->type = CHAR_CLASS;
3471 /* Helper function for parse_bracket_exp.
3472 Build the equivalence class which is represented by NAME.
3473 The result are written to MBCSET and SBCSET.
3474 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3475 is a pointer argument since we may update it. */
3477 static reg_errcode_t
3478 #ifdef RE_ENABLE_I18N
3479 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3480 Idx *equiv_class_alloc, const unsigned char *name)
3481 #else /* not RE_ENABLE_I18N */
3482 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3483 #endif /* not RE_ENABLE_I18N */
3486 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3489 const int32_t *table, *indirect;
3490 const unsigned char *weights, *extra, *cp;
3491 unsigned char char_buf[2];
3495 /* Calculate the index for equivalence class. */
3497 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3498 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3499 _NL_COLLATE_WEIGHTMB);
3500 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3501 _NL_COLLATE_EXTRAMB);
3502 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3503 _NL_COLLATE_INDIRECTMB);
3504 idx1 = findidx (table, indirect, extra, &cp, -1);
3505 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3506 /* This isn't a valid character. */
3507 return REG_ECOLLATE;
3509 /* Build single byte matching table for this equivalence class. */
3510 len = weights[idx1 & 0xffffff];
3511 for (ch = 0; ch < SBC_MAX; ++ch)
3515 idx2 = findidx (table, indirect, extra, &cp, 1);
3520 /* This isn't a valid character. */
3522 /* Compare only if the length matches and the collation rule
3523 index is the same. */
3524 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3525 && memcmp (weights + (idx1 & 0xffffff) + 1,
3526 weights + (idx2 & 0xffffff) + 1, len) == 0)
3527 bitset_set (sbcset, ch);
3529 /* Check whether the array has enough space. */
3530 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3532 /* Not enough, realloc it. */
3533 /* +1 in case of mbcset->nequiv_classes is 0. */
3534 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3535 /* Use realloc since the array is NULL if *alloc == 0. */
3536 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3538 new_equiv_class_alloc);
3539 if (__glibc_unlikely (new_equiv_classes == NULL))
3541 mbcset->equiv_classes = new_equiv_classes;
3542 *equiv_class_alloc = new_equiv_class_alloc;
3544 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3549 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3550 return REG_ECOLLATE;
3551 bitset_set (sbcset, *name);
3556 /* Helper function for parse_bracket_exp.
3557 Build the character class which is represented by NAME.
3558 The result are written to MBCSET and SBCSET.
3559 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3560 is a pointer argument since we may update it. */
3562 static reg_errcode_t
3563 #ifdef RE_ENABLE_I18N
3564 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3565 re_charset_t *mbcset, Idx *char_class_alloc,
3566 const char *class_name, reg_syntax_t syntax)
3567 #else /* not RE_ENABLE_I18N */
3568 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3569 const char *class_name, reg_syntax_t syntax)
3570 #endif /* not RE_ENABLE_I18N */
3573 const char *name = class_name;
3575 /* In case of REG_ICASE "upper" and "lower" match the both of
3576 upper and lower cases. */
3577 if ((syntax & RE_ICASE)
3578 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3581 #ifdef RE_ENABLE_I18N
3582 /* Check the space of the arrays. */
3583 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3585 /* Not enough, realloc it. */
3586 /* +1 in case of mbcset->nchar_classes is 0. */
3587 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3588 /* Use realloc since array is NULL if *alloc == 0. */
3589 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3590 new_char_class_alloc);
3591 if (__glibc_unlikely (new_char_classes == NULL))
3593 mbcset->char_classes = new_char_classes;
3594 *char_class_alloc = new_char_class_alloc;
3596 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3597 #endif /* RE_ENABLE_I18N */
3599 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3601 if (__glibc_unlikely (trans != NULL)) \
3603 for (i = 0; i < SBC_MAX; ++i) \
3604 if (ctype_func (i)) \
3605 bitset_set (sbcset, trans[i]); \
3609 for (i = 0; i < SBC_MAX; ++i) \
3610 if (ctype_func (i)) \
3611 bitset_set (sbcset, i); \
3615 if (strcmp (name, "alnum") == 0)
3616 BUILD_CHARCLASS_LOOP (isalnum);
3617 else if (strcmp (name, "cntrl") == 0)
3618 BUILD_CHARCLASS_LOOP (iscntrl);
3619 else if (strcmp (name, "lower") == 0)
3620 BUILD_CHARCLASS_LOOP (islower);
3621 else if (strcmp (name, "space") == 0)
3622 BUILD_CHARCLASS_LOOP (isspace);
3623 else if (strcmp (name, "alpha") == 0)
3624 BUILD_CHARCLASS_LOOP (isalpha);
3625 else if (strcmp (name, "digit") == 0)
3626 BUILD_CHARCLASS_LOOP (isdigit);
3627 else if (strcmp (name, "print") == 0)
3628 BUILD_CHARCLASS_LOOP (isprint);
3629 else if (strcmp (name, "upper") == 0)
3630 BUILD_CHARCLASS_LOOP (isupper);
3631 else if (strcmp (name, "blank") == 0)
3632 BUILD_CHARCLASS_LOOP (isblank);
3633 else if (strcmp (name, "graph") == 0)
3634 BUILD_CHARCLASS_LOOP (isgraph);
3635 else if (strcmp (name, "punct") == 0)
3636 BUILD_CHARCLASS_LOOP (ispunct);
3637 else if (strcmp (name, "xdigit") == 0)
3638 BUILD_CHARCLASS_LOOP (isxdigit);
3646 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3647 const char *class_name,
3648 const char *extra, bool non_match,
3651 re_bitset_ptr_t sbcset;
3652 #ifdef RE_ENABLE_I18N
3653 re_charset_t *mbcset;
3655 #endif /* not RE_ENABLE_I18N */
3659 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3660 if (__glibc_unlikely (sbcset == NULL))
3665 #ifdef RE_ENABLE_I18N
3666 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3667 if (__glibc_unlikely (mbcset == NULL))
3673 mbcset->non_match = non_match;
3674 #endif /* RE_ENABLE_I18N */
3676 /* We don't care the syntax in this case. */
3677 ret = build_charclass (trans, sbcset,
3678 #ifdef RE_ENABLE_I18N
3680 #endif /* RE_ENABLE_I18N */
3683 if (__glibc_unlikely (ret != REG_NOERROR))
3686 #ifdef RE_ENABLE_I18N
3687 free_charset (mbcset);
3688 #endif /* RE_ENABLE_I18N */
3692 /* \w match '_' also. */
3693 for (; *extra; extra++)
3694 bitset_set (sbcset, *extra);
3696 /* If it is non-matching list. */
3698 bitset_not (sbcset);
3700 #ifdef RE_ENABLE_I18N
3701 /* Ensure only single byte characters are set. */
3702 if (dfa->mb_cur_max > 1)
3703 bitset_mask (sbcset, dfa->sb_char);
3706 /* Build a tree for simple bracket. */
3707 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3708 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3709 if (__glibc_unlikely (tree == NULL))
3710 goto build_word_op_espace;
3712 #ifdef RE_ENABLE_I18N
3713 if (dfa->mb_cur_max > 1)
3715 bin_tree_t *mbc_tree;
3716 /* Build a tree for complex bracket. */
3717 br_token.type = COMPLEX_BRACKET;
3718 br_token.opr.mbcset = mbcset;
3719 dfa->has_mb_node = 1;
3720 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3721 if (__glibc_unlikely (mbc_tree == NULL))
3722 goto build_word_op_espace;
3723 /* Then join them by ALT node. */
3724 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3725 if (__glibc_likely (mbc_tree != NULL))
3730 free_charset (mbcset);
3733 #else /* not RE_ENABLE_I18N */
3735 #endif /* not RE_ENABLE_I18N */
3737 build_word_op_espace:
3739 #ifdef RE_ENABLE_I18N
3740 free_charset (mbcset);
3741 #endif /* RE_ENABLE_I18N */
3746 /* This is intended for the expressions like "a{1,3}".
3747 Fetch a number from 'input', and return the number.
3748 Return -1 if the number field is empty like "{,1}".
3749 Return RE_DUP_MAX + 1 if the number field is too large.
3750 Return -2 if an error occurred. */
3753 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3759 fetch_token (token, input, syntax);
3761 if (__glibc_unlikely (token->type == END_OF_RE))
3763 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3765 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3769 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3774 #ifdef RE_ENABLE_I18N
3776 free_charset (re_charset_t *cset)
3778 re_free (cset->mbchars);
3780 re_free (cset->coll_syms);
3781 re_free (cset->equiv_classes);
3783 re_free (cset->range_starts);
3784 re_free (cset->range_ends);
3785 re_free (cset->char_classes);
3788 #endif /* RE_ENABLE_I18N */
3790 /* Functions for binary tree operation. */
3792 /* Create a tree node. */
3795 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3796 re_token_type_t type)
3798 re_token_t t = { .type = type };
3799 return create_token_tree (dfa, left, right, &t);
3803 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3804 const re_token_t *token)
3807 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3809 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3811 if (storage == NULL)
3813 storage->next = dfa->str_tree_storage;
3814 dfa->str_tree_storage = storage;
3815 dfa->str_tree_storage_idx = 0;
3817 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3819 tree->parent = NULL;
3821 tree->right = right;
3822 tree->token = *token;
3823 tree->token.duplicated = 0;
3824 tree->token.opt_subexp = 0;
3827 tree->node_idx = -1;
3830 left->parent = tree;
3832 right->parent = tree;
3836 /* Mark the tree SRC as an optional subexpression.
3837 To be called from preorder or postorder. */
3839 static reg_errcode_t
3840 mark_opt_subexp (void *extra, bin_tree_t *node)
3842 Idx idx = (uintptr_t) extra;
3843 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3844 node->token.opt_subexp = 1;
3849 /* Free the allocated memory inside NODE. */
3852 free_token (re_token_t *node)
3854 #ifdef RE_ENABLE_I18N
3855 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3856 free_charset (node->opr.mbcset);
3858 #endif /* RE_ENABLE_I18N */
3859 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3860 re_free (node->opr.sbcset);
3863 /* Worker function for tree walking. Free the allocated memory inside NODE
3864 and its children. */
3866 static reg_errcode_t
3867 free_tree (void *extra, bin_tree_t *node)
3869 free_token (&node->token);
3874 /* Duplicate the node SRC, and return new node. This is a preorder
3875 visit similar to the one implemented by the generic visitor, but
3876 we need more infrastructure to maintain two parallel trees --- so,
3877 it's easier to duplicate. */
3880 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3882 const bin_tree_t *node;
3883 bin_tree_t *dup_root;
3884 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3886 for (node = root; ; )
3888 /* Create a new tree and link it back to the current parent. */
3889 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3892 (*p_new)->parent = dup_node;
3893 (*p_new)->token.duplicated = 1;
3896 /* Go to the left node, or up and to the right. */
3900 p_new = &dup_node->left;
3904 const bin_tree_t *prev = NULL;
3905 while (node->right == prev || node->right == NULL)
3908 node = node->parent;
3909 dup_node = dup_node->parent;
3914 p_new = &dup_node->right;