1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 static void optimize_utf8 (re_dfa_t *dfa);
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
41 static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52 int top_clone_node, int root_node,
53 unsigned int constraint);
54 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
55 unsigned int constraint);
56 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57 unsigned int constraint);
58 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
61 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62 static int fetch_number (re_string_t *input, re_token_t *token,
64 static void fetch_token (re_token_t *result, re_string_t *input,
66 static int peek_token (re_token_t *token, re_string_t *input,
68 static int peek_token_bracket (re_token_t *token, re_string_t *input,
70 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
71 reg_syntax_t syntax, reg_errcode_t *err);
72 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
82 re_token_t *token, reg_syntax_t syntax,
83 int nest, reg_errcode_t *err);
84 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
85 re_dfa_t *dfa, re_token_t *token,
86 reg_syntax_t syntax, reg_errcode_t *err);
87 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
88 re_token_t *token, reg_syntax_t syntax,
90 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
92 re_token_t *token, int token_len,
96 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
100 # ifdef RE_ENABLE_I18N
101 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102 re_charset_t *mbcset, int *range_alloc,
103 bracket_elem_t *start_elem,
104 bracket_elem_t *end_elem);
105 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106 re_charset_t *mbcset,
108 const unsigned char *name);
109 # else /* not RE_ENABLE_I18N */
110 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111 bracket_elem_t *start_elem,
112 bracket_elem_t *end_elem);
113 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114 const unsigned char *name);
115 # endif /* not RE_ENABLE_I18N */
116 #endif /* not _LIBC */
117 #ifdef RE_ENABLE_I18N
118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119 re_charset_t *mbcset,
120 int *equiv_class_alloc,
121 const unsigned char *name);
122 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123 re_bitset_ptr_t sbcset,
124 re_charset_t *mbcset,
125 int *char_class_alloc,
126 const unsigned char *class_name,
127 reg_syntax_t syntax);
128 #else /* not RE_ENABLE_I18N */
129 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130 const unsigned char *name);
131 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132 re_bitset_ptr_t sbcset,
133 const unsigned char *class_name,
134 reg_syntax_t syntax);
135 #endif /* not RE_ENABLE_I18N */
136 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
137 unsigned RE_TRANSLATE_TYPE trans,
138 const unsigned char *class_name,
139 const unsigned char *extra,
140 int non_match, reg_errcode_t *err);
141 static bin_tree_t *create_tree (re_dfa_t *dfa,
142 bin_tree_t *left, bin_tree_t *right,
143 re_token_type_t type);
144 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
145 bin_tree_t *left, bin_tree_t *right,
146 const re_token_t *token);
147 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
148 static void free_token (re_token_t *node);
149 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
150 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
152 /* This table gives an error message for each of the error codes listed
153 in regex.h. Obviously the order here has to be same as there.
154 POSIX doesn't require that we do anything for REG_NOERROR,
155 but why not be nice? */
157 const char __re_error_msgid[] attribute_hidden =
159 #define REG_NOERROR_IDX 0
160 gettext_noop ("Success") /* REG_NOERROR */
162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163 gettext_noop ("No match") /* REG_NOMATCH */
165 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
166 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
171 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
175 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
178 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
180 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
181 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
183 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
186 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
189 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
190 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
192 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193 gettext_noop ("Invalid range end") /* REG_ERANGE */
195 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
196 gettext_noop ("Memory exhausted") /* REG_ESPACE */
198 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
199 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
201 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202 gettext_noop ("Premature end of regular expression") /* REG_EEND */
204 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
205 gettext_noop ("Regular expression too big") /* REG_ESIZE */
207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
208 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
211 const size_t __re_error_msgid_idx[] attribute_hidden =
232 /* Entry points for GNU code. */
234 /* re_compile_pattern is the GNU regular expression compiler: it
235 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236 Returns 0 if the pattern was valid, otherwise an error string.
238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239 are set in BUFP on entry. */
242 re_compile_pattern (pattern, length, bufp)
245 struct re_pattern_buffer *bufp;
249 /* And GNU code determines whether or not to get register information
250 by passing null for the REGS argument to re_match, etc., not by
251 setting no_sub, unless RE_NO_SUB is set. */
252 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
254 /* Match anchors at newline. */
255 bufp->newline_anchor = 1;
257 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
261 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
264 weak_alias (__re_compile_pattern, re_compile_pattern)
267 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
268 also be assigned to arbitrarily: each pattern buffer stores its own
269 syntax, so it can be changed between regex compilations. */
270 /* This has no initializer because initialized variables in Emacs
271 become read-only after dumping. */
272 reg_syntax_t re_syntax_options;
275 /* Specify the precise syntax of regexps for compilation. This provides
276 for compatibility for various utilities which historically have
277 different, incompatible syntaxes.
279 The argument SYNTAX is a bit mask comprised of the various bits
280 defined in regex.h. We return the old syntax. */
283 re_set_syntax (syntax)
286 reg_syntax_t ret = re_syntax_options;
288 re_syntax_options = syntax;
292 weak_alias (__re_set_syntax, re_set_syntax)
296 re_compile_fastmap (bufp)
297 struct re_pattern_buffer *bufp;
299 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
300 char *fastmap = bufp->fastmap;
302 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
303 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
304 if (dfa->init_state != dfa->init_state_word)
305 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
306 if (dfa->init_state != dfa->init_state_nl)
307 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
308 if (dfa->init_state != dfa->init_state_begbuf)
309 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
310 bufp->fastmap_accurate = 1;
314 weak_alias (__re_compile_fastmap, re_compile_fastmap)
318 __attribute ((always_inline))
319 re_set_fastmap (char *fastmap, int icase, int ch)
323 fastmap[tolower (ch)] = 1;
326 /* Helper function for re_compile_fastmap.
327 Compile fastmap for the initial_state INIT_STATE. */
330 re_compile_fastmap_iter (bufp, init_state, fastmap)
332 const re_dfastate_t *init_state;
335 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
337 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
338 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
340 int node = init_state->nodes.elems[node_cnt];
341 re_token_type_t type = dfa->nodes[node].type;
343 if (type == CHARACTER)
345 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
346 #ifdef RE_ENABLE_I18N
347 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
349 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
354 *p++ = dfa->nodes[node].opr.c;
355 while (++node < dfa->nodes_len
356 && dfa->nodes[node].type == CHARACTER
357 && dfa->nodes[node].mb_partial)
358 *p++ = dfa->nodes[node].opr.c;
359 memset (&state, 0, sizeof (state));
360 if (mbrtowc (&wc, (const char *) buf, p - buf,
362 && (__wcrtomb ((char *) buf, towlower (wc), &state)
364 re_set_fastmap (fastmap, 0, buf[0]);
368 else if (type == SIMPLE_BRACKET)
371 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
372 for (j = 0; j < UINT_BITS; ++j, ++ch)
373 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
374 re_set_fastmap (fastmap, icase, ch);
376 #ifdef RE_ENABLE_I18N
377 else if (type == COMPLEX_BRACKET)
380 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
381 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
382 || cset->nranges || cset->nchar_classes)
385 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
387 /* In this case we want to catch the bytes which are
388 the first byte of any collation elements.
389 e.g. In da_DK, we want to catch 'a' since "aa"
390 is a valid collation element, and don't catch
391 'b' since 'b' is the only collation element
392 which starts from 'b'. */
394 const int32_t *table = (const int32_t *)
395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
396 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
397 for (j = 0; j < UINT_BITS; ++j, ++ch)
399 re_set_fastmap (fastmap, icase, ch);
402 if (dfa->mb_cur_max > 1)
403 for (i = 0; i < SBC_MAX; ++i)
404 if (__btowc (i) == WEOF)
405 re_set_fastmap (fastmap, icase, i);
406 # endif /* not _LIBC */
408 for (i = 0; i < cset->nmbchars; ++i)
412 memset (&state, '\0', sizeof (state));
413 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
414 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
415 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
417 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
419 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
423 #endif /* RE_ENABLE_I18N */
424 else if (type == OP_PERIOD
425 #ifdef RE_ENABLE_I18N
426 || type == OP_UTF8_PERIOD
427 #endif /* RE_ENABLE_I18N */
428 || type == END_OF_RE)
430 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
431 if (type == END_OF_RE)
432 bufp->can_be_null = 1;
438 /* Entry point for POSIX code. */
439 /* regcomp takes a regular expression as a string and compiles it.
441 PREG is a regex_t *. We do not expect any fields to be initialized,
442 since POSIX says we shouldn't. Thus, we set
444 `buffer' to the compiled pattern;
445 `used' to the length of the compiled pattern;
446 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447 REG_EXTENDED bit in CFLAGS is set; otherwise, to
448 RE_SYNTAX_POSIX_BASIC;
449 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
450 `fastmap' to an allocated space for the fastmap;
451 `fastmap_accurate' to zero;
452 `re_nsub' to the number of subexpressions in PATTERN.
454 PATTERN is the address of the pattern string.
456 CFLAGS is a series of bits which affect compilation.
458 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459 use POSIX basic syntax.
461 If REG_NEWLINE is set, then . and [^...] don't match newline.
462 Also, regexec will try a match beginning after every newline.
464 If REG_ICASE is set, then we considers upper- and lowercase
465 versions of letters to be equivalent when matching.
467 If REG_NOSUB is set, then when PREG is passed to regexec, that
468 routine will report only success or failure, and nothing about the
471 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
472 the return codes and their meanings.) */
475 regcomp (preg, pattern, cflags)
476 regex_t *__restrict preg;
477 const char *__restrict pattern;
481 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482 : RE_SYNTAX_POSIX_BASIC);
488 /* Try to allocate space for the fastmap. */
489 preg->fastmap = re_malloc (char, SBC_MAX);
490 if (BE (preg->fastmap == NULL, 0))
493 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
495 /* If REG_NEWLINE is set, newlines are treated differently. */
496 if (cflags & REG_NEWLINE)
497 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
498 syntax &= ~RE_DOT_NEWLINE;
499 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
500 /* It also changes the matching behavior. */
501 preg->newline_anchor = 1;
504 preg->newline_anchor = 0;
505 preg->no_sub = !!(cflags & REG_NOSUB);
506 preg->translate = NULL;
508 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
510 /* POSIX doesn't distinguish between an unmatched open-group and an
511 unmatched close-group: both are REG_EPAREN. */
512 if (ret == REG_ERPAREN)
515 /* We have already checked preg->fastmap != NULL. */
516 if (BE (ret == REG_NOERROR, 1))
517 /* Compute the fastmap now, since regexec cannot modify the pattern
518 buffer. This function never fails in this implementation. */
519 (void) re_compile_fastmap (preg);
522 /* Some error occurred while compiling the expression. */
523 re_free (preg->fastmap);
524 preg->fastmap = NULL;
530 weak_alias (__regcomp, regcomp)
533 /* Returns a message corresponding to an error code, ERRCODE, returned
534 from either regcomp or regexec. We don't use PREG here. */
537 regerror (errcode, preg, errbuf, errbuf_size)
547 || errcode >= (int) (sizeof (__re_error_msgid_idx)
548 / sizeof (__re_error_msgid_idx[0])), 0))
549 /* Only error codes returned by the rest of the code should be passed
550 to this routine. If we are given anything else, or if other regex
551 code generates an invalid error code, then the program has a bug.
552 Dump core so we can fix it. */
555 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
557 msg_size = strlen (msg) + 1; /* Includes the null. */
559 if (BE (errbuf_size != 0, 1))
561 if (BE (msg_size > errbuf_size, 0))
563 #if defined HAVE_MEMPCPY || defined _LIBC
564 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
566 memcpy (errbuf, msg, errbuf_size - 1);
567 errbuf[errbuf_size - 1] = 0;
571 memcpy (errbuf, msg, msg_size);
577 weak_alias (__regerror, regerror)
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset utf8_sb_map =
588 /* Set the first 128 bits. */
589 # if UINT_MAX == 0xffffffff
590 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
592 # error "Add case for new unsigned int size"
599 free_dfa_content (re_dfa_t *dfa)
604 for (i = 0; i < dfa->nodes_len; ++i)
605 free_token (dfa->nodes + i);
606 re_free (dfa->nexts);
607 for (i = 0; i < dfa->nodes_len; ++i)
609 if (dfa->eclosures != NULL)
610 re_node_set_free (dfa->eclosures + i);
611 if (dfa->inveclosures != NULL)
612 re_node_set_free (dfa->inveclosures + i);
613 if (dfa->edests != NULL)
614 re_node_set_free (dfa->edests + i);
616 re_free (dfa->edests);
617 re_free (dfa->eclosures);
618 re_free (dfa->inveclosures);
619 re_free (dfa->nodes);
621 if (dfa->state_table)
622 for (i = 0; i <= dfa->state_hash_mask; ++i)
624 struct re_state_table_entry *entry = dfa->state_table + i;
625 for (j = 0; j < entry->num; ++j)
627 re_dfastate_t *state = entry->array[j];
630 re_free (entry->array);
632 re_free (dfa->state_table);
633 #ifdef RE_ENABLE_I18N
634 if (dfa->sb_char != utf8_sb_map)
635 re_free (dfa->sb_char);
637 re_free (dfa->subexp_map);
639 re_free (dfa->re_str);
646 /* Free dynamically allocated space used by PREG. */
652 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
653 if (BE (dfa != NULL, 1))
654 free_dfa_content (dfa);
658 re_free (preg->fastmap);
659 preg->fastmap = NULL;
661 re_free (preg->translate);
662 preg->translate = NULL;
665 weak_alias (__regfree, regfree)
668 /* Entry points compatible with 4.2 BSD regex library. We don't define
669 them unless specifically requested. */
671 #if defined _REGEX_RE_COMP || defined _LIBC
673 /* BSD has one and only one pattern buffer. */
674 static struct re_pattern_buffer re_comp_buf;
678 /* Make these definitions weak in libc, so POSIX programs can redefine
679 these names if they don't use our functions, and still use
680 regcomp/regexec above without link errors. */
691 if (!re_comp_buf.buffer)
692 return gettext ("No previous regular expression");
696 if (re_comp_buf.buffer)
698 fastmap = re_comp_buf.fastmap;
699 re_comp_buf.fastmap = NULL;
700 __regfree (&re_comp_buf);
701 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
702 re_comp_buf.fastmap = fastmap;
705 if (re_comp_buf.fastmap == NULL)
707 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
708 if (re_comp_buf.fastmap == NULL)
709 return (char *) gettext (__re_error_msgid
710 + __re_error_msgid_idx[(int) REG_ESPACE]);
713 /* Since `re_exec' always passes NULL for the `regs' argument, we
714 don't need to initialize the pattern buffer fields which affect it. */
716 /* Match anchors at newlines. */
717 re_comp_buf.newline_anchor = 1;
719 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
724 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
725 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
729 libc_freeres_fn (free_mem)
731 __regfree (&re_comp_buf);
735 #endif /* _REGEX_RE_COMP */
737 /* Internal entry point.
738 Compile the regular expression PATTERN, whose length is LENGTH.
739 SYNTAX indicate regular expression's syntax. */
742 re_compile_internal (preg, pattern, length, syntax)
744 const char * pattern;
748 reg_errcode_t err = REG_NOERROR;
752 /* Initialize the pattern buffer. */
753 preg->fastmap_accurate = 0;
754 preg->syntax = syntax;
755 preg->not_bol = preg->not_eol = 0;
758 preg->can_be_null = 0;
759 preg->regs_allocated = REGS_UNALLOCATED;
761 /* Initialize the dfa. */
762 dfa = (re_dfa_t *) preg->buffer;
763 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
765 /* If zero allocated, but buffer is non-null, try to realloc
766 enough space. This loses if buffer's address is bogus, but
767 that is the user's responsibility. If ->buffer is NULL this
768 is a simple allocation. */
769 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
772 preg->allocated = sizeof (re_dfa_t);
773 preg->buffer = (unsigned char *) dfa;
775 preg->used = sizeof (re_dfa_t);
777 err = init_dfa (dfa, length);
778 if (BE (err != REG_NOERROR, 0))
780 free_dfa_content (dfa);
786 dfa->re_str = re_malloc (char, length + 1);
787 strncpy (dfa->re_str, pattern, length + 1);
790 __libc_lock_init (dfa->lock);
792 err = re_string_construct (®exp, pattern, length, preg->translate,
793 syntax & RE_ICASE, dfa);
794 if (BE (err != REG_NOERROR, 0))
796 re_compile_internal_free_return:
797 free_workarea_compile (preg);
798 re_string_destruct (®exp);
799 free_dfa_content (dfa);
805 /* Parse the regular expression, and build a structure tree. */
807 dfa->str_tree = parse (®exp, preg, syntax, &err);
808 if (BE (dfa->str_tree == NULL, 0))
809 goto re_compile_internal_free_return;
811 /* Analyze the tree and create the nfa. */
812 err = analyze (preg);
813 if (BE (err != REG_NOERROR, 0))
814 goto re_compile_internal_free_return;
816 #ifdef RE_ENABLE_I18N
817 /* If possible, do searching in single byte encoding to speed things up. */
818 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
822 /* Then create the initial state of the dfa. */
823 err = create_initial_state (dfa);
825 /* Release work areas. */
826 free_workarea_compile (preg);
827 re_string_destruct (®exp);
829 if (BE (err != REG_NOERROR, 0))
831 free_dfa_content (dfa);
839 /* Initialize DFA. We use the length of the regular expression PAT_LEN
840 as the initial length of some arrays. */
843 init_dfa (dfa, pat_len)
852 memset (dfa, '\0', sizeof (re_dfa_t));
854 /* Force allocation of str_tree_storage the first time. */
855 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
857 dfa->nodes_alloc = pat_len + 1;
858 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
860 dfa->states_alloc = pat_len + 1;
862 /* table_size = 2 ^ ceil(log pat_len) */
863 for (table_size = 1; table_size > 0; table_size <<= 1)
864 if (table_size > pat_len)
867 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
868 dfa->state_hash_mask = table_size - 1;
870 dfa->mb_cur_max = MB_CUR_MAX;
872 if (dfa->mb_cur_max == 6
873 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
875 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
878 # ifdef HAVE_LANGINFO_CODESET
879 codeset_name = nl_langinfo (CODESET);
881 codeset_name = getenv ("LC_ALL");
882 if (codeset_name == NULL || codeset_name[0] == '\0')
883 codeset_name = getenv ("LC_CTYPE");
884 if (codeset_name == NULL || codeset_name[0] == '\0')
885 codeset_name = getenv ("LANG");
886 if (codeset_name == NULL)
888 else if (strchr (codeset_name, '.') != NULL)
889 codeset_name = strchr (codeset_name, '.') + 1;
892 if (strcasecmp (codeset_name, "UTF-8") == 0
893 || strcasecmp (codeset_name, "UTF8") == 0)
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
901 #ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
911 if (BE (dfa->sb_char == NULL, 0))
914 /* Clear all bits by, then set those corresponding to single
916 bitset_empty (dfa->sb_char);
918 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
919 for (j = 0; j < UINT_BITS; ++j, ++ch)
921 wchar_t wch = __btowc (ch);
923 dfa->sb_char[i] |= 1 << j;
925 if (isascii (ch) && wch != (wchar_t) ch)
926 dfa->map_notascii = 1;
933 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
938 /* Initialize WORD_CHAR table, which indicate which character is
939 "word". In this case "word" means that it is the word construction
940 character used by some operators like "\<", "\>", etc. */
947 dfa->word_ops_used = 1;
948 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
949 for (j = 0; j < UINT_BITS; ++j, ++ch)
950 if (isalnum (ch) || ch == '_')
951 dfa->word_char[i] |= 1 << j;
954 /* Free the work area which are only used while compiling. */
957 free_workarea_compile (preg)
960 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
961 bin_tree_storage_t *storage, *next;
962 for (storage = dfa->str_tree_storage; storage; storage = next)
964 next = storage->next;
967 dfa->str_tree_storage = NULL;
968 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
969 dfa->str_tree = NULL;
970 re_free (dfa->org_indices);
971 dfa->org_indices = NULL;
974 /* Create initial states for all contexts. */
977 create_initial_state (dfa)
982 re_node_set init_nodes;
984 /* Initial states have the epsilon closure of the node which is
985 the first node of the regular expression. */
986 first = dfa->str_tree->first->node_idx;
987 dfa->init_node = first;
988 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
989 if (BE (err != REG_NOERROR, 0))
992 /* The back-references which are in initial states can epsilon transit,
993 since in this case all of the subexpressions can be null.
994 Then we add epsilon closures of the nodes which are the next nodes of
995 the back-references. */
996 if (dfa->nbackref > 0)
997 for (i = 0; i < init_nodes.nelem; ++i)
999 int node_idx = init_nodes.elems[i];
1000 re_token_type_t type = dfa->nodes[node_idx].type;
1003 if (type != OP_BACK_REF)
1005 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1007 re_token_t *clexp_node;
1008 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1009 if (clexp_node->type == OP_CLOSE_SUBEXP
1010 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1013 if (clexp_idx == init_nodes.nelem)
1016 if (type == OP_BACK_REF)
1018 int dest_idx = dfa->edests[node_idx].elems[0];
1019 if (!re_node_set_contains (&init_nodes, dest_idx))
1021 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1027 /* It must be the first time to invoke acquire_state. */
1028 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1029 /* We don't check ERR here, since the initial state must not be NULL. */
1030 if (BE (dfa->init_state == NULL, 0))
1032 if (dfa->init_state->has_constraint)
1034 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1036 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1038 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1042 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1043 || dfa->init_state_begbuf == NULL, 0))
1047 dfa->init_state_word = dfa->init_state_nl
1048 = dfa->init_state_begbuf = dfa->init_state;
1050 re_node_set_free (&init_nodes);
1054 #ifdef RE_ENABLE_I18N
1055 /* If it is possible to do searching in single byte encoding instead of UTF-8
1056 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1057 DFA nodes where needed. */
1063 int node, i, mb_chars = 0, has_period = 0;
1065 for (node = 0; node < dfa->nodes_len; ++node)
1066 switch (dfa->nodes[node].type)
1069 if (dfa->nodes[node].opr.c >= 0x80)
1073 switch (dfa->nodes[node].opr.idx)
1081 /* Word anchors etc. cannot be handled. */
1091 case OP_DUP_ASTERISK:
1092 case OP_OPEN_SUBEXP:
1093 case OP_CLOSE_SUBEXP:
1095 case COMPLEX_BRACKET:
1097 case SIMPLE_BRACKET:
1098 /* Just double check. */
1099 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1100 if (dfa->nodes[node].opr.sbcset[i])
1107 if (mb_chars || has_period)
1108 for (node = 0; node < dfa->nodes_len; ++node)
1110 if (dfa->nodes[node].type == CHARACTER
1111 && dfa->nodes[node].opr.c >= 0x80)
1112 dfa->nodes[node].mb_partial = 0;
1113 else if (dfa->nodes[node].type == OP_PERIOD)
1114 dfa->nodes[node].type = OP_UTF8_PERIOD;
1117 /* The search can be in single byte locale. */
1118 dfa->mb_cur_max = 1;
1120 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1124 /* Analyze the structure tree, and calculate "first", "next", "edest",
1125 "eclosure", and "inveclosure". */
1127 static reg_errcode_t
1131 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1134 /* Allocate arrays. */
1135 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1136 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1137 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1138 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1139 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1140 || dfa->eclosures == NULL, 0))
1143 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1144 if (dfa->subexp_map != NULL)
1147 for (i = 0; i < preg->re_nsub; i++)
1148 dfa->subexp_map[i] = i;
1149 preorder (dfa->str_tree, optimize_subexps, dfa);
1150 for (i = 0; i < preg->re_nsub; i++)
1151 if (dfa->subexp_map[i] != i)
1153 if (i == preg->re_nsub)
1155 free (dfa->subexp_map);
1156 dfa->subexp_map = NULL;
1160 ret = postorder (dfa->str_tree, lower_subexps, preg);
1161 if (BE (ret != REG_NOERROR, 0))
1163 ret = postorder (dfa->str_tree, calc_first, dfa);
1164 if (BE (ret != REG_NOERROR, 0))
1166 preorder (dfa->str_tree, calc_next, dfa);
1167 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1168 if (BE (ret != REG_NOERROR, 0))
1170 ret = calc_eclosure (dfa);
1171 if (BE (ret != REG_NOERROR, 0))
1174 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1175 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1176 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1179 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1180 if (BE (dfa->inveclosures == NULL, 0))
1182 ret = calc_inveclosure (dfa);
1188 /* Our parse trees are very unbalanced, so we cannot use a stack to
1189 implement parse tree visits. Instead, we use parent pointers and
1190 some hairy code in these two functions. */
1191 static reg_errcode_t
1192 postorder (root, fn, extra)
1194 reg_errcode_t (fn (void *, bin_tree_t *));
1197 bin_tree_t *node, *prev;
1199 for (node = root; ; )
1201 /* Descend down the tree, preferably to the left (or to the right
1202 if that's the only child). */
1203 while (node->left || node->right)
1211 reg_errcode_t err = fn (extra, node);
1212 if (BE (err != REG_NOERROR, 0))
1214 if (node->parent == NULL)
1217 node = node->parent;
1219 /* Go up while we have a node that is reached from the right. */
1220 while (node->right == prev || node->right == NULL);
1225 static reg_errcode_t
1226 preorder (root, fn, extra)
1228 reg_errcode_t (fn (void *, bin_tree_t *));
1233 for (node = root; ; )
1235 reg_errcode_t err = fn (extra, node);
1236 if (BE (err != REG_NOERROR, 0))
1239 /* Go to the left node, or up and to the right. */
1244 bin_tree_t *prev = NULL;
1245 while (node->right == prev || node->right == NULL)
1248 node = node->parent;
1257 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1258 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1259 backreferences as well. Requires a preorder visit. */
1260 static reg_errcode_t
1261 optimize_subexps (extra, node)
1265 re_dfa_t *dfa = (re_dfa_t *) extra;
1267 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1269 int idx = node->token.opr.idx;
1270 node->token.opr.idx = dfa->subexp_map[idx];
1271 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1274 else if (node->token.type == SUBEXP
1275 && node->left && node->left->token.type == SUBEXP)
1277 int other_idx = node->left->token.opr.idx;
1279 node->left = node->left->left;
1281 node->left->parent = node;
1283 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1284 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1285 dfa->used_bkref_map &= ~(1 << other_idx);
1291 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1292 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1293 static reg_errcode_t
1294 lower_subexps (extra, node)
1298 regex_t *preg = (regex_t *) extra;
1299 reg_errcode_t err = REG_NOERROR;
1301 if (node->left && node->left->token.type == SUBEXP)
1303 node->left = lower_subexp (&err, preg, node->left);
1305 node->left->parent = node;
1307 if (node->right && node->right->token.type == SUBEXP)
1309 node->right = lower_subexp (&err, preg, node->right);
1311 node->right->parent = node;
1318 lower_subexp (err, preg, node)
1323 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1324 bin_tree_t *body = node->left;
1325 bin_tree_t *op, *cls, *tree1, *tree;
1328 /* We do not optimize empty subexpressions, because otherwise we may
1329 have bad CONCAT nodes with NULL children. This is obviously not
1330 very common, so we do not lose much. An example that triggers
1331 this case is the sed "script" /\(\)/x. */
1332 && node->left != NULL
1333 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1334 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1337 /* Convert the SUBEXP node to the concatenation of an
1338 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1339 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1340 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1341 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1342 tree = create_tree (dfa, op, tree1, CONCAT);
1343 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1349 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1350 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1354 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1355 nodes. Requires a postorder visit. */
1356 static reg_errcode_t
1357 calc_first (extra, node)
1361 re_dfa_t *dfa = (re_dfa_t *) extra;
1362 if (node->token.type == CONCAT)
1364 node->first = node->left->first;
1365 node->node_idx = node->left->node_idx;
1370 node->node_idx = re_dfa_add_node (dfa, node->token);
1371 if (BE (node->node_idx == -1, 0))
1377 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1378 static reg_errcode_t
1379 calc_next (extra, node)
1383 switch (node->token.type)
1385 case OP_DUP_ASTERISK:
1386 node->left->next = node;
1389 node->left->next = node->right->first;
1390 node->right->next = node->next;
1394 node->left->next = node->next;
1396 node->right->next = node->next;
1402 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1403 static reg_errcode_t
1404 link_nfa_nodes (extra, node)
1408 re_dfa_t *dfa = (re_dfa_t *) extra;
1409 int idx = node->node_idx;
1410 reg_errcode_t err = REG_NOERROR;
1412 switch (node->token.type)
1418 assert (node->next == NULL);
1421 case OP_DUP_ASTERISK:
1425 dfa->has_plural_match = 1;
1426 if (node->left != NULL)
1427 left = node->left->first->node_idx;
1429 left = node->next->node_idx;
1430 if (node->right != NULL)
1431 right = node->right->first->node_idx;
1433 right = node->next->node_idx;
1435 assert (right > -1);
1436 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1441 case OP_OPEN_SUBEXP:
1442 case OP_CLOSE_SUBEXP:
1443 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447 dfa->nexts[idx] = node->next->node_idx;
1448 if (node->token.type == OP_BACK_REF)
1449 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453 assert (!IS_EPSILON_NODE (node->token.type));
1454 dfa->nexts[idx] = node->next->node_idx;
1461 /* Duplicate the epsilon closure of the node ROOT_NODE.
1462 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1463 to their own constraint. */
1465 static reg_errcode_t
1466 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1469 int top_org_node, top_clone_node, root_node;
1470 unsigned int init_constraint;
1473 int org_node, clone_node, ret;
1474 unsigned int constraint = init_constraint;
1475 for (org_node = top_org_node, clone_node = top_clone_node;;)
1477 int org_dest, clone_dest;
1478 if (dfa->nodes[org_node].type == OP_BACK_REF)
1480 /* If the back reference epsilon-transit, its destination must
1481 also have the constraint. Then duplicate the epsilon closure
1482 of the destination of the back reference, and store it in
1483 edests of the back reference. */
1484 org_dest = dfa->nexts[org_node];
1485 re_node_set_empty (dfa->edests + clone_node);
1486 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1487 if (BE (err != REG_NOERROR, 0))
1489 dfa->nexts[clone_node] = dfa->nexts[org_node];
1490 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491 if (BE (ret < 0, 0))
1494 else if (dfa->edests[org_node].nelem == 0)
1496 /* In case of the node can't epsilon-transit, don't duplicate the
1497 destination and store the original destination as the
1498 destination of the node. */
1499 dfa->nexts[clone_node] = dfa->nexts[org_node];
1502 else if (dfa->edests[org_node].nelem == 1)
1504 /* In case of the node can epsilon-transit, and it has only one
1506 org_dest = dfa->edests[org_node].elems[0];
1507 re_node_set_empty (dfa->edests + clone_node);
1508 if (dfa->nodes[org_node].type == ANCHOR)
1510 /* In case of the node has another constraint, append it. */
1511 if (org_node == root_node && clone_node != org_node)
1513 /* ...but if the node is root_node itself, it means the
1514 epsilon closure have a loop, then tie it to the
1515 destination of the root_node. */
1516 ret = re_node_set_insert (dfa->edests + clone_node,
1518 if (BE (ret < 0, 0))
1522 constraint |= dfa->nodes[org_node].opr.ctx_type;
1524 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1525 if (BE (err != REG_NOERROR, 0))
1527 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1528 if (BE (ret < 0, 0))
1531 else /* dfa->edests[org_node].nelem == 2 */
1533 /* In case of the node can epsilon-transit, and it has two
1534 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1535 org_dest = dfa->edests[org_node].elems[0];
1536 re_node_set_empty (dfa->edests + clone_node);
1537 /* Search for a duplicated node which satisfies the constraint. */
1538 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1539 if (clone_dest == -1)
1541 /* There are no such a duplicated node, create a new one. */
1542 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1543 if (BE (err != REG_NOERROR, 0))
1545 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546 if (BE (ret < 0, 0))
1548 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1549 root_node, constraint);
1550 if (BE (err != REG_NOERROR, 0))
1555 /* There are a duplicated node which satisfy the constraint,
1556 use it to avoid infinite loop. */
1557 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558 if (BE (ret < 0, 0))
1562 org_dest = dfa->edests[org_node].elems[1];
1563 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1564 if (BE (err != REG_NOERROR, 0))
1566 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567 if (BE (ret < 0, 0))
1570 org_node = org_dest;
1571 clone_node = clone_dest;
1576 /* Search for a node which is duplicated from the node ORG_NODE, and
1577 satisfies the constraint CONSTRAINT. */
1580 search_duplicated_node (dfa, org_node, constraint)
1583 unsigned int constraint;
1586 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1588 if (org_node == dfa->org_indices[idx]
1589 && constraint == dfa->nodes[idx].constraint)
1590 return idx; /* Found. */
1592 return -1; /* Not found. */
1595 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1596 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1597 otherwise return the error code. */
1599 static reg_errcode_t
1600 duplicate_node (new_idx, dfa, org_idx, constraint)
1602 int *new_idx, org_idx;
1603 unsigned int constraint;
1605 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1606 if (BE (dup_idx == -1, 0))
1608 dfa->nodes[dup_idx].constraint = constraint;
1609 if (dfa->nodes[org_idx].type == ANCHOR)
1610 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1611 dfa->nodes[dup_idx].duplicated = 1;
1613 /* Store the index of the original node. */
1614 dfa->org_indices[dup_idx] = org_idx;
1619 static reg_errcode_t
1620 calc_inveclosure (dfa)
1624 for (idx = 0; idx < dfa->nodes_len; ++idx)
1625 re_node_set_init_empty (dfa->inveclosures + idx);
1627 for (src = 0; src < dfa->nodes_len; ++src)
1629 int *elems = dfa->eclosures[src].elems;
1630 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1632 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1633 if (BE (ret == -1, 0))
1641 /* Calculate "eclosure" for all the node in DFA. */
1643 static reg_errcode_t
1647 int node_idx, incomplete;
1649 assert (dfa->nodes_len > 0);
1652 /* For each nodes, calculate epsilon closure. */
1653 for (node_idx = 0; ; ++node_idx)
1656 re_node_set eclosure_elem;
1657 if (node_idx == dfa->nodes_len)
1666 assert (dfa->eclosures[node_idx].nelem != -1);
1669 /* If we have already calculated, skip it. */
1670 if (dfa->eclosures[node_idx].nelem != 0)
1672 /* Calculate epsilon closure of `node_idx'. */
1673 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1674 if (BE (err != REG_NOERROR, 0))
1677 if (dfa->eclosures[node_idx].nelem == 0)
1680 re_node_set_free (&eclosure_elem);
1686 /* Calculate epsilon closure of NODE. */
1688 static reg_errcode_t
1689 calc_eclosure_iter (new_set, dfa, node, root)
1690 re_node_set *new_set;
1695 unsigned int constraint;
1697 re_node_set eclosure;
1699 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1700 if (BE (err != REG_NOERROR, 0))
1703 /* This indicates that we are calculating this node now.
1704 We reference this value to avoid infinite loop. */
1705 dfa->eclosures[node].nelem = -1;
1707 constraint = ((dfa->nodes[node].type == ANCHOR)
1708 ? dfa->nodes[node].opr.ctx_type : 0);
1709 /* If the current node has constraints, duplicate all nodes.
1710 Since they must inherit the constraints. */
1712 && dfa->edests[node].nelem
1713 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1715 int org_node, cur_node;
1716 org_node = cur_node = node;
1717 err = duplicate_node_closure (dfa, node, node, node, constraint);
1718 if (BE (err != REG_NOERROR, 0))
1722 /* Expand each epsilon destination nodes. */
1723 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1724 for (i = 0; i < dfa->edests[node].nelem; ++i)
1726 re_node_set eclosure_elem;
1727 int edest = dfa->edests[node].elems[i];
1728 /* If calculating the epsilon closure of `edest' is in progress,
1729 return intermediate result. */
1730 if (dfa->eclosures[edest].nelem == -1)
1735 /* If we haven't calculated the epsilon closure of `edest' yet,
1736 calculate now. Otherwise use calculated epsilon closure. */
1737 if (dfa->eclosures[edest].nelem == 0)
1739 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1740 if (BE (err != REG_NOERROR, 0))
1744 eclosure_elem = dfa->eclosures[edest];
1745 /* Merge the epsilon closure of `edest'. */
1746 re_node_set_merge (&eclosure, &eclosure_elem);
1747 /* If the epsilon closure of `edest' is incomplete,
1748 the epsilon closure of this node is also incomplete. */
1749 if (dfa->eclosures[edest].nelem == 0)
1752 re_node_set_free (&eclosure_elem);
1756 /* Epsilon closures include itself. */
1757 re_node_set_insert (&eclosure, node);
1758 if (incomplete && !root)
1759 dfa->eclosures[node].nelem = 0;
1761 dfa->eclosures[node] = eclosure;
1762 *new_set = eclosure;
1766 /* Functions for token which are used in the parser. */
1768 /* Fetch a token from INPUT.
1769 We must not use this function inside bracket expressions. */
1772 fetch_token (result, input, syntax)
1775 reg_syntax_t syntax;
1777 re_string_skip_bytes (input, peek_token (result, input, syntax));
1780 /* Peek a token from INPUT, and return the length of the token.
1781 We must not use this function inside bracket expressions. */
1784 peek_token (token, input, syntax)
1787 reg_syntax_t syntax;
1791 if (re_string_eoi (input))
1793 token->type = END_OF_RE;
1797 c = re_string_peek_byte (input, 0);
1800 token->word_char = 0;
1801 #ifdef RE_ENABLE_I18N
1802 token->mb_partial = 0;
1803 if (input->mb_cur_max > 1 &&
1804 !re_string_first_byte (input, re_string_cur_idx (input)))
1806 token->type = CHARACTER;
1807 token->mb_partial = 1;
1814 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1816 token->type = BACK_SLASH;
1820 c2 = re_string_peek_byte_case (input, 1);
1822 token->type = CHARACTER;
1823 #ifdef RE_ENABLE_I18N
1824 if (input->mb_cur_max > 1)
1826 wint_t wc = re_string_wchar_at (input,
1827 re_string_cur_idx (input) + 1);
1828 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1832 token->word_char = IS_WORD_CHAR (c2) != 0;
1837 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1838 token->type = OP_ALT;
1840 case '1': case '2': case '3': case '4': case '5':
1841 case '6': case '7': case '8': case '9':
1842 if (!(syntax & RE_NO_BK_REFS))
1844 token->type = OP_BACK_REF;
1845 token->opr.idx = c2 - '1';
1849 if (!(syntax & RE_NO_GNU_OPS))
1851 token->type = ANCHOR;
1852 token->opr.ctx_type = WORD_FIRST;
1856 if (!(syntax & RE_NO_GNU_OPS))
1858 token->type = ANCHOR;
1859 token->opr.ctx_type = WORD_LAST;
1863 if (!(syntax & RE_NO_GNU_OPS))
1865 token->type = ANCHOR;
1866 token->opr.ctx_type = WORD_DELIM;
1870 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = ANCHOR;
1873 token->opr.ctx_type = NOT_WORD_DELIM;
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_WORD;
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = OP_NOTWORD;
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = OP_SPACE;
1889 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = OP_NOTSPACE;
1893 if (!(syntax & RE_NO_GNU_OPS))
1895 token->type = ANCHOR;
1896 token->opr.ctx_type = BUF_FIRST;
1900 if (!(syntax & RE_NO_GNU_OPS))
1902 token->type = ANCHOR;
1903 token->opr.ctx_type = BUF_LAST;
1907 if (!(syntax & RE_NO_BK_PARENS))
1908 token->type = OP_OPEN_SUBEXP;
1911 if (!(syntax & RE_NO_BK_PARENS))
1912 token->type = OP_CLOSE_SUBEXP;
1915 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916 token->type = OP_DUP_PLUS;
1919 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1920 token->type = OP_DUP_QUESTION;
1923 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924 token->type = OP_OPEN_DUP_NUM;
1927 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1928 token->type = OP_CLOSE_DUP_NUM;
1936 token->type = CHARACTER;
1937 #ifdef RE_ENABLE_I18N
1938 if (input->mb_cur_max > 1)
1940 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1941 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1945 token->word_char = IS_WORD_CHAR (token->opr.c);
1950 if (syntax & RE_NEWLINE_ALT)
1951 token->type = OP_ALT;
1954 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1955 token->type = OP_ALT;
1958 token->type = OP_DUP_ASTERISK;
1961 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1962 token->type = OP_DUP_PLUS;
1965 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1966 token->type = OP_DUP_QUESTION;
1969 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1970 token->type = OP_OPEN_DUP_NUM;
1973 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1974 token->type = OP_CLOSE_DUP_NUM;
1977 if (syntax & RE_NO_BK_PARENS)
1978 token->type = OP_OPEN_SUBEXP;
1981 if (syntax & RE_NO_BK_PARENS)
1982 token->type = OP_CLOSE_SUBEXP;
1985 token->type = OP_OPEN_BRACKET;
1988 token->type = OP_PERIOD;
1991 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1992 re_string_cur_idx (input) != 0)
1994 char prev = re_string_peek_byte (input, -1);
1995 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1998 token->type = ANCHOR;
1999 token->opr.ctx_type = LINE_FIRST;
2002 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2003 re_string_cur_idx (input) + 1 != re_string_length (input))
2006 re_string_skip_bytes (input, 1);
2007 peek_token (&next, input, syntax);
2008 re_string_skip_bytes (input, -1);
2009 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2012 token->type = ANCHOR;
2013 token->opr.ctx_type = LINE_LAST;
2021 /* Peek a token from INPUT, and return the length of the token.
2022 We must not use this function out of bracket expressions. */
2025 peek_token_bracket (token, input, syntax)
2028 reg_syntax_t syntax;
2031 if (re_string_eoi (input))
2033 token->type = END_OF_RE;
2036 c = re_string_peek_byte (input, 0);
2039 #ifdef RE_ENABLE_I18N
2040 if (input->mb_cur_max > 1 &&
2041 !re_string_first_byte (input, re_string_cur_idx (input)))
2043 token->type = CHARACTER;
2046 #endif /* RE_ENABLE_I18N */
2048 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2049 && re_string_cur_idx (input) + 1 < re_string_length (input))
2051 /* In this case, '\' escape a character. */
2053 re_string_skip_bytes (input, 1);
2054 c2 = re_string_peek_byte (input, 0);
2056 token->type = CHARACTER;
2059 if (c == '[') /* '[' is a special char in a bracket exps. */
2063 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2064 c2 = re_string_peek_byte (input, 1);
2072 token->type = OP_OPEN_COLL_ELEM;
2075 token->type = OP_OPEN_EQUIV_CLASS;
2078 if (syntax & RE_CHAR_CLASSES)
2080 token->type = OP_OPEN_CHAR_CLASS;
2083 /* else fall through. */
2085 token->type = CHARACTER;
2095 token->type = OP_CHARSET_RANGE;
2098 token->type = OP_CLOSE_BRACKET;
2101 token->type = OP_NON_MATCH_LIST;
2104 token->type = CHARACTER;
2109 /* Functions for parser. */
2111 /* Entry point of the parser.
2112 Parse the regular expression REGEXP and return the structure tree.
2113 If an error is occured, ERR is set by error code, and return NULL.
2114 This function build the following tree, from regular expression <reg_exp>:
2120 CAT means concatenation.
2121 EOR means end of regular expression. */
2124 parse (regexp, preg, syntax, err)
2125 re_string_t *regexp;
2127 reg_syntax_t syntax;
2130 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2131 bin_tree_t *tree, *eor, *root;
2132 re_token_t current_token;
2133 dfa->syntax = syntax;
2134 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2135 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2136 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2138 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2140 root = create_tree (dfa, tree, eor, CONCAT);
2143 if (BE (eor == NULL || root == NULL, 0))
2151 /* This function build the following tree, from regular expression
2152 <branch1>|<branch2>:
2158 ALT means alternative, which represents the operator `|'. */
2161 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2162 re_string_t *regexp;
2165 reg_syntax_t syntax;
2169 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2170 bin_tree_t *tree, *branch = NULL;
2171 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2172 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2175 while (token->type == OP_ALT)
2177 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2178 if (token->type != OP_ALT && token->type != END_OF_RE
2179 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2181 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2182 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2187 tree = create_tree (dfa, tree, branch, OP_ALT);
2188 if (BE (tree == NULL, 0))
2197 /* This function build the following tree, from regular expression
2204 CAT means concatenation. */
2207 parse_branch (regexp, preg, token, syntax, nest, err)
2208 re_string_t *regexp;
2211 reg_syntax_t syntax;
2215 bin_tree_t *tree, *exp;
2216 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2217 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2218 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221 while (token->type != OP_ALT && token->type != END_OF_RE
2222 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2224 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2225 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2229 if (tree != NULL && exp != NULL)
2231 tree = create_tree (dfa, tree, exp, CONCAT);
2238 else if (tree == NULL)
2240 /* Otherwise exp == NULL, we don't need to create new tree. */
2245 /* This function build the following tree, from regular expression a*:
2252 parse_expression (regexp, preg, token, syntax, nest, err)
2253 re_string_t *regexp;
2256 reg_syntax_t syntax;
2260 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2262 switch (token->type)
2265 tree = create_token_tree (dfa, NULL, NULL, token);
2266 if (BE (tree == NULL, 0))
2271 #ifdef RE_ENABLE_I18N
2272 if (dfa->mb_cur_max > 1)
2274 while (!re_string_eoi (regexp)
2275 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2277 bin_tree_t *mbc_remain;
2278 fetch_token (token, regexp, syntax);
2279 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2280 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2281 if (BE (mbc_remain == NULL || tree == NULL, 0))
2290 case OP_OPEN_SUBEXP:
2291 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2292 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2295 case OP_OPEN_BRACKET:
2296 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2297 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2301 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2306 dfa->used_bkref_map |= 1 << token->opr.idx;
2307 tree = create_token_tree (dfa, NULL, NULL, token);
2308 if (BE (tree == NULL, 0))
2314 dfa->has_mb_node = 1;
2316 case OP_OPEN_DUP_NUM:
2317 if (syntax & RE_CONTEXT_INVALID_DUP)
2323 case OP_DUP_ASTERISK:
2325 case OP_DUP_QUESTION:
2326 if (syntax & RE_CONTEXT_INVALID_OPS)
2331 else if (syntax & RE_CONTEXT_INDEP_OPS)
2333 fetch_token (token, regexp, syntax);
2334 return parse_expression (regexp, preg, token, syntax, nest, err);
2336 /* else fall through */
2337 case OP_CLOSE_SUBEXP:
2338 if ((token->type == OP_CLOSE_SUBEXP) &&
2339 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2344 /* else fall through */
2345 case OP_CLOSE_DUP_NUM:
2346 /* We treat it as a normal character. */
2348 /* Then we can these characters as normal characters. */
2349 token->type = CHARACTER;
2350 /* mb_partial and word_char bits should be initialized already
2352 tree = create_token_tree (dfa, NULL, NULL, token);
2353 if (BE (tree == NULL, 0))
2360 if ((token->opr.ctx_type
2361 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2362 && dfa->word_ops_used == 0)
2363 init_word_char (dfa);
2364 if (token->opr.ctx_type == WORD_DELIM
2365 || token->opr.ctx_type == NOT_WORD_DELIM)
2367 bin_tree_t *tree_first, *tree_last;
2368 if (token->opr.ctx_type == WORD_DELIM)
2370 token->opr.ctx_type = WORD_FIRST;
2371 tree_first = create_token_tree (dfa, NULL, NULL, token);
2372 token->opr.ctx_type = WORD_LAST;
2376 token->opr.ctx_type = INSIDE_WORD;
2377 tree_first = create_token_tree (dfa, NULL, NULL, token);
2378 token->opr.ctx_type = INSIDE_NOTWORD;
2380 tree_last = create_token_tree (dfa, NULL, NULL, token);
2381 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2382 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2390 tree = create_token_tree (dfa, NULL, NULL, token);
2391 if (BE (tree == NULL, 0))
2397 /* We must return here, since ANCHORs can't be followed
2398 by repetition operators.
2399 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2400 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2401 fetch_token (token, regexp, syntax);
2404 tree = create_token_tree (dfa, NULL, NULL, token);
2405 if (BE (tree == NULL, 0))
2410 if (dfa->mb_cur_max > 1)
2411 dfa->has_mb_node = 1;
2415 tree = build_charclass_op (dfa, regexp->trans,
2416 (const unsigned char *) "alnum",
2417 (const unsigned char *) "_",
2418 token->type == OP_NOTWORD, err);
2419 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2424 tree = build_charclass_op (dfa, regexp->trans,
2425 (const unsigned char *) "space",
2426 (const unsigned char *) "",
2427 token->type == OP_NOTSPACE, err);
2428 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2438 /* Must not happen? */
2444 fetch_token (token, regexp, syntax);
2446 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2447 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2449 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2450 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2452 /* In BRE consecutive duplications are not allowed. */
2453 if ((syntax & RE_CONTEXT_INVALID_DUP)
2454 && (token->type == OP_DUP_ASTERISK
2455 || token->type == OP_OPEN_DUP_NUM))
2465 /* This function build the following tree, from regular expression
2473 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2474 re_string_t *regexp;
2477 reg_syntax_t syntax;
2481 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2484 cur_nsub = preg->re_nsub++;
2486 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2488 /* The subexpression may be a null string. */
2489 if (token->type == OP_CLOSE_SUBEXP)
2493 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2494 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2496 if (BE (*err != REG_NOERROR, 0))
2499 dfa->completed_bkref_map |= 1 << cur_nsub;
2501 tree = create_tree (dfa, tree, NULL, SUBEXP);
2502 if (BE (tree == NULL, 0))
2507 tree->token.opr.idx = cur_nsub;
2511 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2514 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2516 re_string_t *regexp;
2519 reg_syntax_t syntax;
2522 bin_tree_t *tree = NULL, *old_tree = NULL;
2523 int i, start, end, start_idx = re_string_cur_idx (regexp);
2524 re_token_t start_token = *token;
2526 if (token->type == OP_OPEN_DUP_NUM)
2529 start = fetch_number (regexp, token, syntax);
2532 if (token->type == CHARACTER && token->opr.c == ',')
2533 start = 0; /* We treat "{,m}" as "{0,m}". */
2536 *err = REG_BADBR; /* <re>{} is invalid. */
2540 if (BE (start != -2, 1))
2542 /* We treat "{n}" as "{n,n}". */
2543 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2544 : ((token->type == CHARACTER && token->opr.c == ',')
2545 ? fetch_number (regexp, token, syntax) : -2));
2547 if (BE (start == -2 || end == -2, 0))
2549 /* Invalid sequence. */
2550 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2552 if (token->type == END_OF_RE)
2560 /* If the syntax bit is set, rollback. */
2561 re_string_set_index (regexp, start_idx);
2562 *token = start_token;
2563 token->type = CHARACTER;
2564 /* mb_partial and word_char bits should be already initialized by
2569 if (BE (end != -1 && start > end, 0))
2571 /* First number greater than second. */
2578 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2579 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2582 fetch_token (token, regexp, syntax);
2584 if (BE (elem == NULL, 0))
2586 if (BE (start == 0 && end == 0, 0))
2588 postorder (elem, free_tree, NULL);
2592 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2593 if (BE (start > 0, 0))
2596 for (i = 2; i <= start; ++i)
2598 elem = duplicate_tree (elem, dfa);
2599 tree = create_tree (dfa, tree, elem, CONCAT);
2600 if (BE (elem == NULL || tree == NULL, 0))
2601 goto parse_dup_op_espace;
2607 /* Duplicate ELEM before it is marked optional. */
2608 elem = duplicate_tree (elem, dfa);
2614 if (elem->token.type == SUBEXP)
2615 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2617 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2618 if (BE (tree == NULL, 0))
2619 goto parse_dup_op_espace;
2621 /* This loop is actually executed only when end != -1,
2622 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2623 already created the start+1-th copy. */
2624 for (i = start + 2; i <= end; ++i)
2626 elem = duplicate_tree (elem, dfa);
2627 tree = create_tree (dfa, tree, elem, CONCAT);
2628 if (BE (elem == NULL || tree == NULL, 0))
2629 goto parse_dup_op_espace;
2631 tree = create_tree (dfa, tree, NULL, OP_ALT);
2632 if (BE (tree == NULL, 0))
2633 goto parse_dup_op_espace;
2637 tree = create_tree (dfa, old_tree, tree, CONCAT);
2641 parse_dup_op_espace:
2646 /* Size of the names for collating symbol/equivalence_class/character_class.
2647 I'm not sure, but maybe enough. */
2648 #define BRACKET_NAME_BUF_SIZE 32
2651 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2652 Build the range expression which starts from START_ELEM, and ends
2653 at END_ELEM. The result are written to MBCSET and SBCSET.
2654 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2655 mbcset->range_ends, is a pointer argument sinse we may
2658 static reg_errcode_t
2659 # ifdef RE_ENABLE_I18N
2660 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2661 re_charset_t *mbcset;
2663 # else /* not RE_ENABLE_I18N */
2664 build_range_exp (sbcset, start_elem, end_elem)
2665 # endif /* not RE_ENABLE_I18N */
2666 re_bitset_ptr_t sbcset;
2667 bracket_elem_t *start_elem, *end_elem;
2669 unsigned int start_ch, end_ch;
2670 /* Equivalence Classes and Character Classes can't be a range start/end. */
2671 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2672 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2676 /* We can handle no multi character collating elements without libc
2678 if (BE ((start_elem->type == COLL_SYM
2679 && strlen ((char *) start_elem->opr.name) > 1)
2680 || (end_elem->type == COLL_SYM
2681 && strlen ((char *) end_elem->opr.name) > 1), 0))
2682 return REG_ECOLLATE;
2684 # ifdef RE_ENABLE_I18N
2686 wchar_t wc, start_wc, end_wc;
2687 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2689 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2690 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2692 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2693 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2695 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2696 ? __btowc (start_ch) : start_elem->opr.wch);
2697 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2698 ? __btowc (end_ch) : end_elem->opr.wch);
2699 if (start_wc == WEOF || end_wc == WEOF)
2700 return REG_ECOLLATE;
2701 cmp_buf[0] = start_wc;
2702 cmp_buf[4] = end_wc;
2703 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2706 /* Got valid collation sequence values, add them as a new entry.
2707 However, for !_LIBC we have no collation elements: if the
2708 character set is single byte, the single byte character set
2709 that we build below suffices. parse_bracket_exp passes
2710 no MBCSET if dfa->mb_cur_max == 1. */
2713 /* Check the space of the arrays. */
2714 if (BE (*range_alloc == mbcset->nranges, 0))
2716 /* There is not enough space, need realloc. */
2717 wchar_t *new_array_start, *new_array_end;
2720 /* +1 in case of mbcset->nranges is 0. */
2721 new_nranges = 2 * mbcset->nranges + 1;
2722 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2723 are NULL if *range_alloc == 0. */
2724 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2726 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2729 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2732 mbcset->range_starts = new_array_start;
2733 mbcset->range_ends = new_array_end;
2734 *range_alloc = new_nranges;
2737 mbcset->range_starts[mbcset->nranges] = start_wc;
2738 mbcset->range_ends[mbcset->nranges++] = end_wc;
2741 /* Build the table for single byte characters. */
2742 for (wc = 0; wc < SBC_MAX; ++wc)
2745 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2746 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2747 bitset_set (sbcset, wc);
2750 # else /* not RE_ENABLE_I18N */
2753 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2754 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2756 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2757 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2759 if (start_ch > end_ch)
2761 /* Build the table for single byte characters. */
2762 for (ch = 0; ch < SBC_MAX; ++ch)
2763 if (start_ch <= ch && ch <= end_ch)
2764 bitset_set (sbcset, ch);
2766 # endif /* not RE_ENABLE_I18N */
2769 #endif /* not _LIBC */
2772 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2773 Build the collating element which is represented by NAME.
2774 The result are written to MBCSET and SBCSET.
2775 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2776 pointer argument since we may update it. */
2778 static reg_errcode_t
2779 # ifdef RE_ENABLE_I18N
2780 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2781 re_charset_t *mbcset;
2782 int *coll_sym_alloc;
2783 # else /* not RE_ENABLE_I18N */
2784 build_collating_symbol (sbcset, name)
2785 # endif /* not RE_ENABLE_I18N */
2786 re_bitset_ptr_t sbcset;
2787 const unsigned char *name;
2789 size_t name_len = strlen ((const char *) name);
2790 if (BE (name_len != 1, 0))
2791 return REG_ECOLLATE;
2794 bitset_set (sbcset, name[0]);
2798 #endif /* not _LIBC */
2800 /* This function parse bracket expression like "[abc]", "[a-c]",
2804 parse_bracket_exp (regexp, dfa, token, syntax, err)
2805 re_string_t *regexp;
2808 reg_syntax_t syntax;
2812 const unsigned char *collseqmb;
2813 const char *collseqwc;
2816 const int32_t *symb_table;
2817 const unsigned char *extra;
2819 /* Local function for parse_bracket_exp used in _LIBC environement.
2820 Seek the collating symbol entry correspondings to NAME.
2821 Return the index of the symbol in the SYMB_TABLE. */
2824 __attribute ((always_inline))
2825 seek_collating_symbol_entry (name, name_len)
2826 const unsigned char *name;
2829 int32_t hash = elem_hash ((const char *) name, name_len);
2830 int32_t elem = hash % table_size;
2831 int32_t second = hash % (table_size - 2);
2832 while (symb_table[2 * elem] != 0)
2834 /* First compare the hashing value. */
2835 if (symb_table[2 * elem] == hash
2836 /* Compare the length of the name. */
2837 && name_len == extra[symb_table[2 * elem + 1]]
2838 /* Compare the name. */
2839 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2842 /* Yep, this is the entry. */
2852 /* Local function for parse_bracket_exp used in _LIBC environement.
2853 Look up the collation sequence value of BR_ELEM.
2854 Return the value if succeeded, UINT_MAX otherwise. */
2856 auto inline unsigned int
2857 __attribute ((always_inline))
2858 lookup_collation_sequence_value (br_elem)
2859 bracket_elem_t *br_elem;
2861 if (br_elem->type == SB_CHAR)
2864 if (MB_CUR_MAX == 1)
2867 return collseqmb[br_elem->opr.ch];
2870 wint_t wc = __btowc (br_elem->opr.ch);
2871 return __collseq_table_lookup (collseqwc, wc);
2874 else if (br_elem->type == MB_CHAR)
2876 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2878 else if (br_elem->type == COLL_SYM)
2880 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2884 elem = seek_collating_symbol_entry (br_elem->opr.name,
2886 if (symb_table[2 * elem] != 0)
2888 /* We found the entry. */
2889 idx = symb_table[2 * elem + 1];
2890 /* Skip the name of collating element name. */
2891 idx += 1 + extra[idx];
2892 /* Skip the byte sequence of the collating element. */
2893 idx += 1 + extra[idx];
2894 /* Adjust for the alignment. */
2895 idx = (idx + 3) & ~3;
2896 /* Skip the multibyte collation sequence value. */
2897 idx += sizeof (unsigned int);
2898 /* Skip the wide char sequence of the collating element. */
2899 idx += sizeof (unsigned int) *
2900 (1 + *(unsigned int *) (extra + idx));
2901 /* Return the collation sequence value. */
2902 return *(unsigned int *) (extra + idx);
2904 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2906 /* No valid character. Match it as a single byte
2908 return collseqmb[br_elem->opr.name[0]];
2911 else if (sym_name_len == 1)
2912 return collseqmb[br_elem->opr.name[0]];
2917 /* Local function for parse_bracket_exp used in _LIBC environement.
2918 Build the range expression which starts from START_ELEM, and ends
2919 at END_ELEM. The result are written to MBCSET and SBCSET.
2920 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2921 mbcset->range_ends, is a pointer argument sinse we may
2924 auto inline reg_errcode_t
2925 __attribute ((always_inline))
2926 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2927 re_charset_t *mbcset;
2929 re_bitset_ptr_t sbcset;
2930 bracket_elem_t *start_elem, *end_elem;
2933 uint32_t start_collseq;
2934 uint32_t end_collseq;
2936 /* Equivalence Classes and Character Classes can't be a range
2938 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2939 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2943 start_collseq = lookup_collation_sequence_value (start_elem);
2944 end_collseq = lookup_collation_sequence_value (end_elem);
2945 /* Check start/end collation sequence values. */
2946 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2947 return REG_ECOLLATE;
2948 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2951 /* Got valid collation sequence values, add them as a new entry.
2952 However, if we have no collation elements, and the character set
2953 is single byte, the single byte character set that we
2954 build below suffices. */
2955 if (nrules > 0 || dfa->mb_cur_max > 1)
2957 /* Check the space of the arrays. */
2958 if (BE (*range_alloc == mbcset->nranges, 0))
2960 /* There is not enough space, need realloc. */
2961 uint32_t *new_array_start;
2962 uint32_t *new_array_end;
2965 /* +1 in case of mbcset->nranges is 0. */
2966 new_nranges = 2 * mbcset->nranges + 1;
2967 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2969 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2972 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2975 mbcset->range_starts = new_array_start;
2976 mbcset->range_ends = new_array_end;
2977 *range_alloc = new_nranges;
2980 mbcset->range_starts[mbcset->nranges] = start_collseq;
2981 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2984 /* Build the table for single byte characters. */
2985 for (ch = 0; ch < SBC_MAX; ch++)
2987 uint32_t ch_collseq;
2989 if (MB_CUR_MAX == 1)
2992 ch_collseq = collseqmb[ch];
2994 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2995 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2996 bitset_set (sbcset, ch);
3001 /* Local function for parse_bracket_exp used in _LIBC environement.
3002 Build the collating element which is represented by NAME.
3003 The result are written to MBCSET and SBCSET.
3004 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3005 pointer argument sinse we may update it. */
3007 auto inline reg_errcode_t
3008 __attribute ((always_inline))
3009 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3010 re_charset_t *mbcset;
3011 int *coll_sym_alloc;
3012 re_bitset_ptr_t sbcset;
3013 const unsigned char *name;
3016 size_t name_len = strlen ((const char *) name);
3019 elem = seek_collating_symbol_entry (name, name_len);
3020 if (symb_table[2 * elem] != 0)
3022 /* We found the entry. */
3023 idx = symb_table[2 * elem + 1];
3024 /* Skip the name of collating element name. */
3025 idx += 1 + extra[idx];
3027 else if (symb_table[2 * elem] == 0 && name_len == 1)
3029 /* No valid character, treat it as a normal
3031 bitset_set (sbcset, name[0]);
3035 return REG_ECOLLATE;
3037 /* Got valid collation sequence, add it as a new entry. */
3038 /* Check the space of the arrays. */
3039 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3041 /* Not enough, realloc it. */
3042 /* +1 in case of mbcset->ncoll_syms is 0. */
3043 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3044 /* Use realloc since mbcset->coll_syms is NULL
3046 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3047 new_coll_sym_alloc);
3048 if (BE (new_coll_syms == NULL, 0))
3050 mbcset->coll_syms = new_coll_syms;
3051 *coll_sym_alloc = new_coll_sym_alloc;
3053 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3058 if (BE (name_len != 1, 0))
3059 return REG_ECOLLATE;
3062 bitset_set (sbcset, name[0]);
3069 re_token_t br_token;
3070 re_bitset_ptr_t sbcset;
3071 #ifdef RE_ENABLE_I18N
3072 re_charset_t *mbcset;
3073 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3074 int equiv_class_alloc = 0, char_class_alloc = 0;
3075 #endif /* not RE_ENABLE_I18N */
3077 bin_tree_t *work_tree;
3079 int first_round = 1;
3081 collseqmb = (const unsigned char *)
3082 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3083 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3089 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3090 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3091 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3092 _NL_COLLATE_SYMB_TABLEMB);
3093 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3094 _NL_COLLATE_SYMB_EXTRAMB);
3097 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3098 #ifdef RE_ENABLE_I18N
3099 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3100 #endif /* RE_ENABLE_I18N */
3101 #ifdef RE_ENABLE_I18N
3102 if (BE (sbcset == NULL || mbcset == NULL, 0))
3104 if (BE (sbcset == NULL, 0))
3105 #endif /* RE_ENABLE_I18N */
3111 token_len = peek_token_bracket (token, regexp, syntax);
3112 if (BE (token->type == END_OF_RE, 0))
3115 goto parse_bracket_exp_free_return;
3117 if (token->type == OP_NON_MATCH_LIST)
3119 #ifdef RE_ENABLE_I18N
3120 mbcset->non_match = 1;
3121 #endif /* not RE_ENABLE_I18N */
3123 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3124 bitset_set (sbcset, '\0');
3125 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3126 token_len = peek_token_bracket (token, regexp, syntax);
3127 if (BE (token->type == END_OF_RE, 0))
3130 goto parse_bracket_exp_free_return;
3134 /* We treat the first ']' as a normal character. */
3135 if (token->type == OP_CLOSE_BRACKET)
3136 token->type = CHARACTER;
3140 bracket_elem_t start_elem, end_elem;
3141 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3142 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3144 int token_len2 = 0, is_range_exp = 0;
3147 start_elem.opr.name = start_name_buf;
3148 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3149 syntax, first_round);
3150 if (BE (ret != REG_NOERROR, 0))
3153 goto parse_bracket_exp_free_return;
3157 /* Get information about the next token. We need it in any case. */
3158 token_len = peek_token_bracket (token, regexp, syntax);
3160 /* Do not check for ranges if we know they are not allowed. */
3161 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3163 if (BE (token->type == END_OF_RE, 0))
3166 goto parse_bracket_exp_free_return;
3168 if (token->type == OP_CHARSET_RANGE)
3170 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3171 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3172 if (BE (token2.type == END_OF_RE, 0))
3175 goto parse_bracket_exp_free_return;
3177 if (token2.type == OP_CLOSE_BRACKET)
3179 /* We treat the last '-' as a normal character. */
3180 re_string_skip_bytes (regexp, -token_len);
3181 token->type = CHARACTER;
3188 if (is_range_exp == 1)
3190 end_elem.opr.name = end_name_buf;
3191 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3193 if (BE (ret != REG_NOERROR, 0))
3196 goto parse_bracket_exp_free_return;
3199 token_len = peek_token_bracket (token, regexp, syntax);
3202 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3203 &start_elem, &end_elem);
3205 # ifdef RE_ENABLE_I18N
3206 *err = build_range_exp (sbcset,
3207 dfa->mb_cur_max > 1 ? mbcset : NULL,
3208 &range_alloc, &start_elem, &end_elem);
3210 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3212 #endif /* RE_ENABLE_I18N */
3213 if (BE (*err != REG_NOERROR, 0))
3214 goto parse_bracket_exp_free_return;
3218 switch (start_elem.type)
3221 bitset_set (sbcset, start_elem.opr.ch);
3223 #ifdef RE_ENABLE_I18N
3225 /* Check whether the array has enough space. */
3226 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3228 wchar_t *new_mbchars;
3229 /* Not enough, realloc it. */
3230 /* +1 in case of mbcset->nmbchars is 0. */
3231 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3232 /* Use realloc since array is NULL if *alloc == 0. */
3233 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3235 if (BE (new_mbchars == NULL, 0))
3236 goto parse_bracket_exp_espace;
3237 mbcset->mbchars = new_mbchars;
3239 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3241 #endif /* RE_ENABLE_I18N */
3243 *err = build_equiv_class (sbcset,
3244 #ifdef RE_ENABLE_I18N
3245 mbcset, &equiv_class_alloc,
3246 #endif /* RE_ENABLE_I18N */
3247 start_elem.opr.name);
3248 if (BE (*err != REG_NOERROR, 0))
3249 goto parse_bracket_exp_free_return;
3252 *err = build_collating_symbol (sbcset,
3253 #ifdef RE_ENABLE_I18N
3254 mbcset, &coll_sym_alloc,
3255 #endif /* RE_ENABLE_I18N */
3256 start_elem.opr.name);
3257 if (BE (*err != REG_NOERROR, 0))
3258 goto parse_bracket_exp_free_return;
3261 *err = build_charclass (regexp->trans, sbcset,
3262 #ifdef RE_ENABLE_I18N
3263 mbcset, &char_class_alloc,
3264 #endif /* RE_ENABLE_I18N */
3265 start_elem.opr.name, syntax);
3266 if (BE (*err != REG_NOERROR, 0))
3267 goto parse_bracket_exp_free_return;
3274 if (BE (token->type == END_OF_RE, 0))
3277 goto parse_bracket_exp_free_return;
3279 if (token->type == OP_CLOSE_BRACKET)
3283 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3285 /* If it is non-matching list. */
3287 bitset_not (sbcset);
3289 #ifdef RE_ENABLE_I18N
3290 /* Ensure only single byte characters are set. */
3291 if (dfa->mb_cur_max > 1)
3292 bitset_mask (sbcset, dfa->sb_char);
3294 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3295 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3296 || mbcset->non_match)))
3298 bin_tree_t *mbc_tree;
3300 /* Build a tree for complex bracket. */
3301 dfa->has_mb_node = 1;
3302 br_token.type = COMPLEX_BRACKET;
3303 br_token.opr.mbcset = mbcset;
3304 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3305 if (BE (mbc_tree == NULL, 0))
3306 goto parse_bracket_exp_espace;
3307 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3308 if (sbcset[sbc_idx])
3310 /* If there are no bits set in sbcset, there is no point
3311 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3312 if (sbc_idx < BITSET_UINTS)
3314 /* Build a tree for simple bracket. */
3315 br_token.type = SIMPLE_BRACKET;
3316 br_token.opr.sbcset = sbcset;
3317 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3318 if (BE (work_tree == NULL, 0))
3319 goto parse_bracket_exp_espace;
3321 /* Then join them by ALT node. */
3322 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3323 if (BE (work_tree == NULL, 0))
3324 goto parse_bracket_exp_espace;
3329 work_tree = mbc_tree;
3333 #endif /* not RE_ENABLE_I18N */
3335 #ifdef RE_ENABLE_I18N
3336 free_charset (mbcset);
3338 /* Build a tree for simple bracket. */
3339 br_token.type = SIMPLE_BRACKET;
3340 br_token.opr.sbcset = sbcset;
3341 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3342 if (BE (work_tree == NULL, 0))
3343 goto parse_bracket_exp_espace;
3347 parse_bracket_exp_espace:
3349 parse_bracket_exp_free_return:
3351 #ifdef RE_ENABLE_I18N
3352 free_charset (mbcset);
3353 #endif /* RE_ENABLE_I18N */
3357 /* Parse an element in the bracket expression. */
3359 static reg_errcode_t
3360 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3362 bracket_elem_t *elem;
3363 re_string_t *regexp;
3367 reg_syntax_t syntax;
3370 #ifdef RE_ENABLE_I18N
3372 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3373 if (cur_char_size > 1)
3375 elem->type = MB_CHAR;
3376 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3377 re_string_skip_bytes (regexp, cur_char_size);
3380 #endif /* RE_ENABLE_I18N */
3381 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3382 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3383 || token->type == OP_OPEN_EQUIV_CLASS)
3384 return parse_bracket_symbol (elem, regexp, token);
3385 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3387 /* A '-' must only appear as anything but a range indicator before
3388 the closing bracket. Everything else is an error. */
3390 (void) peek_token_bracket (&token2, regexp, syntax);
3391 if (token2.type != OP_CLOSE_BRACKET)
3392 /* The actual error value is not standardized since this whole
3393 case is undefined. But ERANGE makes good sense. */
3396 elem->type = SB_CHAR;
3397 elem->opr.ch = token->opr.c;
3401 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3402 such as [:<character_class>:], [.<collating_element>.], and
3403 [=<equivalent_class>=]. */
3405 static reg_errcode_t
3406 parse_bracket_symbol (elem, regexp, token)
3407 bracket_elem_t *elem;
3408 re_string_t *regexp;
3411 unsigned char ch, delim = token->opr.c;
3413 if (re_string_eoi(regexp))
3417 if (i >= BRACKET_NAME_BUF_SIZE)
3419 if (token->type == OP_OPEN_CHAR_CLASS)
3420 ch = re_string_fetch_byte_case (regexp);
3422 ch = re_string_fetch_byte (regexp);
3423 if (re_string_eoi(regexp))
3425 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3427 elem->opr.name[i] = ch;
3429 re_string_skip_bytes (regexp, 1);
3430 elem->opr.name[i] = '\0';
3431 switch (token->type)
3433 case OP_OPEN_COLL_ELEM:
3434 elem->type = COLL_SYM;
3436 case OP_OPEN_EQUIV_CLASS:
3437 elem->type = EQUIV_CLASS;
3439 case OP_OPEN_CHAR_CLASS:
3440 elem->type = CHAR_CLASS;
3448 /* Helper function for parse_bracket_exp.
3449 Build the equivalence class which is represented by NAME.
3450 The result are written to MBCSET and SBCSET.
3451 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3452 is a pointer argument sinse we may update it. */
3454 static reg_errcode_t
3455 #ifdef RE_ENABLE_I18N
3456 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3457 re_charset_t *mbcset;
3458 int *equiv_class_alloc;
3459 #else /* not RE_ENABLE_I18N */
3460 build_equiv_class (sbcset, name)
3461 #endif /* not RE_ENABLE_I18N */
3462 re_bitset_ptr_t sbcset;
3463 const unsigned char *name;
3466 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3469 const int32_t *table, *indirect;
3470 const unsigned char *weights, *extra, *cp;
3471 unsigned char char_buf[2];
3475 /* This #include defines a local function! */
3476 # include <locale/weight.h>
3477 /* Calculate the index for equivalence class. */
3479 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3480 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3481 _NL_COLLATE_WEIGHTMB);
3482 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3483 _NL_COLLATE_EXTRAMB);
3484 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3485 _NL_COLLATE_INDIRECTMB);
3486 idx1 = findidx (&cp);
3487 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3488 /* This isn't a valid character. */
3489 return REG_ECOLLATE;
3491 /* Build single byte matcing table for this equivalence class. */
3492 char_buf[1] = (unsigned char) '\0';
3493 len = weights[idx1];
3494 for (ch = 0; ch < SBC_MAX; ++ch)
3498 idx2 = findidx (&cp);
3503 /* This isn't a valid character. */
3505 if (len == weights[idx2])
3508 while (cnt <= len &&
3509 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3513 bitset_set (sbcset, ch);
3516 /* Check whether the array has enough space. */
3517 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3519 /* Not enough, realloc it. */
3520 /* +1 in case of mbcset->nequiv_classes is 0. */
3521 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3522 /* Use realloc since the array is NULL if *alloc == 0. */
3523 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3525 new_equiv_class_alloc);
3526 if (BE (new_equiv_classes == NULL, 0))
3528 mbcset->equiv_classes = new_equiv_classes;
3529 *equiv_class_alloc = new_equiv_class_alloc;
3531 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3536 if (BE (strlen ((const char *) name) != 1, 0))
3537 return REG_ECOLLATE;
3538 bitset_set (sbcset, *name);
3543 /* Helper function for parse_bracket_exp.
3544 Build the character class which is represented by NAME.
3545 The result are written to MBCSET and SBCSET.
3546 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3547 is a pointer argument sinse we may update it. */
3549 static reg_errcode_t
3550 #ifdef RE_ENABLE_I18N
3551 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3552 re_charset_t *mbcset;
3553 int *char_class_alloc;
3554 #else /* not RE_ENABLE_I18N */
3555 build_charclass (trans, sbcset, class_name, syntax)
3556 #endif /* not RE_ENABLE_I18N */
3557 unsigned RE_TRANSLATE_TYPE trans;
3558 re_bitset_ptr_t sbcset;
3559 const unsigned char *class_name;
3560 reg_syntax_t syntax;
3563 const char *name = (const char *) class_name;
3565 /* In case of REG_ICASE "upper" and "lower" match the both of
3566 upper and lower cases. */
3567 if ((syntax & RE_ICASE)
3568 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3571 #ifdef RE_ENABLE_I18N
3572 /* Check the space of the arrays. */
3573 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3575 /* Not enough, realloc it. */
3576 /* +1 in case of mbcset->nchar_classes is 0. */
3577 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3578 /* Use realloc since array is NULL if *alloc == 0. */
3579 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3580 new_char_class_alloc);
3581 if (BE (new_char_classes == NULL, 0))
3583 mbcset->char_classes = new_char_classes;
3584 *char_class_alloc = new_char_class_alloc;
3586 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3587 #endif /* RE_ENABLE_I18N */
3589 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3590 for (i = 0; i < SBC_MAX; ++i) \
3592 if (ctype_func (i)) \
3594 int ch = trans ? trans[i] : i; \
3595 bitset_set (sbcset, ch); \
3599 if (strcmp (name, "alnum") == 0)
3600 BUILD_CHARCLASS_LOOP (isalnum)
3601 else if (strcmp (name, "cntrl") == 0)
3602 BUILD_CHARCLASS_LOOP (iscntrl)
3603 else if (strcmp (name, "lower") == 0)
3604 BUILD_CHARCLASS_LOOP (islower)
3605 else if (strcmp (name, "space") == 0)
3606 BUILD_CHARCLASS_LOOP (isspace)
3607 else if (strcmp (name, "alpha") == 0)
3608 BUILD_CHARCLASS_LOOP (isalpha)
3609 else if (strcmp (name, "digit") == 0)
3610 BUILD_CHARCLASS_LOOP (isdigit)
3611 else if (strcmp (name, "print") == 0)
3612 BUILD_CHARCLASS_LOOP (isprint)
3613 else if (strcmp (name, "upper") == 0)
3614 BUILD_CHARCLASS_LOOP (isupper)
3615 else if (strcmp (name, "blank") == 0)
3616 BUILD_CHARCLASS_LOOP (isblank)
3617 else if (strcmp (name, "graph") == 0)
3618 BUILD_CHARCLASS_LOOP (isgraph)
3619 else if (strcmp (name, "punct") == 0)
3620 BUILD_CHARCLASS_LOOP (ispunct)
3621 else if (strcmp (name, "xdigit") == 0)
3622 BUILD_CHARCLASS_LOOP (isxdigit)
3630 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3632 unsigned RE_TRANSLATE_TYPE trans;
3633 const unsigned char *class_name;
3634 const unsigned char *extra;
3638 re_bitset_ptr_t sbcset;
3639 #ifdef RE_ENABLE_I18N
3640 re_charset_t *mbcset;
3642 #endif /* not RE_ENABLE_I18N */
3644 re_token_t br_token;
3647 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3648 #ifdef RE_ENABLE_I18N
3649 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3650 #endif /* RE_ENABLE_I18N */
3652 #ifdef RE_ENABLE_I18N
3653 if (BE (sbcset == NULL || mbcset == NULL, 0))
3654 #else /* not RE_ENABLE_I18N */
3655 if (BE (sbcset == NULL, 0))
3656 #endif /* not RE_ENABLE_I18N */
3664 #ifdef RE_ENABLE_I18N
3666 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3667 bitset_set(cset->sbcset, '\0');
3669 mbcset->non_match = 1;
3670 #endif /* not RE_ENABLE_I18N */
3673 /* We don't care the syntax in this case. */
3674 ret = build_charclass (trans, sbcset,
3675 #ifdef RE_ENABLE_I18N
3677 #endif /* RE_ENABLE_I18N */
3680 if (BE (ret != REG_NOERROR, 0))
3683 #ifdef RE_ENABLE_I18N
3684 free_charset (mbcset);
3685 #endif /* RE_ENABLE_I18N */
3689 /* \w match '_' also. */
3690 for (; *extra; extra++)
3691 bitset_set (sbcset, *extra);
3693 /* If it is non-matching list. */
3695 bitset_not (sbcset);
3697 #ifdef RE_ENABLE_I18N
3698 /* Ensure only single byte characters are set. */
3699 if (dfa->mb_cur_max > 1)
3700 bitset_mask (sbcset, dfa->sb_char);
3703 /* Build a tree for simple bracket. */
3704 br_token.type = SIMPLE_BRACKET;
3705 br_token.opr.sbcset = sbcset;
3706 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3707 if (BE (tree == NULL, 0))
3708 goto build_word_op_espace;
3710 #ifdef RE_ENABLE_I18N
3711 if (dfa->mb_cur_max > 1)
3713 bin_tree_t *mbc_tree;
3714 /* Build a tree for complex bracket. */
3715 br_token.type = COMPLEX_BRACKET;
3716 br_token.opr.mbcset = mbcset;
3717 dfa->has_mb_node = 1;
3718 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3719 if (BE (mbc_tree == NULL, 0))
3720 goto build_word_op_espace;
3721 /* Then join them by ALT node. */
3722 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3723 if (BE (mbc_tree != NULL, 1))
3728 free_charset (mbcset);
3731 #else /* not RE_ENABLE_I18N */
3733 #endif /* not RE_ENABLE_I18N */
3735 build_word_op_espace:
3737 #ifdef RE_ENABLE_I18N
3738 free_charset (mbcset);
3739 #endif /* RE_ENABLE_I18N */
3744 /* This is intended for the expressions like "a{1,3}".
3745 Fetch a number from `input', and return the number.
3746 Return -1, if the number field is empty like "{,1}".
3747 Return -2, If an error is occured. */
3750 fetch_number (input, token, syntax)
3753 reg_syntax_t syntax;
3759 fetch_token (token, input, syntax);
3761 if (BE (token->type == END_OF_RE, 0))
3763 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3765 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3766 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3767 num = (num > RE_DUP_MAX) ? -2 : num;
3772 #ifdef RE_ENABLE_I18N
3774 free_charset (re_charset_t *cset)
3776 re_free (cset->mbchars);
3778 re_free (cset->coll_syms);
3779 re_free (cset->equiv_classes);
3780 re_free (cset->range_starts);
3781 re_free (cset->range_ends);
3783 re_free (cset->char_classes);
3786 #endif /* RE_ENABLE_I18N */
3788 /* Functions for binary tree operation. */
3790 /* Create a tree node. */
3793 create_tree (dfa, left, right, type)
3797 re_token_type_t type;
3801 return create_token_tree (dfa, left, right, &t);
3805 create_token_tree (dfa, left, right, token)
3809 const re_token_t *token;
3812 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3814 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3816 if (storage == NULL)
3818 storage->next = dfa->str_tree_storage;
3819 dfa->str_tree_storage = storage;
3820 dfa->str_tree_storage_idx = 0;
3822 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3824 tree->parent = NULL;
3826 tree->right = right;
3827 tree->token = *token;
3828 tree->token.duplicated = 0;
3829 tree->token.opt_subexp = 0;
3832 tree->node_idx = -1;
3835 left->parent = tree;
3837 right->parent = tree;
3841 /* Mark the tree SRC as an optional subexpression.
3842 To be called from preorder or postorder. */
3844 static reg_errcode_t
3845 mark_opt_subexp (extra, node)
3849 int idx = (int) (long) extra;
3850 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3851 node->token.opt_subexp = 1;
3856 /* Free the allocated memory inside NODE. */
3859 free_token (re_token_t *node)
3861 #ifdef RE_ENABLE_I18N
3862 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3863 free_charset (node->opr.mbcset);
3865 #endif /* RE_ENABLE_I18N */
3866 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3867 re_free (node->opr.sbcset);
3870 /* Worker function for tree walking. Free the allocated memory inside NODE
3871 and its children. */
3873 static reg_errcode_t
3874 free_tree (void *extra, bin_tree_t *node)
3876 free_token (&node->token);
3881 /* Duplicate the node SRC, and return new node. This is a preorder
3882 visit similar to the one implemented by the generic visitor, but
3883 we need more infrastructure to maintain two parallel trees --- so,
3884 it's easier to duplicate. */
3887 duplicate_tree (root, dfa)
3888 const bin_tree_t *root;
3891 const bin_tree_t *node;
3892 bin_tree_t *dup_root;
3893 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3895 for (node = root; ; )
3897 /* Create a new tree and link it back to the current parent. */
3898 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3901 (*p_new)->parent = dup_node;
3902 (*p_new)->token.duplicated = 1;
3905 /* Go to the left node, or up and to the right. */
3909 p_new = &dup_node->left;
3913 const bin_tree_t *prev = NULL;
3914 while (node->right == prev || node->right == NULL)
3917 node = node->parent;
3918 dup_node = dup_node->parent;
3923 p_new = &dup_node->right;