1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
34 static void optimize_utf8 (re_dfa_t *dfa);
36 static reg_errcode_t analyze (re_dfa_t *dfa);
37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
42 int top_clone_node, int root_node,
43 unsigned int constraint);
44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
45 unsigned int constraint);
46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
51 static void calc_inveclosure (re_dfa_t *dfa);
52 static int fetch_number (re_string_t *input, re_token_t *token,
54 static void fetch_token (re_token_t *result, re_string_t *input,
56 static int peek_token (re_token_t *token, re_string_t *input,
58 static int peek_token_bracket (re_token_t *token, re_string_t *input,
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_token_t *token, int token_len,
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 # ifdef RE_ENABLE_I18N
91 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
92 re_charset_t *mbcset, int *range_alloc,
93 bracket_elem_t *start_elem,
94 bracket_elem_t *end_elem);
95 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
98 const unsigned char *name);
99 # else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 const unsigned char *name);
105 # endif /* not RE_ENABLE_I18N */
106 #endif /* not _LIBC */
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
109 re_charset_t *mbcset,
110 int *equiv_class_alloc,
111 const unsigned char *name);
112 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
113 re_bitset_ptr_t sbcset,
114 re_charset_t *mbcset,
115 int *char_class_alloc,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
118 #else /* not RE_ENABLE_I18N */
119 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
120 const unsigned char *name);
121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
122 re_bitset_ptr_t sbcset,
123 const unsigned char *class_name,
124 reg_syntax_t syntax);
125 #endif /* not RE_ENABLE_I18N */
126 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
127 unsigned RE_TRANSLATE_TYPE trans,
128 const unsigned char *class_name,
129 const unsigned char *extra,
130 int non_match, reg_errcode_t *err);
131 static bin_tree_t *create_tree (re_dfa_t *dfa,
132 bin_tree_t *left, bin_tree_t *right,
133 re_token_type_t type, int index);
134 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
135 bin_tree_t *left, bin_tree_t *right,
136 const re_token_t *token)
137 __attribute ((noinline));
138 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
139 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
140 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
142 /* This table gives an error message for each of the error codes listed
143 in regex.h. Obviously the order here has to be same as there.
144 POSIX doesn't require that we do anything for REG_NOERROR,
145 but why not be nice? */
147 const char __re_error_msgid[] attribute_hidden =
149 #define REG_NOERROR_IDX 0
150 gettext_noop ("Success") /* REG_NOERROR */
152 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
153 gettext_noop ("No match") /* REG_NOMATCH */
155 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
156 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
158 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
159 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
161 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
162 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
164 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
165 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
167 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
168 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
170 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
171 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
173 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
174 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
176 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
177 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
179 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
180 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
182 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
183 gettext_noop ("Invalid range end") /* REG_ERANGE */
185 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
186 gettext_noop ("Memory exhausted") /* REG_ESPACE */
188 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
189 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
191 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
192 gettext_noop ("Premature end of regular expression") /* REG_EEND */
194 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
195 gettext_noop ("Regular expression too big") /* REG_ESIZE */
197 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
198 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
201 const size_t __re_error_msgid_idx[] attribute_hidden =
222 /* Entry points for GNU code. */
224 /* re_compile_pattern is the GNU regular expression compiler: it
225 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
226 Returns 0 if the pattern was valid, otherwise an error string.
228 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
229 are set in BUFP on entry. */
232 re_compile_pattern (pattern, length, bufp)
235 struct re_pattern_buffer *bufp;
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
244 /* Match anchors at newline. */
245 bufp->newline_anchor = 1;
247 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
251 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
254 weak_alias (__re_compile_pattern, re_compile_pattern)
257 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260 /* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262 reg_syntax_t re_syntax_options;
265 /* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
273 re_set_syntax (syntax)
276 reg_syntax_t ret = re_syntax_options;
278 re_syntax_options = syntax;
282 weak_alias (__re_set_syntax, re_set_syntax)
286 re_compile_fastmap (bufp)
287 struct re_pattern_buffer *bufp;
289 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
290 char *fastmap = bufp->fastmap;
292 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
293 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
294 if (dfa->init_state != dfa->init_state_word)
295 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
296 if (dfa->init_state != dfa->init_state_nl)
297 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
298 if (dfa->init_state != dfa->init_state_begbuf)
299 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
300 bufp->fastmap_accurate = 1;
304 weak_alias (__re_compile_fastmap, re_compile_fastmap)
308 __attribute ((always_inline))
309 re_set_fastmap (char *fastmap, int icase, int ch)
313 fastmap[tolower (ch)] = 1;
316 /* Helper function for re_compile_fastmap.
317 Compile fastmap for the initial_state INIT_STATE. */
320 re_compile_fastmap_iter (bufp, init_state, fastmap)
322 const re_dfastate_t *init_state;
325 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
327 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
328 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
330 int node = init_state->nodes.elems[node_cnt];
331 re_token_type_t type = dfa->nodes[node].type;
333 if (type == CHARACTER)
335 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
336 #ifdef RE_ENABLE_I18N
337 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
339 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
344 *p++ = dfa->nodes[node].opr.c;
345 while (++node < dfa->nodes_len
346 && dfa->nodes[node].type == CHARACTER
347 && dfa->nodes[node].mb_partial)
348 *p++ = dfa->nodes[node].opr.c;
349 memset (&state, 0, sizeof (state));
350 if (mbrtowc (&wc, (const char *) buf, p - buf,
352 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
353 re_set_fastmap (fastmap, 0, buf[0]);
357 else if (type == SIMPLE_BRACKET)
360 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
361 for (j = 0; j < UINT_BITS; ++j, ++ch)
362 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
363 re_set_fastmap (fastmap, icase, ch);
365 #ifdef RE_ENABLE_I18N
366 else if (type == COMPLEX_BRACKET)
369 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
370 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
371 || cset->nranges || cset->nchar_classes)
374 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
376 /* In this case we want to catch the bytes which are
377 the first byte of any collation elements.
378 e.g. In da_DK, we want to catch 'a' since "aa"
379 is a valid collation element, and don't catch
380 'b' since 'b' is the only collation element
381 which starts from 'b'. */
383 const int32_t *table = (const int32_t *)
384 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
385 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
386 for (j = 0; j < UINT_BITS; ++j, ++ch)
388 re_set_fastmap (fastmap, icase, ch);
391 if (dfa->mb_cur_max > 1)
392 for (i = 0; i < SBC_MAX; ++i)
393 if (__btowc (i) == WEOF)
394 re_set_fastmap (fastmap, icase, i);
395 # endif /* not _LIBC */
397 for (i = 0; i < cset->nmbchars; ++i)
401 memset (&state, '\0', sizeof (state));
402 __wcrtomb (buf, cset->mbchars[i], &state);
403 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
404 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
406 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
407 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
411 #endif /* RE_ENABLE_I18N */
412 else if (type == OP_PERIOD
413 #ifdef RE_ENABLE_I18N
414 || type == OP_UTF8_PERIOD
415 #endif /* RE_ENABLE_I18N */
416 || type == END_OF_RE)
418 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
419 if (type == END_OF_RE)
420 bufp->can_be_null = 1;
426 /* Entry point for POSIX code. */
427 /* regcomp takes a regular expression as a string and compiles it.
429 PREG is a regex_t *. We do not expect any fields to be initialized,
430 since POSIX says we shouldn't. Thus, we set
432 `buffer' to the compiled pattern;
433 `used' to the length of the compiled pattern;
434 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
435 REG_EXTENDED bit in CFLAGS is set; otherwise, to
436 RE_SYNTAX_POSIX_BASIC;
437 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
438 `fastmap' to an allocated space for the fastmap;
439 `fastmap_accurate' to zero;
440 `re_nsub' to the number of subexpressions in PATTERN.
442 PATTERN is the address of the pattern string.
444 CFLAGS is a series of bits which affect compilation.
446 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
447 use POSIX basic syntax.
449 If REG_NEWLINE is set, then . and [^...] don't match newline.
450 Also, regexec will try a match beginning after every newline.
452 If REG_ICASE is set, then we considers upper- and lowercase
453 versions of letters to be equivalent when matching.
455 If REG_NOSUB is set, then when PREG is passed to regexec, that
456 routine will report only success or failure, and nothing about the
459 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
460 the return codes and their meanings.) */
463 regcomp (preg, pattern, cflags)
464 regex_t *__restrict preg;
465 const char *__restrict pattern;
469 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
470 : RE_SYNTAX_POSIX_BASIC);
476 /* Try to allocate space for the fastmap. */
477 preg->fastmap = re_malloc (char, SBC_MAX);
478 if (BE (preg->fastmap == NULL, 0))
481 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483 /* If REG_NEWLINE is set, newlines are treated differently. */
484 if (cflags & REG_NEWLINE)
485 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
486 syntax &= ~RE_DOT_NEWLINE;
487 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
488 /* It also changes the matching behavior. */
489 preg->newline_anchor = 1;
492 preg->newline_anchor = 0;
493 preg->no_sub = !!(cflags & REG_NOSUB);
494 preg->translate = NULL;
496 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498 /* POSIX doesn't distinguish between an unmatched open-group and an
499 unmatched close-group: both are REG_EPAREN. */
500 if (ret == REG_ERPAREN)
503 /* We have already checked preg->fastmap != NULL. */
504 if (BE (ret == REG_NOERROR, 1))
505 /* Compute the fastmap now, since regexec cannot modify the pattern
506 buffer. This function never fails in this implementation. */
507 (void) re_compile_fastmap (preg);
510 /* Some error occurred while compiling the expression. */
511 re_free (preg->fastmap);
512 preg->fastmap = NULL;
518 weak_alias (__regcomp, regcomp)
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
525 regerror (errcode, preg, errbuf, errbuf_size)
535 || errcode >= (int) (sizeof (__re_error_msgid_idx)
536 / sizeof (__re_error_msgid_idx[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
543 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
545 msg_size = strlen (msg) + 1; /* Includes the null. */
547 if (BE (errbuf_size != 0, 1))
549 if (BE (msg_size > errbuf_size, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
554 memcpy (errbuf, msg, errbuf_size - 1);
555 errbuf[errbuf_size - 1] = 0;
559 memcpy (errbuf, msg, msg_size);
565 weak_alias (__regerror, regerror)
569 #ifdef RE_ENABLE_I18N
570 /* This static array is used for the map to single-byte characters when
571 UTF-8 is used. Otherwise we would allocate memory just to initialize
572 it the same all the time. UTF-8 is the preferred encoding so this is
573 a worthwhile optimization. */
574 static const bitset utf8_sb_map =
576 /* Set the first 128 bits. */
577 # if UINT_MAX == 0xffffffff
578 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
580 # error "Add case for new unsigned int size"
587 free_dfa_content (re_dfa_t *dfa)
591 re_free (dfa->subexps);
594 for (i = 0; i < dfa->nodes_len; ++i)
596 re_token_t *node = dfa->nodes + i;
597 #ifdef RE_ENABLE_I18N
598 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
599 free_charset (node->opr.mbcset);
601 #endif /* RE_ENABLE_I18N */
602 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
603 re_free (node->opr.sbcset);
605 re_free (dfa->nexts);
606 for (i = 0; i < dfa->nodes_len; ++i)
608 if (dfa->eclosures != NULL)
609 re_node_set_free (dfa->eclosures + i);
610 if (dfa->inveclosures != NULL)
611 re_node_set_free (dfa->inveclosures + i);
612 if (dfa->edests != NULL)
613 re_node_set_free (dfa->edests + i);
615 re_free (dfa->edests);
616 re_free (dfa->eclosures);
617 re_free (dfa->inveclosures);
618 re_free (dfa->nodes);
620 if (dfa->state_table)
621 for (i = 0; i <= dfa->state_hash_mask; ++i)
623 struct re_state_table_entry *entry = dfa->state_table + i;
624 for (j = 0; j < entry->num; ++j)
626 re_dfastate_t *state = entry->array[j];
629 re_free (entry->array);
631 re_free (dfa->state_table);
632 #ifdef RE_ENABLE_I18N
633 if (dfa->sb_char != utf8_sb_map)
634 re_free (dfa->sb_char);
637 re_free (dfa->re_str);
644 /* Free dynamically allocated space used by PREG. */
650 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
651 if (BE (dfa != NULL, 1))
652 free_dfa_content (dfa);
656 re_free (preg->fastmap);
657 preg->fastmap = NULL;
659 re_free (preg->translate);
660 preg->translate = NULL;
663 weak_alias (__regfree, regfree)
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
689 if (!re_comp_buf.buffer)
690 return gettext ("No previous regular expression");
694 if (re_comp_buf.buffer)
696 fastmap = re_comp_buf.fastmap;
697 re_comp_buf.fastmap = NULL;
698 __regfree (&re_comp_buf);
699 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
700 re_comp_buf.fastmap = fastmap;
703 if (re_comp_buf.fastmap == NULL)
705 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
706 if (re_comp_buf.fastmap == NULL)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx[(int) REG_ESPACE]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf.newline_anchor = 1;
717 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
727 libc_freeres_fn (free_mem)
729 __regfree (&re_comp_buf);
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
740 re_compile_internal (preg, pattern, length, syntax)
742 const char * pattern;
746 reg_errcode_t err = REG_NOERROR;
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
770 preg->allocated = sizeof (re_dfa_t);
771 preg->buffer = (unsigned char *) dfa;
773 preg->used = sizeof (re_dfa_t);
775 err = init_dfa (dfa, length);
776 if (BE (err != REG_NOERROR, 0))
778 free_dfa_content (dfa);
784 dfa->re_str = re_malloc (char, length + 1);
785 strncpy (dfa->re_str, pattern, length + 1);
788 err = re_string_construct (®exp, pattern, length, preg->translate,
789 syntax & RE_ICASE, dfa);
790 if (BE (err != REG_NOERROR, 0))
792 re_compile_internal_free_return:
793 free_workarea_compile (preg);
794 re_string_destruct (®exp);
795 free_dfa_content (dfa);
801 /* Parse the regular expression, and build a structure tree. */
803 dfa->str_tree = parse (®exp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
807 #ifdef RE_ENABLE_I18N
808 /* If possible, do searching in single byte encoding to speed things up. */
809 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
813 /* Analyze the tree and collect information which is necessary to
816 if (BE (err != REG_NOERROR, 0))
817 goto re_compile_internal_free_return;
819 /* Then create the initial state of the dfa. */
820 err = create_initial_state (dfa);
822 /* Release work areas. */
823 free_workarea_compile (preg);
824 re_string_destruct (®exp);
826 if (BE (err != REG_NOERROR, 0))
828 free_dfa_content (dfa);
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
840 init_dfa (dfa, pat_len)
849 memset (dfa, '\0', sizeof (re_dfa_t));
851 /* Force allocation of str_tree_storage the first time. */
852 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854 dfa->nodes_alloc = pat_len + 1;
855 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
857 dfa->states_alloc = pat_len + 1;
859 /* table_size = 2 ^ ceil(log pat_len) */
860 for (table_size = 1; table_size > 0; table_size <<= 1)
861 if (table_size > pat_len)
864 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
865 dfa->state_hash_mask = table_size - 1;
867 dfa->subexps_alloc = 1;
868 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
870 dfa->mb_cur_max = MB_CUR_MAX;
872 if (dfa->mb_cur_max == 6
873 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
875 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
878 # ifdef HAVE_LANGINFO_CODESET
879 codeset_name = nl_langinfo (CODESET);
881 codeset_name = getenv ("LC_ALL");
882 if (codeset_name == NULL || codeset[0] == '\0')
883 codeset_name = getenv ("LC_CTYPE");
884 if (codeset_name == NULL || codeset[0] == '\0')
885 codeset_name = getenv ("LANG");
886 if (codeset_name == NULL)
888 else if (strchr (codeset_name, '.') != NULL)
889 codeset_name = strchr (codeset_name, '.') + 1;
892 if (strcasecmp (codeset_name, "UTF-8") == 0
893 || strcasecmp (codeset_name, "UTF8") == 0)
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
901 #ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
911 if (BE (dfa->sb_char == NULL, 0))
914 /* Clear all bits by, then set those corresponding to single
916 bitset_empty (dfa->sb_char);
918 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
919 for (j = 0; j < UINT_BITS; ++j, ++ch)
921 wchar_t wch = __btowc (ch);
923 dfa->sb_char[i] |= 1 << j;
925 if (isascii (ch) && wch != (wchar_t) ch)
926 dfa->map_notascii = 1;
933 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
934 || dfa->subexps == NULL, 0))
939 /* Initialize WORD_CHAR table, which indicate which character is
940 "word". In this case "word" means that it is the word construction
941 character used by some operators like "\<", "\>", etc. */
948 dfa->word_ops_used = 1;
949 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
950 for (j = 0; j < UINT_BITS; ++j, ++ch)
951 if (isalnum (ch) || ch == '_')
952 dfa->word_char[i] |= 1 << j;
955 /* Free the work area which are only used while compiling. */
958 free_workarea_compile (preg)
961 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
962 bin_tree_storage_t *storage, *next;
963 for (storage = dfa->str_tree_storage; storage; storage = next)
965 next = storage->next;
968 dfa->str_tree_storage = NULL;
969 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
970 dfa->str_tree = NULL;
971 re_free (dfa->org_indices);
972 dfa->org_indices = NULL;
975 /* Create initial states for all contexts. */
978 create_initial_state (dfa)
983 re_node_set init_nodes;
985 /* Initial states have the epsilon closure of the node which is
986 the first node of the regular expression. */
987 first = dfa->str_tree->first;
988 dfa->init_node = first;
989 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
990 if (BE (err != REG_NOERROR, 0))
993 /* The back-references which are in initial states can epsilon transit,
994 since in this case all of the subexpressions can be null.
995 Then we add epsilon closures of the nodes which are the next nodes of
996 the back-references. */
997 if (dfa->nbackref > 0)
998 for (i = 0; i < init_nodes.nelem; ++i)
1000 int node_idx = init_nodes.elems[i];
1001 re_token_type_t type = dfa->nodes[node_idx].type;
1004 if (type != OP_BACK_REF)
1006 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1008 re_token_t *clexp_node;
1009 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1010 if (clexp_node->type == OP_CLOSE_SUBEXP
1011 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
1014 if (clexp_idx == init_nodes.nelem)
1017 if (type == OP_BACK_REF)
1019 int dest_idx = dfa->edests[node_idx].elems[0];
1020 if (!re_node_set_contains (&init_nodes, dest_idx))
1022 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1028 /* It must be the first time to invoke acquire_state. */
1029 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1030 /* We don't check ERR here, since the initial state must not be NULL. */
1031 if (BE (dfa->init_state == NULL, 0))
1033 if (dfa->init_state->has_constraint)
1035 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1037 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1039 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1043 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1044 || dfa->init_state_begbuf == NULL, 0))
1048 dfa->init_state_word = dfa->init_state_nl
1049 = dfa->init_state_begbuf = dfa->init_state;
1051 re_node_set_free (&init_nodes);
1055 #ifdef RE_ENABLE_I18N
1056 /* If it is possible to do searching in single byte encoding instead of UTF-8
1057 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058 DFA nodes where needed. */
1064 int node, i, mb_chars = 0, has_period = 0;
1066 for (node = 0; node < dfa->nodes_len; ++node)
1067 switch (dfa->nodes[node].type)
1070 if (dfa->nodes[node].opr.c >= 0x80)
1074 switch (dfa->nodes[node].opr.idx)
1082 /* Word anchors etc. cannot be handled. */
1092 case OP_DUP_ASTERISK:
1093 case OP_DUP_QUESTION:
1094 case OP_OPEN_SUBEXP:
1095 case OP_CLOSE_SUBEXP:
1097 case SIMPLE_BRACKET:
1098 /* Just double check. */
1099 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1100 if (dfa->nodes[node].opr.sbcset[i])
1107 if (mb_chars || has_period)
1108 for (node = 0; node < dfa->nodes_len; ++node)
1110 if (dfa->nodes[node].type == CHARACTER
1111 && dfa->nodes[node].opr.c >= 0x80)
1112 dfa->nodes[node].mb_partial = 0;
1113 else if (dfa->nodes[node].type == OP_PERIOD)
1114 dfa->nodes[node].type = OP_UTF8_PERIOD;
1117 /* The search can be in single byte locale. */
1118 dfa->mb_cur_max = 1;
1120 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1124 /* Analyze the structure tree, and calculate "first", "next", "edest",
1125 "eclosure", and "inveclosure". */
1127 static reg_errcode_t
1134 /* Allocate arrays. */
1135 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1136 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1137 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1138 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1139 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1140 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1141 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1143 /* Initialize them. */
1144 for (i = 0; i < dfa->nodes_len; ++i)
1147 re_node_set_init_empty (dfa->edests + i);
1148 re_node_set_init_empty (dfa->eclosures + i);
1149 re_node_set_init_empty (dfa->inveclosures + i);
1152 ret = analyze_tree (dfa, dfa->str_tree);
1153 if (BE (ret == REG_NOERROR, 1))
1155 ret = calc_eclosure (dfa);
1156 if (ret == REG_NOERROR)
1157 calc_inveclosure (dfa);
1162 /* Helper functions for analyze.
1163 This function calculate "first", "next", and "edest" for the subtree
1164 whose root is NODE. */
1166 static reg_errcode_t
1167 analyze_tree (dfa, node)
1172 if (node->first == -1)
1173 calc_first (dfa, node);
1174 if (node->next == -1)
1175 calc_next (dfa, node);
1176 if (node->eclosure.nelem == 0)
1177 calc_epsdest (dfa, node);
1178 /* Calculate "first" etc. for the left child. */
1179 if (node->left != NULL)
1181 ret = analyze_tree (dfa, node->left);
1182 if (BE (ret != REG_NOERROR, 0))
1185 /* Calculate "first" etc. for the right child. */
1186 if (node->right != NULL)
1188 ret = analyze_tree (dfa, node->right);
1189 if (BE (ret != REG_NOERROR, 0))
1195 /* Calculate "first" for the node NODE. */
1197 calc_first (dfa, node)
1202 idx = node->node_idx;
1203 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1208 case OP_OPEN_BRACKET:
1209 case OP_CLOSE_BRACKET:
1210 case OP_OPEN_DUP_NUM:
1211 case OP_CLOSE_DUP_NUM:
1213 case OP_NON_MATCH_LIST:
1214 case OP_OPEN_COLL_ELEM:
1215 case OP_CLOSE_COLL_ELEM:
1216 case OP_OPEN_EQUIV_CLASS:
1217 case OP_CLOSE_EQUIV_CLASS:
1218 case OP_OPEN_CHAR_CLASS:
1219 case OP_CLOSE_CHAR_CLASS:
1220 /* These must not appear here. */
1226 case OP_DUP_ASTERISK:
1227 case OP_DUP_QUESTION:
1228 #ifdef RE_ENABLE_I18N
1229 case OP_UTF8_PERIOD:
1230 case COMPLEX_BRACKET:
1231 #endif /* RE_ENABLE_I18N */
1232 case SIMPLE_BRACKET:
1235 case OP_OPEN_SUBEXP:
1236 case OP_CLOSE_SUBEXP:
1242 /* else fall through */
1245 assert (node->left != NULL);
1247 if (node->left->first == -1)
1248 calc_first (dfa, node->left);
1249 node->first = node->left->first;
1254 /* Calculate "next" for the node NODE. */
1257 calc_next (dfa, node)
1262 bin_tree_t *parent = node->parent;
1266 idx = node->node_idx;
1267 if (node->type == 0)
1268 dfa->nexts[idx] = node->next;
1272 idx = parent->node_idx;
1273 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1277 case OP_DUP_ASTERISK:
1281 if (parent->left == node)
1283 if (parent->right->first == -1)
1284 calc_first (dfa, parent->right);
1285 node->next = parent->right->first;
1288 /* else fall through */
1290 if (parent->next == -1)
1291 calc_next (dfa, parent);
1292 node->next = parent->next;
1295 idx = node->node_idx;
1296 if (node->type == 0)
1297 dfa->nexts[idx] = node->next;
1300 /* Calculate "edest" for the node NODE. */
1303 calc_epsdest (dfa, node)
1308 idx = node->node_idx;
1309 if (node->type == 0)
1311 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1312 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1314 if (node->left->first == -1)
1315 calc_first (dfa, node->left);
1316 if (node->next == -1)
1317 calc_next (dfa, node);
1318 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1321 else if (dfa->nodes[idx].type == OP_ALT)
1324 if (node->left != NULL)
1326 if (node->left->first == -1)
1327 calc_first (dfa, node->left);
1328 left = node->left->first;
1332 if (node->next == -1)
1333 calc_next (dfa, node);
1336 if (node->right != NULL)
1338 if (node->right->first == -1)
1339 calc_first (dfa, node->right);
1340 right = node->right->first;
1344 if (node->next == -1)
1345 calc_next (dfa, node);
1348 re_node_set_init_2 (dfa->edests + idx, left, right);
1350 else if (dfa->nodes[idx].type == ANCHOR
1351 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1352 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1353 || dfa->nodes[idx].type == OP_BACK_REF)
1354 re_node_set_init_1 (dfa->edests + idx, node->next);
1356 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1360 /* Duplicate the epsilon closure of the node ROOT_NODE.
1361 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1362 to their own constraint. */
1364 static reg_errcode_t
1365 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1368 int top_org_node, top_clone_node, root_node;
1369 unsigned int init_constraint;
1372 int org_node, clone_node, ret;
1373 unsigned int constraint = init_constraint;
1374 for (org_node = top_org_node, clone_node = top_clone_node;;)
1376 int org_dest, clone_dest;
1377 if (dfa->nodes[org_node].type == OP_BACK_REF)
1379 /* If the back reference epsilon-transit, its destination must
1380 also have the constraint. Then duplicate the epsilon closure
1381 of the destination of the back reference, and store it in
1382 edests of the back reference. */
1383 org_dest = dfa->nexts[org_node];
1384 re_node_set_empty (dfa->edests + clone_node);
1385 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1386 if (BE (err != REG_NOERROR, 0))
1388 dfa->nexts[clone_node] = dfa->nexts[org_node];
1389 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1390 if (BE (ret < 0, 0))
1393 else if (dfa->edests[org_node].nelem == 0)
1395 /* In case of the node can't epsilon-transit, don't duplicate the
1396 destination and store the original destination as the
1397 destination of the node. */
1398 dfa->nexts[clone_node] = dfa->nexts[org_node];
1401 else if (dfa->edests[org_node].nelem == 1)
1403 /* In case of the node can epsilon-transit, and it has only one
1405 org_dest = dfa->edests[org_node].elems[0];
1406 re_node_set_empty (dfa->edests + clone_node);
1407 if (dfa->nodes[org_node].type == ANCHOR)
1409 /* In case of the node has another constraint, append it. */
1410 if (org_node == root_node && clone_node != org_node)
1412 /* ...but if the node is root_node itself, it means the
1413 epsilon closure have a loop, then tie it to the
1414 destination of the root_node. */
1415 ret = re_node_set_insert (dfa->edests + clone_node,
1417 if (BE (ret < 0, 0))
1421 constraint |= dfa->nodes[org_node].opr.ctx_type;
1423 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1424 if (BE (err != REG_NOERROR, 0))
1426 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1427 if (BE (ret < 0, 0))
1430 else /* dfa->edests[org_node].nelem == 2 */
1432 /* In case of the node can epsilon-transit, and it has two
1433 destinations. E.g. '|', '*', '+', '?'. */
1434 org_dest = dfa->edests[org_node].elems[0];
1435 re_node_set_empty (dfa->edests + clone_node);
1436 /* Search for a duplicated node which satisfies the constraint. */
1437 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1438 if (clone_dest == -1)
1440 /* There are no such a duplicated node, create a new one. */
1441 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1442 if (BE (err != REG_NOERROR, 0))
1444 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1445 if (BE (ret < 0, 0))
1447 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1448 root_node, constraint);
1449 if (BE (err != REG_NOERROR, 0))
1454 /* There are a duplicated node which satisfy the constraint,
1455 use it to avoid infinite loop. */
1456 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457 if (BE (ret < 0, 0))
1461 org_dest = dfa->edests[org_node].elems[1];
1462 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1463 if (BE (err != REG_NOERROR, 0))
1465 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1466 if (BE (ret < 0, 0))
1469 org_node = org_dest;
1470 clone_node = clone_dest;
1475 /* Search for a node which is duplicated from the node ORG_NODE, and
1476 satisfies the constraint CONSTRAINT. */
1479 search_duplicated_node (dfa, org_node, constraint)
1482 unsigned int constraint;
1485 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1487 if (org_node == dfa->org_indices[idx]
1488 && constraint == dfa->nodes[idx].constraint)
1489 return idx; /* Found. */
1491 return -1; /* Not found. */
1494 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1495 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1496 otherwise return the error code. */
1498 static reg_errcode_t
1499 duplicate_node (new_idx, dfa, org_idx, constraint)
1501 int *new_idx, org_idx;
1502 unsigned int constraint;
1504 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1505 if (BE (dup_idx == -1, 0))
1507 dfa->nodes[dup_idx].constraint = constraint;
1508 if (dfa->nodes[org_idx].type == ANCHOR)
1509 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1510 dfa->nodes[dup_idx].duplicated = 1;
1511 re_node_set_init_empty (dfa->edests + dup_idx);
1512 re_node_set_init_empty (dfa->eclosures + dup_idx);
1513 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1515 /* Store the index of the original node. */
1516 dfa->org_indices[dup_idx] = org_idx;
1522 calc_inveclosure (dfa)
1526 for (src = 0; src < dfa->nodes_len; ++src)
1528 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1530 dest = dfa->eclosures[src].elems[idx];
1531 re_node_set_insert (dfa->inveclosures + dest, src);
1536 /* Calculate "eclosure" for all the node in DFA. */
1538 static reg_errcode_t
1542 int node_idx, incomplete;
1544 assert (dfa->nodes_len > 0);
1547 /* For each nodes, calculate epsilon closure. */
1548 for (node_idx = 0; ; ++node_idx)
1551 re_node_set eclosure_elem;
1552 if (node_idx == dfa->nodes_len)
1561 assert (dfa->eclosures[node_idx].nelem != -1);
1563 /* If we have already calculated, skip it. */
1564 if (dfa->eclosures[node_idx].nelem != 0)
1566 /* Calculate epsilon closure of `node_idx'. */
1567 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1568 if (BE (err != REG_NOERROR, 0))
1571 if (dfa->eclosures[node_idx].nelem == 0)
1574 re_node_set_free (&eclosure_elem);
1580 /* Calculate epsilon closure of NODE. */
1582 static reg_errcode_t
1583 calc_eclosure_iter (new_set, dfa, node, root)
1584 re_node_set *new_set;
1589 unsigned int constraint;
1591 re_node_set eclosure;
1593 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1594 if (BE (err != REG_NOERROR, 0))
1597 /* This indicates that we are calculating this node now.
1598 We reference this value to avoid infinite loop. */
1599 dfa->eclosures[node].nelem = -1;
1601 constraint = ((dfa->nodes[node].type == ANCHOR)
1602 ? dfa->nodes[node].opr.ctx_type : 0);
1603 /* If the current node has constraints, duplicate all nodes.
1604 Since they must inherit the constraints. */
1605 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1607 int org_node, cur_node;
1608 org_node = cur_node = node;
1609 err = duplicate_node_closure (dfa, node, node, node, constraint);
1610 if (BE (err != REG_NOERROR, 0))
1614 /* Expand each epsilon destination nodes. */
1615 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1616 for (i = 0; i < dfa->edests[node].nelem; ++i)
1618 re_node_set eclosure_elem;
1619 int edest = dfa->edests[node].elems[i];
1620 /* If calculating the epsilon closure of `edest' is in progress,
1621 return intermediate result. */
1622 if (dfa->eclosures[edest].nelem == -1)
1627 /* If we haven't calculated the epsilon closure of `edest' yet,
1628 calculate now. Otherwise use calculated epsilon closure. */
1629 if (dfa->eclosures[edest].nelem == 0)
1631 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1632 if (BE (err != REG_NOERROR, 0))
1636 eclosure_elem = dfa->eclosures[edest];
1637 /* Merge the epsilon closure of `edest'. */
1638 re_node_set_merge (&eclosure, &eclosure_elem);
1639 /* If the epsilon closure of `edest' is incomplete,
1640 the epsilon closure of this node is also incomplete. */
1641 if (dfa->eclosures[edest].nelem == 0)
1644 re_node_set_free (&eclosure_elem);
1648 /* Epsilon closures include itself. */
1649 re_node_set_insert (&eclosure, node);
1650 if (incomplete && !root)
1651 dfa->eclosures[node].nelem = 0;
1653 dfa->eclosures[node] = eclosure;
1654 *new_set = eclosure;
1658 /* Functions for token which are used in the parser. */
1660 /* Fetch a token from INPUT.
1661 We must not use this function inside bracket expressions. */
1664 fetch_token (result, input, syntax)
1667 reg_syntax_t syntax;
1669 re_string_skip_bytes (input, peek_token (result, input, syntax));
1672 /* Peek a token from INPUT, and return the length of the token.
1673 We must not use this function inside bracket expressions. */
1676 peek_token (token, input, syntax)
1679 reg_syntax_t syntax;
1683 if (re_string_eoi (input))
1685 token->type = END_OF_RE;
1689 c = re_string_peek_byte (input, 0);
1692 token->word_char = 0;
1693 #ifdef RE_ENABLE_I18N
1694 token->mb_partial = 0;
1695 if (input->mb_cur_max > 1 &&
1696 !re_string_first_byte (input, re_string_cur_idx (input)))
1698 token->type = CHARACTER;
1699 token->mb_partial = 1;
1706 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1708 token->type = BACK_SLASH;
1712 c2 = re_string_peek_byte_case (input, 1);
1714 token->type = CHARACTER;
1715 #ifdef RE_ENABLE_I18N
1716 if (input->mb_cur_max > 1)
1718 wint_t wc = re_string_wchar_at (input,
1719 re_string_cur_idx (input) + 1);
1720 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1724 token->word_char = IS_WORD_CHAR (c2) != 0;
1729 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1730 token->type = OP_ALT;
1732 case '1': case '2': case '3': case '4': case '5':
1733 case '6': case '7': case '8': case '9':
1734 if (!(syntax & RE_NO_BK_REFS))
1736 token->type = OP_BACK_REF;
1737 token->opr.idx = c2 - '0';
1741 if (!(syntax & RE_NO_GNU_OPS))
1743 token->type = ANCHOR;
1744 token->opr.ctx_type = WORD_FIRST;
1748 if (!(syntax & RE_NO_GNU_OPS))
1750 token->type = ANCHOR;
1751 token->opr.ctx_type = WORD_LAST;
1755 if (!(syntax & RE_NO_GNU_OPS))
1757 token->type = ANCHOR;
1758 token->opr.ctx_type = WORD_DELIM;
1762 if (!(syntax & RE_NO_GNU_OPS))
1764 token->type = ANCHOR;
1765 token->opr.ctx_type = INSIDE_WORD;
1769 if (!(syntax & RE_NO_GNU_OPS))
1770 token->type = OP_WORD;
1773 if (!(syntax & RE_NO_GNU_OPS))
1774 token->type = OP_NOTWORD;
1777 if (!(syntax & RE_NO_GNU_OPS))
1778 token->type = OP_SPACE;
1781 if (!(syntax & RE_NO_GNU_OPS))
1782 token->type = OP_NOTSPACE;
1785 if (!(syntax & RE_NO_GNU_OPS))
1787 token->type = ANCHOR;
1788 token->opr.ctx_type = BUF_FIRST;
1792 if (!(syntax & RE_NO_GNU_OPS))
1794 token->type = ANCHOR;
1795 token->opr.ctx_type = BUF_LAST;
1799 if (!(syntax & RE_NO_BK_PARENS))
1800 token->type = OP_OPEN_SUBEXP;
1803 if (!(syntax & RE_NO_BK_PARENS))
1804 token->type = OP_CLOSE_SUBEXP;
1807 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1808 token->type = OP_DUP_PLUS;
1811 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1812 token->type = OP_DUP_QUESTION;
1815 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1816 token->type = OP_OPEN_DUP_NUM;
1819 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1820 token->type = OP_CLOSE_DUP_NUM;
1828 token->type = CHARACTER;
1829 #ifdef RE_ENABLE_I18N
1830 if (input->mb_cur_max > 1)
1832 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1833 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1837 token->word_char = IS_WORD_CHAR (token->opr.c);
1842 if (syntax & RE_NEWLINE_ALT)
1843 token->type = OP_ALT;
1846 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1847 token->type = OP_ALT;
1850 token->type = OP_DUP_ASTERISK;
1853 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1854 token->type = OP_DUP_PLUS;
1857 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1858 token->type = OP_DUP_QUESTION;
1861 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1862 token->type = OP_OPEN_DUP_NUM;
1865 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1866 token->type = OP_CLOSE_DUP_NUM;
1869 if (syntax & RE_NO_BK_PARENS)
1870 token->type = OP_OPEN_SUBEXP;
1873 if (syntax & RE_NO_BK_PARENS)
1874 token->type = OP_CLOSE_SUBEXP;
1877 token->type = OP_OPEN_BRACKET;
1880 token->type = OP_PERIOD;
1883 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1884 re_string_cur_idx (input) != 0)
1886 char prev = re_string_peek_byte (input, -1);
1887 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = LINE_FIRST;
1894 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1895 re_string_cur_idx (input) + 1 != re_string_length (input))
1898 re_string_skip_bytes (input, 1);
1899 peek_token (&next, input, syntax);
1900 re_string_skip_bytes (input, -1);
1901 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1904 token->type = ANCHOR;
1905 token->opr.ctx_type = LINE_LAST;
1913 /* Peek a token from INPUT, and return the length of the token.
1914 We must not use this function out of bracket expressions. */
1917 peek_token_bracket (token, input, syntax)
1920 reg_syntax_t syntax;
1923 if (re_string_eoi (input))
1925 token->type = END_OF_RE;
1928 c = re_string_peek_byte (input, 0);
1931 #ifdef RE_ENABLE_I18N
1932 if (input->mb_cur_max > 1 &&
1933 !re_string_first_byte (input, re_string_cur_idx (input)))
1935 token->type = CHARACTER;
1938 #endif /* RE_ENABLE_I18N */
1940 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1941 && re_string_cur_idx (input) + 1 < re_string_length (input))
1943 /* In this case, '\' escape a character. */
1945 re_string_skip_bytes (input, 1);
1946 c2 = re_string_peek_byte (input, 0);
1948 token->type = CHARACTER;
1951 if (c == '[') /* '[' is a special char in a bracket exps. */
1955 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1956 c2 = re_string_peek_byte (input, 1);
1964 token->type = OP_OPEN_COLL_ELEM;
1967 token->type = OP_OPEN_EQUIV_CLASS;
1970 if (syntax & RE_CHAR_CLASSES)
1972 token->type = OP_OPEN_CHAR_CLASS;
1975 /* else fall through. */
1977 token->type = CHARACTER;
1987 token->type = OP_CHARSET_RANGE;
1990 token->type = OP_CLOSE_BRACKET;
1993 token->type = OP_NON_MATCH_LIST;
1996 token->type = CHARACTER;
2001 /* Functions for parser. */
2003 /* Entry point of the parser.
2004 Parse the regular expression REGEXP and return the structure tree.
2005 If an error is occured, ERR is set by error code, and return NULL.
2006 This function build the following tree, from regular expression <reg_exp>:
2012 CAT means concatenation.
2013 EOR means end of regular expression. */
2016 parse (regexp, preg, syntax, err)
2017 re_string_t *regexp;
2019 reg_syntax_t syntax;
2022 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2023 bin_tree_t *tree, *eor, *root;
2024 re_token_t current_token;
2025 dfa->syntax = syntax;
2026 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2027 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2028 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2030 eor = re_dfa_add_tree_node (dfa, NULL, NULL, ¤t_token);
2032 root = create_tree (dfa, tree, eor, CONCAT, 0);
2035 if (BE (eor == NULL || root == NULL, 0))
2043 /* This function build the following tree, from regular expression
2044 <branch1>|<branch2>:
2050 ALT means alternative, which represents the operator `|'. */
2053 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2054 re_string_t *regexp;
2057 reg_syntax_t syntax;
2061 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2062 bin_tree_t *tree, *branch = NULL;
2063 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2064 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2067 while (token->type == OP_ALT)
2069 re_token_t alt_token = *token;
2070 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2071 if (token->type != OP_ALT && token->type != END_OF_RE
2072 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2074 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2075 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2080 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2081 if (BE (tree == NULL, 0))
2086 dfa->has_plural_match = 1;
2091 /* This function build the following tree, from regular expression
2098 CAT means concatenation. */
2101 parse_branch (regexp, preg, token, syntax, nest, err)
2102 re_string_t *regexp;
2105 reg_syntax_t syntax;
2109 bin_tree_t *tree, *exp;
2110 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2112 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2115 while (token->type != OP_ALT && token->type != END_OF_RE
2116 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2118 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2119 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2123 if (tree != NULL && exp != NULL)
2125 tree = create_tree (dfa, tree, exp, CONCAT, 0);
2132 else if (tree == NULL)
2134 /* Otherwise exp == NULL, we don't need to create new tree. */
2139 /* This function build the following tree, from regular expression a*:
2146 parse_expression (regexp, preg, token, syntax, nest, err)
2147 re_string_t *regexp;
2150 reg_syntax_t syntax;
2154 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2156 switch (token->type)
2159 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2160 if (BE (tree == NULL, 0))
2165 #ifdef RE_ENABLE_I18N
2166 if (dfa->mb_cur_max > 1)
2168 while (!re_string_eoi (regexp)
2169 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2171 bin_tree_t *mbc_remain;
2172 fetch_token (token, regexp, syntax);
2173 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2174 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2175 if (BE (mbc_remain == NULL || tree == NULL, 0))
2184 case OP_OPEN_SUBEXP:
2185 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2186 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189 case OP_OPEN_BRACKET:
2190 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2191 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2195 if (BE (preg->re_nsub < token->opr.idx
2196 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2201 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2202 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2203 if (BE (tree == NULL, 0))
2209 dfa->has_mb_node = 1;
2211 case OP_OPEN_DUP_NUM:
2212 if (syntax & RE_CONTEXT_INVALID_DUP)
2218 case OP_DUP_ASTERISK:
2220 case OP_DUP_QUESTION:
2221 if (syntax & RE_CONTEXT_INVALID_OPS)
2226 else if (syntax & RE_CONTEXT_INDEP_OPS)
2228 fetch_token (token, regexp, syntax);
2229 return parse_expression (regexp, preg, token, syntax, nest, err);
2231 /* else fall through */
2232 case OP_CLOSE_SUBEXP:
2233 if ((token->type == OP_CLOSE_SUBEXP) &&
2234 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2239 /* else fall through */
2240 case OP_CLOSE_DUP_NUM:
2241 /* We treat it as a normal character. */
2243 /* Then we can these characters as normal characters. */
2244 token->type = CHARACTER;
2245 /* mb_partial and word_char bits should be initialized already
2247 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2248 if (BE (tree == NULL, 0))
2255 if ((token->opr.ctx_type
2256 & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2257 && dfa->word_ops_used == 0)
2258 init_word_char (dfa);
2259 if (token->opr.ctx_type == WORD_DELIM)
2261 bin_tree_t *tree_first, *tree_last;
2262 token->opr.ctx_type = WORD_FIRST;
2263 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2264 token->opr.ctx_type = WORD_LAST;
2265 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2266 token->type = OP_ALT;
2267 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2268 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2276 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2277 if (BE (tree == NULL, 0))
2283 /* We must return here, since ANCHORs can't be followed
2284 by repetition operators.
2285 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2286 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2287 fetch_token (token, regexp, syntax);
2290 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2291 if (BE (tree == NULL, 0))
2296 if (dfa->mb_cur_max > 1)
2297 dfa->has_mb_node = 1;
2301 tree = build_charclass_op (dfa, regexp->trans,
2302 (const unsigned char *) "alnum",
2303 (const unsigned char *) "_",
2304 token->type == OP_NOTWORD, err);
2305 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2310 tree = build_charclass_op (dfa, regexp->trans,
2311 (const unsigned char *) "space",
2312 (const unsigned char *) "",
2313 token->type == OP_NOTSPACE, err);
2314 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2324 /* Must not happen? */
2330 fetch_token (token, regexp, syntax);
2332 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2333 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2335 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2336 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2338 /* In BRE consecutive duplications are not allowed. */
2339 if ((syntax & RE_CONTEXT_INVALID_DUP)
2340 && (token->type == OP_DUP_ASTERISK
2341 || token->type == OP_OPEN_DUP_NUM))
2346 dfa->has_plural_match = 1;
2352 /* This function build the following tree, from regular expression
2360 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2361 re_string_t *regexp;
2364 reg_syntax_t syntax;
2368 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2369 bin_tree_t *tree, *left_par, *right_par;
2371 cur_nsub = preg->re_nsub++;
2372 if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
2374 re_subexp_t *new_array;
2375 dfa->subexps_alloc *= 2;
2376 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2377 if (BE (new_array == NULL, 0))
2379 dfa->subexps_alloc /= 2;
2383 dfa->subexps = new_array;
2385 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2386 dfa->subexps[cur_nsub].end = -1;
2388 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2389 if (BE (left_par == NULL, 0))
2394 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2395 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2397 /* The subexpression may be a null string. */
2398 if (token->type == OP_CLOSE_SUBEXP)
2402 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2403 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2406 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2411 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2412 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2413 tree = ((tree == NULL) ? right_par
2414 : create_tree (dfa, tree, right_par, CONCAT, 0));
2415 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2416 if (BE (right_par == NULL || tree == NULL, 0))
2421 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2426 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2429 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2431 re_string_t *regexp;
2434 reg_syntax_t syntax;
2437 re_token_t dup_token;
2438 bin_tree_t *tree = NULL;
2439 int i, start, end, start_idx = re_string_cur_idx (regexp);
2440 re_token_t start_token = *token;
2442 if (token->type == OP_OPEN_DUP_NUM)
2445 start = fetch_number (regexp, token, syntax);
2448 if (token->type == CHARACTER && token->opr.c == ',')
2449 start = 0; /* We treat "{,m}" as "{0,m}". */
2452 *err = REG_BADBR; /* <re>{} is invalid. */
2456 if (BE (start != -2, 1))
2458 /* We treat "{n}" as "{n,n}". */
2459 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2460 : ((token->type == CHARACTER && token->opr.c == ',')
2461 ? fetch_number (regexp, token, syntax) : -2));
2463 if (BE (start == -2 || end == -2, 0))
2465 /* Invalid sequence. */
2466 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2468 if (token->type == END_OF_RE)
2476 /* If the syntax bit is set, rollback. */
2477 re_string_set_index (regexp, start_idx);
2478 *token = start_token;
2479 token->type = CHARACTER;
2480 /* mb_partial and word_char bits should be already initialized by
2485 if (BE (end != -1 && start > end, 0))
2487 /* First number greater than second. */
2494 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2495 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2498 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2499 if (BE (elem == NULL, 0))
2502 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2503 else if (BE (start > 0, 0))
2506 for (i = 2; i <= start; ++i)
2508 elem = duplicate_tree (elem, dfa);
2509 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2510 if (BE (elem == NULL || tree == NULL, 0))
2511 goto parse_dup_op_espace;
2515 if (BE (end != start, 1))
2517 dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2518 if (BE (start > 0, 0))
2520 elem = duplicate_tree (elem, dfa);
2521 if (BE (elem == NULL, 0))
2522 goto parse_dup_op_espace;
2524 /* This subexpression will be marked as optional, so that
2525 empty matches do not touch the registers. */
2526 mark_opt_subexp (elem, dfa);
2528 /* Prepare the tree with the modifier. */
2529 elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2530 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2534 /* We do not need to duplicate the tree because we have not
2536 mark_opt_subexp (elem, dfa);
2537 tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2540 if (BE (elem == NULL || tree == NULL, 0))
2541 goto parse_dup_op_espace;
2543 /* This loop is actually executed only when end != -1,
2544 to rewrite <re>{0,n} as <re>?<re>?<re>?... We have
2545 already created the start+1-th copy. */
2546 for (i = start + 2; i <= end; ++i)
2548 elem = duplicate_tree (elem, dfa);
2549 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2550 if (BE (elem == NULL || tree == NULL, 0))
2558 fetch_token (token, regexp, syntax);
2561 parse_dup_op_espace:
2566 /* Size of the names for collating symbol/equivalence_class/character_class.
2567 I'm not sure, but maybe enough. */
2568 #define BRACKET_NAME_BUF_SIZE 32
2571 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2572 Build the range expression which starts from START_ELEM, and ends
2573 at END_ELEM. The result are written to MBCSET and SBCSET.
2574 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2575 mbcset->range_ends, is a pointer argument sinse we may
2578 static reg_errcode_t
2579 # ifdef RE_ENABLE_I18N
2580 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2581 re_charset_t *mbcset;
2583 # else /* not RE_ENABLE_I18N */
2584 build_range_exp (sbcset, start_elem, end_elem)
2585 # endif /* not RE_ENABLE_I18N */
2586 re_bitset_ptr_t sbcset;
2587 bracket_elem_t *start_elem, *end_elem;
2589 unsigned int start_ch, end_ch;
2590 /* Equivalence Classes and Character Classes can't be a range start/end. */
2591 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2592 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2596 /* We can handle no multi character collating elements without libc
2598 if (BE ((start_elem->type == COLL_SYM
2599 && strlen ((char *) start_elem->opr.name) > 1)
2600 || (end_elem->type == COLL_SYM
2601 && strlen ((char *) end_elem->opr.name) > 1), 0))
2602 return REG_ECOLLATE;
2604 # ifdef RE_ENABLE_I18N
2606 wchar_t wc, start_wc, end_wc;
2607 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2609 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2610 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2612 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2613 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2615 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2616 ? __btowc (start_ch) : start_elem->opr.wch);
2617 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2618 ? __btowc (end_ch) : end_elem->opr.wch);
2619 if (start_wc == WEOF || end_wc == WEOF)
2620 return REG_ECOLLATE;
2621 cmp_buf[0] = start_wc;
2622 cmp_buf[4] = end_wc;
2623 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2626 /* Got valid collation sequence values, add them as a new entry.
2627 However, for !_LIBC we have no collation elements: if the
2628 character set is single byte, the single byte character set
2629 that we build below suffices. parse_bracket_exp passes
2630 no MBCSET if dfa->mb_cur_max == 1. */
2633 /* Check the space of the arrays. */
2634 if (BE (*range_alloc == mbcset->nranges, 0))
2636 /* There is not enough space, need realloc. */
2637 wchar_t *new_array_start, *new_array_end;
2640 /* +1 in case of mbcset->nranges is 0. */
2641 new_nranges = 2 * mbcset->nranges + 1;
2642 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2643 are NULL if *range_alloc == 0. */
2644 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2646 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2649 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2652 mbcset->range_starts = new_array_start;
2653 mbcset->range_ends = new_array_end;
2654 *range_alloc = new_nranges;
2657 mbcset->range_starts[mbcset->nranges] = start_wc;
2658 mbcset->range_ends[mbcset->nranges++] = end_wc;
2661 /* Build the table for single byte characters. */
2662 for (wc = 0; wc < SBC_MAX; ++wc)
2665 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2666 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2667 bitset_set (sbcset, wc);
2670 # else /* not RE_ENABLE_I18N */
2673 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2674 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2676 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2677 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2679 if (start_ch > end_ch)
2681 /* Build the table for single byte characters. */
2682 for (ch = 0; ch < SBC_MAX; ++ch)
2683 if (start_ch <= ch && ch <= end_ch)
2684 bitset_set (sbcset, ch);
2686 # endif /* not RE_ENABLE_I18N */
2689 #endif /* not _LIBC */
2692 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2693 Build the collating element which is represented by NAME.
2694 The result are written to MBCSET and SBCSET.
2695 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2696 pointer argument since we may update it. */
2698 static reg_errcode_t
2699 # ifdef RE_ENABLE_I18N
2700 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2701 re_charset_t *mbcset;
2702 int *coll_sym_alloc;
2703 # else /* not RE_ENABLE_I18N */
2704 build_collating_symbol (sbcset, name)
2705 # endif /* not RE_ENABLE_I18N */
2706 re_bitset_ptr_t sbcset;
2707 const unsigned char *name;
2709 size_t name_len = strlen ((const char *) name);
2710 if (BE (name_len != 1, 0))
2711 return REG_ECOLLATE;
2714 bitset_set (sbcset, name[0]);
2718 #endif /* not _LIBC */
2720 /* This function parse bracket expression like "[abc]", "[a-c]",
2724 parse_bracket_exp (regexp, dfa, token, syntax, err)
2725 re_string_t *regexp;
2728 reg_syntax_t syntax;
2732 const unsigned char *collseqmb;
2733 const char *collseqwc;
2736 const int32_t *symb_table;
2737 const unsigned char *extra;
2739 /* Local function for parse_bracket_exp used in _LIBC environement.
2740 Seek the collating symbol entry correspondings to NAME.
2741 Return the index of the symbol in the SYMB_TABLE. */
2744 __attribute ((always_inline))
2745 seek_collating_symbol_entry (name, name_len)
2746 const unsigned char *name;
2749 int32_t hash = elem_hash ((const char *) name, name_len);
2750 int32_t elem = hash % table_size;
2751 int32_t second = hash % (table_size - 2);
2752 while (symb_table[2 * elem] != 0)
2754 /* First compare the hashing value. */
2755 if (symb_table[2 * elem] == hash
2756 /* Compare the length of the name. */
2757 && name_len == extra[symb_table[2 * elem + 1]]
2758 /* Compare the name. */
2759 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2762 /* Yep, this is the entry. */
2772 /* Local function for parse_bracket_exp used in _LIBC environement.
2773 Look up the collation sequence value of BR_ELEM.
2774 Return the value if succeeded, UINT_MAX otherwise. */
2776 auto inline unsigned int
2777 __attribute ((always_inline))
2778 lookup_collation_sequence_value (br_elem)
2779 bracket_elem_t *br_elem;
2781 if (br_elem->type == SB_CHAR)
2784 if (MB_CUR_MAX == 1)
2787 return collseqmb[br_elem->opr.ch];
2790 wint_t wc = __btowc (br_elem->opr.ch);
2791 return __collseq_table_lookup (collseqwc, wc);
2794 else if (br_elem->type == MB_CHAR)
2796 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2798 else if (br_elem->type == COLL_SYM)
2800 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2804 elem = seek_collating_symbol_entry (br_elem->opr.name,
2806 if (symb_table[2 * elem] != 0)
2808 /* We found the entry. */
2809 idx = symb_table[2 * elem + 1];
2810 /* Skip the name of collating element name. */
2811 idx += 1 + extra[idx];
2812 /* Skip the byte sequence of the collating element. */
2813 idx += 1 + extra[idx];
2814 /* Adjust for the alignment. */
2815 idx = (idx + 3) & ~3;
2816 /* Skip the multibyte collation sequence value. */
2817 idx += sizeof (unsigned int);
2818 /* Skip the wide char sequence of the collating element. */
2819 idx += sizeof (unsigned int) *
2820 (1 + *(unsigned int *) (extra + idx));
2821 /* Return the collation sequence value. */
2822 return *(unsigned int *) (extra + idx);
2824 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2826 /* No valid character. Match it as a single byte
2828 return collseqmb[br_elem->opr.name[0]];
2831 else if (sym_name_len == 1)
2832 return collseqmb[br_elem->opr.name[0]];
2837 /* Local function for parse_bracket_exp used in _LIBC environement.
2838 Build the range expression which starts from START_ELEM, and ends
2839 at END_ELEM. The result are written to MBCSET and SBCSET.
2840 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2841 mbcset->range_ends, is a pointer argument sinse we may
2844 auto inline reg_errcode_t
2845 __attribute ((always_inline))
2846 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2847 re_charset_t *mbcset;
2849 re_bitset_ptr_t sbcset;
2850 bracket_elem_t *start_elem, *end_elem;
2853 uint32_t start_collseq;
2854 uint32_t end_collseq;
2856 /* Equivalence Classes and Character Classes can't be a range
2858 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2859 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2863 start_collseq = lookup_collation_sequence_value (start_elem);
2864 end_collseq = lookup_collation_sequence_value (end_elem);
2865 /* Check start/end collation sequence values. */
2866 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2867 return REG_ECOLLATE;
2868 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2871 /* Got valid collation sequence values, add them as a new entry.
2872 However, if we have no collation elements, and the character set
2873 is single byte, the single byte character set that we
2874 build below suffices. */
2875 if (nrules > 0 || dfa->mb_cur_max > 1)
2877 /* Check the space of the arrays. */
2878 if (BE (*range_alloc == mbcset->nranges, 0))
2880 /* There is not enough space, need realloc. */
2881 uint32_t *new_array_start;
2882 uint32_t *new_array_end;
2885 /* +1 in case of mbcset->nranges is 0. */
2886 new_nranges = 2 * mbcset->nranges + 1;
2887 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2889 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2892 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2895 mbcset->range_starts = new_array_start;
2896 mbcset->range_ends = new_array_end;
2897 *range_alloc = new_nranges;
2900 mbcset->range_starts[mbcset->nranges] = start_collseq;
2901 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2904 /* Build the table for single byte characters. */
2905 for (ch = 0; ch < SBC_MAX; ch++)
2907 uint32_t ch_collseq;
2909 if (MB_CUR_MAX == 1)
2912 ch_collseq = collseqmb[ch];
2914 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2915 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2916 bitset_set (sbcset, ch);
2921 /* Local function for parse_bracket_exp used in _LIBC environement.
2922 Build the collating element which is represented by NAME.
2923 The result are written to MBCSET and SBCSET.
2924 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2925 pointer argument sinse we may update it. */
2927 auto inline reg_errcode_t
2928 __attribute ((always_inline))
2929 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2930 re_charset_t *mbcset;
2931 int *coll_sym_alloc;
2932 re_bitset_ptr_t sbcset;
2933 const unsigned char *name;
2936 size_t name_len = strlen ((const char *) name);
2939 elem = seek_collating_symbol_entry (name, name_len);
2940 if (symb_table[2 * elem] != 0)
2942 /* We found the entry. */
2943 idx = symb_table[2 * elem + 1];
2944 /* Skip the name of collating element name. */
2945 idx += 1 + extra[idx];
2947 else if (symb_table[2 * elem] == 0 && name_len == 1)
2949 /* No valid character, treat it as a normal
2951 bitset_set (sbcset, name[0]);
2955 return REG_ECOLLATE;
2957 /* Got valid collation sequence, add it as a new entry. */
2958 /* Check the space of the arrays. */
2959 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2961 /* Not enough, realloc it. */
2962 /* +1 in case of mbcset->ncoll_syms is 0. */
2963 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2964 /* Use realloc since mbcset->coll_syms is NULL
2966 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2967 new_coll_sym_alloc);
2968 if (BE (new_coll_syms == NULL, 0))
2970 mbcset->coll_syms = new_coll_syms;
2971 *coll_sym_alloc = new_coll_sym_alloc;
2973 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2978 if (BE (name_len != 1, 0))
2979 return REG_ECOLLATE;
2982 bitset_set (sbcset, name[0]);
2989 re_token_t br_token;
2990 re_bitset_ptr_t sbcset;
2991 #ifdef RE_ENABLE_I18N
2992 re_charset_t *mbcset;
2993 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2994 int equiv_class_alloc = 0, char_class_alloc = 0;
2995 #endif /* not RE_ENABLE_I18N */
2997 bin_tree_t *work_tree;
2999 int first_round = 1;
3001 collseqmb = (const unsigned char *)
3002 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3003 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3009 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3010 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3011 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3012 _NL_COLLATE_SYMB_TABLEMB);
3013 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3014 _NL_COLLATE_SYMB_EXTRAMB);
3017 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3018 #ifdef RE_ENABLE_I18N
3019 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3020 #endif /* RE_ENABLE_I18N */
3021 #ifdef RE_ENABLE_I18N
3022 if (BE (sbcset == NULL || mbcset == NULL, 0))
3024 if (BE (sbcset == NULL, 0))
3025 #endif /* RE_ENABLE_I18N */
3031 token_len = peek_token_bracket (token, regexp, syntax);
3032 if (BE (token->type == END_OF_RE, 0))
3035 goto parse_bracket_exp_free_return;
3037 if (token->type == OP_NON_MATCH_LIST)
3039 #ifdef RE_ENABLE_I18N
3040 mbcset->non_match = 1;
3041 #endif /* not RE_ENABLE_I18N */
3043 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3044 bitset_set (sbcset, '\0');
3045 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3046 token_len = peek_token_bracket (token, regexp, syntax);
3047 if (BE (token->type == END_OF_RE, 0))
3050 goto parse_bracket_exp_free_return;
3054 /* We treat the first ']' as a normal character. */
3055 if (token->type == OP_CLOSE_BRACKET)
3056 token->type = CHARACTER;
3060 bracket_elem_t start_elem, end_elem;
3061 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3062 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3064 int token_len2 = 0, is_range_exp = 0;
3067 start_elem.opr.name = start_name_buf;
3068 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3069 syntax, first_round);
3070 if (BE (ret != REG_NOERROR, 0))
3073 goto parse_bracket_exp_free_return;
3077 /* Get information about the next token. We need it in any case. */
3078 token_len = peek_token_bracket (token, regexp, syntax);
3080 /* Do not check for ranges if we know they are not allowed. */
3081 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3083 if (BE (token->type == END_OF_RE, 0))
3086 goto parse_bracket_exp_free_return;
3088 if (token->type == OP_CHARSET_RANGE)
3090 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3091 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3092 if (BE (token2.type == END_OF_RE, 0))
3095 goto parse_bracket_exp_free_return;
3097 if (token2.type == OP_CLOSE_BRACKET)
3099 /* We treat the last '-' as a normal character. */
3100 re_string_skip_bytes (regexp, -token_len);
3101 token->type = CHARACTER;
3108 if (is_range_exp == 1)
3110 end_elem.opr.name = end_name_buf;
3111 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3113 if (BE (ret != REG_NOERROR, 0))
3116 goto parse_bracket_exp_free_return;
3119 token_len = peek_token_bracket (token, regexp, syntax);
3122 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3123 &start_elem, &end_elem);
3125 # ifdef RE_ENABLE_I18N
3126 *err = build_range_exp (sbcset,
3127 dfa->mb_cur_max > 1 ? mbcset : NULL,
3128 &range_alloc, &start_elem, &end_elem);
3130 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3132 #endif /* RE_ENABLE_I18N */
3133 if (BE (*err != REG_NOERROR, 0))
3134 goto parse_bracket_exp_free_return;
3138 switch (start_elem.type)
3141 bitset_set (sbcset, start_elem.opr.ch);
3143 #ifdef RE_ENABLE_I18N
3145 /* Check whether the array has enough space. */
3146 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3148 wchar_t *new_mbchars;
3149 /* Not enough, realloc it. */
3150 /* +1 in case of mbcset->nmbchars is 0. */
3151 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3152 /* Use realloc since array is NULL if *alloc == 0. */
3153 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3155 if (BE (new_mbchars == NULL, 0))
3156 goto parse_bracket_exp_espace;
3157 mbcset->mbchars = new_mbchars;
3159 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3161 #endif /* RE_ENABLE_I18N */
3163 *err = build_equiv_class (sbcset,
3164 #ifdef RE_ENABLE_I18N
3165 mbcset, &equiv_class_alloc,
3166 #endif /* RE_ENABLE_I18N */
3167 start_elem.opr.name);
3168 if (BE (*err != REG_NOERROR, 0))
3169 goto parse_bracket_exp_free_return;
3172 *err = build_collating_symbol (sbcset,
3173 #ifdef RE_ENABLE_I18N
3174 mbcset, &coll_sym_alloc,
3175 #endif /* RE_ENABLE_I18N */
3176 start_elem.opr.name);
3177 if (BE (*err != REG_NOERROR, 0))
3178 goto parse_bracket_exp_free_return;
3181 *err = build_charclass (regexp->trans, sbcset,
3182 #ifdef RE_ENABLE_I18N
3183 mbcset, &char_class_alloc,
3184 #endif /* RE_ENABLE_I18N */
3185 start_elem.opr.name, syntax);
3186 if (BE (*err != REG_NOERROR, 0))
3187 goto parse_bracket_exp_free_return;
3194 if (BE (token->type == END_OF_RE, 0))
3197 goto parse_bracket_exp_free_return;
3199 if (token->type == OP_CLOSE_BRACKET)
3203 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3205 /* If it is non-matching list. */
3207 bitset_not (sbcset);
3209 #ifdef RE_ENABLE_I18N
3210 /* Ensure only single byte characters are set. */
3211 if (dfa->mb_cur_max > 1)
3212 bitset_mask (sbcset, dfa->sb_char);
3213 #endif /* RE_ENABLE_I18N */
3215 /* Build a tree for simple bracket. */
3216 br_token.type = SIMPLE_BRACKET;
3217 br_token.opr.sbcset = sbcset;
3218 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3219 if (BE (work_tree == NULL, 0))
3220 goto parse_bracket_exp_espace;
3222 #ifdef RE_ENABLE_I18N
3223 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3224 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3225 || mbcset->non_match)))
3227 re_token_t alt_token;
3228 bin_tree_t *mbc_tree;
3230 /* Build a tree for complex bracket. */
3231 dfa->has_mb_node = 1;
3232 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3233 if (sbcset[sbc_idx])
3235 /* If there are no bits set in sbcset, there is no point
3236 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3237 if (sbc_idx == BITSET_UINTS)
3240 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3241 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3244 br_token.type = COMPLEX_BRACKET;
3245 br_token.opr.mbcset = mbcset;
3246 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3247 if (BE (mbc_tree == NULL, 0))
3248 goto parse_bracket_exp_espace;
3249 /* Then join them by ALT node. */
3250 alt_token.type = OP_ALT;
3251 dfa->has_plural_match = 1;
3252 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3253 if (BE (mbc_tree != NULL, 1))
3258 free_charset (mbcset);
3261 #else /* not RE_ENABLE_I18N */
3263 #endif /* not RE_ENABLE_I18N */
3265 parse_bracket_exp_espace:
3267 parse_bracket_exp_free_return:
3269 #ifdef RE_ENABLE_I18N
3270 free_charset (mbcset);
3271 #endif /* RE_ENABLE_I18N */
3275 /* Parse an element in the bracket expression. */
3277 static reg_errcode_t
3278 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3280 bracket_elem_t *elem;
3281 re_string_t *regexp;
3285 reg_syntax_t syntax;
3288 #ifdef RE_ENABLE_I18N
3290 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3291 if (cur_char_size > 1)
3293 elem->type = MB_CHAR;
3294 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3295 re_string_skip_bytes (regexp, cur_char_size);
3298 #endif /* RE_ENABLE_I18N */
3299 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3300 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3301 || token->type == OP_OPEN_EQUIV_CLASS)
3302 return parse_bracket_symbol (elem, regexp, token);
3303 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3305 /* A '-' must only appear as anything but a range indicator before
3306 the closing bracket. Everything else is an error. */
3308 (void) peek_token_bracket (&token2, regexp, syntax);
3309 if (token2.type != OP_CLOSE_BRACKET)
3310 /* The actual error value is not standardized since this whole
3311 case is undefined. But ERANGE makes good sense. */
3314 elem->type = SB_CHAR;
3315 elem->opr.ch = token->opr.c;
3319 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3320 such as [:<character_class>:], [.<collating_element>.], and
3321 [=<equivalent_class>=]. */
3323 static reg_errcode_t
3324 parse_bracket_symbol (elem, regexp, token)
3325 bracket_elem_t *elem;
3326 re_string_t *regexp;
3329 unsigned char ch, delim = token->opr.c;
3331 if (re_string_eoi(regexp))
3335 if (i >= BRACKET_NAME_BUF_SIZE)
3337 if (token->type == OP_OPEN_CHAR_CLASS)
3338 ch = re_string_fetch_byte_case (regexp);
3340 ch = re_string_fetch_byte (regexp);
3341 if (re_string_eoi(regexp))
3343 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3345 elem->opr.name[i] = ch;
3347 re_string_skip_bytes (regexp, 1);
3348 elem->opr.name[i] = '\0';
3349 switch (token->type)
3351 case OP_OPEN_COLL_ELEM:
3352 elem->type = COLL_SYM;
3354 case OP_OPEN_EQUIV_CLASS:
3355 elem->type = EQUIV_CLASS;
3357 case OP_OPEN_CHAR_CLASS:
3358 elem->type = CHAR_CLASS;
3366 /* Helper function for parse_bracket_exp.
3367 Build the equivalence class which is represented by NAME.
3368 The result are written to MBCSET and SBCSET.
3369 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3370 is a pointer argument sinse we may update it. */
3372 static reg_errcode_t
3373 #ifdef RE_ENABLE_I18N
3374 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3375 re_charset_t *mbcset;
3376 int *equiv_class_alloc;
3377 #else /* not RE_ENABLE_I18N */
3378 build_equiv_class (sbcset, name)
3379 #endif /* not RE_ENABLE_I18N */
3380 re_bitset_ptr_t sbcset;
3381 const unsigned char *name;
3384 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3387 const int32_t *table, *indirect;
3388 const unsigned char *weights, *extra, *cp;
3389 unsigned char char_buf[2];
3393 /* This #include defines a local function! */
3394 # include <locale/weight.h>
3395 /* Calculate the index for equivalence class. */
3397 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3398 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3399 _NL_COLLATE_WEIGHTMB);
3400 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3401 _NL_COLLATE_EXTRAMB);
3402 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3403 _NL_COLLATE_INDIRECTMB);
3404 idx1 = findidx (&cp);
3405 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3406 /* This isn't a valid character. */
3407 return REG_ECOLLATE;
3409 /* Build single byte matcing table for this equivalence class. */
3410 char_buf[1] = (unsigned char) '\0';
3411 len = weights[idx1];
3412 for (ch = 0; ch < SBC_MAX; ++ch)
3416 idx2 = findidx (&cp);
3421 /* This isn't a valid character. */
3423 if (len == weights[idx2])
3426 while (cnt <= len &&
3427 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3431 bitset_set (sbcset, ch);
3434 /* Check whether the array has enough space. */
3435 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3437 /* Not enough, realloc it. */
3438 /* +1 in case of mbcset->nequiv_classes is 0. */
3439 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3440 /* Use realloc since the array is NULL if *alloc == 0. */
3441 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3443 new_equiv_class_alloc);
3444 if (BE (new_equiv_classes == NULL, 0))
3446 mbcset->equiv_classes = new_equiv_classes;
3447 *equiv_class_alloc = new_equiv_class_alloc;
3449 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3454 if (BE (strlen ((const char *) name) != 1, 0))
3455 return REG_ECOLLATE;
3456 bitset_set (sbcset, *name);
3461 /* Helper function for parse_bracket_exp.
3462 Build the character class which is represented by NAME.
3463 The result are written to MBCSET and SBCSET.
3464 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3465 is a pointer argument sinse we may update it. */
3467 static reg_errcode_t
3468 #ifdef RE_ENABLE_I18N
3469 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3470 re_charset_t *mbcset;
3471 int *char_class_alloc;
3472 #else /* not RE_ENABLE_I18N */
3473 build_charclass (trans, sbcset, class_name, syntax)
3474 #endif /* not RE_ENABLE_I18N */
3475 unsigned RE_TRANSLATE_TYPE trans;
3476 re_bitset_ptr_t sbcset;
3477 const unsigned char *class_name;
3478 reg_syntax_t syntax;
3481 const char *name = (const char *) class_name;
3483 /* In case of REG_ICASE "upper" and "lower" match the both of
3484 upper and lower cases. */
3485 if ((syntax & RE_ICASE)
3486 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3489 #ifdef RE_ENABLE_I18N
3490 /* Check the space of the arrays. */
3491 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3493 /* Not enough, realloc it. */
3494 /* +1 in case of mbcset->nchar_classes is 0. */
3495 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3496 /* Use realloc since array is NULL if *alloc == 0. */
3497 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3498 new_char_class_alloc);
3499 if (BE (new_char_classes == NULL, 0))
3501 mbcset->char_classes = new_char_classes;
3502 *char_class_alloc = new_char_class_alloc;
3504 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3505 #endif /* RE_ENABLE_I18N */
3507 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3508 for (i = 0; i < SBC_MAX; ++i) \
3510 if (ctype_func (i)) \
3512 int ch = trans ? trans[i] : i; \
3513 bitset_set (sbcset, ch); \
3517 if (strcmp (name, "alnum") == 0)
3518 BUILD_CHARCLASS_LOOP (isalnum)
3519 else if (strcmp (name, "cntrl") == 0)
3520 BUILD_CHARCLASS_LOOP (iscntrl)
3521 else if (strcmp (name, "lower") == 0)
3522 BUILD_CHARCLASS_LOOP (islower)
3523 else if (strcmp (name, "space") == 0)
3524 BUILD_CHARCLASS_LOOP (isspace)
3525 else if (strcmp (name, "alpha") == 0)
3526 BUILD_CHARCLASS_LOOP (isalpha)
3527 else if (strcmp (name, "digit") == 0)
3528 BUILD_CHARCLASS_LOOP (isdigit)
3529 else if (strcmp (name, "print") == 0)
3530 BUILD_CHARCLASS_LOOP (isprint)
3531 else if (strcmp (name, "upper") == 0)
3532 BUILD_CHARCLASS_LOOP (isupper)
3533 else if (strcmp (name, "blank") == 0)
3534 BUILD_CHARCLASS_LOOP (isblank)
3535 else if (strcmp (name, "graph") == 0)
3536 BUILD_CHARCLASS_LOOP (isgraph)
3537 else if (strcmp (name, "punct") == 0)
3538 BUILD_CHARCLASS_LOOP (ispunct)
3539 else if (strcmp (name, "xdigit") == 0)
3540 BUILD_CHARCLASS_LOOP (isxdigit)
3548 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3550 unsigned RE_TRANSLATE_TYPE trans;
3551 const unsigned char *class_name;
3552 const unsigned char *extra;
3556 re_bitset_ptr_t sbcset;
3557 #ifdef RE_ENABLE_I18N
3558 re_charset_t *mbcset;
3560 #endif /* not RE_ENABLE_I18N */
3562 re_token_t br_token;
3565 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3566 #ifdef RE_ENABLE_I18N
3567 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3568 #endif /* RE_ENABLE_I18N */
3570 #ifdef RE_ENABLE_I18N
3571 if (BE (sbcset == NULL || mbcset == NULL, 0))
3572 #else /* not RE_ENABLE_I18N */
3573 if (BE (sbcset == NULL, 0))
3574 #endif /* not RE_ENABLE_I18N */
3582 #ifdef RE_ENABLE_I18N
3584 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3585 bitset_set(cset->sbcset, '\0');
3587 mbcset->non_match = 1;
3588 #endif /* not RE_ENABLE_I18N */
3591 /* We don't care the syntax in this case. */
3592 ret = build_charclass (trans, sbcset,
3593 #ifdef RE_ENABLE_I18N
3595 #endif /* RE_ENABLE_I18N */
3598 if (BE (ret != REG_NOERROR, 0))
3601 #ifdef RE_ENABLE_I18N
3602 free_charset (mbcset);
3603 #endif /* RE_ENABLE_I18N */
3607 /* \w match '_' also. */
3608 for (; *extra; extra++)
3609 bitset_set (sbcset, *extra);
3611 /* If it is non-matching list. */
3613 bitset_not (sbcset);
3615 #ifdef RE_ENABLE_I18N
3616 /* Ensure only single byte characters are set. */
3617 if (dfa->mb_cur_max > 1)
3618 bitset_mask (sbcset, dfa->sb_char);
3621 /* Build a tree for simple bracket. */
3622 br_token.type = SIMPLE_BRACKET;
3623 br_token.opr.sbcset = sbcset;
3624 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3625 if (BE (tree == NULL, 0))
3626 goto build_word_op_espace;
3628 #ifdef RE_ENABLE_I18N
3629 if (dfa->mb_cur_max > 1)
3631 re_token_t alt_token;
3632 bin_tree_t *mbc_tree;
3633 /* Build a tree for complex bracket. */
3634 br_token.type = COMPLEX_BRACKET;
3635 br_token.opr.mbcset = mbcset;
3636 dfa->has_mb_node = 1;
3637 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3638 if (BE (mbc_tree == NULL, 0))
3639 goto build_word_op_espace;
3640 /* Then join them by ALT node. */
3641 alt_token.type = OP_ALT;
3642 dfa->has_plural_match = 1;
3643 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3644 if (BE (mbc_tree != NULL, 1))
3649 free_charset (mbcset);
3652 #else /* not RE_ENABLE_I18N */
3654 #endif /* not RE_ENABLE_I18N */
3656 build_word_op_espace:
3658 #ifdef RE_ENABLE_I18N
3659 free_charset (mbcset);
3660 #endif /* RE_ENABLE_I18N */
3665 /* This is intended for the expressions like "a{1,3}".
3666 Fetch a number from `input', and return the number.
3667 Return -1, if the number field is empty like "{,1}".
3668 Return -2, If an error is occured. */
3671 fetch_number (input, token, syntax)
3674 reg_syntax_t syntax;
3680 fetch_token (token, input, syntax);
3682 if (BE (token->type == END_OF_RE, 0))
3684 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3686 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3687 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3688 num = (num > RE_DUP_MAX) ? -2 : num;
3693 #ifdef RE_ENABLE_I18N
3695 free_charset (re_charset_t *cset)
3697 re_free (cset->mbchars);
3699 re_free (cset->coll_syms);
3700 re_free (cset->equiv_classes);
3701 re_free (cset->range_starts);
3702 re_free (cset->range_ends);
3704 re_free (cset->char_classes);
3707 #endif /* RE_ENABLE_I18N */
3709 /* Functions for binary tree operation. */
3711 /* Create a tree node. */
3714 create_tree (dfa, left, right, type, index)
3718 re_token_type_t type;
3722 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3724 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3726 if (storage == NULL)
3728 storage->next = dfa->str_tree_storage;
3729 dfa->str_tree_storage = storage;
3730 dfa->str_tree_storage_idx = 0;
3732 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3734 tree->parent = NULL;
3736 tree->right = right;
3738 tree->node_idx = index;
3741 re_node_set_init_empty (&tree->eclosure);
3744 left->parent = tree;
3746 right->parent = tree;
3750 /* Create both a DFA node and a tree for it. */
3753 re_dfa_add_tree_node (dfa, left, right, token)
3757 const re_token_t *token;
3759 int new_idx = re_dfa_add_node (dfa, *token, 0);
3764 return create_tree (dfa, left, right, 0, new_idx);
3767 /* Mark the tree SRC as an optional subexpression. */
3770 mark_opt_subexp (src, dfa)
3771 const bin_tree_t *src;
3774 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3776 if (src->type == CONCAT
3777 && src->left->type == NON_TYPE
3778 && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3779 mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3783 /* Recursive tree walker for mark_opt_subexp. */
3786 mark_opt_subexp_iter (src, dfa, idx)
3787 const bin_tree_t *src;
3793 if (src->type == NON_TYPE)
3795 node_idx = src->node_idx;
3796 if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3797 || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3798 && dfa->nodes[node_idx].opr.idx == idx)
3799 dfa->nodes[node_idx].opt_subexp = 1;
3802 if (src->left != NULL)
3803 mark_opt_subexp_iter (src->left, dfa, idx);
3805 if (src->right != NULL)
3806 mark_opt_subexp_iter (src->right, dfa, idx);
3810 /* Duplicate the node SRC, and return new node. */
3813 duplicate_tree (src, dfa)
3814 const bin_tree_t *src;
3817 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3819 /* Since node indies must be according to Post-order of the tree,
3820 we must duplicate the left at first. */
3821 if (src->left != NULL)
3823 left = duplicate_tree (src->left, dfa);
3828 /* Secondaly, duplicate the right. */
3829 if (src->right != NULL)
3831 right = duplicate_tree (src->right, dfa);
3836 /* At last, duplicate itself. */
3837 if (src->type == NON_TYPE)
3839 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3840 dfa->nodes[new_node_idx].duplicated = 1;
3841 if (BE (new_node_idx == -1, 0))
3845 new_node_idx = src->type;
3847 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);