posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (re_dfa_t *dfa);
  37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  42                                              int top_clone_node, int root_node,
  43                                              unsigned int constraint);
  44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  45                                      unsigned int constraint);
  46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  47                                    unsigned int constraint);
  48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  50                                          int node, int root);
  51 static void calc_inveclosure (re_dfa_t *dfa);
  52 static int fetch_number (re_string_t *input, re_token_t *token,
  53                          reg_syntax_t syntax);
  54 static void fetch_token (re_token_t *result, re_string_t *input,
  55                          reg_syntax_t syntax);
  56 static int peek_token (re_token_t *token, re_string_t *input,
  57                         reg_syntax_t syntax);
  58 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  59                                reg_syntax_t syntax);
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifndef _LIBC
  90 # ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  92                                       re_charset_t *mbcset, int *range_alloc,
  93                                       bracket_elem_t *start_elem,
  94                                       bracket_elem_t *end_elem);
  95 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  96                                              re_charset_t *mbcset,
  97                                              int *coll_sym_alloc,
  98                                              const unsigned char *name);
  99 # else /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 101                                       bracket_elem_t *start_elem,
 102                                       bracket_elem_t *end_elem);
 103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 104                                              const unsigned char *name);
 105 # endif /* not RE_ENABLE_I18N */
 106 #endif /* not _LIBC */
 107 #ifdef RE_ENABLE_I18N
 108 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 109                                         re_charset_t *mbcset,
 110                                         int *equiv_class_alloc,
 111                                         const unsigned char *name);
 112 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 113                                       re_bitset_ptr_t sbcset,
 114                                       re_charset_t *mbcset,
 115                                       int *char_class_alloc,
 116                                       const unsigned char *class_name,
 117                                       reg_syntax_t syntax);
 118 #else  /* not RE_ENABLE_I18N */
 119 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 120                                         const unsigned char *name);
 121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 122                                       re_bitset_ptr_t sbcset,
 123                                       const unsigned char *class_name,
 124                                       reg_syntax_t syntax);
 125 #endif /* not RE_ENABLE_I18N */
 126 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 127                                        unsigned RE_TRANSLATE_TYPE trans,
 128                                        const unsigned char *class_name,
 129                                        const unsigned char *extra,
 130                                        int non_match, reg_errcode_t *err);
 131 static bin_tree_t *create_tree (re_dfa_t *dfa,
 132                                 bin_tree_t *left, bin_tree_t *right,
 133                                 re_token_type_t type, int index);
 134 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
 135                                          bin_tree_t *left, bin_tree_t *right,
 136                                          const re_token_t *token)
 137   __attribute ((noinline));
 138 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 139 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
 140 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
 141 \f
 142 /* This table gives an error message for each of the error codes listed
 143    in regex.h.  Obviously the order here has to be same as there.
 144    POSIX doesn't require that we do anything for REG_NOERROR,
 145    but why not be nice?  */
 146
 147 const char __re_error_msgid[] attribute_hidden =
 148   {
 149 #define REG_NOERROR_IDX 0
 150     gettext_noop ("Success")    /* REG_NOERROR */
 151     "\0"
 152 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 153     gettext_noop ("No match")   /* REG_NOMATCH */
 154     "\0"
 155 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 156     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 157     "\0"
 158 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 159     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 160     "\0"
 161 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 162     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 163     "\0"
 164 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 165     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 166     "\0"
 167 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 168     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 169     "\0"
 170 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 171     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 172     "\0"
 173 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 174     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 175     "\0"
 176 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 177     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 178     "\0"
 179 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 180     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 181     "\0"
 182 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 183     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 184     "\0"
 185 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 186     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 187     "\0"
 188 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 189     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 190     "\0"
 191 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 192     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 193     "\0"
 194 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 195     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 196     "\0"
 197 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 198     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 199   };
 200
 201 const size_t __re_error_msgid_idx[] attribute_hidden =
 202   {
 203     REG_NOERROR_IDX,
 204     REG_NOMATCH_IDX,
 205     REG_BADPAT_IDX,
 206     REG_ECOLLATE_IDX,
 207     REG_ECTYPE_IDX,
 208     REG_EESCAPE_IDX,
 209     REG_ESUBREG_IDX,
 210     REG_EBRACK_IDX,
 211     REG_EPAREN_IDX,
 212     REG_EBRACE_IDX,
 213     REG_BADBR_IDX,
 214     REG_ERANGE_IDX,
 215     REG_ESPACE_IDX,
 216     REG_BADRPT_IDX,
 217     REG_EEND_IDX,
 218     REG_ESIZE_IDX,
 219     REG_ERPAREN_IDX
 220   };
 221 \f
 222 /* Entry points for GNU code.  */
 223
 224 /* re_compile_pattern is the GNU regular expression compiler: it
 225    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 226    Returns 0 if the pattern was valid, otherwise an error string.
 227
 228    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 229    are set in BUFP on entry.  */
 230
 231 const char *
 232 re_compile_pattern (pattern, length, bufp)
 233     const char *pattern;
 234     size_t length;
 235     struct re_pattern_buffer *bufp;
 236 {
 237   reg_errcode_t ret;
 238
 239   /* And GNU code determines whether or not to get register information
 240      by passing null for the REGS argument to re_match, etc., not by
 241      setting no_sub.  */
 242   bufp->no_sub = 0;
 243
 244   /* Match anchors at newline.  */
 245   bufp->newline_anchor = 1;
 246
 247   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 248
 249   if (!ret)
 250     return NULL;
 251   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 252 }
 253 #ifdef _LIBC
 254 weak_alias (__re_compile_pattern, re_compile_pattern)
 255 #endif
 256
 257 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 258    also be assigned to arbitrarily: each pattern buffer stores its own
 259    syntax, so it can be changed between regex compilations.  */
 260 /* This has no initializer because initialized variables in Emacs
 261    become read-only after dumping.  */
 262 reg_syntax_t re_syntax_options;
 263
 264
 265 /* Specify the precise syntax of regexps for compilation.  This provides
 266    for compatibility for various utilities which historically have
 267    different, incompatible syntaxes.
 268
 269    The argument SYNTAX is a bit mask comprised of the various bits
 270    defined in regex.h.  We return the old syntax.  */
 271
 272 reg_syntax_t
 273 re_set_syntax (syntax)
 274     reg_syntax_t syntax;
 275 {
 276   reg_syntax_t ret = re_syntax_options;
 277
 278   re_syntax_options = syntax;
 279   return ret;
 280 }
 281 #ifdef _LIBC
 282 weak_alias (__re_set_syntax, re_set_syntax)
 283 #endif
 284
 285 int
 286 re_compile_fastmap (bufp)
 287     struct re_pattern_buffer *bufp;
 288 {
 289   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 290   char *fastmap = bufp->fastmap;
 291
 292   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 293   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 294   if (dfa->init_state != dfa->init_state_word)
 295     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 296   if (dfa->init_state != dfa->init_state_nl)
 297     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 298   if (dfa->init_state != dfa->init_state_begbuf)
 299     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 300   bufp->fastmap_accurate = 1;
 301   return 0;
 302 }
 303 #ifdef _LIBC
 304 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 305 #endif
 306
 307 static inline void
 308 __attribute ((always_inline))
 309 re_set_fastmap (char *fastmap, int icase, int ch)
 310 {
 311   fastmap[ch] = 1;
 312   if (icase)
 313     fastmap[tolower (ch)] = 1;
 314 }
 315
 316 /* Helper function for re_compile_fastmap.
 317    Compile fastmap for the initial_state INIT_STATE.  */
 318
 319 static void
 320 re_compile_fastmap_iter (bufp, init_state, fastmap)
 321      regex_t *bufp;
 322      const re_dfastate_t *init_state;
 323      char *fastmap;
 324 {
 325   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 326   int node_cnt;
 327   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 328   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 329     {
 330       int node = init_state->nodes.elems[node_cnt];
 331       re_token_type_t type = dfa->nodes[node].type;
 332
 333       if (type == CHARACTER)
 334         {
 335           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 336 #ifdef RE_ENABLE_I18N
 337           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 338             {
 339               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 340               wchar_t wc;
 341               mbstate_t state;
 342
 343               p = buf;
 344               *p++ = dfa->nodes[node].opr.c;
 345               while (++node < dfa->nodes_len
 346                      && dfa->nodes[node].type == CHARACTER
 347                      && dfa->nodes[node].mb_partial)
 348                 *p++ = dfa->nodes[node].opr.c;
 349               memset (&state, 0, sizeof (state));
 350               if (mbrtowc (&wc, (const char *) buf, p - buf,
 351                            &state) == p - buf
 352                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 353                 re_set_fastmap (fastmap, 0, buf[0]);
 354             }
 355 #endif
 356         }
 357       else if (type == SIMPLE_BRACKET)
 358         {
 359           int i, j, ch;
 360           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 361             for (j = 0; j < UINT_BITS; ++j, ++ch)
 362               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 363                 re_set_fastmap (fastmap, icase, ch);
 364         }
 365 #ifdef RE_ENABLE_I18N
 366       else if (type == COMPLEX_BRACKET)
 367         {
 368           int i;
 369           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 370           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 371               || cset->nranges || cset->nchar_classes)
 372             {
 373 # ifdef _LIBC
 374               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 375                 {
 376                   /* In this case we want to catch the bytes which are
 377                      the first byte of any collation elements.
 378                      e.g. In da_DK, we want to catch 'a' since "aa"
 379                           is a valid collation element, and don't catch
 380                           'b' since 'b' is the only collation element
 381                           which starts from 'b'.  */
 382                   int j, ch;
 383                   const int32_t *table = (const int32_t *)
 384                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 385                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 386                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 387                       if (table[ch] < 0)
 388                         re_set_fastmap (fastmap, icase, ch);
 389                 }
 390 # else
 391               if (dfa->mb_cur_max > 1)
 392                 for (i = 0; i < SBC_MAX; ++i)
 393                   if (__btowc (i) == WEOF)
 394                     re_set_fastmap (fastmap, icase, i);
 395 # endif /* not _LIBC */
 396             }
 397           for (i = 0; i < cset->nmbchars; ++i)
 398             {
 399               char buf[256];
 400               mbstate_t state;
 401               memset (&state, '\0', sizeof (state));
 402               __wcrtomb (buf, cset->mbchars[i], &state);
 403               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 404               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 405                 {
 406                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 407                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 408                 }
 409             }
 410         }
 411 #endif /* RE_ENABLE_I18N */
 412       else if (type == OP_PERIOD
 413 #ifdef RE_ENABLE_I18N
 414                || type == OP_UTF8_PERIOD
 415 #endif /* RE_ENABLE_I18N */
 416                || type == END_OF_RE)
 417         {
 418           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 419           if (type == END_OF_RE)
 420             bufp->can_be_null = 1;
 421           return;
 422         }
 423     }
 424 }
 425 \f
 426 /* Entry point for POSIX code.  */
 427 /* regcomp takes a regular expression as a string and compiles it.
 428
 429    PREG is a regex_t *.  We do not expect any fields to be initialized,
 430    since POSIX says we shouldn't.  Thus, we set
 431
 432      `buffer' to the compiled pattern;
 433      `used' to the length of the compiled pattern;
 434      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 435        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 436        RE_SYNTAX_POSIX_BASIC;
 437      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 438      `fastmap' to an allocated space for the fastmap;
 439      `fastmap_accurate' to zero;
 440      `re_nsub' to the number of subexpressions in PATTERN.
 441
 442    PATTERN is the address of the pattern string.
 443
 444    CFLAGS is a series of bits which affect compilation.
 445
 446      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 447      use POSIX basic syntax.
 448
 449      If REG_NEWLINE is set, then . and [^...] don't match newline.
 450      Also, regexec will try a match beginning after every newline.
 451
 452      If REG_ICASE is set, then we considers upper- and lowercase
 453      versions of letters to be equivalent when matching.
 454
 455      If REG_NOSUB is set, then when PREG is passed to regexec, that
 456      routine will report only success or failure, and nothing about the
 457      registers.
 458
 459    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 460    the return codes and their meanings.)  */
 461
 462 int
 463 regcomp (preg, pattern, cflags)
 464     regex_t *__restrict preg;
 465     const char *__restrict pattern;
 466     int cflags;
 467 {
 468   reg_errcode_t ret;
 469   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 470                          : RE_SYNTAX_POSIX_BASIC);
 471
 472   preg->buffer = NULL;
 473   preg->allocated = 0;
 474   preg->used = 0;
 475
 476   /* Try to allocate space for the fastmap.  */
 477   preg->fastmap = re_malloc (char, SBC_MAX);
 478   if (BE (preg->fastmap == NULL, 0))
 479     return REG_ESPACE;
 480
 481   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 482
 483   /* If REG_NEWLINE is set, newlines are treated differently.  */
 484   if (cflags & REG_NEWLINE)
 485     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 486       syntax &= ~RE_DOT_NEWLINE;
 487       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 488       /* It also changes the matching behavior.  */
 489       preg->newline_anchor = 1;
 490     }
 491   else
 492     preg->newline_anchor = 0;
 493   preg->no_sub = !!(cflags & REG_NOSUB);
 494   preg->translate = NULL;
 495
 496   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 497
 498   /* POSIX doesn't distinguish between an unmatched open-group and an
 499      unmatched close-group: both are REG_EPAREN.  */
 500   if (ret == REG_ERPAREN)
 501     ret = REG_EPAREN;
 502
 503   /* We have already checked preg->fastmap != NULL.  */
 504   if (BE (ret == REG_NOERROR, 1))
 505     /* Compute the fastmap now, since regexec cannot modify the pattern
 506        buffer.  This function never fails in this implementation.  */
 507     (void) re_compile_fastmap (preg);
 508   else
 509     {
 510       /* Some error occurred while compiling the expression.  */
 511       re_free (preg->fastmap);
 512       preg->fastmap = NULL;
 513     }
 514
 515   return (int) ret;
 516 }
 517 #ifdef _LIBC
 518 weak_alias (__regcomp, regcomp)
 519 #endif
 520
 521 /* Returns a message corresponding to an error code, ERRCODE, returned
 522    from either regcomp or regexec.   We don't use PREG here.  */
 523
 524 size_t
 525 regerror (errcode, preg, errbuf, errbuf_size)
 526     int errcode;
 527     const regex_t *preg;
 528     char *errbuf;
 529     size_t errbuf_size;
 530 {
 531   const char *msg;
 532   size_t msg_size;
 533
 534   if (BE (errcode < 0
 535           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 536                                / sizeof (__re_error_msgid_idx[0])), 0))
 537     /* Only error codes returned by the rest of the code should be passed
 538        to this routine.  If we are given anything else, or if other regex
 539        code generates an invalid error code, then the program has a bug.
 540        Dump core so we can fix it.  */
 541     abort ();
 542
 543   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 544
 545   msg_size = strlen (msg) + 1; /* Includes the null.  */
 546
 547   if (BE (errbuf_size != 0, 1))
 548     {
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551 #if defined HAVE_MEMPCPY || defined _LIBC
 552           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 553 #else
 554           memcpy (errbuf, msg, errbuf_size - 1);
 555           errbuf[errbuf_size - 1] = 0;
 556 #endif
 557         }
 558       else
 559         memcpy (errbuf, msg, msg_size);
 560     }
 561
 562   return msg_size;
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__regerror, regerror)
 566 #endif
 567
 568
 569 #ifdef RE_ENABLE_I18N
 570 /* This static array is used for the map to single-byte characters when
 571    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 572    it the same all the time.  UTF-8 is the preferred encoding so this is
 573    a worthwhile optimization.  */
 574 static const bitset utf8_sb_map =
 575 {
 576   /* Set the first 128 bits.  */
 577 # if UINT_MAX == 0xffffffff
 578   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 579 # else
 580 #  error "Add case for new unsigned int size"
 581 # endif
 582 };
 583 #endif
 584
 585
 586 static void
 587 free_dfa_content (re_dfa_t *dfa)
 588 {
 589   int i, j;
 590
 591   re_free (dfa->subexps);
 592
 593   if (dfa->nodes)
 594     for (i = 0; i < dfa->nodes_len; ++i)
 595       {
 596         re_token_t *node = dfa->nodes + i;
 597 #ifdef RE_ENABLE_I18N
 598         if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 599           free_charset (node->opr.mbcset);
 600         else
 601 #endif /* RE_ENABLE_I18N */
 602           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 603             re_free (node->opr.sbcset);
 604       }
 605   re_free (dfa->nexts);
 606   for (i = 0; i < dfa->nodes_len; ++i)
 607     {
 608       if (dfa->eclosures != NULL)
 609         re_node_set_free (dfa->eclosures + i);
 610       if (dfa->inveclosures != NULL)
 611         re_node_set_free (dfa->inveclosures + i);
 612       if (dfa->edests != NULL)
 613         re_node_set_free (dfa->edests + i);
 614     }
 615   re_free (dfa->edests);
 616   re_free (dfa->eclosures);
 617   re_free (dfa->inveclosures);
 618   re_free (dfa->nodes);
 619
 620   if (dfa->state_table)
 621     for (i = 0; i <= dfa->state_hash_mask; ++i)
 622       {
 623         struct re_state_table_entry *entry = dfa->state_table + i;
 624         for (j = 0; j < entry->num; ++j)
 625           {
 626             re_dfastate_t *state = entry->array[j];
 627             free_state (state);
 628           }
 629         re_free (entry->array);
 630       }
 631   re_free (dfa->state_table);
 632 #ifdef RE_ENABLE_I18N
 633   if (dfa->sb_char != utf8_sb_map)
 634     re_free (dfa->sb_char);
 635 #endif
 636 #ifdef DEBUG
 637   re_free (dfa->re_str);
 638 #endif
 639
 640   re_free (dfa);
 641 }
 642
 643
 644 /* Free dynamically allocated space used by PREG.  */
 645
 646 void
 647 regfree (preg)
 648     regex_t *preg;
 649 {
 650   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 651   if (BE (dfa != NULL, 1))
 652     free_dfa_content (dfa);
 653   preg->buffer = NULL;
 654   preg->allocated = 0;
 655
 656   re_free (preg->fastmap);
 657   preg->fastmap = NULL;
 658
 659   re_free (preg->translate);
 660   preg->translate = NULL;
 661 }
 662 #ifdef _LIBC
 663 weak_alias (__regfree, regfree)
 664 #endif
 665 \f
 666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 667    them unless specifically requested.  */
 668
 669 #if defined _REGEX_RE_COMP || defined _LIBC
 670
 671 /* BSD has one and only one pattern buffer.  */
 672 static struct re_pattern_buffer re_comp_buf;
 673
 674 char *
 675 # ifdef _LIBC
 676 /* Make these definitions weak in libc, so POSIX programs can redefine
 677    these names if they don't use our functions, and still use
 678    regcomp/regexec above without link errors.  */
 679 weak_function
 680 # endif
 681 re_comp (s)
 682      const char *s;
 683 {
 684   reg_errcode_t ret;
 685   char *fastmap;
 686
 687   if (!s)
 688     {
 689       if (!re_comp_buf.buffer)
 690         return gettext ("No previous regular expression");
 691       return 0;
 692     }
 693
 694   if (re_comp_buf.buffer)
 695     {
 696       fastmap = re_comp_buf.fastmap;
 697       re_comp_buf.fastmap = NULL;
 698       __regfree (&re_comp_buf);
 699       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 700       re_comp_buf.fastmap = fastmap;
 701     }
 702
 703   if (re_comp_buf.fastmap == NULL)
 704     {
 705       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 706       if (re_comp_buf.fastmap == NULL)
 707         return (char *) gettext (__re_error_msgid
 708                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 709     }
 710
 711   /* Since `re_exec' always passes NULL for the `regs' argument, we
 712      don't need to initialize the pattern buffer fields which affect it.  */
 713
 714   /* Match anchors at newlines.  */
 715   re_comp_buf.newline_anchor = 1;
 716
 717   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 718
 719   if (!ret)
 720     return NULL;
 721
 722   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 723   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 724 }
 725
 726 #ifdef _LIBC
 727 libc_freeres_fn (free_mem)
 728 {
 729   __regfree (&re_comp_buf);
 730 }
 731 #endif
 732
 733 #endif /* _REGEX_RE_COMP */
 734 \f
 735 /* Internal entry point.
 736    Compile the regular expression PATTERN, whose length is LENGTH.
 737    SYNTAX indicate regular expression's syntax.  */
 738
 739 static reg_errcode_t
 740 re_compile_internal (preg, pattern, length, syntax)
 741      regex_t *preg;
 742      const char * pattern;
 743      int length;
 744      reg_syntax_t syntax;
 745 {
 746   reg_errcode_t err = REG_NOERROR;
 747   re_dfa_t *dfa;
 748   re_string_t regexp;
 749
 750   /* Initialize the pattern buffer.  */
 751   preg->fastmap_accurate = 0;
 752   preg->syntax = syntax;
 753   preg->not_bol = preg->not_eol = 0;
 754   preg->used = 0;
 755   preg->re_nsub = 0;
 756   preg->can_be_null = 0;
 757   preg->regs_allocated = REGS_UNALLOCATED;
 758
 759   /* Initialize the dfa.  */
 760   dfa = (re_dfa_t *) preg->buffer;
 761   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 762     {
 763       /* If zero allocated, but buffer is non-null, try to realloc
 764          enough space.  This loses if buffer's address is bogus, but
 765          that is the user's responsibility.  If ->buffer is NULL this
 766          is a simple allocation.  */
 767       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 768       if (dfa == NULL)
 769         return REG_ESPACE;
 770       preg->allocated = sizeof (re_dfa_t);
 771       preg->buffer = (unsigned char *) dfa;
 772     }
 773   preg->used = sizeof (re_dfa_t);
 774
 775   err = init_dfa (dfa, length);
 776   if (BE (err != REG_NOERROR, 0))
 777     {
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783 #ifdef DEBUG
 784   dfa->re_str = re_malloc (char, length + 1);
 785   strncpy (dfa->re_str, pattern, length + 1);
 786 #endif
 787
 788   err = re_string_construct (&regexp, pattern, length, preg->translate,
 789                              syntax & RE_ICASE, dfa);
 790   if (BE (err != REG_NOERROR, 0))
 791     {
 792     re_compile_internal_free_return:
 793       free_workarea_compile (preg);
 794       re_string_destruct (&regexp);
 795       free_dfa_content (dfa);
 796       preg->buffer = NULL;
 797       preg->allocated = 0;
 798       return err;
 799     }
 800
 801   /* Parse the regular expression, and build a structure tree.  */
 802   preg->re_nsub = 0;
 803   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 804   if (BE (dfa->str_tree == NULL, 0))
 805     goto re_compile_internal_free_return;
 806
 807 #ifdef RE_ENABLE_I18N
 808   /* If possible, do searching in single byte encoding to speed things up.  */
 809   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 810     optimize_utf8 (dfa);
 811 #endif
 812
 813   /* Analyze the tree and collect information which is necessary to
 814      create the dfa.  */
 815   err = analyze (dfa);
 816   if (BE (err != REG_NOERROR, 0))
 817     goto re_compile_internal_free_return;
 818
 819   /* Then create the initial state of the dfa.  */
 820   err = create_initial_state (dfa);
 821
 822   /* Release work areas.  */
 823   free_workarea_compile (preg);
 824   re_string_destruct (&regexp);
 825
 826   if (BE (err != REG_NOERROR, 0))
 827     {
 828       free_dfa_content (dfa);
 829       preg->buffer = NULL;
 830       preg->allocated = 0;
 831     }
 832
 833   return err;
 834 }
 835
 836 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 837    as the initial length of some arrays.  */
 838
 839 static reg_errcode_t
 840 init_dfa (dfa, pat_len)
 841      re_dfa_t *dfa;
 842      int pat_len;
 843 {
 844   int table_size;
 845 #ifndef _LIBC
 846   char *codeset_name;
 847 #endif
 848
 849   memset (dfa, '\0', sizeof (re_dfa_t));
 850
 851   /* Force allocation of str_tree_storage the first time.  */
 852   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 853
 854   dfa->nodes_alloc = pat_len + 1;
 855   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 856
 857   dfa->states_alloc = pat_len + 1;
 858
 859   /*  table_size = 2 ^ ceil(log pat_len) */
 860   for (table_size = 1; table_size > 0; table_size <<= 1)
 861     if (table_size > pat_len)
 862       break;
 863
 864   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 865   dfa->state_hash_mask = table_size - 1;
 866
 867   dfa->subexps_alloc = 1;
 868   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 869
 870   dfa->mb_cur_max = MB_CUR_MAX;
 871 #ifdef _LIBC
 872   if (dfa->mb_cur_max == 6
 873       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 874     dfa->is_utf8 = 1;
 875   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 876                        != 0);
 877 #else
 878 # ifdef HAVE_LANGINFO_CODESET
 879   codeset_name = nl_langinfo (CODESET);
 880 # else
 881   codeset_name = getenv ("LC_ALL");
 882   if (codeset_name == NULL || codeset[0] == '\0')
 883     codeset_name = getenv ("LC_CTYPE");
 884   if (codeset_name == NULL || codeset[0] == '\0')
 885     codeset_name = getenv ("LANG");
 886   if (codeset_name == NULL)
 887     codeset_name = "";
 888   else if (strchr (codeset_name, '.') !=  NULL)
 889     codeset_name = strchr (codeset_name, '.') + 1;
 890 # endif
 891
 892   if (strcasecmp (codeset_name, "UTF-8") == 0
 893       || strcasecmp (codeset_name, "UTF8") == 0)
 894     dfa->is_utf8 = 1;
 895
 896   /* We check exhaustively in the loop below if this charset is a
 897      superset of ASCII.  */
 898   dfa->map_notascii = 0;
 899 #endif
 900
 901 #ifdef RE_ENABLE_I18N
 902   if (dfa->mb_cur_max > 1)
 903     {
 904       if (dfa->is_utf8)
 905         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 906       else
 907         {
 908           int i, j, ch;
 909
 910           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 911           if (BE (dfa->sb_char == NULL, 0))
 912             return REG_ESPACE;
 913
 914           /* Clear all bits by, then set those corresponding to single
 915              byte chars.  */
 916           bitset_empty (dfa->sb_char);
 917
 918           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 919             for (j = 0; j < UINT_BITS; ++j, ++ch)
 920               {
 921                 wchar_t wch = __btowc (ch);
 922                 if (wch != WEOF)
 923                   dfa->sb_char[i] |= 1 << j;
 924 # ifndef _LIBC
 925                 if (isascii (ch) && wch != (wchar_t) ch)
 926                   dfa->map_notascii = 1;
 927 # endif
 928               }
 929         }
 930     }
 931 #endif
 932
 933   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 934           || dfa->subexps == NULL, 0))
 935     return REG_ESPACE;
 936   return REG_NOERROR;
 937 }
 938
 939 /* Initialize WORD_CHAR table, which indicate which character is
 940    "word".  In this case "word" means that it is the word construction
 941    character used by some operators like "\<", "\>", etc.  */
 942
 943 static void
 944 init_word_char (dfa)
 945      re_dfa_t *dfa;
 946 {
 947   int i, j, ch;
 948   dfa->word_ops_used = 1;
 949   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 950     for (j = 0; j < UINT_BITS; ++j, ++ch)
 951       if (isalnum (ch) || ch == '_')
 952         dfa->word_char[i] |= 1 << j;
 953 }
 954
 955 /* Free the work area which are only used while compiling.  */
 956
 957 static void
 958 free_workarea_compile (preg)
 959      regex_t *preg;
 960 {
 961   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 962   bin_tree_storage_t *storage, *next;
 963   for (storage = dfa->str_tree_storage; storage; storage = next)
 964     {
 965       next = storage->next;
 966       re_free (storage);
 967     }
 968   dfa->str_tree_storage = NULL;
 969   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 970   dfa->str_tree = NULL;
 971   re_free (dfa->org_indices);
 972   dfa->org_indices = NULL;
 973 }
 974
 975 /* Create initial states for all contexts.  */
 976
 977 static reg_errcode_t
 978 create_initial_state (dfa)
 979      re_dfa_t *dfa;
 980 {
 981   int first, i;
 982   reg_errcode_t err;
 983   re_node_set init_nodes;
 984
 985   /* Initial states have the epsilon closure of the node which is
 986      the first node of the regular expression.  */
 987   first = dfa->str_tree->first;
 988   dfa->init_node = first;
 989   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 990   if (BE (err != REG_NOERROR, 0))
 991     return err;
 992
 993   /* The back-references which are in initial states can epsilon transit,
 994      since in this case all of the subexpressions can be null.
 995      Then we add epsilon closures of the nodes which are the next nodes of
 996      the back-references.  */
 997   if (dfa->nbackref > 0)
 998     for (i = 0; i < init_nodes.nelem; ++i)
 999       {
1000         int node_idx = init_nodes.elems[i];
1001         re_token_type_t type = dfa->nodes[node_idx].type;
1002
1003         int clexp_idx;
1004         if (type != OP_BACK_REF)
1005           continue;
1006         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1007           {
1008             re_token_t *clexp_node;
1009             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1010             if (clexp_node->type == OP_CLOSE_SUBEXP
1011                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
1012               break;
1013           }
1014         if (clexp_idx == init_nodes.nelem)
1015           continue;
1016
1017         if (type == OP_BACK_REF)
1018           {
1019             int dest_idx = dfa->edests[node_idx].elems[0];
1020             if (!re_node_set_contains (&init_nodes, dest_idx))
1021               {
1022                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1023                 i = 0;
1024               }
1025           }
1026       }
1027
1028   /* It must be the first time to invoke acquire_state.  */
1029   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1030   /* We don't check ERR here, since the initial state must not be NULL.  */
1031   if (BE (dfa->init_state == NULL, 0))
1032     return err;
1033   if (dfa->init_state->has_constraint)
1034     {
1035       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1036                                                        CONTEXT_WORD);
1037       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1038                                                      CONTEXT_NEWLINE);
1039       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1040                                                          &init_nodes,
1041                                                          CONTEXT_NEWLINE
1042                                                          | CONTEXT_BEGBUF);
1043       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1044               || dfa->init_state_begbuf == NULL, 0))
1045         return err;
1046     }
1047   else
1048     dfa->init_state_word = dfa->init_state_nl
1049       = dfa->init_state_begbuf = dfa->init_state;
1050
1051   re_node_set_free (&init_nodes);
1052   return REG_NOERROR;
1053 }
1054 \f
1055 #ifdef RE_ENABLE_I18N
1056 /* If it is possible to do searching in single byte encoding instead of UTF-8
1057    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058    DFA nodes where needed.  */
1059
1060 static void
1061 optimize_utf8 (dfa)
1062      re_dfa_t *dfa;
1063 {
1064   int node, i, mb_chars = 0, has_period = 0;
1065
1066   for (node = 0; node < dfa->nodes_len; ++node)
1067     switch (dfa->nodes[node].type)
1068       {
1069       case CHARACTER:
1070         if (dfa->nodes[node].opr.c >= 0x80)
1071           mb_chars = 1;
1072         break;
1073       case ANCHOR:
1074         switch (dfa->nodes[node].opr.idx)
1075           {
1076           case LINE_FIRST:
1077           case LINE_LAST:
1078           case BUF_FIRST:
1079           case BUF_LAST:
1080             break;
1081           default:
1082             /* Word anchors etc. cannot be handled.  */
1083             return;
1084           }
1085         break;
1086       case OP_PERIOD:
1087         has_period = 1;
1088         break;
1089       case OP_BACK_REF:
1090       case OP_ALT:
1091       case END_OF_RE:
1092       case OP_DUP_ASTERISK:
1093       case OP_DUP_QUESTION:
1094       case OP_OPEN_SUBEXP:
1095       case OP_CLOSE_SUBEXP:
1096         break;
1097       case SIMPLE_BRACKET:
1098         /* Just double check.  */
1099         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1100           if (dfa->nodes[node].opr.sbcset[i])
1101             return;
1102         break;
1103       default:
1104         return;
1105       }
1106
1107   if (mb_chars || has_period)
1108     for (node = 0; node < dfa->nodes_len; ++node)
1109       {
1110         if (dfa->nodes[node].type == CHARACTER
1111             && dfa->nodes[node].opr.c >= 0x80)
1112           dfa->nodes[node].mb_partial = 0;
1113         else if (dfa->nodes[node].type == OP_PERIOD)
1114           dfa->nodes[node].type = OP_UTF8_PERIOD;
1115       }
1116
1117   /* The search can be in single byte locale.  */
1118   dfa->mb_cur_max = 1;
1119   dfa->is_utf8 = 0;
1120   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1121 }
1122 #endif
1123 \f
1124 /* Analyze the structure tree, and calculate "first", "next", "edest",
1125    "eclosure", and "inveclosure".  */
1126
1127 static reg_errcode_t
1128 analyze (dfa)
1129      re_dfa_t *dfa;
1130 {
1131   int i;
1132   reg_errcode_t ret;
1133
1134   /* Allocate arrays.  */
1135   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1136   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1137   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1138   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1139   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1140   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1141           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1142     return REG_ESPACE;
1143   /* Initialize them.  */
1144   for (i = 0; i < dfa->nodes_len; ++i)
1145     {
1146       dfa->nexts[i] = -1;
1147       re_node_set_init_empty (dfa->edests + i);
1148       re_node_set_init_empty (dfa->eclosures + i);
1149       re_node_set_init_empty (dfa->inveclosures + i);
1150     }
1151
1152   ret = analyze_tree (dfa, dfa->str_tree);
1153   if (BE (ret == REG_NOERROR, 1))
1154     {
1155       ret = calc_eclosure (dfa);
1156       if (ret == REG_NOERROR)
1157         calc_inveclosure (dfa);
1158     }
1159   return ret;
1160 }
1161
1162 /* Helper functions for analyze.
1163    This function calculate "first", "next", and "edest" for the subtree
1164    whose root is NODE.  */
1165
1166 static reg_errcode_t
1167 analyze_tree (dfa, node)
1168      re_dfa_t *dfa;
1169      bin_tree_t *node;
1170 {
1171   reg_errcode_t ret;
1172   if (node->first == -1)
1173     calc_first (dfa, node);
1174   if (node->next == -1)
1175     calc_next (dfa, node);
1176   if (node->eclosure.nelem == 0)
1177     calc_epsdest (dfa, node);
1178   /* Calculate "first" etc. for the left child.  */
1179   if (node->left != NULL)
1180     {
1181       ret = analyze_tree (dfa, node->left);
1182       if (BE (ret != REG_NOERROR, 0))
1183         return ret;
1184     }
1185   /* Calculate "first" etc. for the right child.  */
1186   if (node->right != NULL)
1187     {
1188       ret = analyze_tree (dfa, node->right);
1189       if (BE (ret != REG_NOERROR, 0))
1190         return ret;
1191     }
1192   return REG_NOERROR;
1193 }
1194
1195 /* Calculate "first" for the node NODE.  */
1196 static void
1197 calc_first (dfa, node)
1198      re_dfa_t *dfa;
1199      bin_tree_t *node;
1200 {
1201   int idx, type;
1202   idx = node->node_idx;
1203   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1204
1205   switch (type)
1206     {
1207 #ifdef DEBUG
1208     case OP_OPEN_BRACKET:
1209     case OP_CLOSE_BRACKET:
1210     case OP_OPEN_DUP_NUM:
1211     case OP_CLOSE_DUP_NUM:
1212     case OP_DUP_PLUS:
1213     case OP_NON_MATCH_LIST:
1214     case OP_OPEN_COLL_ELEM:
1215     case OP_CLOSE_COLL_ELEM:
1216     case OP_OPEN_EQUIV_CLASS:
1217     case OP_CLOSE_EQUIV_CLASS:
1218     case OP_OPEN_CHAR_CLASS:
1219     case OP_CLOSE_CHAR_CLASS:
1220       /* These must not appear here.  */
1221       assert (0);
1222 #endif
1223     case END_OF_RE:
1224     case CHARACTER:
1225     case OP_PERIOD:
1226     case OP_DUP_ASTERISK:
1227     case OP_DUP_QUESTION:
1228 #ifdef RE_ENABLE_I18N
1229     case OP_UTF8_PERIOD:
1230     case COMPLEX_BRACKET:
1231 #endif /* RE_ENABLE_I18N */
1232     case SIMPLE_BRACKET:
1233     case OP_BACK_REF:
1234     case ANCHOR:
1235     case OP_OPEN_SUBEXP:
1236     case OP_CLOSE_SUBEXP:
1237       node->first = idx;
1238       break;
1239     case OP_ALT:
1240       node->first = idx;
1241       break;
1242       /* else fall through */
1243     default:
1244 #ifdef DEBUG
1245       assert (node->left != NULL);
1246 #endif
1247       if (node->left->first == -1)
1248         calc_first (dfa, node->left);
1249       node->first = node->left->first;
1250       break;
1251     }
1252 }
1253
1254 /* Calculate "next" for the node NODE.  */
1255
1256 static void
1257 calc_next (dfa, node)
1258      re_dfa_t *dfa;
1259      bin_tree_t *node;
1260 {
1261   int idx, type;
1262   bin_tree_t *parent = node->parent;
1263   if (parent == NULL)
1264     {
1265       node->next = -1;
1266       idx = node->node_idx;
1267       if (node->type == 0)
1268         dfa->nexts[idx] = node->next;
1269       return;
1270     }
1271
1272   idx = parent->node_idx;
1273   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1274
1275   switch (type)
1276     {
1277     case OP_DUP_ASTERISK:
1278       node->next = idx;
1279       break;
1280     case CONCAT:
1281       if (parent->left == node)
1282         {
1283           if (parent->right->first == -1)
1284             calc_first (dfa, parent->right);
1285           node->next = parent->right->first;
1286           break;
1287         }
1288       /* else fall through */
1289     default:
1290       if (parent->next == -1)
1291         calc_next (dfa, parent);
1292       node->next = parent->next;
1293       break;
1294     }
1295   idx = node->node_idx;
1296   if (node->type == 0)
1297     dfa->nexts[idx] = node->next;
1298 }
1299
1300 /* Calculate "edest" for the node NODE.  */
1301
1302 static void
1303 calc_epsdest (dfa, node)
1304      re_dfa_t *dfa;
1305      bin_tree_t *node;
1306 {
1307   int idx;
1308   idx = node->node_idx;
1309   if (node->type == 0)
1310     {
1311       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1312           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1313         {
1314           if (node->left->first == -1)
1315             calc_first (dfa, node->left);
1316           if (node->next == -1)
1317             calc_next (dfa, node);
1318           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1319                               node->next);
1320         }
1321       else if (dfa->nodes[idx].type == OP_ALT)
1322         {
1323           int left, right;
1324           if (node->left != NULL)
1325             {
1326               if (node->left->first == -1)
1327                 calc_first (dfa, node->left);
1328               left = node->left->first;
1329             }
1330           else
1331             {
1332               if (node->next == -1)
1333                 calc_next (dfa, node);
1334               left = node->next;
1335             }
1336           if (node->right != NULL)
1337             {
1338               if (node->right->first == -1)
1339                 calc_first (dfa, node->right);
1340               right = node->right->first;
1341             }
1342           else
1343             {
1344               if (node->next == -1)
1345                 calc_next (dfa, node);
1346               right = node->next;
1347             }
1348           re_node_set_init_2 (dfa->edests + idx, left, right);
1349         }
1350       else if (dfa->nodes[idx].type == ANCHOR
1351                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1352                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1353                || dfa->nodes[idx].type == OP_BACK_REF)
1354         re_node_set_init_1 (dfa->edests + idx, node->next);
1355       else
1356         assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1357     }
1358 }
1359
1360 /* Duplicate the epsilon closure of the node ROOT_NODE.
1361    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1362    to their own constraint.  */
1363
1364 static reg_errcode_t
1365 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1366                         init_constraint)
1367      re_dfa_t *dfa;
1368      int top_org_node, top_clone_node, root_node;
1369      unsigned int init_constraint;
1370 {
1371   reg_errcode_t err;
1372   int org_node, clone_node, ret;
1373   unsigned int constraint = init_constraint;
1374   for (org_node = top_org_node, clone_node = top_clone_node;;)
1375     {
1376       int org_dest, clone_dest;
1377       if (dfa->nodes[org_node].type == OP_BACK_REF)
1378         {
1379           /* If the back reference epsilon-transit, its destination must
1380              also have the constraint.  Then duplicate the epsilon closure
1381              of the destination of the back reference, and store it in
1382              edests of the back reference.  */
1383           org_dest = dfa->nexts[org_node];
1384           re_node_set_empty (dfa->edests + clone_node);
1385           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1386           if (BE (err != REG_NOERROR, 0))
1387             return err;
1388           dfa->nexts[clone_node] = dfa->nexts[org_node];
1389           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1390           if (BE (ret < 0, 0))
1391             return REG_ESPACE;
1392         }
1393       else if (dfa->edests[org_node].nelem == 0)
1394         {
1395           /* In case of the node can't epsilon-transit, don't duplicate the
1396              destination and store the original destination as the
1397              destination of the node.  */
1398           dfa->nexts[clone_node] = dfa->nexts[org_node];
1399           break;
1400         }
1401       else if (dfa->edests[org_node].nelem == 1)
1402         {
1403           /* In case of the node can epsilon-transit, and it has only one
1404              destination.  */
1405           org_dest = dfa->edests[org_node].elems[0];
1406           re_node_set_empty (dfa->edests + clone_node);
1407           if (dfa->nodes[org_node].type == ANCHOR)
1408             {
1409               /* In case of the node has another constraint, append it.  */
1410               if (org_node == root_node && clone_node != org_node)
1411                 {
1412                   /* ...but if the node is root_node itself, it means the
1413                      epsilon closure have a loop, then tie it to the
1414                      destination of the root_node.  */
1415                   ret = re_node_set_insert (dfa->edests + clone_node,
1416                                             org_dest);
1417                   if (BE (ret < 0, 0))
1418                     return REG_ESPACE;
1419                   break;
1420                 }
1421               constraint |= dfa->nodes[org_node].opr.ctx_type;
1422             }
1423           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1424           if (BE (err != REG_NOERROR, 0))
1425             return err;
1426           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1427           if (BE (ret < 0, 0))
1428             return REG_ESPACE;
1429         }
1430       else /* dfa->edests[org_node].nelem == 2 */
1431         {
1432           /* In case of the node can epsilon-transit, and it has two
1433              destinations. E.g. '|', '*', '+', '?'.   */
1434           org_dest = dfa->edests[org_node].elems[0];
1435           re_node_set_empty (dfa->edests + clone_node);
1436           /* Search for a duplicated node which satisfies the constraint.  */
1437           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1438           if (clone_dest == -1)
1439             {
1440               /* There are no such a duplicated node, create a new one.  */
1441               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1442               if (BE (err != REG_NOERROR, 0))
1443                 return err;
1444               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1445               if (BE (ret < 0, 0))
1446                 return REG_ESPACE;
1447               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1448                                             root_node, constraint);
1449               if (BE (err != REG_NOERROR, 0))
1450                 return err;
1451             }
1452           else
1453             {
1454               /* There are a duplicated node which satisfy the constraint,
1455                  use it to avoid infinite loop.  */
1456               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457               if (BE (ret < 0, 0))
1458                 return REG_ESPACE;
1459             }
1460
1461           org_dest = dfa->edests[org_node].elems[1];
1462           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1463           if (BE (err != REG_NOERROR, 0))
1464             return err;
1465           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1466           if (BE (ret < 0, 0))
1467             return REG_ESPACE;
1468         }
1469       org_node = org_dest;
1470       clone_node = clone_dest;
1471     }
1472   return REG_NOERROR;
1473 }
1474
1475 /* Search for a node which is duplicated from the node ORG_NODE, and
1476    satisfies the constraint CONSTRAINT.  */
1477
1478 static int
1479 search_duplicated_node (dfa, org_node, constraint)
1480      re_dfa_t *dfa;
1481      int org_node;
1482      unsigned int constraint;
1483 {
1484   int idx;
1485   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1486     {
1487       if (org_node == dfa->org_indices[idx]
1488           && constraint == dfa->nodes[idx].constraint)
1489         return idx; /* Found.  */
1490     }
1491   return -1; /* Not found.  */
1492 }
1493
1494 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1495    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1496    otherwise return the error code.  */
1497
1498 static reg_errcode_t
1499 duplicate_node (new_idx, dfa, org_idx, constraint)
1500      re_dfa_t *dfa;
1501      int *new_idx, org_idx;
1502      unsigned int constraint;
1503 {
1504   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1505   if (BE (dup_idx == -1, 0))
1506     return REG_ESPACE;
1507   dfa->nodes[dup_idx].constraint = constraint;
1508   if (dfa->nodes[org_idx].type == ANCHOR)
1509     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1510   dfa->nodes[dup_idx].duplicated = 1;
1511   re_node_set_init_empty (dfa->edests + dup_idx);
1512   re_node_set_init_empty (dfa->eclosures + dup_idx);
1513   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1514
1515   /* Store the index of the original node.  */
1516   dfa->org_indices[dup_idx] = org_idx;
1517   *new_idx = dup_idx;
1518   return REG_NOERROR;
1519 }
1520
1521 static void
1522 calc_inveclosure (dfa)
1523      re_dfa_t *dfa;
1524 {
1525   int src, idx, dest;
1526   for (src = 0; src < dfa->nodes_len; ++src)
1527     {
1528       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1529         {
1530           dest = dfa->eclosures[src].elems[idx];
1531           re_node_set_insert (dfa->inveclosures + dest, src);
1532         }
1533     }
1534 }
1535
1536 /* Calculate "eclosure" for all the node in DFA.  */
1537
1538 static reg_errcode_t
1539 calc_eclosure (dfa)
1540      re_dfa_t *dfa;
1541 {
1542   int node_idx, incomplete;
1543 #ifdef DEBUG
1544   assert (dfa->nodes_len > 0);
1545 #endif
1546   incomplete = 0;
1547   /* For each nodes, calculate epsilon closure.  */
1548   for (node_idx = 0; ; ++node_idx)
1549     {
1550       reg_errcode_t err;
1551       re_node_set eclosure_elem;
1552       if (node_idx == dfa->nodes_len)
1553         {
1554           if (!incomplete)
1555             break;
1556           incomplete = 0;
1557           node_idx = 0;
1558         }
1559
1560 #ifdef DEBUG
1561       assert (dfa->eclosures[node_idx].nelem != -1);
1562 #endif
1563       /* If we have already calculated, skip it.  */
1564       if (dfa->eclosures[node_idx].nelem != 0)
1565         continue;
1566       /* Calculate epsilon closure of `node_idx'.  */
1567       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1568       if (BE (err != REG_NOERROR, 0))
1569         return err;
1570
1571       if (dfa->eclosures[node_idx].nelem == 0)
1572         {
1573           incomplete = 1;
1574           re_node_set_free (&eclosure_elem);
1575         }
1576     }
1577   return REG_NOERROR;
1578 }
1579
1580 /* Calculate epsilon closure of NODE.  */
1581
1582 static reg_errcode_t
1583 calc_eclosure_iter (new_set, dfa, node, root)
1584      re_node_set *new_set;
1585      re_dfa_t *dfa;
1586      int node, root;
1587 {
1588   reg_errcode_t err;
1589   unsigned int constraint;
1590   int i, incomplete;
1591   re_node_set eclosure;
1592   incomplete = 0;
1593   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1594   if (BE (err != REG_NOERROR, 0))
1595     return err;
1596
1597   /* This indicates that we are calculating this node now.
1598      We reference this value to avoid infinite loop.  */
1599   dfa->eclosures[node].nelem = -1;
1600
1601   constraint = ((dfa->nodes[node].type == ANCHOR)
1602                 ? dfa->nodes[node].opr.ctx_type : 0);
1603   /* If the current node has constraints, duplicate all nodes.
1604      Since they must inherit the constraints.  */
1605   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1606     {
1607       int org_node, cur_node;
1608       org_node = cur_node = node;
1609       err = duplicate_node_closure (dfa, node, node, node, constraint);
1610       if (BE (err != REG_NOERROR, 0))
1611         return err;
1612     }
1613
1614   /* Expand each epsilon destination nodes.  */
1615   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1616     for (i = 0; i < dfa->edests[node].nelem; ++i)
1617       {
1618         re_node_set eclosure_elem;
1619         int edest = dfa->edests[node].elems[i];
1620         /* If calculating the epsilon closure of `edest' is in progress,
1621            return intermediate result.  */
1622         if (dfa->eclosures[edest].nelem == -1)
1623           {
1624             incomplete = 1;
1625             continue;
1626           }
1627         /* If we haven't calculated the epsilon closure of `edest' yet,
1628            calculate now. Otherwise use calculated epsilon closure.  */
1629         if (dfa->eclosures[edest].nelem == 0)
1630           {
1631             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1632             if (BE (err != REG_NOERROR, 0))
1633               return err;
1634           }
1635         else
1636           eclosure_elem = dfa->eclosures[edest];
1637         /* Merge the epsilon closure of `edest'.  */
1638         re_node_set_merge (&eclosure, &eclosure_elem);
1639         /* If the epsilon closure of `edest' is incomplete,
1640            the epsilon closure of this node is also incomplete.  */
1641         if (dfa->eclosures[edest].nelem == 0)
1642           {
1643             incomplete = 1;
1644             re_node_set_free (&eclosure_elem);
1645           }
1646       }
1647
1648   /* Epsilon closures include itself.  */
1649   re_node_set_insert (&eclosure, node);
1650   if (incomplete && !root)
1651     dfa->eclosures[node].nelem = 0;
1652   else
1653     dfa->eclosures[node] = eclosure;
1654   *new_set = eclosure;
1655   return REG_NOERROR;
1656 }
1657 \f
1658 /* Functions for token which are used in the parser.  */
1659
1660 /* Fetch a token from INPUT.
1661    We must not use this function inside bracket expressions.  */
1662
1663 static void
1664 fetch_token (result, input, syntax)
1665      re_token_t *result;
1666      re_string_t *input;
1667      reg_syntax_t syntax;
1668 {
1669   re_string_skip_bytes (input, peek_token (result, input, syntax));
1670 }
1671
1672 /* Peek a token from INPUT, and return the length of the token.
1673    We must not use this function inside bracket expressions.  */
1674
1675 static int
1676 peek_token (token, input, syntax)
1677      re_token_t *token;
1678      re_string_t *input;
1679      reg_syntax_t syntax;
1680 {
1681   unsigned char c;
1682
1683   if (re_string_eoi (input))
1684     {
1685       token->type = END_OF_RE;
1686       return 0;
1687     }
1688
1689   c = re_string_peek_byte (input, 0);
1690   token->opr.c = c;
1691
1692   token->word_char = 0;
1693 #ifdef RE_ENABLE_I18N
1694   token->mb_partial = 0;
1695   if (input->mb_cur_max > 1 &&
1696       !re_string_first_byte (input, re_string_cur_idx (input)))
1697     {
1698       token->type = CHARACTER;
1699       token->mb_partial = 1;
1700       return 1;
1701     }
1702 #endif
1703   if (c == '\\')
1704     {
1705       unsigned char c2;
1706       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1707         {
1708           token->type = BACK_SLASH;
1709           return 1;
1710         }
1711
1712       c2 = re_string_peek_byte_case (input, 1);
1713       token->opr.c = c2;
1714       token->type = CHARACTER;
1715 #ifdef RE_ENABLE_I18N
1716       if (input->mb_cur_max > 1)
1717         {
1718           wint_t wc = re_string_wchar_at (input,
1719                                           re_string_cur_idx (input) + 1);
1720           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1721         }
1722       else
1723 #endif
1724         token->word_char = IS_WORD_CHAR (c2) != 0;
1725
1726       switch (c2)
1727         {
1728         case '|':
1729           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1730             token->type = OP_ALT;
1731           break;
1732         case '1': case '2': case '3': case '4': case '5':
1733         case '6': case '7': case '8': case '9':
1734           if (!(syntax & RE_NO_BK_REFS))
1735             {
1736               token->type = OP_BACK_REF;
1737               token->opr.idx = c2 - '0';
1738             }
1739           break;
1740         case '<':
1741           if (!(syntax & RE_NO_GNU_OPS))
1742             {
1743               token->type = ANCHOR;
1744               token->opr.ctx_type = WORD_FIRST;
1745             }
1746           break;
1747         case '>':
1748           if (!(syntax & RE_NO_GNU_OPS))
1749             {
1750               token->type = ANCHOR;
1751               token->opr.ctx_type = WORD_LAST;
1752             }
1753           break;
1754         case 'b':
1755           if (!(syntax & RE_NO_GNU_OPS))
1756             {
1757               token->type = ANCHOR;
1758               token->opr.ctx_type = WORD_DELIM;
1759             }
1760           break;
1761         case 'B':
1762           if (!(syntax & RE_NO_GNU_OPS))
1763             {
1764               token->type = ANCHOR;
1765               token->opr.ctx_type = INSIDE_WORD;
1766             }
1767           break;
1768         case 'w':
1769           if (!(syntax & RE_NO_GNU_OPS))
1770             token->type = OP_WORD;
1771           break;
1772         case 'W':
1773           if (!(syntax & RE_NO_GNU_OPS))
1774             token->type = OP_NOTWORD;
1775           break;
1776         case 's':
1777           if (!(syntax & RE_NO_GNU_OPS))
1778             token->type = OP_SPACE;
1779           break;
1780         case 'S':
1781           if (!(syntax & RE_NO_GNU_OPS))
1782             token->type = OP_NOTSPACE;
1783           break;
1784         case '`':
1785           if (!(syntax & RE_NO_GNU_OPS))
1786             {
1787               token->type = ANCHOR;
1788               token->opr.ctx_type = BUF_FIRST;
1789             }
1790           break;
1791         case '\'':
1792           if (!(syntax & RE_NO_GNU_OPS))
1793             {
1794               token->type = ANCHOR;
1795               token->opr.ctx_type = BUF_LAST;
1796             }
1797           break;
1798         case '(':
1799           if (!(syntax & RE_NO_BK_PARENS))
1800             token->type = OP_OPEN_SUBEXP;
1801           break;
1802         case ')':
1803           if (!(syntax & RE_NO_BK_PARENS))
1804             token->type = OP_CLOSE_SUBEXP;
1805           break;
1806         case '+':
1807           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1808             token->type = OP_DUP_PLUS;
1809           break;
1810         case '?':
1811           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1812             token->type = OP_DUP_QUESTION;
1813           break;
1814         case '{':
1815           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1816             token->type = OP_OPEN_DUP_NUM;
1817           break;
1818         case '}':
1819           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1820             token->type = OP_CLOSE_DUP_NUM;
1821           break;
1822         default:
1823           break;
1824         }
1825       return 2;
1826     }
1827
1828   token->type = CHARACTER;
1829 #ifdef RE_ENABLE_I18N
1830   if (input->mb_cur_max > 1)
1831     {
1832       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1833       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1834     }
1835   else
1836 #endif
1837     token->word_char = IS_WORD_CHAR (token->opr.c);
1838
1839   switch (c)
1840     {
1841     case '\n':
1842       if (syntax & RE_NEWLINE_ALT)
1843         token->type = OP_ALT;
1844       break;
1845     case '|':
1846       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1847         token->type = OP_ALT;
1848       break;
1849     case '*':
1850       token->type = OP_DUP_ASTERISK;
1851       break;
1852     case '+':
1853       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1854         token->type = OP_DUP_PLUS;
1855       break;
1856     case '?':
1857       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1858         token->type = OP_DUP_QUESTION;
1859       break;
1860     case '{':
1861       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1862         token->type = OP_OPEN_DUP_NUM;
1863       break;
1864     case '}':
1865       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1866         token->type = OP_CLOSE_DUP_NUM;
1867       break;
1868     case '(':
1869       if (syntax & RE_NO_BK_PARENS)
1870         token->type = OP_OPEN_SUBEXP;
1871       break;
1872     case ')':
1873       if (syntax & RE_NO_BK_PARENS)
1874         token->type = OP_CLOSE_SUBEXP;
1875       break;
1876     case '[':
1877       token->type = OP_OPEN_BRACKET;
1878       break;
1879     case '.':
1880       token->type = OP_PERIOD;
1881       break;
1882     case '^':
1883       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1884           re_string_cur_idx (input) != 0)
1885         {
1886           char prev = re_string_peek_byte (input, -1);
1887           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1888             break;
1889         }
1890       token->type = ANCHOR;
1891       token->opr.ctx_type = LINE_FIRST;
1892       break;
1893     case '$':
1894       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1895           re_string_cur_idx (input) + 1 != re_string_length (input))
1896         {
1897           re_token_t next;
1898           re_string_skip_bytes (input, 1);
1899           peek_token (&next, input, syntax);
1900           re_string_skip_bytes (input, -1);
1901           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1902             break;
1903         }
1904       token->type = ANCHOR;
1905       token->opr.ctx_type = LINE_LAST;
1906       break;
1907     default:
1908       break;
1909     }
1910   return 1;
1911 }
1912
1913 /* Peek a token from INPUT, and return the length of the token.
1914    We must not use this function out of bracket expressions.  */
1915
1916 static int
1917 peek_token_bracket (token, input, syntax)
1918      re_token_t *token;
1919      re_string_t *input;
1920      reg_syntax_t syntax;
1921 {
1922   unsigned char c;
1923   if (re_string_eoi (input))
1924     {
1925       token->type = END_OF_RE;
1926       return 0;
1927     }
1928   c = re_string_peek_byte (input, 0);
1929   token->opr.c = c;
1930
1931 #ifdef RE_ENABLE_I18N
1932   if (input->mb_cur_max > 1 &&
1933       !re_string_first_byte (input, re_string_cur_idx (input)))
1934     {
1935       token->type = CHARACTER;
1936       return 1;
1937     }
1938 #endif /* RE_ENABLE_I18N */
1939
1940   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1941       && re_string_cur_idx (input) + 1 < re_string_length (input))
1942     {
1943       /* In this case, '\' escape a character.  */
1944       unsigned char c2;
1945       re_string_skip_bytes (input, 1);
1946       c2 = re_string_peek_byte (input, 0);
1947       token->opr.c = c2;
1948       token->type = CHARACTER;
1949       return 1;
1950     }
1951   if (c == '[') /* '[' is a special char in a bracket exps.  */
1952     {
1953       unsigned char c2;
1954       int token_len;
1955       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1956         c2 = re_string_peek_byte (input, 1);
1957       else
1958         c2 = 0;
1959       token->opr.c = c2;
1960       token_len = 2;
1961       switch (c2)
1962         {
1963         case '.':
1964           token->type = OP_OPEN_COLL_ELEM;
1965           break;
1966         case '=':
1967           token->type = OP_OPEN_EQUIV_CLASS;
1968           break;
1969         case ':':
1970           if (syntax & RE_CHAR_CLASSES)
1971             {
1972               token->type = OP_OPEN_CHAR_CLASS;
1973               break;
1974             }
1975           /* else fall through.  */
1976         default:
1977           token->type = CHARACTER;
1978           token->opr.c = c;
1979           token_len = 1;
1980           break;
1981         }
1982       return token_len;
1983     }
1984   switch (c)
1985     {
1986     case '-':
1987       token->type = OP_CHARSET_RANGE;
1988       break;
1989     case ']':
1990       token->type = OP_CLOSE_BRACKET;
1991       break;
1992     case '^':
1993       token->type = OP_NON_MATCH_LIST;
1994       break;
1995     default:
1996       token->type = CHARACTER;
1997     }
1998   return 1;
1999 }
2000 \f
2001 /* Functions for parser.  */
2002
2003 /* Entry point of the parser.
2004    Parse the regular expression REGEXP and return the structure tree.
2005    If an error is occured, ERR is set by error code, and return NULL.
2006    This function build the following tree, from regular expression <reg_exp>:
2007            CAT
2008            / \
2009           /   \
2010    <reg_exp>  EOR
2011
2012    CAT means concatenation.
2013    EOR means end of regular expression.  */
2014
2015 static bin_tree_t *
2016 parse (regexp, preg, syntax, err)
2017      re_string_t *regexp;
2018      regex_t *preg;
2019      reg_syntax_t syntax;
2020      reg_errcode_t *err;
2021 {
2022   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2023   bin_tree_t *tree, *eor, *root;
2024   re_token_t current_token;
2025   dfa->syntax = syntax;
2026   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2027   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2028   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2029     return NULL;
2030   eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
2031   if (tree != NULL)
2032     root = create_tree (dfa, tree, eor, CONCAT, 0);
2033   else
2034     root = eor;
2035   if (BE (eor == NULL || root == NULL, 0))
2036     {
2037       *err = REG_ESPACE;
2038       return NULL;
2039     }
2040   return root;
2041 }
2042
2043 /* This function build the following tree, from regular expression
2044    <branch1>|<branch2>:
2045            ALT
2046            / \
2047           /   \
2048    <branch1> <branch2>
2049
2050    ALT means alternative, which represents the operator `|'.  */
2051
2052 static bin_tree_t *
2053 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2054      re_string_t *regexp;
2055      regex_t *preg;
2056      re_token_t *token;
2057      reg_syntax_t syntax;
2058      int nest;
2059      reg_errcode_t *err;
2060 {
2061   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2062   bin_tree_t *tree, *branch = NULL;
2063   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2064   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2065     return NULL;
2066
2067   while (token->type == OP_ALT)
2068     {
2069       re_token_t alt_token = *token;
2070       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2071       if (token->type != OP_ALT && token->type != END_OF_RE
2072           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2073         {
2074           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2075           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2076             return NULL;
2077         }
2078       else
2079         branch = NULL;
2080       tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2081       if (BE (tree == NULL, 0))
2082         {
2083           *err = REG_ESPACE;
2084           return NULL;
2085         }
2086       dfa->has_plural_match = 1;
2087     }
2088   return tree;
2089 }
2090
2091 /* This function build the following tree, from regular expression
2092    <exp1><exp2>:
2093         CAT
2094         / \
2095        /   \
2096    <exp1> <exp2>
2097
2098    CAT means concatenation.  */
2099
2100 static bin_tree_t *
2101 parse_branch (regexp, preg, token, syntax, nest, err)
2102      re_string_t *regexp;
2103      regex_t *preg;
2104      re_token_t *token;
2105      reg_syntax_t syntax;
2106      int nest;
2107      reg_errcode_t *err;
2108 {
2109   bin_tree_t *tree, *exp;
2110   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2111   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2112   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2113     return NULL;
2114
2115   while (token->type != OP_ALT && token->type != END_OF_RE
2116          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2117     {
2118       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2119       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2120         {
2121           return NULL;
2122         }
2123       if (tree != NULL && exp != NULL)
2124         {
2125           tree = create_tree (dfa, tree, exp, CONCAT, 0);
2126           if (tree == NULL)
2127             {
2128               *err = REG_ESPACE;
2129               return NULL;
2130             }
2131         }
2132       else if (tree == NULL)
2133         tree = exp;
2134       /* Otherwise exp == NULL, we don't need to create new tree.  */
2135     }
2136   return tree;
2137 }
2138
2139 /* This function build the following tree, from regular expression a*:
2140          *
2141          |
2142          a
2143 */
2144
2145 static bin_tree_t *
2146 parse_expression (regexp, preg, token, syntax, nest, err)
2147      re_string_t *regexp;
2148      regex_t *preg;
2149      re_token_t *token;
2150      reg_syntax_t syntax;
2151      int nest;
2152      reg_errcode_t *err;
2153 {
2154   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2155   bin_tree_t *tree;
2156   switch (token->type)
2157     {
2158     case CHARACTER:
2159       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2160       if (BE (tree == NULL, 0))
2161         {
2162           *err = REG_ESPACE;
2163           return NULL;
2164         }
2165 #ifdef RE_ENABLE_I18N
2166       if (dfa->mb_cur_max > 1)
2167         {
2168           while (!re_string_eoi (regexp)
2169                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2170             {
2171               bin_tree_t *mbc_remain;
2172               fetch_token (token, regexp, syntax);
2173               mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2174               tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2175               if (BE (mbc_remain == NULL || tree == NULL, 0))
2176                 {
2177                   *err = REG_ESPACE;
2178                   return NULL;
2179                 }
2180             }
2181         }
2182 #endif
2183       break;
2184     case OP_OPEN_SUBEXP:
2185       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2186       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2187         return NULL;
2188       break;
2189     case OP_OPEN_BRACKET:
2190       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2191       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192         return NULL;
2193       break;
2194     case OP_BACK_REF:
2195       if (BE (preg->re_nsub < token->opr.idx
2196               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2197         {
2198           *err = REG_ESUBREG;
2199           return NULL;
2200         }
2201       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2202       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2203       if (BE (tree == NULL, 0))
2204         {
2205           *err = REG_ESPACE;
2206           return NULL;
2207         }
2208       ++dfa->nbackref;
2209       dfa->has_mb_node = 1;
2210       break;
2211     case OP_OPEN_DUP_NUM:
2212       if (syntax & RE_CONTEXT_INVALID_DUP)
2213         {
2214           *err = REG_BADRPT;
2215           return NULL;
2216         }
2217       /* FALLTHROUGH */
2218     case OP_DUP_ASTERISK:
2219     case OP_DUP_PLUS:
2220     case OP_DUP_QUESTION:
2221       if (syntax & RE_CONTEXT_INVALID_OPS)
2222         {
2223           *err = REG_BADRPT;
2224           return NULL;
2225         }
2226       else if (syntax & RE_CONTEXT_INDEP_OPS)
2227         {
2228           fetch_token (token, regexp, syntax);
2229           return parse_expression (regexp, preg, token, syntax, nest, err);
2230         }
2231       /* else fall through  */
2232     case OP_CLOSE_SUBEXP:
2233       if ((token->type == OP_CLOSE_SUBEXP) &&
2234           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2235         {
2236           *err = REG_ERPAREN;
2237           return NULL;
2238         }
2239       /* else fall through  */
2240     case OP_CLOSE_DUP_NUM:
2241       /* We treat it as a normal character.  */
2242
2243       /* Then we can these characters as normal characters.  */
2244       token->type = CHARACTER;
2245       /* mb_partial and word_char bits should be initialized already
2246          by peek_token.  */
2247       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2248       if (BE (tree == NULL, 0))
2249         {
2250           *err = REG_ESPACE;
2251           return NULL;
2252         }
2253       break;
2254     case ANCHOR:
2255       if ((token->opr.ctx_type
2256            & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2257           && dfa->word_ops_used == 0)
2258         init_word_char (dfa);
2259       if (token->opr.ctx_type == WORD_DELIM)
2260         {
2261           bin_tree_t *tree_first, *tree_last;
2262           token->opr.ctx_type = WORD_FIRST;
2263           tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2264           token->opr.ctx_type = WORD_LAST;
2265           tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2266           token->type = OP_ALT;
2267           tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2268           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2269             {
2270               *err = REG_ESPACE;
2271               return NULL;
2272             }
2273         }
2274       else
2275         {
2276           tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2277           if (BE (tree == NULL, 0))
2278             {
2279               *err = REG_ESPACE;
2280               return NULL;
2281             }
2282         }
2283       /* We must return here, since ANCHORs can't be followed
2284          by repetition operators.
2285          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2286              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2287       fetch_token (token, regexp, syntax);
2288       return tree;
2289     case OP_PERIOD:
2290       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2291       if (BE (tree == NULL, 0))
2292         {
2293           *err = REG_ESPACE;
2294           return NULL;
2295         }
2296       if (dfa->mb_cur_max > 1)
2297         dfa->has_mb_node = 1;
2298       break;
2299     case OP_WORD:
2300     case OP_NOTWORD:
2301       tree = build_charclass_op (dfa, regexp->trans,
2302                                  (const unsigned char *) "alnum",
2303                                  (const unsigned char *) "_",
2304                                  token->type == OP_NOTWORD, err);
2305       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2306         return NULL;
2307       break;
2308     case OP_SPACE:
2309     case OP_NOTSPACE:
2310       tree = build_charclass_op (dfa, regexp->trans,
2311                                  (const unsigned char *) "space",
2312                                  (const unsigned char *) "",
2313                                  token->type == OP_NOTSPACE, err);
2314       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2315         return NULL;
2316       break;
2317     case OP_ALT:
2318     case END_OF_RE:
2319       return NULL;
2320     case BACK_SLASH:
2321       *err = REG_EESCAPE;
2322       return NULL;
2323     default:
2324       /* Must not happen?  */
2325 #ifdef DEBUG
2326       assert (0);
2327 #endif
2328       return NULL;
2329     }
2330   fetch_token (token, regexp, syntax);
2331
2332   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2333          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2334     {
2335       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2336       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2337         return NULL;
2338       /* In BRE consecutive duplications are not allowed.  */
2339       if ((syntax & RE_CONTEXT_INVALID_DUP)
2340           && (token->type == OP_DUP_ASTERISK
2341               || token->type == OP_OPEN_DUP_NUM))
2342         {
2343           *err = REG_BADRPT;
2344           return NULL;
2345         }
2346       dfa->has_plural_match = 1;
2347     }
2348
2349   return tree;
2350 }
2351
2352 /* This function build the following tree, from regular expression
2353    (<reg_exp>):
2354          SUBEXP
2355             |
2356         <reg_exp>
2357 */
2358
2359 static bin_tree_t *
2360 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2361      re_string_t *regexp;
2362      regex_t *preg;
2363      re_token_t *token;
2364      reg_syntax_t syntax;
2365      int nest;
2366      reg_errcode_t *err;
2367 {
2368   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2369   bin_tree_t *tree, *left_par, *right_par;
2370   size_t cur_nsub;
2371   cur_nsub = preg->re_nsub++;
2372   if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
2373     {
2374       re_subexp_t *new_array;
2375       dfa->subexps_alloc *= 2;
2376       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2377       if (BE (new_array == NULL, 0))
2378         {
2379           dfa->subexps_alloc /= 2;
2380           *err = REG_ESPACE;
2381           return NULL;
2382         }
2383       dfa->subexps = new_array;
2384     }
2385   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2386   dfa->subexps[cur_nsub].end = -1;
2387
2388   left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2389   if (BE (left_par == NULL, 0))
2390     {
2391       *err = REG_ESPACE;
2392       return NULL;
2393     }
2394   dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2395   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2396
2397   /* The subexpression may be a null string.  */
2398   if (token->type == OP_CLOSE_SUBEXP)
2399     tree = NULL;
2400   else
2401     {
2402       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2403       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2404         return NULL;
2405     }
2406   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2407     {
2408       *err = REG_EPAREN;
2409       return NULL;
2410     }
2411   right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2412   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2413   tree = ((tree == NULL) ? right_par
2414           : create_tree (dfa, tree, right_par, CONCAT, 0));
2415   tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2416   if (BE (right_par == NULL || tree == NULL, 0))
2417     {
2418       *err = REG_ESPACE;
2419       return NULL;
2420     }
2421   dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2422
2423   return tree;
2424 }
2425
2426 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2427
2428 static bin_tree_t *
2429 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2430      bin_tree_t *elem;
2431      re_string_t *regexp;
2432      re_dfa_t *dfa;
2433      re_token_t *token;
2434      reg_syntax_t syntax;
2435      reg_errcode_t *err;
2436 {
2437   re_token_t dup_token;
2438   bin_tree_t *tree = NULL;
2439   int i, start, end, start_idx = re_string_cur_idx (regexp);
2440   re_token_t start_token = *token;
2441
2442   if (token->type == OP_OPEN_DUP_NUM)
2443     {
2444       end = 0;
2445       start = fetch_number (regexp, token, syntax);
2446       if (start == -1)
2447         {
2448           if (token->type == CHARACTER && token->opr.c == ',')
2449             start = 0; /* We treat "{,m}" as "{0,m}".  */
2450           else
2451             {
2452               *err = REG_BADBR; /* <re>{} is invalid.  */
2453               return NULL;
2454             }
2455         }
2456       if (BE (start != -2, 1))
2457         {
2458           /* We treat "{n}" as "{n,n}".  */
2459           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2460                  : ((token->type == CHARACTER && token->opr.c == ',')
2461                     ? fetch_number (regexp, token, syntax) : -2));
2462         }
2463       if (BE (start == -2 || end == -2, 0))
2464         {
2465           /* Invalid sequence.  */
2466           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2467             {
2468               if (token->type == END_OF_RE)
2469                 *err = REG_EBRACE;
2470               else
2471                 *err = REG_BADBR;
2472
2473               return NULL;
2474             }
2475
2476           /* If the syntax bit is set, rollback.  */
2477           re_string_set_index (regexp, start_idx);
2478           *token = start_token;
2479           token->type = CHARACTER;
2480           /* mb_partial and word_char bits should be already initialized by
2481              peek_token.  */
2482           return elem;
2483         }
2484
2485       if (BE (end != -1 && start > end, 0))
2486         {
2487           /* First number greater than second.  */
2488           *err = REG_BADBR;
2489           return NULL;
2490         }
2491     }
2492   else
2493     {
2494       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2495       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2496     }
2497
2498   /* Treat "<re>{0}*" etc. as "<re>{0}".  */
2499   if (BE (elem == NULL, 0))
2500     start = end = 0;
2501
2502   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2503   else if (BE (start > 0, 0))
2504     {
2505       tree = elem;
2506       for (i = 2; i <= start; ++i)
2507         {
2508           elem = duplicate_tree (elem, dfa);
2509           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2510           if (BE (elem == NULL || tree == NULL, 0))
2511             goto parse_dup_op_espace;
2512         }
2513     }
2514
2515   if (BE (end != start, 1))
2516     {
2517       dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2518       if (BE (start > 0, 0))
2519         {
2520           elem = duplicate_tree (elem, dfa);
2521           if (BE (elem == NULL, 0))
2522             goto parse_dup_op_espace;
2523
2524           /* This subexpression will be marked as optional, so that
2525              empty matches do not touch the registers.  */
2526           mark_opt_subexp (elem, dfa);
2527
2528           /* Prepare the tree with the modifier.  */
2529           elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2530           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2531         }
2532       else
2533         {
2534           /* We do not need to duplicate the tree because we have not
2535              created it yet.  */
2536           mark_opt_subexp (elem, dfa);
2537           tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2538         }
2539
2540       if (BE (elem == NULL || tree == NULL, 0))
2541         goto parse_dup_op_espace;
2542
2543       /* This loop is actually executed only when end != -1,
2544          to rewrite <re>{0,n} as <re>?<re>?<re>?...  We have
2545          already created the start+1-th copy.  */
2546       for (i = start + 2; i <= end; ++i)
2547         {
2548           elem = duplicate_tree (elem, dfa);
2549           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2550           if (BE (elem == NULL || tree == NULL, 0))
2551             {
2552               *err = REG_ESPACE;
2553               return NULL;
2554             }
2555         }
2556     }
2557
2558   fetch_token (token, regexp, syntax);
2559   return tree;
2560
2561  parse_dup_op_espace:
2562   *err = REG_ESPACE;
2563   return NULL;
2564 }
2565
2566 /* Size of the names for collating symbol/equivalence_class/character_class.
2567    I'm not sure, but maybe enough.  */
2568 #define BRACKET_NAME_BUF_SIZE 32
2569
2570 #ifndef _LIBC
2571   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2572      Build the range expression which starts from START_ELEM, and ends
2573      at END_ELEM.  The result are written to MBCSET and SBCSET.
2574      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2575      mbcset->range_ends, is a pointer argument sinse we may
2576      update it.  */
2577
2578 static reg_errcode_t
2579 # ifdef RE_ENABLE_I18N
2580 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2581      re_charset_t *mbcset;
2582      int *range_alloc;
2583 # else /* not RE_ENABLE_I18N */
2584 build_range_exp (sbcset, start_elem, end_elem)
2585 # endif /* not RE_ENABLE_I18N */
2586      re_bitset_ptr_t sbcset;
2587      bracket_elem_t *start_elem, *end_elem;
2588 {
2589   unsigned int start_ch, end_ch;
2590   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2591   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2592           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2593           0))
2594     return REG_ERANGE;
2595
2596   /* We can handle no multi character collating elements without libc
2597      support.  */
2598   if (BE ((start_elem->type == COLL_SYM
2599            && strlen ((char *) start_elem->opr.name) > 1)
2600           || (end_elem->type == COLL_SYM
2601               && strlen ((char *) end_elem->opr.name) > 1), 0))
2602     return REG_ECOLLATE;
2603
2604 # ifdef RE_ENABLE_I18N
2605   {
2606     wchar_t wc, start_wc, end_wc;
2607     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2608
2609     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2610                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2611                    : 0));
2612     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2613               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2614                  : 0));
2615     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2616                 ? __btowc (start_ch) : start_elem->opr.wch);
2617     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2618               ? __btowc (end_ch) : end_elem->opr.wch);
2619     if (start_wc == WEOF || end_wc == WEOF)
2620       return REG_ECOLLATE;
2621     cmp_buf[0] = start_wc;
2622     cmp_buf[4] = end_wc;
2623     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2624       return REG_ERANGE;
2625
2626     /* Got valid collation sequence values, add them as a new entry.
2627        However, for !_LIBC we have no collation elements: if the
2628        character set is single byte, the single byte character set
2629        that we build below suffices.  parse_bracket_exp passes
2630        no MBCSET if dfa->mb_cur_max == 1.  */
2631     if (mbcset)
2632       {
2633         /* Check the space of the arrays.  */
2634         if (BE (*range_alloc == mbcset->nranges, 0))
2635           {
2636             /* There is not enough space, need realloc.  */
2637             wchar_t *new_array_start, *new_array_end;
2638             int new_nranges;
2639
2640             /* +1 in case of mbcset->nranges is 0.  */
2641             new_nranges = 2 * mbcset->nranges + 1;
2642             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2643                are NULL if *range_alloc == 0.  */
2644             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2645                                           new_nranges);
2646             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2647                                         new_nranges);
2648
2649             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2650               return REG_ESPACE;
2651
2652             mbcset->range_starts = new_array_start;
2653             mbcset->range_ends = new_array_end;
2654             *range_alloc = new_nranges;
2655           }
2656
2657         mbcset->range_starts[mbcset->nranges] = start_wc;
2658         mbcset->range_ends[mbcset->nranges++] = end_wc;
2659       }
2660
2661     /* Build the table for single byte characters.  */
2662     for (wc = 0; wc < SBC_MAX; ++wc)
2663       {
2664         cmp_buf[2] = wc;
2665         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2666             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2667           bitset_set (sbcset, wc);
2668       }
2669   }
2670 # else /* not RE_ENABLE_I18N */
2671   {
2672     unsigned int ch;
2673     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2674                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2675                    : 0));
2676     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2677               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2678                  : 0));
2679     if (start_ch > end_ch)
2680       return REG_ERANGE;
2681     /* Build the table for single byte characters.  */
2682     for (ch = 0; ch < SBC_MAX; ++ch)
2683       if (start_ch <= ch  && ch <= end_ch)
2684         bitset_set (sbcset, ch);
2685   }
2686 # endif /* not RE_ENABLE_I18N */
2687   return REG_NOERROR;
2688 }
2689 #endif /* not _LIBC */
2690
2691 #ifndef _LIBC
2692 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2693    Build the collating element which is represented by NAME.
2694    The result are written to MBCSET and SBCSET.
2695    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2696    pointer argument since we may update it.  */
2697
2698 static reg_errcode_t
2699 # ifdef RE_ENABLE_I18N
2700 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2701      re_charset_t *mbcset;
2702      int *coll_sym_alloc;
2703 # else /* not RE_ENABLE_I18N */
2704 build_collating_symbol (sbcset, name)
2705 # endif /* not RE_ENABLE_I18N */
2706      re_bitset_ptr_t sbcset;
2707      const unsigned char *name;
2708 {
2709   size_t name_len = strlen ((const char *) name);
2710   if (BE (name_len != 1, 0))
2711     return REG_ECOLLATE;
2712   else
2713     {
2714       bitset_set (sbcset, name[0]);
2715       return REG_NOERROR;
2716     }
2717 }
2718 #endif /* not _LIBC */
2719
2720 /* This function parse bracket expression like "[abc]", "[a-c]",
2721    "[[.a-a.]]" etc.  */
2722
2723 static bin_tree_t *
2724 parse_bracket_exp (regexp, dfa, token, syntax, err)
2725      re_string_t *regexp;
2726      re_dfa_t *dfa;
2727      re_token_t *token;
2728      reg_syntax_t syntax;
2729      reg_errcode_t *err;
2730 {
2731 #ifdef _LIBC
2732   const unsigned char *collseqmb;
2733   const char *collseqwc;
2734   uint32_t nrules;
2735   int32_t table_size;
2736   const int32_t *symb_table;
2737   const unsigned char *extra;
2738
2739   /* Local function for parse_bracket_exp used in _LIBC environement.
2740      Seek the collating symbol entry correspondings to NAME.
2741      Return the index of the symbol in the SYMB_TABLE.  */
2742
2743   auto inline int32_t
2744   __attribute ((always_inline))
2745   seek_collating_symbol_entry (name, name_len)
2746          const unsigned char *name;
2747          size_t name_len;
2748     {
2749       int32_t hash = elem_hash ((const char *) name, name_len);
2750       int32_t elem = hash % table_size;
2751       int32_t second = hash % (table_size - 2);
2752       while (symb_table[2 * elem] != 0)
2753         {
2754           /* First compare the hashing value.  */
2755           if (symb_table[2 * elem] == hash
2756               /* Compare the length of the name.  */
2757               && name_len == extra[symb_table[2 * elem + 1]]
2758               /* Compare the name.  */
2759               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2760                          name_len) == 0)
2761             {
2762               /* Yep, this is the entry.  */
2763               break;
2764             }
2765
2766           /* Next entry.  */
2767           elem += second;
2768         }
2769       return elem;
2770     }
2771
2772   /* Local function for parse_bracket_exp used in _LIBC environement.
2773      Look up the collation sequence value of BR_ELEM.
2774      Return the value if succeeded, UINT_MAX otherwise.  */
2775
2776   auto inline unsigned int
2777   __attribute ((always_inline))
2778   lookup_collation_sequence_value (br_elem)
2779          bracket_elem_t *br_elem;
2780     {
2781       if (br_elem->type == SB_CHAR)
2782         {
2783           /*
2784           if (MB_CUR_MAX == 1)
2785           */
2786           if (nrules == 0)
2787             return collseqmb[br_elem->opr.ch];
2788           else
2789             {
2790               wint_t wc = __btowc (br_elem->opr.ch);
2791               return __collseq_table_lookup (collseqwc, wc);
2792             }
2793         }
2794       else if (br_elem->type == MB_CHAR)
2795         {
2796           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2797         }
2798       else if (br_elem->type == COLL_SYM)
2799         {
2800           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2801           if (nrules != 0)
2802             {
2803               int32_t elem, idx;
2804               elem = seek_collating_symbol_entry (br_elem->opr.name,
2805                                                   sym_name_len);
2806               if (symb_table[2 * elem] != 0)
2807                 {
2808                   /* We found the entry.  */
2809                   idx = symb_table[2 * elem + 1];
2810                   /* Skip the name of collating element name.  */
2811                   idx += 1 + extra[idx];
2812                   /* Skip the byte sequence of the collating element.  */
2813                   idx += 1 + extra[idx];
2814                   /* Adjust for the alignment.  */
2815                   idx = (idx + 3) & ~3;
2816                   /* Skip the multibyte collation sequence value.  */
2817                   idx += sizeof (unsigned int);
2818                   /* Skip the wide char sequence of the collating element.  */
2819                   idx += sizeof (unsigned int) *
2820                     (1 + *(unsigned int *) (extra + idx));
2821                   /* Return the collation sequence value.  */
2822                   return *(unsigned int *) (extra + idx);
2823                 }
2824               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2825                 {
2826                   /* No valid character.  Match it as a single byte
2827                      character.  */
2828                   return collseqmb[br_elem->opr.name[0]];
2829                 }
2830             }
2831           else if (sym_name_len == 1)
2832             return collseqmb[br_elem->opr.name[0]];
2833         }
2834       return UINT_MAX;
2835     }
2836
2837   /* Local function for parse_bracket_exp used in _LIBC environement.
2838      Build the range expression which starts from START_ELEM, and ends
2839      at END_ELEM.  The result are written to MBCSET and SBCSET.
2840      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2841      mbcset->range_ends, is a pointer argument sinse we may
2842      update it.  */
2843
2844   auto inline reg_errcode_t
2845   __attribute ((always_inline))
2846   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2847          re_charset_t *mbcset;
2848          int *range_alloc;
2849          re_bitset_ptr_t sbcset;
2850          bracket_elem_t *start_elem, *end_elem;
2851     {
2852       unsigned int ch;
2853       uint32_t start_collseq;
2854       uint32_t end_collseq;
2855
2856       /* Equivalence Classes and Character Classes can't be a range
2857          start/end.  */
2858       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2859               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2860               0))
2861         return REG_ERANGE;
2862
2863       start_collseq = lookup_collation_sequence_value (start_elem);
2864       end_collseq = lookup_collation_sequence_value (end_elem);
2865       /* Check start/end collation sequence values.  */
2866       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2867         return REG_ECOLLATE;
2868       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2869         return REG_ERANGE;
2870
2871       /* Got valid collation sequence values, add them as a new entry.
2872          However, if we have no collation elements, and the character set
2873          is single byte, the single byte character set that we
2874          build below suffices. */
2875       if (nrules > 0 || dfa->mb_cur_max > 1)
2876         {
2877           /* Check the space of the arrays.  */
2878           if (BE (*range_alloc == mbcset->nranges, 0))
2879             {
2880               /* There is not enough space, need realloc.  */
2881               uint32_t *new_array_start;
2882               uint32_t *new_array_end;
2883               int new_nranges;
2884
2885               /* +1 in case of mbcset->nranges is 0.  */
2886               new_nranges = 2 * mbcset->nranges + 1;
2887               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2888                                             new_nranges);
2889               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2890                                           new_nranges);
2891
2892               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2893                 return REG_ESPACE;
2894
2895               mbcset->range_starts = new_array_start;
2896               mbcset->range_ends = new_array_end;
2897               *range_alloc = new_nranges;
2898             }
2899
2900           mbcset->range_starts[mbcset->nranges] = start_collseq;
2901           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2902         }
2903
2904       /* Build the table for single byte characters.  */
2905       for (ch = 0; ch < SBC_MAX; ch++)
2906         {
2907           uint32_t ch_collseq;
2908           /*
2909           if (MB_CUR_MAX == 1)
2910           */
2911           if (nrules == 0)
2912             ch_collseq = collseqmb[ch];
2913           else
2914             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2915           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2916             bitset_set (sbcset, ch);
2917         }
2918       return REG_NOERROR;
2919     }
2920
2921   /* Local function for parse_bracket_exp used in _LIBC environement.
2922      Build the collating element which is represented by NAME.
2923      The result are written to MBCSET and SBCSET.
2924      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2925      pointer argument sinse we may update it.  */
2926
2927   auto inline reg_errcode_t
2928   __attribute ((always_inline))
2929   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2930          re_charset_t *mbcset;
2931          int *coll_sym_alloc;
2932          re_bitset_ptr_t sbcset;
2933          const unsigned char *name;
2934     {
2935       int32_t elem, idx;
2936       size_t name_len = strlen ((const char *) name);
2937       if (nrules != 0)
2938         {
2939           elem = seek_collating_symbol_entry (name, name_len);
2940           if (symb_table[2 * elem] != 0)
2941             {
2942               /* We found the entry.  */
2943               idx = symb_table[2 * elem + 1];
2944               /* Skip the name of collating element name.  */
2945               idx += 1 + extra[idx];
2946             }
2947           else if (symb_table[2 * elem] == 0 && name_len == 1)
2948             {
2949               /* No valid character, treat it as a normal
2950                  character.  */
2951               bitset_set (sbcset, name[0]);
2952               return REG_NOERROR;
2953             }
2954           else
2955             return REG_ECOLLATE;
2956
2957           /* Got valid collation sequence, add it as a new entry.  */
2958           /* Check the space of the arrays.  */
2959           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2960             {
2961               /* Not enough, realloc it.  */
2962               /* +1 in case of mbcset->ncoll_syms is 0.  */
2963               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2964               /* Use realloc since mbcset->coll_syms is NULL
2965                  if *alloc == 0.  */
2966               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2967                                                    new_coll_sym_alloc);
2968               if (BE (new_coll_syms == NULL, 0))
2969                 return REG_ESPACE;
2970               mbcset->coll_syms = new_coll_syms;
2971               *coll_sym_alloc = new_coll_sym_alloc;
2972             }
2973           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2974           return REG_NOERROR;
2975         }
2976       else
2977         {
2978           if (BE (name_len != 1, 0))
2979             return REG_ECOLLATE;
2980           else
2981             {
2982               bitset_set (sbcset, name[0]);
2983               return REG_NOERROR;
2984             }
2985         }
2986     }
2987 #endif
2988
2989   re_token_t br_token;
2990   re_bitset_ptr_t sbcset;
2991 #ifdef RE_ENABLE_I18N
2992   re_charset_t *mbcset;
2993   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2994   int equiv_class_alloc = 0, char_class_alloc = 0;
2995 #endif /* not RE_ENABLE_I18N */
2996   int non_match = 0;
2997   bin_tree_t *work_tree;
2998   int token_len;
2999   int first_round = 1;
3000 #ifdef _LIBC
3001   collseqmb = (const unsigned char *)
3002     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3003   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3004   if (nrules)
3005     {
3006       /*
3007       if (MB_CUR_MAX > 1)
3008       */
3009         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3010       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3011       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3012                                                   _NL_COLLATE_SYMB_TABLEMB);
3013       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3014                                                    _NL_COLLATE_SYMB_EXTRAMB);
3015     }
3016 #endif
3017   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3018 #ifdef RE_ENABLE_I18N
3019   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3020 #endif /* RE_ENABLE_I18N */
3021 #ifdef RE_ENABLE_I18N
3022   if (BE (sbcset == NULL || mbcset == NULL, 0))
3023 #else
3024   if (BE (sbcset == NULL, 0))
3025 #endif /* RE_ENABLE_I18N */
3026     {
3027       *err = REG_ESPACE;
3028       return NULL;
3029     }
3030
3031   token_len = peek_token_bracket (token, regexp, syntax);
3032   if (BE (token->type == END_OF_RE, 0))
3033     {
3034       *err = REG_BADPAT;
3035       goto parse_bracket_exp_free_return;
3036     }
3037   if (token->type == OP_NON_MATCH_LIST)
3038     {
3039 #ifdef RE_ENABLE_I18N
3040       mbcset->non_match = 1;
3041 #endif /* not RE_ENABLE_I18N */
3042       non_match = 1;
3043       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3044         bitset_set (sbcset, '\0');
3045       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3046       token_len = peek_token_bracket (token, regexp, syntax);
3047       if (BE (token->type == END_OF_RE, 0))
3048         {
3049           *err = REG_BADPAT;
3050           goto parse_bracket_exp_free_return;
3051         }
3052     }
3053
3054   /* We treat the first ']' as a normal character.  */
3055   if (token->type == OP_CLOSE_BRACKET)
3056     token->type = CHARACTER;
3057
3058   while (1)
3059     {
3060       bracket_elem_t start_elem, end_elem;
3061       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3062       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3063       reg_errcode_t ret;
3064       int token_len2 = 0, is_range_exp = 0;
3065       re_token_t token2;
3066
3067       start_elem.opr.name = start_name_buf;
3068       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3069                                    syntax, first_round);
3070       if (BE (ret != REG_NOERROR, 0))
3071         {
3072           *err = ret;
3073           goto parse_bracket_exp_free_return;
3074         }
3075       first_round = 0;
3076
3077       /* Get information about the next token.  We need it in any case.  */
3078       token_len = peek_token_bracket (token, regexp, syntax);
3079
3080       /* Do not check for ranges if we know they are not allowed.  */
3081       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3082         {
3083           if (BE (token->type == END_OF_RE, 0))
3084             {
3085               *err = REG_EBRACK;
3086               goto parse_bracket_exp_free_return;
3087             }
3088           if (token->type == OP_CHARSET_RANGE)
3089             {
3090               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3091               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3092               if (BE (token2.type == END_OF_RE, 0))
3093                 {
3094                   *err = REG_EBRACK;
3095                   goto parse_bracket_exp_free_return;
3096                 }
3097               if (token2.type == OP_CLOSE_BRACKET)
3098                 {
3099                   /* We treat the last '-' as a normal character.  */
3100                   re_string_skip_bytes (regexp, -token_len);
3101                   token->type = CHARACTER;
3102                 }
3103               else
3104                 is_range_exp = 1;
3105             }
3106         }
3107
3108       if (is_range_exp == 1)
3109         {
3110           end_elem.opr.name = end_name_buf;
3111           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3112                                        dfa, syntax, 1);
3113           if (BE (ret != REG_NOERROR, 0))
3114             {
3115               *err = ret;
3116               goto parse_bracket_exp_free_return;
3117             }
3118
3119           token_len = peek_token_bracket (token, regexp, syntax);
3120
3121 #ifdef _LIBC
3122           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3123                                   &start_elem, &end_elem);
3124 #else
3125 # ifdef RE_ENABLE_I18N
3126           *err = build_range_exp (sbcset,
3127                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3128                                   &range_alloc, &start_elem, &end_elem);
3129 # else
3130           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3131 # endif
3132 #endif /* RE_ENABLE_I18N */
3133           if (BE (*err != REG_NOERROR, 0))
3134             goto parse_bracket_exp_free_return;
3135         }
3136       else
3137         {
3138           switch (start_elem.type)
3139             {
3140             case SB_CHAR:
3141               bitset_set (sbcset, start_elem.opr.ch);
3142               break;
3143 #ifdef RE_ENABLE_I18N
3144             case MB_CHAR:
3145               /* Check whether the array has enough space.  */
3146               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3147                 {
3148                   wchar_t *new_mbchars;
3149                   /* Not enough, realloc it.  */
3150                   /* +1 in case of mbcset->nmbchars is 0.  */
3151                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3152                   /* Use realloc since array is NULL if *alloc == 0.  */
3153                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3154                                             mbchar_alloc);
3155                   if (BE (new_mbchars == NULL, 0))
3156                     goto parse_bracket_exp_espace;
3157                   mbcset->mbchars = new_mbchars;
3158                 }
3159               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3160               break;
3161 #endif /* RE_ENABLE_I18N */
3162             case EQUIV_CLASS:
3163               *err = build_equiv_class (sbcset,
3164 #ifdef RE_ENABLE_I18N
3165                                         mbcset, &equiv_class_alloc,
3166 #endif /* RE_ENABLE_I18N */
3167                                         start_elem.opr.name);
3168               if (BE (*err != REG_NOERROR, 0))
3169                 goto parse_bracket_exp_free_return;
3170               break;
3171             case COLL_SYM:
3172               *err = build_collating_symbol (sbcset,
3173 #ifdef RE_ENABLE_I18N
3174                                              mbcset, &coll_sym_alloc,
3175 #endif /* RE_ENABLE_I18N */
3176                                              start_elem.opr.name);
3177               if (BE (*err != REG_NOERROR, 0))
3178                 goto parse_bracket_exp_free_return;
3179               break;
3180             case CHAR_CLASS:
3181               *err = build_charclass (regexp->trans, sbcset,
3182 #ifdef RE_ENABLE_I18N
3183                                       mbcset, &char_class_alloc,
3184 #endif /* RE_ENABLE_I18N */
3185                                       start_elem.opr.name, syntax);
3186               if (BE (*err != REG_NOERROR, 0))
3187                goto parse_bracket_exp_free_return;
3188               break;
3189             default:
3190               assert (0);
3191               break;
3192             }
3193         }
3194       if (BE (token->type == END_OF_RE, 0))
3195         {
3196           *err = REG_EBRACK;
3197           goto parse_bracket_exp_free_return;
3198         }
3199       if (token->type == OP_CLOSE_BRACKET)
3200         break;
3201     }
3202
3203   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3204
3205   /* If it is non-matching list.  */
3206   if (non_match)
3207     bitset_not (sbcset);
3208
3209 #ifdef RE_ENABLE_I18N
3210   /* Ensure only single byte characters are set.  */
3211   if (dfa->mb_cur_max > 1)
3212     bitset_mask (sbcset, dfa->sb_char);
3213 #endif /* RE_ENABLE_I18N */
3214
3215   /* Build a tree for simple bracket.  */
3216   br_token.type = SIMPLE_BRACKET;
3217   br_token.opr.sbcset = sbcset;
3218   work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3219   if (BE (work_tree == NULL, 0))
3220     goto parse_bracket_exp_espace;
3221
3222 #ifdef RE_ENABLE_I18N
3223   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3224       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3225                                                      || mbcset->non_match)))
3226     {
3227       re_token_t alt_token;
3228       bin_tree_t *mbc_tree;
3229       int sbc_idx;
3230       /* Build a tree for complex bracket.  */
3231       dfa->has_mb_node = 1;
3232       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3233         if (sbcset[sbc_idx])
3234           break;
3235       /* If there are no bits set in sbcset, there is no point
3236          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3237       if (sbc_idx == BITSET_UINTS)
3238         {
3239           re_free (sbcset);
3240           dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3241           dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3242           return work_tree;
3243         }
3244       br_token.type = COMPLEX_BRACKET;
3245       br_token.opr.mbcset = mbcset;
3246       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3247       if (BE (mbc_tree == NULL, 0))
3248         goto parse_bracket_exp_espace;
3249       /* Then join them by ALT node.  */
3250       alt_token.type = OP_ALT;
3251       dfa->has_plural_match = 1;
3252       work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3253       if (BE (mbc_tree != NULL, 1))
3254         return work_tree;
3255     }
3256   else
3257     {
3258       free_charset (mbcset);
3259       return work_tree;
3260     }
3261 #else /* not RE_ENABLE_I18N */
3262   return work_tree;
3263 #endif /* not RE_ENABLE_I18N */
3264
3265  parse_bracket_exp_espace:
3266   *err = REG_ESPACE;
3267  parse_bracket_exp_free_return:
3268   re_free (sbcset);
3269 #ifdef RE_ENABLE_I18N
3270   free_charset (mbcset);
3271 #endif /* RE_ENABLE_I18N */
3272   return NULL;
3273 }
3274
3275 /* Parse an element in the bracket expression.  */
3276
3277 static reg_errcode_t
3278 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3279                        accept_hyphen)
3280      bracket_elem_t *elem;
3281      re_string_t *regexp;
3282      re_token_t *token;
3283      int token_len;
3284      re_dfa_t *dfa;
3285      reg_syntax_t syntax;
3286      int accept_hyphen;
3287 {
3288 #ifdef RE_ENABLE_I18N
3289   int cur_char_size;
3290   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3291   if (cur_char_size > 1)
3292     {
3293       elem->type = MB_CHAR;
3294       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3295       re_string_skip_bytes (regexp, cur_char_size);
3296       return REG_NOERROR;
3297     }
3298 #endif /* RE_ENABLE_I18N */
3299   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3300   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3301       || token->type == OP_OPEN_EQUIV_CLASS)
3302     return parse_bracket_symbol (elem, regexp, token);
3303   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3304     {
3305       /* A '-' must only appear as anything but a range indicator before
3306          the closing bracket.  Everything else is an error.  */
3307       re_token_t token2;
3308       (void) peek_token_bracket (&token2, regexp, syntax);
3309       if (token2.type != OP_CLOSE_BRACKET)
3310         /* The actual error value is not standardized since this whole
3311            case is undefined.  But ERANGE makes good sense.  */
3312         return REG_ERANGE;
3313     }
3314   elem->type = SB_CHAR;
3315   elem->opr.ch = token->opr.c;
3316   return REG_NOERROR;
3317 }
3318
3319 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3320    such as [:<character_class>:], [.<collating_element>.], and
3321    [=<equivalent_class>=].  */
3322
3323 static reg_errcode_t
3324 parse_bracket_symbol (elem, regexp, token)
3325      bracket_elem_t *elem;
3326      re_string_t *regexp;
3327      re_token_t *token;
3328 {
3329   unsigned char ch, delim = token->opr.c;
3330   int i = 0;
3331   if (re_string_eoi(regexp))
3332     return REG_EBRACK;
3333   for (;; ++i)
3334     {
3335       if (i >= BRACKET_NAME_BUF_SIZE)
3336         return REG_EBRACK;
3337       if (token->type == OP_OPEN_CHAR_CLASS)
3338         ch = re_string_fetch_byte_case (regexp);
3339       else
3340         ch = re_string_fetch_byte (regexp);
3341       if (re_string_eoi(regexp))
3342         return REG_EBRACK;
3343       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3344         break;
3345       elem->opr.name[i] = ch;
3346     }
3347   re_string_skip_bytes (regexp, 1);
3348   elem->opr.name[i] = '\0';
3349   switch (token->type)
3350     {
3351     case OP_OPEN_COLL_ELEM:
3352       elem->type = COLL_SYM;
3353       break;
3354     case OP_OPEN_EQUIV_CLASS:
3355       elem->type = EQUIV_CLASS;
3356       break;
3357     case OP_OPEN_CHAR_CLASS:
3358       elem->type = CHAR_CLASS;
3359       break;
3360     default:
3361       break;
3362     }
3363   return REG_NOERROR;
3364 }
3365
3366   /* Helper function for parse_bracket_exp.
3367      Build the equivalence class which is represented by NAME.
3368      The result are written to MBCSET and SBCSET.
3369      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3370      is a pointer argument sinse we may update it.  */
3371
3372 static reg_errcode_t
3373 #ifdef RE_ENABLE_I18N
3374 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3375      re_charset_t *mbcset;
3376      int *equiv_class_alloc;
3377 #else /* not RE_ENABLE_I18N */
3378 build_equiv_class (sbcset, name)
3379 #endif /* not RE_ENABLE_I18N */
3380      re_bitset_ptr_t sbcset;
3381      const unsigned char *name;
3382 {
3383 #if defined _LIBC
3384   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3385   if (nrules != 0)
3386     {
3387       const int32_t *table, *indirect;
3388       const unsigned char *weights, *extra, *cp;
3389       unsigned char char_buf[2];
3390       int32_t idx1, idx2;
3391       unsigned int ch;
3392       size_t len;
3393       /* This #include defines a local function!  */
3394 # include <locale/weight.h>
3395       /* Calculate the index for equivalence class.  */
3396       cp = name;
3397       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3398       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3399                                                _NL_COLLATE_WEIGHTMB);
3400       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3401                                                    _NL_COLLATE_EXTRAMB);
3402       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3403                                                 _NL_COLLATE_INDIRECTMB);
3404       idx1 = findidx (&cp);
3405       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3406         /* This isn't a valid character.  */
3407         return REG_ECOLLATE;
3408
3409       /* Build single byte matcing table for this equivalence class.  */
3410       char_buf[1] = (unsigned char) '\0';
3411       len = weights[idx1];
3412       for (ch = 0; ch < SBC_MAX; ++ch)
3413         {
3414           char_buf[0] = ch;
3415           cp = char_buf;
3416           idx2 = findidx (&cp);
3417 /*
3418           idx2 = table[ch];
3419 */
3420           if (idx2 == 0)
3421             /* This isn't a valid character.  */
3422             continue;
3423           if (len == weights[idx2])
3424             {
3425               int cnt = 0;
3426               while (cnt <= len &&
3427                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3428                 ++cnt;
3429
3430               if (cnt > len)
3431                 bitset_set (sbcset, ch);
3432             }
3433         }
3434       /* Check whether the array has enough space.  */
3435       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3436         {
3437           /* Not enough, realloc it.  */
3438           /* +1 in case of mbcset->nequiv_classes is 0.  */
3439           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3440           /* Use realloc since the array is NULL if *alloc == 0.  */
3441           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3442                                                    int32_t,
3443                                                    new_equiv_class_alloc);
3444           if (BE (new_equiv_classes == NULL, 0))
3445             return REG_ESPACE;
3446           mbcset->equiv_classes = new_equiv_classes;
3447           *equiv_class_alloc = new_equiv_class_alloc;
3448         }
3449       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3450     }
3451   else
3452 #endif /* _LIBC */
3453     {
3454       if (BE (strlen ((const char *) name) != 1, 0))
3455         return REG_ECOLLATE;
3456       bitset_set (sbcset, *name);
3457     }
3458   return REG_NOERROR;
3459 }
3460
3461   /* Helper function for parse_bracket_exp.
3462      Build the character class which is represented by NAME.
3463      The result are written to MBCSET and SBCSET.
3464      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3465      is a pointer argument sinse we may update it.  */
3466
3467 static reg_errcode_t
3468 #ifdef RE_ENABLE_I18N
3469 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3470      re_charset_t *mbcset;
3471      int *char_class_alloc;
3472 #else /* not RE_ENABLE_I18N */
3473 build_charclass (trans, sbcset, class_name, syntax)
3474 #endif /* not RE_ENABLE_I18N */
3475      unsigned RE_TRANSLATE_TYPE trans;
3476      re_bitset_ptr_t sbcset;
3477      const unsigned char *class_name;
3478      reg_syntax_t syntax;
3479 {
3480   int i;
3481   const char *name = (const char *) class_name;
3482
3483   /* In case of REG_ICASE "upper" and "lower" match the both of
3484      upper and lower cases.  */
3485   if ((syntax & RE_ICASE)
3486       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3487     name = "alpha";
3488
3489 #ifdef RE_ENABLE_I18N
3490   /* Check the space of the arrays.  */
3491   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3492     {
3493       /* Not enough, realloc it.  */
3494       /* +1 in case of mbcset->nchar_classes is 0.  */
3495       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3496       /* Use realloc since array is NULL if *alloc == 0.  */
3497       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3498                                                new_char_class_alloc);
3499       if (BE (new_char_classes == NULL, 0))
3500         return REG_ESPACE;
3501       mbcset->char_classes = new_char_classes;
3502       *char_class_alloc = new_char_class_alloc;
3503     }
3504   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3505 #endif /* RE_ENABLE_I18N */
3506
3507 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3508     for (i = 0; i < SBC_MAX; ++i)               \
3509       {                                         \
3510         if (ctype_func (i))                     \
3511           {                                     \
3512             int ch = trans ? trans[i] : i;      \
3513             bitset_set (sbcset, ch);            \
3514           }                                     \
3515       }
3516
3517   if (strcmp (name, "alnum") == 0)
3518     BUILD_CHARCLASS_LOOP (isalnum)
3519   else if (strcmp (name, "cntrl") == 0)
3520     BUILD_CHARCLASS_LOOP (iscntrl)
3521   else if (strcmp (name, "lower") == 0)
3522     BUILD_CHARCLASS_LOOP (islower)
3523   else if (strcmp (name, "space") == 0)
3524     BUILD_CHARCLASS_LOOP (isspace)
3525   else if (strcmp (name, "alpha") == 0)
3526     BUILD_CHARCLASS_LOOP (isalpha)
3527   else if (strcmp (name, "digit") == 0)
3528     BUILD_CHARCLASS_LOOP (isdigit)
3529   else if (strcmp (name, "print") == 0)
3530     BUILD_CHARCLASS_LOOP (isprint)
3531   else if (strcmp (name, "upper") == 0)
3532     BUILD_CHARCLASS_LOOP (isupper)
3533   else if (strcmp (name, "blank") == 0)
3534     BUILD_CHARCLASS_LOOP (isblank)
3535   else if (strcmp (name, "graph") == 0)
3536     BUILD_CHARCLASS_LOOP (isgraph)
3537   else if (strcmp (name, "punct") == 0)
3538     BUILD_CHARCLASS_LOOP (ispunct)
3539   else if (strcmp (name, "xdigit") == 0)
3540     BUILD_CHARCLASS_LOOP (isxdigit)
3541   else
3542     return REG_ECTYPE;
3543
3544   return REG_NOERROR;
3545 }
3546
3547 static bin_tree_t *
3548 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3549      re_dfa_t *dfa;
3550      unsigned RE_TRANSLATE_TYPE trans;
3551      const unsigned char *class_name;
3552      const unsigned char *extra;
3553      int non_match;
3554      reg_errcode_t *err;
3555 {
3556   re_bitset_ptr_t sbcset;
3557 #ifdef RE_ENABLE_I18N
3558   re_charset_t *mbcset;
3559   int alloc = 0;
3560 #endif /* not RE_ENABLE_I18N */
3561   reg_errcode_t ret;
3562   re_token_t br_token;
3563   bin_tree_t *tree;
3564
3565   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3566 #ifdef RE_ENABLE_I18N
3567   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3568 #endif /* RE_ENABLE_I18N */
3569
3570 #ifdef RE_ENABLE_I18N
3571   if (BE (sbcset == NULL || mbcset == NULL, 0))
3572 #else /* not RE_ENABLE_I18N */
3573   if (BE (sbcset == NULL, 0))
3574 #endif /* not RE_ENABLE_I18N */
3575     {
3576       *err = REG_ESPACE;
3577       return NULL;
3578     }
3579
3580   if (non_match)
3581     {
3582 #ifdef RE_ENABLE_I18N
3583       /*
3584       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3585         bitset_set(cset->sbcset, '\0');
3586       */
3587       mbcset->non_match = 1;
3588 #endif /* not RE_ENABLE_I18N */
3589     }
3590
3591   /* We don't care the syntax in this case.  */
3592   ret = build_charclass (trans, sbcset,
3593 #ifdef RE_ENABLE_I18N
3594                          mbcset, &alloc,
3595 #endif /* RE_ENABLE_I18N */
3596                          class_name, 0);
3597
3598   if (BE (ret != REG_NOERROR, 0))
3599     {
3600       re_free (sbcset);
3601 #ifdef RE_ENABLE_I18N
3602       free_charset (mbcset);
3603 #endif /* RE_ENABLE_I18N */
3604       *err = ret;
3605       return NULL;
3606     }
3607   /* \w match '_' also.  */
3608   for (; *extra; extra++)
3609     bitset_set (sbcset, *extra);
3610
3611   /* If it is non-matching list.  */
3612   if (non_match)
3613     bitset_not (sbcset);
3614
3615 #ifdef RE_ENABLE_I18N
3616   /* Ensure only single byte characters are set.  */
3617   if (dfa->mb_cur_max > 1)
3618     bitset_mask (sbcset, dfa->sb_char);
3619 #endif
3620
3621   /* Build a tree for simple bracket.  */
3622   br_token.type = SIMPLE_BRACKET;
3623   br_token.opr.sbcset = sbcset;
3624   tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3625   if (BE (tree == NULL, 0))
3626     goto build_word_op_espace;
3627
3628 #ifdef RE_ENABLE_I18N
3629   if (dfa->mb_cur_max > 1)
3630     {
3631       re_token_t alt_token;
3632       bin_tree_t *mbc_tree;
3633       /* Build a tree for complex bracket.  */
3634       br_token.type = COMPLEX_BRACKET;
3635       br_token.opr.mbcset = mbcset;
3636       dfa->has_mb_node = 1;
3637       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3638       if (BE (mbc_tree == NULL, 0))
3639         goto build_word_op_espace;
3640       /* Then join them by ALT node.  */
3641       alt_token.type = OP_ALT;
3642       dfa->has_plural_match = 1;
3643       tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3644       if (BE (mbc_tree != NULL, 1))
3645         return tree;
3646     }
3647   else
3648     {
3649       free_charset (mbcset);
3650       return tree;
3651     }
3652 #else /* not RE_ENABLE_I18N */
3653   return tree;
3654 #endif /* not RE_ENABLE_I18N */
3655
3656  build_word_op_espace:
3657   re_free (sbcset);
3658 #ifdef RE_ENABLE_I18N
3659   free_charset (mbcset);
3660 #endif /* RE_ENABLE_I18N */
3661   *err = REG_ESPACE;
3662   return NULL;
3663 }
3664
3665 /* This is intended for the expressions like "a{1,3}".
3666    Fetch a number from `input', and return the number.
3667    Return -1, if the number field is empty like "{,1}".
3668    Return -2, If an error is occured.  */
3669
3670 static int
3671 fetch_number (input, token, syntax)
3672      re_string_t *input;
3673      re_token_t *token;
3674      reg_syntax_t syntax;
3675 {
3676   int num = -1;
3677   unsigned char c;
3678   while (1)
3679     {
3680       fetch_token (token, input, syntax);
3681       c = token->opr.c;
3682       if (BE (token->type == END_OF_RE, 0))
3683         return -2;
3684       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3685         break;
3686       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3687              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3688       num = (num > RE_DUP_MAX) ? -2 : num;
3689     }
3690   return num;
3691 }
3692 \f
3693 #ifdef RE_ENABLE_I18N
3694 static void
3695 free_charset (re_charset_t *cset)
3696 {
3697   re_free (cset->mbchars);
3698 # ifdef _LIBC
3699   re_free (cset->coll_syms);
3700   re_free (cset->equiv_classes);
3701   re_free (cset->range_starts);
3702   re_free (cset->range_ends);
3703 # endif
3704   re_free (cset->char_classes);
3705   re_free (cset);
3706 }
3707 #endif /* RE_ENABLE_I18N */
3708 \f
3709 /* Functions for binary tree operation.  */
3710
3711 /* Create a tree node.  */
3712
3713 static bin_tree_t *
3714 create_tree (dfa, left, right, type, index)
3715      re_dfa_t *dfa;
3716      bin_tree_t *left;
3717      bin_tree_t *right;
3718      re_token_type_t type;
3719      int index;
3720 {
3721   bin_tree_t *tree;
3722   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3723     {
3724       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3725
3726       if (storage == NULL)
3727         return NULL;
3728       storage->next = dfa->str_tree_storage;
3729       dfa->str_tree_storage = storage;
3730       dfa->str_tree_storage_idx = 0;
3731     }
3732   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3733
3734   tree->parent = NULL;
3735   tree->left = left;
3736   tree->right = right;
3737   tree->type = type;
3738   tree->node_idx = index;
3739   tree->first = -1;
3740   tree->next = -1;
3741   re_node_set_init_empty (&tree->eclosure);
3742
3743   if (left != NULL)
3744     left->parent = tree;
3745   if (right != NULL)
3746     right->parent = tree;
3747   return tree;
3748 }
3749
3750 /* Create both a DFA node and a tree for it.  */
3751
3752 static bin_tree_t *
3753 re_dfa_add_tree_node (dfa, left, right, token)
3754      re_dfa_t *dfa;
3755      bin_tree_t *left;
3756      bin_tree_t *right;
3757      const re_token_t *token;
3758 {
3759   int new_idx = re_dfa_add_node (dfa, *token, 0);
3760
3761   if (new_idx == -1)
3762     return NULL;
3763
3764   return create_tree (dfa, left, right, 0, new_idx);
3765 }
3766
3767 /* Mark the tree SRC as an optional subexpression.  */
3768
3769 static void
3770 mark_opt_subexp (src, dfa)
3771      const bin_tree_t *src;
3772      re_dfa_t *dfa;
3773 {
3774   /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3775      a subexpression.  */
3776   if (src->type == CONCAT
3777       && src->left->type == NON_TYPE
3778       && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3779     mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3780 }
3781
3782
3783 /* Recursive tree walker for mark_opt_subexp.  */
3784
3785 static void
3786 mark_opt_subexp_iter (src, dfa, idx)
3787      const bin_tree_t *src;
3788      re_dfa_t *dfa;
3789      int idx;
3790 {
3791   int node_idx;
3792
3793   if (src->type == NON_TYPE)
3794     {
3795       node_idx = src->node_idx;
3796       if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3797            || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3798           && dfa->nodes[node_idx].opr.idx == idx)
3799         dfa->nodes[node_idx].opt_subexp = 1;
3800      }
3801
3802   if (src->left != NULL)
3803     mark_opt_subexp_iter (src->left, dfa, idx);
3804
3805   if (src->right != NULL)
3806     mark_opt_subexp_iter (src->right, dfa, idx);
3807 }
3808
3809
3810 /* Duplicate the node SRC, and return new node.  */
3811
3812 static bin_tree_t *
3813 duplicate_tree (src, dfa)
3814      const bin_tree_t *src;
3815      re_dfa_t *dfa;
3816 {
3817   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3818   int new_node_idx;
3819   /* Since node indies must be according to Post-order of the tree,
3820      we must duplicate the left at first.  */
3821   if (src->left != NULL)
3822     {
3823       left = duplicate_tree (src->left, dfa);
3824       if (left == NULL)
3825         return NULL;
3826     }
3827
3828   /* Secondaly, duplicate the right.  */
3829   if (src->right != NULL)
3830     {
3831       right = duplicate_tree (src->right, dfa);
3832       if (right == NULL)
3833         return NULL;
3834     }
3835
3836   /* At last, duplicate itself.  */
3837   if (src->type == NON_TYPE)
3838     {
3839       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3840       dfa->nodes[new_node_idx].duplicated = 1;
3841       if (BE (new_node_idx == -1, 0))
3842         return NULL;
3843     }
3844   else
3845     new_node_idx = src->type;
3846
3847   new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3848   return new_tree;
3849 }