posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <stdint.h>
  21
  22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  23                                           size_t length, reg_syntax_t syntax);
  24 static void re_compile_fastmap_iter (regex_t *bufp,
  25                                      const re_dfastate_t *init_state,
  26                                      char *fastmap);
  27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t preorder (bin_tree_t *root,
  38                                reg_errcode_t (fn (void *, bin_tree_t *)),
  39                                void *extra);
  40 static reg_errcode_t postorder (bin_tree_t *root,
  41                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  42                                 void *extra);
  43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  46                                  bin_tree_t *node);
  47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  50 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  51 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  52                                    unsigned int constraint);
  53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  55                                          int node, int root);
  56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  57 static int fetch_number (re_string_t *input, re_token_t *token,
  58                          reg_syntax_t syntax);
  59 static int peek_token (re_token_t *token, re_string_t *input,
  60                         reg_syntax_t syntax) internal_function;
  61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  62                           reg_syntax_t syntax, reg_errcode_t *err);
  63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  64                                   re_token_t *token, reg_syntax_t syntax,
  65                                   int nest, reg_errcode_t *err);
  66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  67                                  re_token_t *token, reg_syntax_t syntax,
  68                                  int nest, reg_errcode_t *err);
  69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  70                                      re_token_t *token, reg_syntax_t syntax,
  71                                      int nest, reg_errcode_t *err);
  72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   int nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  76                                  re_dfa_t *dfa, re_token_t *token,
  77                                  reg_syntax_t syntax, reg_errcode_t *err);
  78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  79                                       re_token_t *token, reg_syntax_t syntax,
  80                                       reg_errcode_t *err);
  81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  82                                             re_string_t *regexp,
  83                                             re_token_t *token, int token_len,
  84                                             re_dfa_t *dfa,
  85                                             reg_syntax_t syntax,
  86                                             int accept_hyphen);
  87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  88                                           re_string_t *regexp,
  89                                           re_token_t *token);
  90 #ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  92                                         re_charset_t *mbcset,
  93                                         int *equiv_class_alloc,
  94                                         const unsigned char *name);
  95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  96                                       bitset_t sbcset,
  97                                       re_charset_t *mbcset,
  98                                       int *char_class_alloc,
  99                                       const unsigned char *class_name,
 100                                       reg_syntax_t syntax);
 101 #else  /* not RE_ENABLE_I18N */
 102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 103                                         const unsigned char *name);
 104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 105                                       bitset_t sbcset,
 106                                       const unsigned char *class_name,
 107                                       reg_syntax_t syntax);
 108 #endif /* not RE_ENABLE_I18N */
 109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 110                                        RE_TRANSLATE_TYPE trans,
 111                                        const unsigned char *class_name,
 112                                        const unsigned char *extra,
 113                                        int non_match, reg_errcode_t *err);
 114 static bin_tree_t *create_tree (re_dfa_t *dfa,
 115                                 bin_tree_t *left, bin_tree_t *right,
 116                                 re_token_type_t type);
 117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 118                                       bin_tree_t *left, bin_tree_t *right,
 119                                       const re_token_t *token);
 120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 121 static void free_token (re_token_t *node);
 122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 124 \f
 125 /* This table gives an error message for each of the error codes listed
 126    in regex.h.  Obviously the order here has to be same as there.
 127    POSIX doesn't require that we do anything for REG_NOERROR,
 128    but why not be nice?  */
 129
 130 const char __re_error_msgid[] attribute_hidden =
 131   {
 132 #define REG_NOERROR_IDX 0
 133     gettext_noop ("Success")    /* REG_NOERROR */
 134     "\0"
 135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 136     gettext_noop ("No match")   /* REG_NOMATCH */
 137     "\0"
 138 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 139     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 140     "\0"
 141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 142     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 143     "\0"
 144 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 145     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 146     "\0"
 147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 148     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 149     "\0"
 150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 151     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 152     "\0"
 153 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 154     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 155     "\0"
 156 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 157     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 158     "\0"
 159 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 160     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 161     "\0"
 162 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 163     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 164     "\0"
 165 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 166     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 167     "\0"
 168 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 169     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 170     "\0"
 171 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 172     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 173     "\0"
 174 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 175     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 176     "\0"
 177 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 178     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 179     "\0"
 180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 181     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 182   };
 183
 184 const size_t __re_error_msgid_idx[] attribute_hidden =
 185   {
 186     REG_NOERROR_IDX,
 187     REG_NOMATCH_IDX,
 188     REG_BADPAT_IDX,
 189     REG_ECOLLATE_IDX,
 190     REG_ECTYPE_IDX,
 191     REG_EESCAPE_IDX,
 192     REG_ESUBREG_IDX,
 193     REG_EBRACK_IDX,
 194     REG_EPAREN_IDX,
 195     REG_EBRACE_IDX,
 196     REG_BADBR_IDX,
 197     REG_ERANGE_IDX,
 198     REG_ESPACE_IDX,
 199     REG_BADRPT_IDX,
 200     REG_EEND_IDX,
 201     REG_ESIZE_IDX,
 202     REG_ERPAREN_IDX
 203   };
 204 \f
 205 /* Entry points for GNU code.  */
 206
 207 /* re_compile_pattern is the GNU regular expression compiler: it
 208    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 209    Returns 0 if the pattern was valid, otherwise an error string.
 210
 211    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 212    are set in BUFP on entry.  */
 213
 214 const char *
 215 re_compile_pattern (pattern, length, bufp)
 216     const char *pattern;
 217     size_t length;
 218     struct re_pattern_buffer *bufp;
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 #ifdef _LIBC
 237 weak_alias (__re_compile_pattern, re_compile_pattern)
 238 #endif
 239
 240 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 241    also be assigned to arbitrarily: each pattern buffer stores its own
 242    syntax, so it can be changed between regex compilations.  */
 243 /* This has no initializer because initialized variables in Emacs
 244    become read-only after dumping.  */
 245 reg_syntax_t re_syntax_options;
 246
 247
 248 /* Specify the precise syntax of regexps for compilation.  This provides
 249    for compatibility for various utilities which historically have
 250    different, incompatible syntaxes.
 251
 252    The argument SYNTAX is a bit mask comprised of the various bits
 253    defined in regex.h.  We return the old syntax.  */
 254
 255 reg_syntax_t
 256 re_set_syntax (syntax)
 257     reg_syntax_t syntax;
 258 {
 259   reg_syntax_t ret = re_syntax_options;
 260
 261   re_syntax_options = syntax;
 262   return ret;
 263 }
 264 #ifdef _LIBC
 265 weak_alias (__re_set_syntax, re_set_syntax)
 266 #endif
 267
 268 int
 269 re_compile_fastmap (bufp)
 270     struct re_pattern_buffer *bufp;
 271 {
 272   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 273   char *fastmap = bufp->fastmap;
 274
 275   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 276   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 277   if (dfa->init_state != dfa->init_state_word)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 279   if (dfa->init_state != dfa->init_state_nl)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 281   if (dfa->init_state != dfa->init_state_begbuf)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 283   bufp->fastmap_accurate = 1;
 284   return 0;
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 288 #endif
 289
 290 static inline void
 291 __attribute ((always_inline))
 292 re_set_fastmap (char *fastmap, int icase, int ch)
 293 {
 294   fastmap[ch] = 1;
 295   if (icase)
 296     fastmap[tolower (ch)] = 1;
 297 }
 298
 299 /* Helper function for re_compile_fastmap.
 300    Compile fastmap for the initial_state INIT_STATE.  */
 301
 302 static void
 303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 304                          char *fastmap)
 305 {
 306   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 307   int node_cnt;
 308   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 309   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 310     {
 311       int node = init_state->nodes.elems[node_cnt];
 312       re_token_type_t type = dfa->nodes[node].type;
 313
 314       if (type == CHARACTER)
 315         {
 316           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 317 #ifdef RE_ENABLE_I18N
 318           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 319             {
 320               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 321               wchar_t wc;
 322               mbstate_t state;
 323
 324               p = buf;
 325               *p++ = dfa->nodes[node].opr.c;
 326               while (++node < dfa->nodes_len
 327                      && dfa->nodes[node].type == CHARACTER
 328                      && dfa->nodes[node].mb_partial)
 329                 *p++ = dfa->nodes[node].opr.c;
 330               memset (&state, '\0', sizeof (state));
 331               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 332                              &state) == p - buf
 333                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 334                       != (size_t) -1))
 335                 re_set_fastmap (fastmap, 0, buf[0]);
 336             }
 337 #endif
 338         }
 339       else if (type == SIMPLE_BRACKET)
 340         {
 341           int i, ch;
 342           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 343             {
 344               int j;
 345               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 346               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 347                 if (w & ((bitset_word_t) 1 << j))
 348                   re_set_fastmap (fastmap, icase, ch);
 349             }
 350         }
 351 #ifdef RE_ENABLE_I18N
 352       else if (type == COMPLEX_BRACKET)
 353         {
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           int i;
 356
 357 # ifdef _LIBC
 358           /* See if we have to try all bytes which start multiple collation
 359              elements.
 360              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 361                   collation element, and don't catch 'b' since 'b' is
 362                   the only collation element which starts from 'b' (and
 363                   it is caught by SIMPLE_BRACKET).  */
 364               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 365                   && (cset->ncoll_syms || cset->nranges))
 366                 {
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # endif /* _LIBC */
 374
 375           /* See if we have to start the match at all multibyte characters,
 376              i.e. where we would not find an invalid sequence.  This only
 377              applies to multibyte character sets; for single byte character
 378              sets, the SIMPLE_BRACKET again suffices.  */
 379           if (dfa->mb_cur_max > 1
 380               && (cset->nchar_classes || cset->non_match || cset->nranges
 381 # ifdef _LIBC
 382                   || cset->nequiv_classes
 383 # endif /* _LIBC */
 384                  ))
 385             {
 386               unsigned char c = 0;
 387               do
 388                 {
 389                   mbstate_t mbs;
 390                   memset (&mbs, 0, sizeof (mbs));
 391                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 392                     re_set_fastmap (fastmap, false, (int) c);
 393                 }
 394               while (++c != 0);
 395             }
 396
 397           else
 398             {
 399               /* ... Else catch all bytes which can start the mbchars.  */
 400               for (i = 0; i < cset->nmbchars; ++i)
 401                 {
 402                   char buf[256];
 403                   mbstate_t state;
 404                   memset (&state, '\0', sizeof (state));
 405                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 406                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 407                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 408                     {
 409                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 410                           != (size_t) -1)
 411                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 412                     }
 413                 }
 414             }
 415         }
 416 #endif /* RE_ENABLE_I18N */
 417       else if (type == OP_PERIOD
 418 #ifdef RE_ENABLE_I18N
 419                || type == OP_UTF8_PERIOD
 420 #endif /* RE_ENABLE_I18N */
 421                || type == END_OF_RE)
 422         {
 423           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 424           if (type == END_OF_RE)
 425             bufp->can_be_null = 1;
 426           return;
 427         }
 428     }
 429 }
 430 \f
 431 /* Entry point for POSIX code.  */
 432 /* regcomp takes a regular expression as a string and compiles it.
 433
 434    PREG is a regex_t *.  We do not expect any fields to be initialized,
 435    since POSIX says we shouldn't.  Thus, we set
 436
 437      `buffer' to the compiled pattern;
 438      `used' to the length of the compiled pattern;
 439      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 440        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 441        RE_SYNTAX_POSIX_BASIC;
 442      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 443      `fastmap' to an allocated space for the fastmap;
 444      `fastmap_accurate' to zero;
 445      `re_nsub' to the number of subexpressions in PATTERN.
 446
 447    PATTERN is the address of the pattern string.
 448
 449    CFLAGS is a series of bits which affect compilation.
 450
 451      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 452      use POSIX basic syntax.
 453
 454      If REG_NEWLINE is set, then . and [^...] don't match newline.
 455      Also, regexec will try a match beginning after every newline.
 456
 457      If REG_ICASE is set, then we considers upper- and lowercase
 458      versions of letters to be equivalent when matching.
 459
 460      If REG_NOSUB is set, then when PREG is passed to regexec, that
 461      routine will report only success or failure, and nothing about the
 462      registers.
 463
 464    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 465    the return codes and their meanings.)  */
 466
 467 int
 468 regcomp (preg, pattern, cflags)
 469     regex_t *__restrict preg;
 470     const char *__restrict pattern;
 471     int cflags;
 472 {
 473   reg_errcode_t ret;
 474   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 475                          : RE_SYNTAX_POSIX_BASIC);
 476
 477   preg->buffer = NULL;
 478   preg->allocated = 0;
 479   preg->used = 0;
 480
 481   /* Try to allocate space for the fastmap.  */
 482   preg->fastmap = re_malloc (char, SBC_MAX);
 483   if (BE (preg->fastmap == NULL, 0))
 484     return REG_ESPACE;
 485
 486   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 487
 488   /* If REG_NEWLINE is set, newlines are treated differently.  */
 489   if (cflags & REG_NEWLINE)
 490     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 491       syntax &= ~RE_DOT_NEWLINE;
 492       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 493       /* It also changes the matching behavior.  */
 494       preg->newline_anchor = 1;
 495     }
 496   else
 497     preg->newline_anchor = 0;
 498   preg->no_sub = !!(cflags & REG_NOSUB);
 499   preg->translate = NULL;
 500
 501   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 502
 503   /* POSIX doesn't distinguish between an unmatched open-group and an
 504      unmatched close-group: both are REG_EPAREN.  */
 505   if (ret == REG_ERPAREN)
 506     ret = REG_EPAREN;
 507
 508   /* We have already checked preg->fastmap != NULL.  */
 509   if (BE (ret == REG_NOERROR, 1))
 510     /* Compute the fastmap now, since regexec cannot modify the pattern
 511        buffer.  This function never fails in this implementation.  */
 512     (void) re_compile_fastmap (preg);
 513   else
 514     {
 515       /* Some error occurred while compiling the expression.  */
 516       re_free (preg->fastmap);
 517       preg->fastmap = NULL;
 518     }
 519
 520   return (int) ret;
 521 }
 522 #ifdef _LIBC
 523 weak_alias (__regcomp, regcomp)
 524 #endif
 525
 526 /* Returns a message corresponding to an error code, ERRCODE, returned
 527    from either regcomp or regexec.   We don't use PREG here.  */
 528
 529 size_t
 530 regerror (errcode, preg, errbuf, errbuf_size)
 531     int errcode;
 532     const regex_t *__restrict preg;
 533     char *__restrict errbuf;
 534     size_t errbuf_size;
 535 {
 536   const char *msg;
 537   size_t msg_size;
 538
 539   if (BE (errcode < 0
 540           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 541                                / sizeof (__re_error_msgid_idx[0])), 0))
 542     /* Only error codes returned by the rest of the code should be passed
 543        to this routine.  If we are given anything else, or if other regex
 544        code generates an invalid error code, then the program has a bug.
 545        Dump core so we can fix it.  */
 546     abort ();
 547
 548   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 549
 550   msg_size = strlen (msg) + 1; /* Includes the null.  */
 551
 552   if (BE (errbuf_size != 0, 1))
 553     {
 554       if (BE (msg_size > errbuf_size, 0))
 555         {
 556 #if defined HAVE_MEMPCPY || defined _LIBC
 557           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 558 #else
 559           memcpy (errbuf, msg, errbuf_size - 1);
 560           errbuf[errbuf_size - 1] = 0;
 561 #endif
 562         }
 563       else
 564         memcpy (errbuf, msg, msg_size);
 565     }
 566
 567   return msg_size;
 568 }
 569 #ifdef _LIBC
 570 weak_alias (__regerror, regerror)
 571 #endif
 572
 573
 574 #ifdef RE_ENABLE_I18N
 575 /* This static array is used for the map to single-byte characters when
 576    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 577    it the same all the time.  UTF-8 is the preferred encoding so this is
 578    a worthwhile optimization.  */
 579 static const bitset_t utf8_sb_map =
 580 {
 581   /* Set the first 128 bits.  */
 582   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 583 };
 584 #endif
 585
 586
 587 static void
 588 free_dfa_content (re_dfa_t *dfa)
 589 {
 590   int i, j;
 591
 592   if (dfa->nodes)
 593     for (i = 0; i < dfa->nodes_len; ++i)
 594       free_token (dfa->nodes + i);
 595   re_free (dfa->nexts);
 596   for (i = 0; i < dfa->nodes_len; ++i)
 597     {
 598       if (dfa->eclosures != NULL)
 599         re_node_set_free (dfa->eclosures + i);
 600       if (dfa->inveclosures != NULL)
 601         re_node_set_free (dfa->inveclosures + i);
 602       if (dfa->edests != NULL)
 603         re_node_set_free (dfa->edests + i);
 604     }
 605   re_free (dfa->edests);
 606   re_free (dfa->eclosures);
 607   re_free (dfa->inveclosures);
 608   re_free (dfa->nodes);
 609
 610   if (dfa->state_table)
 611     for (i = 0; i <= dfa->state_hash_mask; ++i)
 612       {
 613         struct re_state_table_entry *entry = dfa->state_table + i;
 614         for (j = 0; j < entry->num; ++j)
 615           {
 616             re_dfastate_t *state = entry->array[j];
 617             free_state (state);
 618           }
 619         re_free (entry->array);
 620       }
 621   re_free (dfa->state_table);
 622 #ifdef RE_ENABLE_I18N
 623   if (dfa->sb_char != utf8_sb_map)
 624     re_free (dfa->sb_char);
 625 #endif
 626   re_free (dfa->subexp_map);
 627 #ifdef DEBUG
 628   re_free (dfa->re_str);
 629 #endif
 630
 631   re_free (dfa);
 632 }
 633
 634
 635 /* Free dynamically allocated space used by PREG.  */
 636
 637 void
 638 regfree (preg)
 639     regex_t *preg;
 640 {
 641   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 642   if (BE (dfa != NULL, 1))
 643     free_dfa_content (dfa);
 644   preg->buffer = NULL;
 645   preg->allocated = 0;
 646
 647   re_free (preg->fastmap);
 648   preg->fastmap = NULL;
 649
 650   re_free (preg->translate);
 651   preg->translate = NULL;
 652 }
 653 #ifdef _LIBC
 654 weak_alias (__regfree, regfree)
 655 #endif
 656 \f
 657 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 658    them unless specifically requested.  */
 659
 660 #if defined _REGEX_RE_COMP || defined _LIBC
 661
 662 /* BSD has one and only one pattern buffer.  */
 663 static struct re_pattern_buffer re_comp_buf;
 664
 665 char *
 666 # ifdef _LIBC
 667 /* Make these definitions weak in libc, so POSIX programs can redefine
 668    these names if they don't use our functions, and still use
 669    regcomp/regexec above without link errors.  */
 670 weak_function
 671 # endif
 672 re_comp (s)
 673      const char *s;
 674 {
 675   reg_errcode_t ret;
 676   char *fastmap;
 677
 678   if (!s)
 679     {
 680       if (!re_comp_buf.buffer)
 681         return gettext ("No previous regular expression");
 682       return 0;
 683     }
 684
 685   if (re_comp_buf.buffer)
 686     {
 687       fastmap = re_comp_buf.fastmap;
 688       re_comp_buf.fastmap = NULL;
 689       __regfree (&re_comp_buf);
 690       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 691       re_comp_buf.fastmap = fastmap;
 692     }
 693
 694   if (re_comp_buf.fastmap == NULL)
 695     {
 696       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 697       if (re_comp_buf.fastmap == NULL)
 698         return (char *) gettext (__re_error_msgid
 699                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 700     }
 701
 702   /* Since `re_exec' always passes NULL for the `regs' argument, we
 703      don't need to initialize the pattern buffer fields which affect it.  */
 704
 705   /* Match anchors at newlines.  */
 706   re_comp_buf.newline_anchor = 1;
 707
 708   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 709
 710   if (!ret)
 711     return NULL;
 712
 713   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 714   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 715 }
 716
 717 #ifdef _LIBC
 718 libc_freeres_fn (free_mem)
 719 {
 720   __regfree (&re_comp_buf);
 721 }
 722 #endif
 723
 724 #endif /* _REGEX_RE_COMP */
 725 \f
 726 /* Internal entry point.
 727    Compile the regular expression PATTERN, whose length is LENGTH.
 728    SYNTAX indicate regular expression's syntax.  */
 729
 730 static reg_errcode_t
 731 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 732                      reg_syntax_t syntax)
 733 {
 734   reg_errcode_t err = REG_NOERROR;
 735   re_dfa_t *dfa;
 736   re_string_t regexp;
 737
 738   /* Initialize the pattern buffer.  */
 739   preg->fastmap_accurate = 0;
 740   preg->syntax = syntax;
 741   preg->not_bol = preg->not_eol = 0;
 742   preg->used = 0;
 743   preg->re_nsub = 0;
 744   preg->can_be_null = 0;
 745   preg->regs_allocated = REGS_UNALLOCATED;
 746
 747   /* Initialize the dfa.  */
 748   dfa = (re_dfa_t *) preg->buffer;
 749   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 750     {
 751       /* If zero allocated, but buffer is non-null, try to realloc
 752          enough space.  This loses if buffer's address is bogus, but
 753          that is the user's responsibility.  If ->buffer is NULL this
 754          is a simple allocation.  */
 755       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 756       if (dfa == NULL)
 757         return REG_ESPACE;
 758       preg->allocated = sizeof (re_dfa_t);
 759       preg->buffer = (unsigned char *) dfa;
 760     }
 761   preg->used = sizeof (re_dfa_t);
 762
 763   err = init_dfa (dfa, length);
 764   if (BE (err != REG_NOERROR, 0))
 765     {
 766       free_dfa_content (dfa);
 767       preg->buffer = NULL;
 768       preg->allocated = 0;
 769       return err;
 770     }
 771 #ifdef DEBUG
 772   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 773   dfa->re_str = re_malloc (char, length + 1);
 774   strncpy (dfa->re_str, pattern, length + 1);
 775 #endif
 776
 777   __libc_lock_init (dfa->lock);
 778
 779   err = re_string_construct (&regexp, pattern, length, preg->translate,
 780                              syntax & RE_ICASE, dfa);
 781   if (BE (err != REG_NOERROR, 0))
 782     {
 783     re_compile_internal_free_return:
 784       free_workarea_compile (preg);
 785       re_string_destruct (&regexp);
 786       free_dfa_content (dfa);
 787       preg->buffer = NULL;
 788       preg->allocated = 0;
 789       return err;
 790     }
 791
 792   /* Parse the regular expression, and build a structure tree.  */
 793   preg->re_nsub = 0;
 794   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 795   if (BE (dfa->str_tree == NULL, 0))
 796     goto re_compile_internal_free_return;
 797
 798   /* Analyze the tree and create the nfa.  */
 799   err = analyze (preg);
 800   if (BE (err != REG_NOERROR, 0))
 801     goto re_compile_internal_free_return;
 802
 803 #ifdef RE_ENABLE_I18N
 804   /* If possible, do searching in single byte encoding to speed things up.  */
 805   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 806     optimize_utf8 (dfa);
 807 #endif
 808
 809   /* Then create the initial state of the dfa.  */
 810   err = create_initial_state (dfa);
 811
 812   /* Release work areas.  */
 813   free_workarea_compile (preg);
 814   re_string_destruct (&regexp);
 815
 816   if (BE (err != REG_NOERROR, 0))
 817     {
 818       free_dfa_content (dfa);
 819       preg->buffer = NULL;
 820       preg->allocated = 0;
 821     }
 822
 823   return err;
 824 }
 825
 826 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 827    as the initial length of some arrays.  */
 828
 829 static reg_errcode_t
 830 init_dfa (re_dfa_t *dfa, size_t pat_len)
 831 {
 832   unsigned int table_size;
 833 #ifndef _LIBC
 834   char *codeset_name;
 835 #endif
 836
 837   memset (dfa, '\0', sizeof (re_dfa_t));
 838
 839   /* Force allocation of str_tree_storage the first time.  */
 840   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 841
 842   /* Avoid overflows.  */
 843   if (pat_len == SIZE_MAX)
 844     return REG_ESPACE;
 845
 846   dfa->nodes_alloc = pat_len + 1;
 847   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 848
 849   /*  table_size = 2 ^ ceil(log pat_len) */
 850   for (table_size = 1; ; table_size <<= 1)
 851     if (table_size > pat_len)
 852       break;
 853
 854   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 855   dfa->state_hash_mask = table_size - 1;
 856
 857   dfa->mb_cur_max = MB_CUR_MAX;
 858 #ifdef _LIBC
 859   if (dfa->mb_cur_max == 6
 860       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 861     dfa->is_utf8 = 1;
 862   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 863                        != 0);
 864 #else
 865 # ifdef HAVE_LANGINFO_CODESET
 866   codeset_name = nl_langinfo (CODESET);
 867 # else
 868   codeset_name = getenv ("LC_ALL");
 869   if (codeset_name == NULL || codeset_name[0] == '\0')
 870     codeset_name = getenv ("LC_CTYPE");
 871   if (codeset_name == NULL || codeset_name[0] == '\0')
 872     codeset_name = getenv ("LANG");
 873   if (codeset_name == NULL)
 874     codeset_name = "";
 875   else if (strchr (codeset_name, '.') !=  NULL)
 876     codeset_name = strchr (codeset_name, '.') + 1;
 877 # endif
 878
 879   if (strcasecmp (codeset_name, "UTF-8") == 0
 880       || strcasecmp (codeset_name, "UTF8") == 0)
 881     dfa->is_utf8 = 1;
 882
 883   /* We check exhaustively in the loop below if this charset is a
 884      superset of ASCII.  */
 885   dfa->map_notascii = 0;
 886 #endif
 887
 888 #ifdef RE_ENABLE_I18N
 889   if (dfa->mb_cur_max > 1)
 890     {
 891       if (dfa->is_utf8)
 892         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 893       else
 894         {
 895           int i, j, ch;
 896
 897           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 898           if (BE (dfa->sb_char == NULL, 0))
 899             return REG_ESPACE;
 900
 901           /* Set the bits corresponding to single byte chars.  */
 902           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 903             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 904               {
 905                 wint_t wch = __btowc (ch);
 906                 if (wch != WEOF)
 907                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 908 # ifndef _LIBC
 909                 if (isascii (ch) && wch != ch)
 910                   dfa->map_notascii = 1;
 911 # endif
 912               }
 913         }
 914     }
 915 #endif
 916
 917   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 918     return REG_ESPACE;
 919   return REG_NOERROR;
 920 }
 921
 922 /* Initialize WORD_CHAR table, which indicate which character is
 923    "word".  In this case "word" means that it is the word construction
 924    character used by some operators like "\<", "\>", etc.  */
 925
 926 static void
 927 internal_function
 928 init_word_char (re_dfa_t *dfa)
 929 {
 930   dfa->word_ops_used = 1;
 931   int i = 0;
 932   int ch = 0;
 933   if (BE (dfa->map_notascii == 0, 1))
 934     {
 935       if (sizeof (dfa->word_char[0]) == 8)
 936         {
 937           /* The extra temporaries here avoid "implicitly truncated"
 938              warnings in the case when this is dead code, i.e. 32-bit.  */
 939           const uint64_t wc0 = UINT64_C (0x03ff000000000000);
 940           const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
 941           dfa->word_char[0] = wc0;
 942           dfa->word_char[1] = wc1;
 943           i = 2;
 944         }
 945       else if (sizeof (dfa->word_char[0]) == 4)
 946         {
 947           dfa->word_char[0] = UINT32_C (0x00000000);
 948           dfa->word_char[1] = UINT32_C (0x03ff0000);
 949           dfa->word_char[2] = UINT32_C (0x87fffffe);
 950           dfa->word_char[3] = UINT32_C (0x07fffffe);
 951           i = 4;
 952         }
 953       else
 954         abort ();
 955       ch = 128;
 956
 957       if (BE (dfa->is_utf8, 1))
 958         {
 959           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 960           return;
 961         }
 962     }
 963
 964   for (; i < BITSET_WORDS; ++i)
 965     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 966       if (isalnum (ch) || ch == '_')
 967         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 968 }
 969
 970 /* Free the work area which are only used while compiling.  */
 971
 972 static void
 973 free_workarea_compile (regex_t *preg)
 974 {
 975   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 976   bin_tree_storage_t *storage, *next;
 977   for (storage = dfa->str_tree_storage; storage; storage = next)
 978     {
 979       next = storage->next;
 980       re_free (storage);
 981     }
 982   dfa->str_tree_storage = NULL;
 983   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 984   dfa->str_tree = NULL;
 985   re_free (dfa->org_indices);
 986   dfa->org_indices = NULL;
 987 }
 988
 989 /* Create initial states for all contexts.  */
 990
 991 static reg_errcode_t
 992 create_initial_state (re_dfa_t *dfa)
 993 {
 994   int first, i;
 995   reg_errcode_t err;
 996   re_node_set init_nodes;
 997
 998   /* Initial states have the epsilon closure of the node which is
 999      the first node of the regular expression.  */
1000   first = dfa->str_tree->first->node_idx;
1001   dfa->init_node = first;
1002   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1003   if (BE (err != REG_NOERROR, 0))
1004     return err;
1005
1006   /* The back-references which are in initial states can epsilon transit,
1007      since in this case all of the subexpressions can be null.
1008      Then we add epsilon closures of the nodes which are the next nodes of
1009      the back-references.  */
1010   if (dfa->nbackref > 0)
1011     for (i = 0; i < init_nodes.nelem; ++i)
1012       {
1013         int node_idx = init_nodes.elems[i];
1014         re_token_type_t type = dfa->nodes[node_idx].type;
1015
1016         int clexp_idx;
1017         if (type != OP_BACK_REF)
1018           continue;
1019         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1020           {
1021             re_token_t *clexp_node;
1022             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1023             if (clexp_node->type == OP_CLOSE_SUBEXP
1024                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1025               break;
1026           }
1027         if (clexp_idx == init_nodes.nelem)
1028           continue;
1029
1030         if (type == OP_BACK_REF)
1031           {
1032             int dest_idx = dfa->edests[node_idx].elems[0];
1033             if (!re_node_set_contains (&init_nodes, dest_idx))
1034               {
1035                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1036                                                        dfa->eclosures
1037                                                        + dest_idx);
1038                 if (err != REG_NOERROR)
1039                   return err;
1040                 i = 0;
1041               }
1042           }
1043       }
1044
1045   /* It must be the first time to invoke acquire_state.  */
1046   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1047   /* We don't check ERR here, since the initial state must not be NULL.  */
1048   if (BE (dfa->init_state == NULL, 0))
1049     return err;
1050   if (dfa->init_state->has_constraint)
1051     {
1052       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1053                                                        CONTEXT_WORD);
1054       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1055                                                      CONTEXT_NEWLINE);
1056       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1057                                                          &init_nodes,
1058                                                          CONTEXT_NEWLINE
1059                                                          | CONTEXT_BEGBUF);
1060       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1061               || dfa->init_state_begbuf == NULL, 0))
1062         return err;
1063     }
1064   else
1065     dfa->init_state_word = dfa->init_state_nl
1066       = dfa->init_state_begbuf = dfa->init_state;
1067
1068   re_node_set_free (&init_nodes);
1069   return REG_NOERROR;
1070 }
1071 \f
1072 #ifdef RE_ENABLE_I18N
1073 /* If it is possible to do searching in single byte encoding instead of UTF-8
1074    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1075    DFA nodes where needed.  */
1076
1077 static void
1078 optimize_utf8 (re_dfa_t *dfa)
1079 {
1080   int node, i, mb_chars = 0, has_period = 0;
1081
1082   for (node = 0; node < dfa->nodes_len; ++node)
1083     switch (dfa->nodes[node].type)
1084       {
1085       case CHARACTER:
1086         if (dfa->nodes[node].opr.c >= 0x80)
1087           mb_chars = 1;
1088         break;
1089       case ANCHOR:
1090         switch (dfa->nodes[node].opr.ctx_type)
1091           {
1092           case LINE_FIRST:
1093           case LINE_LAST:
1094           case BUF_FIRST:
1095           case BUF_LAST:
1096             break;
1097           default:
1098             /* Word anchors etc. cannot be handled.  It's okay to test
1099                opr.ctx_type since constraints (for all DFA nodes) are
1100                created by ORing one or more opr.ctx_type values.  */
1101             return;
1102           }
1103         break;
1104       case OP_PERIOD:
1105         has_period = 1;
1106         break;
1107       case OP_BACK_REF:
1108       case OP_ALT:
1109       case END_OF_RE:
1110       case OP_DUP_ASTERISK:
1111       case OP_OPEN_SUBEXP:
1112       case OP_CLOSE_SUBEXP:
1113         break;
1114       case COMPLEX_BRACKET:
1115         return;
1116       case SIMPLE_BRACKET:
1117         /* Just double check.  The non-ASCII range starts at 0x80.  */
1118         assert (0x80 % BITSET_WORD_BITS == 0);
1119         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1120           if (dfa->nodes[node].opr.sbcset[i])
1121             return;
1122         break;
1123       default:
1124         abort ();
1125       }
1126
1127   if (mb_chars || has_period)
1128     for (node = 0; node < dfa->nodes_len; ++node)
1129       {
1130         if (dfa->nodes[node].type == CHARACTER
1131             && dfa->nodes[node].opr.c >= 0x80)
1132           dfa->nodes[node].mb_partial = 0;
1133         else if (dfa->nodes[node].type == OP_PERIOD)
1134           dfa->nodes[node].type = OP_UTF8_PERIOD;
1135       }
1136
1137   /* The search can be in single byte locale.  */
1138   dfa->mb_cur_max = 1;
1139   dfa->is_utf8 = 0;
1140   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1141 }
1142 #endif
1143 \f
1144 /* Analyze the structure tree, and calculate "first", "next", "edest",
1145    "eclosure", and "inveclosure".  */
1146
1147 static reg_errcode_t
1148 analyze (regex_t *preg)
1149 {
1150   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1151   reg_errcode_t ret;
1152
1153   /* Allocate arrays.  */
1154   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1155   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1156   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1157   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1158   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1159           || dfa->eclosures == NULL, 0))
1160     return REG_ESPACE;
1161
1162   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1163   if (dfa->subexp_map != NULL)
1164     {
1165       int i;
1166       for (i = 0; i < preg->re_nsub; i++)
1167         dfa->subexp_map[i] = i;
1168       preorder (dfa->str_tree, optimize_subexps, dfa);
1169       for (i = 0; i < preg->re_nsub; i++)
1170         if (dfa->subexp_map[i] != i)
1171           break;
1172       if (i == preg->re_nsub)
1173         {
1174           free (dfa->subexp_map);
1175           dfa->subexp_map = NULL;
1176         }
1177     }
1178
1179   ret = postorder (dfa->str_tree, lower_subexps, preg);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182   ret = postorder (dfa->str_tree, calc_first, dfa);
1183   if (BE (ret != REG_NOERROR, 0))
1184     return ret;
1185   preorder (dfa->str_tree, calc_next, dfa);
1186   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1187   if (BE (ret != REG_NOERROR, 0))
1188     return ret;
1189   ret = calc_eclosure (dfa);
1190   if (BE (ret != REG_NOERROR, 0))
1191     return ret;
1192
1193   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1194      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1195   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1196       || dfa->nbackref)
1197     {
1198       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1199       if (BE (dfa->inveclosures == NULL, 0))
1200         return REG_ESPACE;
1201       ret = calc_inveclosure (dfa);
1202     }
1203
1204   return ret;
1205 }
1206
1207 /* Our parse trees are very unbalanced, so we cannot use a stack to
1208    implement parse tree visits.  Instead, we use parent pointers and
1209    some hairy code in these two functions.  */
1210 static reg_errcode_t
1211 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1212            void *extra)
1213 {
1214   bin_tree_t *node, *prev;
1215
1216   for (node = root; ; )
1217     {
1218       /* Descend down the tree, preferably to the left (or to the right
1219          if that's the only child).  */
1220       while (node->left || node->right)
1221         if (node->left)
1222           node = node->left;
1223         else
1224           node = node->right;
1225
1226       do
1227         {
1228           reg_errcode_t err = fn (extra, node);
1229           if (BE (err != REG_NOERROR, 0))
1230             return err;
1231           if (node->parent == NULL)
1232             return REG_NOERROR;
1233           prev = node;
1234           node = node->parent;
1235         }
1236       /* Go up while we have a node that is reached from the right.  */
1237       while (node->right == prev || node->right == NULL);
1238       node = node->right;
1239     }
1240 }
1241
1242 static reg_errcode_t
1243 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1244           void *extra)
1245 {
1246   bin_tree_t *node;
1247
1248   for (node = root; ; )
1249     {
1250       reg_errcode_t err = fn (extra, node);
1251       if (BE (err != REG_NOERROR, 0))
1252         return err;
1253
1254       /* Go to the left node, or up and to the right.  */
1255       if (node->left)
1256         node = node->left;
1257       else
1258         {
1259           bin_tree_t *prev = NULL;
1260           while (node->right == prev || node->right == NULL)
1261             {
1262               prev = node;
1263               node = node->parent;
1264               if (!node)
1265                 return REG_NOERROR;
1266             }
1267           node = node->right;
1268         }
1269     }
1270 }
1271
1272 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1273    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1274    backreferences as well.  Requires a preorder visit.  */
1275 static reg_errcode_t
1276 optimize_subexps (void *extra, bin_tree_t *node)
1277 {
1278   re_dfa_t *dfa = (re_dfa_t *) extra;
1279
1280   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1281     {
1282       int idx = node->token.opr.idx;
1283       node->token.opr.idx = dfa->subexp_map[idx];
1284       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285     }
1286
1287   else if (node->token.type == SUBEXP
1288            && node->left && node->left->token.type == SUBEXP)
1289     {
1290       int other_idx = node->left->token.opr.idx;
1291
1292       node->left = node->left->left;
1293       if (node->left)
1294         node->left->parent = node;
1295
1296       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1297       if (other_idx < BITSET_WORD_BITS)
1298           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299     }
1300
1301   return REG_NOERROR;
1302 }
1303
1304 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1305    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1306 static reg_errcode_t
1307 lower_subexps (void *extra, bin_tree_t *node)
1308 {
1309   regex_t *preg = (regex_t *) extra;
1310   reg_errcode_t err = REG_NOERROR;
1311
1312   if (node->left && node->left->token.type == SUBEXP)
1313     {
1314       node->left = lower_subexp (&err, preg, node->left);
1315       if (node->left)
1316         node->left->parent = node;
1317     }
1318   if (node->right && node->right->token.type == SUBEXP)
1319     {
1320       node->right = lower_subexp (&err, preg, node->right);
1321       if (node->right)
1322         node->right->parent = node;
1323     }
1324
1325   return err;
1326 }
1327
1328 static bin_tree_t *
1329 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1330 {
1331   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1332   bin_tree_t *body = node->left;
1333   bin_tree_t *op, *cls, *tree1, *tree;
1334
1335   if (preg->no_sub
1336       /* We do not optimize empty subexpressions, because otherwise we may
1337          have bad CONCAT nodes with NULL children.  This is obviously not
1338          very common, so we do not lose much.  An example that triggers
1339          this case is the sed "script" /\(\)/x.  */
1340       && node->left != NULL
1341       && (node->token.opr.idx >= BITSET_WORD_BITS
1342           || !(dfa->used_bkref_map
1343                & ((bitset_word_t) 1 << node->token.opr.idx))))
1344     return node->left;
1345
1346   /* Convert the SUBEXP node to the concatenation of an
1347      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1348   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1349   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1350   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1351   tree = create_tree (dfa, op, tree1, CONCAT);
1352   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1353     {
1354       *err = REG_ESPACE;
1355       return NULL;
1356     }
1357
1358   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1359   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1360   return tree;
1361 }
1362
1363 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1364    nodes.  Requires a postorder visit.  */
1365 static reg_errcode_t
1366 calc_first (void *extra, bin_tree_t *node)
1367 {
1368   re_dfa_t *dfa = (re_dfa_t *) extra;
1369   if (node->token.type == CONCAT)
1370     {
1371       node->first = node->left->first;
1372       node->node_idx = node->left->node_idx;
1373     }
1374   else
1375     {
1376       node->first = node;
1377       node->node_idx = re_dfa_add_node (dfa, node->token);
1378       if (BE (node->node_idx == -1, 0))
1379         return REG_ESPACE;
1380       if (node->token.type == ANCHOR)
1381         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1382     }
1383   return REG_NOERROR;
1384 }
1385
1386 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1387 static reg_errcode_t
1388 calc_next (void *extra, bin_tree_t *node)
1389 {
1390   switch (node->token.type)
1391     {
1392     case OP_DUP_ASTERISK:
1393       node->left->next = node;
1394       break;
1395     case CONCAT:
1396       node->left->next = node->right->first;
1397       node->right->next = node->next;
1398       break;
1399     default:
1400       if (node->left)
1401         node->left->next = node->next;
1402       if (node->right)
1403         node->right->next = node->next;
1404       break;
1405     }
1406   return REG_NOERROR;
1407 }
1408
1409 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1410 static reg_errcode_t
1411 link_nfa_nodes (void *extra, bin_tree_t *node)
1412 {
1413   re_dfa_t *dfa = (re_dfa_t *) extra;
1414   int idx = node->node_idx;
1415   reg_errcode_t err = REG_NOERROR;
1416
1417   switch (node->token.type)
1418     {
1419     case CONCAT:
1420       break;
1421
1422     case END_OF_RE:
1423       assert (node->next == NULL);
1424       break;
1425
1426     case OP_DUP_ASTERISK:
1427     case OP_ALT:
1428       {
1429         int left, right;
1430         dfa->has_plural_match = 1;
1431         if (node->left != NULL)
1432           left = node->left->first->node_idx;
1433         else
1434           left = node->next->node_idx;
1435         if (node->right != NULL)
1436           right = node->right->first->node_idx;
1437         else
1438           right = node->next->node_idx;
1439         assert (left > -1);
1440         assert (right > -1);
1441         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1442       }
1443       break;
1444
1445     case ANCHOR:
1446     case OP_OPEN_SUBEXP:
1447     case OP_CLOSE_SUBEXP:
1448       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1449       break;
1450
1451     case OP_BACK_REF:
1452       dfa->nexts[idx] = node->next->node_idx;
1453       if (node->token.type == OP_BACK_REF)
1454         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1455       break;
1456
1457     default:
1458       assert (!IS_EPSILON_NODE (node->token.type));
1459       dfa->nexts[idx] = node->next->node_idx;
1460       break;
1461     }
1462
1463   return err;
1464 }
1465
1466 /* Duplicate the epsilon closure of the node ROOT_NODE.
1467    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1468    to their own constraint.  */
1469
1470 static reg_errcode_t
1471 internal_function
1472 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1473                         int root_node, unsigned int init_constraint)
1474 {
1475   int org_node, clone_node, ret;
1476   unsigned int constraint = init_constraint;
1477   for (org_node = top_org_node, clone_node = top_clone_node;;)
1478     {
1479       int org_dest, clone_dest;
1480       if (dfa->nodes[org_node].type == OP_BACK_REF)
1481         {
1482           /* If the back reference epsilon-transit, its destination must
1483              also have the constraint.  Then duplicate the epsilon closure
1484              of the destination of the back reference, and store it in
1485              edests of the back reference.  */
1486           org_dest = dfa->nexts[org_node];
1487           re_node_set_empty (dfa->edests + clone_node);
1488           clone_dest = duplicate_node (dfa, org_dest, constraint);
1489           if (BE (clone_dest == -1, 0))
1490             return REG_ESPACE;
1491           dfa->nexts[clone_node] = dfa->nexts[org_node];
1492           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1493           if (BE (ret < 0, 0))
1494             return REG_ESPACE;
1495         }
1496       else if (dfa->edests[org_node].nelem == 0)
1497         {
1498           /* In case of the node can't epsilon-transit, don't duplicate the
1499              destination and store the original destination as the
1500              destination of the node.  */
1501           dfa->nexts[clone_node] = dfa->nexts[org_node];
1502           break;
1503         }
1504       else if (dfa->edests[org_node].nelem == 1)
1505         {
1506           /* In case of the node can epsilon-transit, and it has only one
1507              destination.  */
1508           org_dest = dfa->edests[org_node].elems[0];
1509           re_node_set_empty (dfa->edests + clone_node);
1510           /* If the node is root_node itself, it means the epsilon clsoure
1511              has a loop.   Then tie it to the destination of the root_node.  */
1512           if (org_node == root_node && clone_node != org_node)
1513             {
1514               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1515               if (BE (ret < 0, 0))
1516                 return REG_ESPACE;
1517               break;
1518             }
1519           /* In case of the node has another constraint, add it.  */
1520           constraint |= dfa->nodes[org_node].constraint;
1521           clone_dest = duplicate_node (dfa, org_dest, constraint);
1522           if (BE (clone_dest == -1, 0))
1523             return REG_ESPACE;
1524           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525           if (BE (ret < 0, 0))
1526             return REG_ESPACE;
1527         }
1528       else /* dfa->edests[org_node].nelem == 2 */
1529         {
1530           /* In case of the node can epsilon-transit, and it has two
1531              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1532           org_dest = dfa->edests[org_node].elems[0];
1533           re_node_set_empty (dfa->edests + clone_node);
1534           /* Search for a duplicated node which satisfies the constraint.  */
1535           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536           if (clone_dest == -1)
1537             {
1538               /* There is no such duplicated node, create a new one.  */
1539               reg_errcode_t err;
1540               clone_dest = duplicate_node (dfa, org_dest, constraint);
1541               if (BE (clone_dest == -1, 0))
1542                 return REG_ESPACE;
1543               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544               if (BE (ret < 0, 0))
1545                 return REG_ESPACE;
1546               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547                                             root_node, constraint);
1548               if (BE (err != REG_NOERROR, 0))
1549                 return err;
1550             }
1551           else
1552             {
1553               /* There is a duplicated node which satisfies the constraint,
1554                  use it to avoid infinite loop.  */
1555               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556               if (BE (ret < 0, 0))
1557                 return REG_ESPACE;
1558             }
1559
1560           org_dest = dfa->edests[org_node].elems[1];
1561           clone_dest = duplicate_node (dfa, org_dest, constraint);
1562           if (BE (clone_dest == -1, 0))
1563             return REG_ESPACE;
1564           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565           if (BE (ret < 0, 0))
1566             return REG_ESPACE;
1567         }
1568       org_node = org_dest;
1569       clone_node = clone_dest;
1570     }
1571   return REG_NOERROR;
1572 }
1573
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575    satisfies the constraint CONSTRAINT.  */
1576
1577 static int
1578 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1579                         unsigned int constraint)
1580 {
1581   int idx;
1582   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583     {
1584       if (org_node == dfa->org_indices[idx]
1585           && constraint == dfa->nodes[idx].constraint)
1586         return idx; /* Found.  */
1587     }
1588   return -1; /* Not found.  */
1589 }
1590
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592    Return the index of the new node, or -1 if insufficient storage is
1593    available.  */
1594
1595 static int
1596 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1597 {
1598   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1599   if (BE (dup_idx != -1, 1))
1600     {
1601       dfa->nodes[dup_idx].constraint = constraint;
1602       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1603       dfa->nodes[dup_idx].duplicated = 1;
1604
1605       /* Store the index of the original node.  */
1606       dfa->org_indices[dup_idx] = org_idx;
1607     }
1608   return dup_idx;
1609 }
1610
1611 static reg_errcode_t
1612 calc_inveclosure (re_dfa_t *dfa)
1613 {
1614   int src, idx, ret;
1615   for (idx = 0; idx < dfa->nodes_len; ++idx)
1616     re_node_set_init_empty (dfa->inveclosures + idx);
1617
1618   for (src = 0; src < dfa->nodes_len; ++src)
1619     {
1620       int *elems = dfa->eclosures[src].elems;
1621       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1622         {
1623           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1624           if (BE (ret == -1, 0))
1625             return REG_ESPACE;
1626         }
1627     }
1628
1629   return REG_NOERROR;
1630 }
1631
1632 /* Calculate "eclosure" for all the node in DFA.  */
1633
1634 static reg_errcode_t
1635 calc_eclosure (re_dfa_t *dfa)
1636 {
1637   int node_idx, incomplete;
1638 #ifdef DEBUG
1639   assert (dfa->nodes_len > 0);
1640 #endif
1641   incomplete = 0;
1642   /* For each nodes, calculate epsilon closure.  */
1643   for (node_idx = 0; ; ++node_idx)
1644     {
1645       reg_errcode_t err;
1646       re_node_set eclosure_elem;
1647       if (node_idx == dfa->nodes_len)
1648         {
1649           if (!incomplete)
1650             break;
1651           incomplete = 0;
1652           node_idx = 0;
1653         }
1654
1655 #ifdef DEBUG
1656       assert (dfa->eclosures[node_idx].nelem != -1);
1657 #endif
1658
1659       /* If we have already calculated, skip it.  */
1660       if (dfa->eclosures[node_idx].nelem != 0)
1661         continue;
1662       /* Calculate epsilon closure of `node_idx'.  */
1663       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1664       if (BE (err != REG_NOERROR, 0))
1665         return err;
1666
1667       if (dfa->eclosures[node_idx].nelem == 0)
1668         {
1669           incomplete = 1;
1670           re_node_set_free (&eclosure_elem);
1671         }
1672     }
1673   return REG_NOERROR;
1674 }
1675
1676 /* Calculate epsilon closure of NODE.  */
1677
1678 static reg_errcode_t
1679 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1680 {
1681   reg_errcode_t err;
1682   int i;
1683   re_node_set eclosure;
1684   int ret;
1685   int incomplete = 0;
1686   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1687   if (BE (err != REG_NOERROR, 0))
1688     return err;
1689
1690   /* This indicates that we are calculating this node now.
1691      We reference this value to avoid infinite loop.  */
1692   dfa->eclosures[node].nelem = -1;
1693
1694   /* If the current node has constraints, duplicate all nodes
1695      since they must inherit the constraints.  */
1696   if (dfa->nodes[node].constraint
1697       && dfa->edests[node].nelem
1698       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1699     {
1700       err = duplicate_node_closure (dfa, node, node, node,
1701                                     dfa->nodes[node].constraint);
1702       if (BE (err != REG_NOERROR, 0))
1703         return err;
1704     }
1705
1706   /* Expand each epsilon destination nodes.  */
1707   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1708     for (i = 0; i < dfa->edests[node].nelem; ++i)
1709       {
1710         re_node_set eclosure_elem;
1711         int edest = dfa->edests[node].elems[i];
1712         /* If calculating the epsilon closure of `edest' is in progress,
1713            return intermediate result.  */
1714         if (dfa->eclosures[edest].nelem == -1)
1715           {
1716             incomplete = 1;
1717             continue;
1718           }
1719         /* If we haven't calculated the epsilon closure of `edest' yet,
1720            calculate now. Otherwise use calculated epsilon closure.  */
1721         if (dfa->eclosures[edest].nelem == 0)
1722           {
1723             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1724             if (BE (err != REG_NOERROR, 0))
1725               return err;
1726           }
1727         else
1728           eclosure_elem = dfa->eclosures[edest];
1729         /* Merge the epsilon closure of `edest'.  */
1730         err = re_node_set_merge (&eclosure, &eclosure_elem);
1731         if (BE (err != REG_NOERROR, 0))
1732           return err;
1733         /* If the epsilon closure of `edest' is incomplete,
1734            the epsilon closure of this node is also incomplete.  */
1735         if (dfa->eclosures[edest].nelem == 0)
1736           {
1737             incomplete = 1;
1738             re_node_set_free (&eclosure_elem);
1739           }
1740       }
1741
1742   /* An epsilon closure includes itself.  */
1743   ret = re_node_set_insert (&eclosure, node);
1744   if (BE (ret < 0, 0))
1745     return REG_ESPACE;
1746   if (incomplete && !root)
1747     dfa->eclosures[node].nelem = 0;
1748   else
1749     dfa->eclosures[node] = eclosure;
1750   *new_set = eclosure;
1751   return REG_NOERROR;
1752 }
1753 \f
1754 /* Functions for token which are used in the parser.  */
1755
1756 /* Fetch a token from INPUT.
1757    We must not use this function inside bracket expressions.  */
1758
1759 static void
1760 internal_function
1761 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1762 {
1763   re_string_skip_bytes (input, peek_token (result, input, syntax));
1764 }
1765
1766 /* Peek a token from INPUT, and return the length of the token.
1767    We must not use this function inside bracket expressions.  */
1768
1769 static int
1770 internal_function
1771 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1772 {
1773   unsigned char c;
1774
1775   if (re_string_eoi (input))
1776     {
1777       token->type = END_OF_RE;
1778       return 0;
1779     }
1780
1781   c = re_string_peek_byte (input, 0);
1782   token->opr.c = c;
1783
1784   token->word_char = 0;
1785 #ifdef RE_ENABLE_I18N
1786   token->mb_partial = 0;
1787   if (input->mb_cur_max > 1 &&
1788       !re_string_first_byte (input, re_string_cur_idx (input)))
1789     {
1790       token->type = CHARACTER;
1791       token->mb_partial = 1;
1792       return 1;
1793     }
1794 #endif
1795   if (c == '\\')
1796     {
1797       unsigned char c2;
1798       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1799         {
1800           token->type = BACK_SLASH;
1801           return 1;
1802         }
1803
1804       c2 = re_string_peek_byte_case (input, 1);
1805       token->opr.c = c2;
1806       token->type = CHARACTER;
1807 #ifdef RE_ENABLE_I18N
1808       if (input->mb_cur_max > 1)
1809         {
1810           wint_t wc = re_string_wchar_at (input,
1811                                           re_string_cur_idx (input) + 1);
1812           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1813         }
1814       else
1815 #endif
1816         token->word_char = IS_WORD_CHAR (c2) != 0;
1817
1818       switch (c2)
1819         {
1820         case '|':
1821           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1822             token->type = OP_ALT;
1823           break;
1824         case '1': case '2': case '3': case '4': case '5':
1825         case '6': case '7': case '8': case '9':
1826           if (!(syntax & RE_NO_BK_REFS))
1827             {
1828               token->type = OP_BACK_REF;
1829               token->opr.idx = c2 - '1';
1830             }
1831           break;
1832         case '<':
1833           if (!(syntax & RE_NO_GNU_OPS))
1834             {
1835               token->type = ANCHOR;
1836               token->opr.ctx_type = WORD_FIRST;
1837             }
1838           break;
1839         case '>':
1840           if (!(syntax & RE_NO_GNU_OPS))
1841             {
1842               token->type = ANCHOR;
1843               token->opr.ctx_type = WORD_LAST;
1844             }
1845           break;
1846         case 'b':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             {
1849               token->type = ANCHOR;
1850               token->opr.ctx_type = WORD_DELIM;
1851             }
1852           break;
1853         case 'B':
1854           if (!(syntax & RE_NO_GNU_OPS))
1855             {
1856               token->type = ANCHOR;
1857               token->opr.ctx_type = NOT_WORD_DELIM;
1858             }
1859           break;
1860         case 'w':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             token->type = OP_WORD;
1863           break;
1864         case 'W':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             token->type = OP_NOTWORD;
1867           break;
1868         case 's':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             token->type = OP_SPACE;
1871           break;
1872         case 'S':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             token->type = OP_NOTSPACE;
1875           break;
1876         case '`':
1877           if (!(syntax & RE_NO_GNU_OPS))
1878             {
1879               token->type = ANCHOR;
1880               token->opr.ctx_type = BUF_FIRST;
1881             }
1882           break;
1883         case '\'':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             {
1886               token->type = ANCHOR;
1887               token->opr.ctx_type = BUF_LAST;
1888             }
1889           break;
1890         case '(':
1891           if (!(syntax & RE_NO_BK_PARENS))
1892             token->type = OP_OPEN_SUBEXP;
1893           break;
1894         case ')':
1895           if (!(syntax & RE_NO_BK_PARENS))
1896             token->type = OP_CLOSE_SUBEXP;
1897           break;
1898         case '+':
1899           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1900             token->type = OP_DUP_PLUS;
1901           break;
1902         case '?':
1903           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904             token->type = OP_DUP_QUESTION;
1905           break;
1906         case '{':
1907           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1908             token->type = OP_OPEN_DUP_NUM;
1909           break;
1910         case '}':
1911           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912             token->type = OP_CLOSE_DUP_NUM;
1913           break;
1914         default:
1915           break;
1916         }
1917       return 2;
1918     }
1919
1920   token->type = CHARACTER;
1921 #ifdef RE_ENABLE_I18N
1922   if (input->mb_cur_max > 1)
1923     {
1924       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1925       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1926     }
1927   else
1928 #endif
1929     token->word_char = IS_WORD_CHAR (token->opr.c);
1930
1931   switch (c)
1932     {
1933     case '\n':
1934       if (syntax & RE_NEWLINE_ALT)
1935         token->type = OP_ALT;
1936       break;
1937     case '|':
1938       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1939         token->type = OP_ALT;
1940       break;
1941     case '*':
1942       token->type = OP_DUP_ASTERISK;
1943       break;
1944     case '+':
1945       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1946         token->type = OP_DUP_PLUS;
1947       break;
1948     case '?':
1949       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950         token->type = OP_DUP_QUESTION;
1951       break;
1952     case '{':
1953       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1954         token->type = OP_OPEN_DUP_NUM;
1955       break;
1956     case '}':
1957       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958         token->type = OP_CLOSE_DUP_NUM;
1959       break;
1960     case '(':
1961       if (syntax & RE_NO_BK_PARENS)
1962         token->type = OP_OPEN_SUBEXP;
1963       break;
1964     case ')':
1965       if (syntax & RE_NO_BK_PARENS)
1966         token->type = OP_CLOSE_SUBEXP;
1967       break;
1968     case '[':
1969       token->type = OP_OPEN_BRACKET;
1970       break;
1971     case '.':
1972       token->type = OP_PERIOD;
1973       break;
1974     case '^':
1975       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1976           re_string_cur_idx (input) != 0)
1977         {
1978           char prev = re_string_peek_byte (input, -1);
1979           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1980             break;
1981         }
1982       token->type = ANCHOR;
1983       token->opr.ctx_type = LINE_FIRST;
1984       break;
1985     case '$':
1986       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1987           re_string_cur_idx (input) + 1 != re_string_length (input))
1988         {
1989           re_token_t next;
1990           re_string_skip_bytes (input, 1);
1991           peek_token (&next, input, syntax);
1992           re_string_skip_bytes (input, -1);
1993           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1994             break;
1995         }
1996       token->type = ANCHOR;
1997       token->opr.ctx_type = LINE_LAST;
1998       break;
1999     default:
2000       break;
2001     }
2002   return 1;
2003 }
2004
2005 /* Peek a token from INPUT, and return the length of the token.
2006    We must not use this function out of bracket expressions.  */
2007
2008 static int
2009 internal_function
2010 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2011 {
2012   unsigned char c;
2013   if (re_string_eoi (input))
2014     {
2015       token->type = END_OF_RE;
2016       return 0;
2017     }
2018   c = re_string_peek_byte (input, 0);
2019   token->opr.c = c;
2020
2021 #ifdef RE_ENABLE_I18N
2022   if (input->mb_cur_max > 1 &&
2023       !re_string_first_byte (input, re_string_cur_idx (input)))
2024     {
2025       token->type = CHARACTER;
2026       return 1;
2027     }
2028 #endif /* RE_ENABLE_I18N */
2029
2030   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2031       && re_string_cur_idx (input) + 1 < re_string_length (input))
2032     {
2033       /* In this case, '\' escape a character.  */
2034       unsigned char c2;
2035       re_string_skip_bytes (input, 1);
2036       c2 = re_string_peek_byte (input, 0);
2037       token->opr.c = c2;
2038       token->type = CHARACTER;
2039       return 1;
2040     }
2041   if (c == '[') /* '[' is a special char in a bracket exps.  */
2042     {
2043       unsigned char c2;
2044       int token_len;
2045       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2046         c2 = re_string_peek_byte (input, 1);
2047       else
2048         c2 = 0;
2049       token->opr.c = c2;
2050       token_len = 2;
2051       switch (c2)
2052         {
2053         case '.':
2054           token->type = OP_OPEN_COLL_ELEM;
2055           break;
2056         case '=':
2057           token->type = OP_OPEN_EQUIV_CLASS;
2058           break;
2059         case ':':
2060           if (syntax & RE_CHAR_CLASSES)
2061             {
2062               token->type = OP_OPEN_CHAR_CLASS;
2063               break;
2064             }
2065           /* else fall through.  */
2066         default:
2067           token->type = CHARACTER;
2068           token->opr.c = c;
2069           token_len = 1;
2070           break;
2071         }
2072       return token_len;
2073     }
2074   switch (c)
2075     {
2076     case '-':
2077       token->type = OP_CHARSET_RANGE;
2078       break;
2079     case ']':
2080       token->type = OP_CLOSE_BRACKET;
2081       break;
2082     case '^':
2083       token->type = OP_NON_MATCH_LIST;
2084       break;
2085     default:
2086       token->type = CHARACTER;
2087     }
2088   return 1;
2089 }
2090 \f
2091 /* Functions for parser.  */
2092
2093 /* Entry point of the parser.
2094    Parse the regular expression REGEXP and return the structure tree.
2095    If an error is occured, ERR is set by error code, and return NULL.
2096    This function build the following tree, from regular expression <reg_exp>:
2097            CAT
2098            / \
2099           /   \
2100    <reg_exp>  EOR
2101
2102    CAT means concatenation.
2103    EOR means end of regular expression.  */
2104
2105 static bin_tree_t *
2106 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2107        reg_errcode_t *err)
2108 {
2109   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2110   bin_tree_t *tree, *eor, *root;
2111   re_token_t current_token;
2112   dfa->syntax = syntax;
2113   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2114   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2115   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2116     return NULL;
2117   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2118   if (tree != NULL)
2119     root = create_tree (dfa, tree, eor, CONCAT);
2120   else
2121     root = eor;
2122   if (BE (eor == NULL || root == NULL, 0))
2123     {
2124       *err = REG_ESPACE;
2125       return NULL;
2126     }
2127   return root;
2128 }
2129
2130 /* This function build the following tree, from regular expression
2131    <branch1>|<branch2>:
2132            ALT
2133            / \
2134           /   \
2135    <branch1> <branch2>
2136
2137    ALT means alternative, which represents the operator `|'.  */
2138
2139 static bin_tree_t *
2140 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2142 {
2143   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2144   bin_tree_t *tree, *branch = NULL;
2145   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2146   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147     return NULL;
2148
2149   while (token->type == OP_ALT)
2150     {
2151       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2152       if (token->type != OP_ALT && token->type != END_OF_RE
2153           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2154         {
2155           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2156           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2157             {
2158               if (tree != NULL)
2159                 postorder (tree, free_tree, NULL);
2160               return NULL;
2161             }
2162         }
2163       else
2164         branch = NULL;
2165       tree = create_tree (dfa, tree, branch, OP_ALT);
2166       if (BE (tree == NULL, 0))
2167         {
2168           *err = REG_ESPACE;
2169           return NULL;
2170         }
2171     }
2172   return tree;
2173 }
2174
2175 /* This function build the following tree, from regular expression
2176    <exp1><exp2>:
2177         CAT
2178         / \
2179        /   \
2180    <exp1> <exp2>
2181
2182    CAT means concatenation.  */
2183
2184 static bin_tree_t *
2185 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2187 {
2188   bin_tree_t *tree, *exp;
2189   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2190   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2191   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192     return NULL;
2193
2194   while (token->type != OP_ALT && token->type != END_OF_RE
2195          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2196     {
2197       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2198       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2199         {
2200           if (tree != NULL)
2201             postorder (tree, free_tree, NULL);
2202           return NULL;
2203         }
2204       if (tree != NULL && exp != NULL)
2205         {
2206           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2207           if (newtree == NULL)
2208             {
2209               postorder (exp, free_tree, NULL);
2210               postorder (tree, free_tree, NULL);
2211               *err = REG_ESPACE;
2212               return NULL;
2213             }
2214           tree = newtree;
2215         }
2216       else if (tree == NULL)
2217         tree = exp;
2218       /* Otherwise exp == NULL, we don't need to create new tree.  */
2219     }
2220   return tree;
2221 }
2222
2223 /* This function build the following tree, from regular expression a*:
2224          *
2225          |
2226          a
2227 */
2228
2229 static bin_tree_t *
2230 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2231                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2232 {
2233   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2234   bin_tree_t *tree;
2235   switch (token->type)
2236     {
2237     case CHARACTER:
2238       tree = create_token_tree (dfa, NULL, NULL, token);
2239       if (BE (tree == NULL, 0))
2240         {
2241           *err = REG_ESPACE;
2242           return NULL;
2243         }
2244 #ifdef RE_ENABLE_I18N
2245       if (dfa->mb_cur_max > 1)
2246         {
2247           while (!re_string_eoi (regexp)
2248                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2249             {
2250               bin_tree_t *mbc_remain;
2251               fetch_token (token, regexp, syntax);
2252               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2253               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2254               if (BE (mbc_remain == NULL || tree == NULL, 0))
2255                 {
2256                   *err = REG_ESPACE;
2257                   return NULL;
2258                 }
2259             }
2260         }
2261 #endif
2262       break;
2263     case OP_OPEN_SUBEXP:
2264       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2265       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2266         return NULL;
2267       break;
2268     case OP_OPEN_BRACKET:
2269       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2270       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2271         return NULL;
2272       break;
2273     case OP_BACK_REF:
2274       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2275         {
2276           *err = REG_ESUBREG;
2277           return NULL;
2278         }
2279       dfa->used_bkref_map |= 1 << token->opr.idx;
2280       tree = create_token_tree (dfa, NULL, NULL, token);
2281       if (BE (tree == NULL, 0))
2282         {
2283           *err = REG_ESPACE;
2284           return NULL;
2285         }
2286       ++dfa->nbackref;
2287       dfa->has_mb_node = 1;
2288       break;
2289     case OP_OPEN_DUP_NUM:
2290       if (syntax & RE_CONTEXT_INVALID_DUP)
2291         {
2292           *err = REG_BADRPT;
2293           return NULL;
2294         }
2295       /* FALLTHROUGH */
2296     case OP_DUP_ASTERISK:
2297     case OP_DUP_PLUS:
2298     case OP_DUP_QUESTION:
2299       if (syntax & RE_CONTEXT_INVALID_OPS)
2300         {
2301           *err = REG_BADRPT;
2302           return NULL;
2303         }
2304       else if (syntax & RE_CONTEXT_INDEP_OPS)
2305         {
2306           fetch_token (token, regexp, syntax);
2307           return parse_expression (regexp, preg, token, syntax, nest, err);
2308         }
2309       /* else fall through  */
2310     case OP_CLOSE_SUBEXP:
2311       if ((token->type == OP_CLOSE_SUBEXP) &&
2312           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2313         {
2314           *err = REG_ERPAREN;
2315           return NULL;
2316         }
2317       /* else fall through  */
2318     case OP_CLOSE_DUP_NUM:
2319       /* We treat it as a normal character.  */
2320
2321       /* Then we can these characters as normal characters.  */
2322       token->type = CHARACTER;
2323       /* mb_partial and word_char bits should be initialized already
2324          by peek_token.  */
2325       tree = create_token_tree (dfa, NULL, NULL, token);
2326       if (BE (tree == NULL, 0))
2327         {
2328           *err = REG_ESPACE;
2329           return NULL;
2330         }
2331       break;
2332     case ANCHOR:
2333       if ((token->opr.ctx_type
2334            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2335           && dfa->word_ops_used == 0)
2336         init_word_char (dfa);
2337       if (token->opr.ctx_type == WORD_DELIM
2338           || token->opr.ctx_type == NOT_WORD_DELIM)
2339         {
2340           bin_tree_t *tree_first, *tree_last;
2341           if (token->opr.ctx_type == WORD_DELIM)
2342             {
2343               token->opr.ctx_type = WORD_FIRST;
2344               tree_first = create_token_tree (dfa, NULL, NULL, token);
2345               token->opr.ctx_type = WORD_LAST;
2346             }
2347           else
2348             {
2349               token->opr.ctx_type = INSIDE_WORD;
2350               tree_first = create_token_tree (dfa, NULL, NULL, token);
2351               token->opr.ctx_type = INSIDE_NOTWORD;
2352             }
2353           tree_last = create_token_tree (dfa, NULL, NULL, token);
2354           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2355           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2356             {
2357               *err = REG_ESPACE;
2358               return NULL;
2359             }
2360         }
2361       else
2362         {
2363           tree = create_token_tree (dfa, NULL, NULL, token);
2364           if (BE (tree == NULL, 0))
2365             {
2366               *err = REG_ESPACE;
2367               return NULL;
2368             }
2369         }
2370       /* We must return here, since ANCHORs can't be followed
2371          by repetition operators.
2372          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2373              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2374       fetch_token (token, regexp, syntax);
2375       return tree;
2376     case OP_PERIOD:
2377       tree = create_token_tree (dfa, NULL, NULL, token);
2378       if (BE (tree == NULL, 0))
2379         {
2380           *err = REG_ESPACE;
2381           return NULL;
2382         }
2383       if (dfa->mb_cur_max > 1)
2384         dfa->has_mb_node = 1;
2385       break;
2386     case OP_WORD:
2387     case OP_NOTWORD:
2388       tree = build_charclass_op (dfa, regexp->trans,
2389                                  (const unsigned char *) "alnum",
2390                                  (const unsigned char *) "_",
2391                                  token->type == OP_NOTWORD, err);
2392       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2393         return NULL;
2394       break;
2395     case OP_SPACE:
2396     case OP_NOTSPACE:
2397       tree = build_charclass_op (dfa, regexp->trans,
2398                                  (const unsigned char *) "space",
2399                                  (const unsigned char *) "",
2400                                  token->type == OP_NOTSPACE, err);
2401       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2402         return NULL;
2403       break;
2404     case OP_ALT:
2405     case END_OF_RE:
2406       return NULL;
2407     case BACK_SLASH:
2408       *err = REG_EESCAPE;
2409       return NULL;
2410     default:
2411       /* Must not happen?  */
2412 #ifdef DEBUG
2413       assert (0);
2414 #endif
2415       return NULL;
2416     }
2417   fetch_token (token, regexp, syntax);
2418
2419   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2420          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2421     {
2422       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2423       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2424         {
2425           if (tree != NULL)
2426             postorder (tree, free_tree, NULL);
2427           return NULL;
2428         }
2429       tree = dup_tree;
2430       /* In BRE consecutive duplications are not allowed.  */
2431       if ((syntax & RE_CONTEXT_INVALID_DUP)
2432           && (token->type == OP_DUP_ASTERISK
2433               || token->type == OP_OPEN_DUP_NUM))
2434         {
2435           if (tree != NULL)
2436             postorder (tree, free_tree, NULL);
2437           *err = REG_BADRPT;
2438           return NULL;
2439         }
2440     }
2441
2442   return tree;
2443 }
2444
2445 /* This function build the following tree, from regular expression
2446    (<reg_exp>):
2447          SUBEXP
2448             |
2449         <reg_exp>
2450 */
2451
2452 static bin_tree_t *
2453 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2454                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2455 {
2456   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2457   bin_tree_t *tree;
2458   size_t cur_nsub;
2459   cur_nsub = preg->re_nsub++;
2460
2461   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2462
2463   /* The subexpression may be a null string.  */
2464   if (token->type == OP_CLOSE_SUBEXP)
2465     tree = NULL;
2466   else
2467     {
2468       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2469       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2470         {
2471           if (tree != NULL)
2472             postorder (tree, free_tree, NULL);
2473           *err = REG_EPAREN;
2474         }
2475       if (BE (*err != REG_NOERROR, 0))
2476         return NULL;
2477     }
2478
2479   if (cur_nsub <= '9' - '1')
2480     dfa->completed_bkref_map |= 1 << cur_nsub;
2481
2482   tree = create_tree (dfa, tree, NULL, SUBEXP);
2483   if (BE (tree == NULL, 0))
2484     {
2485       *err = REG_ESPACE;
2486       return NULL;
2487     }
2488   tree->token.opr.idx = cur_nsub;
2489   return tree;
2490 }
2491
2492 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2493
2494 static bin_tree_t *
2495 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2496               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2497 {
2498   bin_tree_t *tree = NULL, *old_tree = NULL;
2499   int i, start, end, start_idx = re_string_cur_idx (regexp);
2500   re_token_t start_token = *token;
2501
2502   if (token->type == OP_OPEN_DUP_NUM)
2503     {
2504       end = 0;
2505       start = fetch_number (regexp, token, syntax);
2506       if (start == -1)
2507         {
2508           if (token->type == CHARACTER && token->opr.c == ',')
2509             start = 0; /* We treat "{,m}" as "{0,m}".  */
2510           else
2511             {
2512               *err = REG_BADBR; /* <re>{} is invalid.  */
2513               return NULL;
2514             }
2515         }
2516       if (BE (start != -2, 1))
2517         {
2518           /* We treat "{n}" as "{n,n}".  */
2519           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2520                  : ((token->type == CHARACTER && token->opr.c == ',')
2521                     ? fetch_number (regexp, token, syntax) : -2));
2522         }
2523       if (BE (start == -2 || end == -2, 0))
2524         {
2525           /* Invalid sequence.  */
2526           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2527             {
2528               if (token->type == END_OF_RE)
2529                 *err = REG_EBRACE;
2530               else
2531                 *err = REG_BADBR;
2532
2533               return NULL;
2534             }
2535
2536           /* If the syntax bit is set, rollback.  */
2537           re_string_set_index (regexp, start_idx);
2538           *token = start_token;
2539           token->type = CHARACTER;
2540           /* mb_partial and word_char bits should be already initialized by
2541              peek_token.  */
2542           return elem;
2543         }
2544
2545       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2546         {
2547           /* First number greater than second.  */
2548           *err = REG_BADBR;
2549           return NULL;
2550         }
2551     }
2552   else
2553     {
2554       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2555       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2556     }
2557
2558   fetch_token (token, regexp, syntax);
2559
2560   if (BE (elem == NULL, 0))
2561     return NULL;
2562   if (BE (start == 0 && end == 0, 0))
2563     {
2564       postorder (elem, free_tree, NULL);
2565       return NULL;
2566     }
2567
2568   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2569   if (BE (start > 0, 0))
2570     {
2571       tree = elem;
2572       for (i = 2; i <= start; ++i)
2573         {
2574           elem = duplicate_tree (elem, dfa);
2575           tree = create_tree (dfa, tree, elem, CONCAT);
2576           if (BE (elem == NULL || tree == NULL, 0))
2577             goto parse_dup_op_espace;
2578         }
2579
2580       if (start == end)
2581         return tree;
2582
2583       /* Duplicate ELEM before it is marked optional.  */
2584       elem = duplicate_tree (elem, dfa);
2585       if (BE (elem == NULL, 0))
2586         goto parse_dup_op_espace;
2587       old_tree = tree;
2588     }
2589   else
2590     old_tree = NULL;
2591
2592   if (elem->token.type == SUBEXP)
2593     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2594
2595   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2596   if (BE (tree == NULL, 0))
2597     goto parse_dup_op_espace;
2598
2599   /* This loop is actually executed only when end != -1,
2600      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2601      already created the start+1-th copy.  */
2602   for (i = start + 2; i <= end; ++i)
2603     {
2604       elem = duplicate_tree (elem, dfa);
2605       tree = create_tree (dfa, tree, elem, CONCAT);
2606       if (BE (elem == NULL || tree == NULL, 0))
2607         goto parse_dup_op_espace;
2608
2609       tree = create_tree (dfa, tree, NULL, OP_ALT);
2610       if (BE (tree == NULL, 0))
2611         goto parse_dup_op_espace;
2612     }
2613
2614   if (old_tree)
2615     tree = create_tree (dfa, old_tree, tree, CONCAT);
2616
2617   return tree;
2618
2619  parse_dup_op_espace:
2620   *err = REG_ESPACE;
2621   return NULL;
2622 }
2623
2624 /* Size of the names for collating symbol/equivalence_class/character_class.
2625    I'm not sure, but maybe enough.  */
2626 #define BRACKET_NAME_BUF_SIZE 32
2627
2628 #ifndef _LIBC
2629   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2630      Build the range expression which starts from START_ELEM, and ends
2631      at END_ELEM.  The result are written to MBCSET and SBCSET.
2632      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2633      mbcset->range_ends, is a pointer argument sinse we may
2634      update it.  */
2635
2636 static reg_errcode_t
2637 internal_function
2638 # ifdef RE_ENABLE_I18N
2639 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2640                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2641 # else /* not RE_ENABLE_I18N */
2642 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2643                  bracket_elem_t *end_elem)
2644 # endif /* not RE_ENABLE_I18N */
2645 {
2646   unsigned int start_ch, end_ch;
2647   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2648   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2649           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2650           0))
2651     return REG_ERANGE;
2652
2653   /* We can handle no multi character collating elements without libc
2654      support.  */
2655   if (BE ((start_elem->type == COLL_SYM
2656            && strlen ((char *) start_elem->opr.name) > 1)
2657           || (end_elem->type == COLL_SYM
2658               && strlen ((char *) end_elem->opr.name) > 1), 0))
2659     return REG_ECOLLATE;
2660
2661 # ifdef RE_ENABLE_I18N
2662   {
2663     wchar_t wc;
2664     wint_t start_wc;
2665     wint_t end_wc;
2666     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2667
2668     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2669                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2670                    : 0));
2671     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2672               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2673                  : 0));
2674     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675                 ? __btowc (start_ch) : start_elem->opr.wch);
2676     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677               ? __btowc (end_ch) : end_elem->opr.wch);
2678     if (start_wc == WEOF || end_wc == WEOF)
2679       return REG_ECOLLATE;
2680     cmp_buf[0] = start_wc;
2681     cmp_buf[4] = end_wc;
2682     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2683       return REG_ERANGE;
2684
2685     /* Got valid collation sequence values, add them as a new entry.
2686        However, for !_LIBC we have no collation elements: if the
2687        character set is single byte, the single byte character set
2688        that we build below suffices.  parse_bracket_exp passes
2689        no MBCSET if dfa->mb_cur_max == 1.  */
2690     if (mbcset)
2691       {
2692         /* Check the space of the arrays.  */
2693         if (BE (*range_alloc == mbcset->nranges, 0))
2694           {
2695             /* There is not enough space, need realloc.  */
2696             wchar_t *new_array_start, *new_array_end;
2697             int new_nranges;
2698
2699             /* +1 in case of mbcset->nranges is 0.  */
2700             new_nranges = 2 * mbcset->nranges + 1;
2701             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2702                are NULL if *range_alloc == 0.  */
2703             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2704                                           new_nranges);
2705             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2706                                         new_nranges);
2707
2708             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2709               return REG_ESPACE;
2710
2711             mbcset->range_starts = new_array_start;
2712             mbcset->range_ends = new_array_end;
2713             *range_alloc = new_nranges;
2714           }
2715
2716         mbcset->range_starts[mbcset->nranges] = start_wc;
2717         mbcset->range_ends[mbcset->nranges++] = end_wc;
2718       }
2719
2720     /* Build the table for single byte characters.  */
2721     for (wc = 0; wc < SBC_MAX; ++wc)
2722       {
2723         cmp_buf[2] = wc;
2724         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2725             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2726           bitset_set (sbcset, wc);
2727       }
2728   }
2729 # else /* not RE_ENABLE_I18N */
2730   {
2731     unsigned int ch;
2732     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2733                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2734                    : 0));
2735     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2736               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2737                  : 0));
2738     if (start_ch > end_ch)
2739       return REG_ERANGE;
2740     /* Build the table for single byte characters.  */
2741     for (ch = 0; ch < SBC_MAX; ++ch)
2742       if (start_ch <= ch  && ch <= end_ch)
2743         bitset_set (sbcset, ch);
2744   }
2745 # endif /* not RE_ENABLE_I18N */
2746   return REG_NOERROR;
2747 }
2748 #endif /* not _LIBC */
2749
2750 #ifndef _LIBC
2751 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2752    Build the collating element which is represented by NAME.
2753    The result are written to MBCSET and SBCSET.
2754    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2755    pointer argument since we may update it.  */
2756
2757 static reg_errcode_t
2758 internal_function
2759 # ifdef RE_ENABLE_I18N
2760 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2761                         int *coll_sym_alloc, const unsigned char *name)
2762 # else /* not RE_ENABLE_I18N */
2763 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2764 # endif /* not RE_ENABLE_I18N */
2765 {
2766   size_t name_len = strlen ((const char *) name);
2767   if (BE (name_len != 1, 0))
2768     return REG_ECOLLATE;
2769   else
2770     {
2771       bitset_set (sbcset, name[0]);
2772       return REG_NOERROR;
2773     }
2774 }
2775 #endif /* not _LIBC */
2776
2777 /* This function parse bracket expression like "[abc]", "[a-c]",
2778    "[[.a-a.]]" etc.  */
2779
2780 static bin_tree_t *
2781 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2782                    reg_syntax_t syntax, reg_errcode_t *err)
2783 {
2784 #ifdef _LIBC
2785   const unsigned char *collseqmb;
2786   const char *collseqwc;
2787   uint32_t nrules;
2788   int32_t table_size;
2789   const int32_t *symb_table;
2790   const unsigned char *extra;
2791
2792   /* Local function for parse_bracket_exp used in _LIBC environement.
2793      Seek the collating symbol entry correspondings to NAME.
2794      Return the index of the symbol in the SYMB_TABLE,
2795      or -1 if not found.  */
2796
2797   auto inline int32_t
2798   __attribute ((always_inline))
2799   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2800     {
2801       int32_t elem;
2802
2803       for (elem = 0; elem < table_size; elem++)
2804         if (symb_table[2 * elem] != 0)
2805           {
2806             int32_t idx = symb_table[2 * elem + 1];
2807             /* Skip the name of collating element name.  */
2808             idx += 1 + extra[idx];
2809             if (/* Compare the length of the name.  */
2810                 name_len == extra[idx]
2811                 /* Compare the name.  */
2812                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2813               /* Yep, this is the entry.  */
2814               return elem;
2815           }
2816       return -1;
2817     }
2818
2819   /* Local function for parse_bracket_exp used in _LIBC environment.
2820      Look up the collation sequence value of BR_ELEM.
2821      Return the value if succeeded, UINT_MAX otherwise.  */
2822
2823   auto inline unsigned int
2824   __attribute ((always_inline))
2825   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2826     {
2827       if (br_elem->type == SB_CHAR)
2828         {
2829           /*
2830           if (MB_CUR_MAX == 1)
2831           */
2832           if (nrules == 0)
2833             return collseqmb[br_elem->opr.ch];
2834           else
2835             {
2836               wint_t wc = __btowc (br_elem->opr.ch);
2837               return __collseq_table_lookup (collseqwc, wc);
2838             }
2839         }
2840       else if (br_elem->type == MB_CHAR)
2841         {
2842           if (nrules != 0)
2843             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2844         }
2845       else if (br_elem->type == COLL_SYM)
2846         {
2847           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2848           if (nrules != 0)
2849             {
2850               int32_t elem, idx;
2851               elem = seek_collating_symbol_entry (br_elem->opr.name,
2852                                                   sym_name_len);
2853               if (elem != -1)
2854                 {
2855                   /* We found the entry.  */
2856                   idx = symb_table[2 * elem + 1];
2857                   /* Skip the name of collating element name.  */
2858                   idx += 1 + extra[idx];
2859                   /* Skip the byte sequence of the collating element.  */
2860                   idx += 1 + extra[idx];
2861                   /* Adjust for the alignment.  */
2862                   idx = (idx + 3) & ~3;
2863                   /* Skip the multibyte collation sequence value.  */
2864                   idx += sizeof (unsigned int);
2865                   /* Skip the wide char sequence of the collating element.  */
2866                   idx += sizeof (unsigned int) *
2867                     (1 + *(unsigned int *) (extra + idx));
2868                   /* Return the collation sequence value.  */
2869                   return *(unsigned int *) (extra + idx);
2870                 }
2871               else if (sym_name_len == 1)
2872                 {
2873                   /* No valid character.  Match it as a single byte
2874                      character.  */
2875                   return collseqmb[br_elem->opr.name[0]];
2876                 }
2877             }
2878           else if (sym_name_len == 1)
2879             return collseqmb[br_elem->opr.name[0]];
2880         }
2881       return UINT_MAX;
2882     }
2883
2884   /* Local function for parse_bracket_exp used in _LIBC environement.
2885      Build the range expression which starts from START_ELEM, and ends
2886      at END_ELEM.  The result are written to MBCSET and SBCSET.
2887      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2888      mbcset->range_ends, is a pointer argument sinse we may
2889      update it.  */
2890
2891   auto inline reg_errcode_t
2892   __attribute ((always_inline))
2893   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2894                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2895     {
2896       unsigned int ch;
2897       uint32_t start_collseq;
2898       uint32_t end_collseq;
2899
2900       /* Equivalence Classes and Character Classes can't be a range
2901          start/end.  */
2902       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2903               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2904               0))
2905         return REG_ERANGE;
2906
2907       start_collseq = lookup_collation_sequence_value (start_elem);
2908       end_collseq = lookup_collation_sequence_value (end_elem);
2909       /* Check start/end collation sequence values.  */
2910       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2911         return REG_ECOLLATE;
2912       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2913         return REG_ERANGE;
2914
2915       /* Got valid collation sequence values, add them as a new entry.
2916          However, if we have no collation elements, and the character set
2917          is single byte, the single byte character set that we
2918          build below suffices. */
2919       if (nrules > 0 || dfa->mb_cur_max > 1)
2920         {
2921           /* Check the space of the arrays.  */
2922           if (BE (*range_alloc == mbcset->nranges, 0))
2923             {
2924               /* There is not enough space, need realloc.  */
2925               uint32_t *new_array_start;
2926               uint32_t *new_array_end;
2927               int new_nranges;
2928
2929               /* +1 in case of mbcset->nranges is 0.  */
2930               new_nranges = 2 * mbcset->nranges + 1;
2931               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2932                                             new_nranges);
2933               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2934                                           new_nranges);
2935
2936               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2937                 return REG_ESPACE;
2938
2939               mbcset->range_starts = new_array_start;
2940               mbcset->range_ends = new_array_end;
2941               *range_alloc = new_nranges;
2942             }
2943
2944           mbcset->range_starts[mbcset->nranges] = start_collseq;
2945           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2946         }
2947
2948       /* Build the table for single byte characters.  */
2949       for (ch = 0; ch < SBC_MAX; ch++)
2950         {
2951           uint32_t ch_collseq;
2952           /*
2953           if (MB_CUR_MAX == 1)
2954           */
2955           if (nrules == 0)
2956             ch_collseq = collseqmb[ch];
2957           else
2958             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2959           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2960             bitset_set (sbcset, ch);
2961         }
2962       return REG_NOERROR;
2963     }
2964
2965   /* Local function for parse_bracket_exp used in _LIBC environement.
2966      Build the collating element which is represented by NAME.
2967      The result are written to MBCSET and SBCSET.
2968      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2969      pointer argument sinse we may update it.  */
2970
2971   auto inline reg_errcode_t
2972   __attribute ((always_inline))
2973   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2974                           int *coll_sym_alloc, const unsigned char *name)
2975     {
2976       int32_t elem, idx;
2977       size_t name_len = strlen ((const char *) name);
2978       if (nrules != 0)
2979         {
2980           elem = seek_collating_symbol_entry (name, name_len);
2981           if (elem != -1)
2982             {
2983               /* We found the entry.  */
2984               idx = symb_table[2 * elem + 1];
2985               /* Skip the name of collating element name.  */
2986               idx += 1 + extra[idx];
2987             }
2988           else if (name_len == 1)
2989             {
2990               /* No valid character, treat it as a normal
2991                  character.  */
2992               bitset_set (sbcset, name[0]);
2993               return REG_NOERROR;
2994             }
2995           else
2996             return REG_ECOLLATE;
2997
2998           /* Got valid collation sequence, add it as a new entry.  */
2999           /* Check the space of the arrays.  */
3000           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3001             {
3002               /* Not enough, realloc it.  */
3003               /* +1 in case of mbcset->ncoll_syms is 0.  */
3004               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3005               /* Use realloc since mbcset->coll_syms is NULL
3006                  if *alloc == 0.  */
3007               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3008                                                    new_coll_sym_alloc);
3009               if (BE (new_coll_syms == NULL, 0))
3010                 return REG_ESPACE;
3011               mbcset->coll_syms = new_coll_syms;
3012               *coll_sym_alloc = new_coll_sym_alloc;
3013             }
3014           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3015           return REG_NOERROR;
3016         }
3017       else
3018         {
3019           if (BE (name_len != 1, 0))
3020             return REG_ECOLLATE;
3021           else
3022             {
3023               bitset_set (sbcset, name[0]);
3024               return REG_NOERROR;
3025             }
3026         }
3027     }
3028 #endif
3029
3030   re_token_t br_token;
3031   re_bitset_ptr_t sbcset;
3032 #ifdef RE_ENABLE_I18N
3033   re_charset_t *mbcset;
3034   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3035   int equiv_class_alloc = 0, char_class_alloc = 0;
3036 #endif /* not RE_ENABLE_I18N */
3037   int non_match = 0;
3038   bin_tree_t *work_tree;
3039   int token_len;
3040   int first_round = 1;
3041 #ifdef _LIBC
3042   collseqmb = (const unsigned char *)
3043     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3044   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3045   if (nrules)
3046     {
3047       /*
3048       if (MB_CUR_MAX > 1)
3049       */
3050       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3051       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3052       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3053                                                   _NL_COLLATE_SYMB_TABLEMB);
3054       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055                                                    _NL_COLLATE_SYMB_EXTRAMB);
3056     }
3057 #endif
3058   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059 #ifdef RE_ENABLE_I18N
3060   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3061 #endif /* RE_ENABLE_I18N */
3062 #ifdef RE_ENABLE_I18N
3063   if (BE (sbcset == NULL || mbcset == NULL, 0))
3064 #else
3065   if (BE (sbcset == NULL, 0))
3066 #endif /* RE_ENABLE_I18N */
3067     {
3068       re_free (sbcset);
3069 #ifdef RE_ENABLE_I18N
3070       re_free (mbcset);
3071 #endif
3072       *err = REG_ESPACE;
3073       return NULL;
3074     }
3075
3076   token_len = peek_token_bracket (token, regexp, syntax);
3077   if (BE (token->type == END_OF_RE, 0))
3078     {
3079       *err = REG_BADPAT;
3080       goto parse_bracket_exp_free_return;
3081     }
3082   if (token->type == OP_NON_MATCH_LIST)
3083     {
3084 #ifdef RE_ENABLE_I18N
3085       mbcset->non_match = 1;
3086 #endif /* not RE_ENABLE_I18N */
3087       non_match = 1;
3088       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3089         bitset_set (sbcset, '\n');
3090       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3091       token_len = peek_token_bracket (token, regexp, syntax);
3092       if (BE (token->type == END_OF_RE, 0))
3093         {
3094           *err = REG_BADPAT;
3095           goto parse_bracket_exp_free_return;
3096         }
3097     }
3098
3099   /* We treat the first ']' as a normal character.  */
3100   if (token->type == OP_CLOSE_BRACKET)
3101     token->type = CHARACTER;
3102
3103   while (1)
3104     {
3105       bracket_elem_t start_elem, end_elem;
3106       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3107       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3108       reg_errcode_t ret;
3109       int token_len2 = 0, is_range_exp = 0;
3110       re_token_t token2;
3111
3112       start_elem.opr.name = start_name_buf;
3113       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3114                                    syntax, first_round);
3115       if (BE (ret != REG_NOERROR, 0))
3116         {
3117           *err = ret;
3118           goto parse_bracket_exp_free_return;
3119         }
3120       first_round = 0;
3121
3122       /* Get information about the next token.  We need it in any case.  */
3123       token_len = peek_token_bracket (token, regexp, syntax);
3124
3125       /* Do not check for ranges if we know they are not allowed.  */
3126       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3127         {
3128           if (BE (token->type == END_OF_RE, 0))
3129             {
3130               *err = REG_EBRACK;
3131               goto parse_bracket_exp_free_return;
3132             }
3133           if (token->type == OP_CHARSET_RANGE)
3134             {
3135               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3136               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3137               if (BE (token2.type == END_OF_RE, 0))
3138                 {
3139                   *err = REG_EBRACK;
3140                   goto parse_bracket_exp_free_return;
3141                 }
3142               if (token2.type == OP_CLOSE_BRACKET)
3143                 {
3144                   /* We treat the last '-' as a normal character.  */
3145                   re_string_skip_bytes (regexp, -token_len);
3146                   token->type = CHARACTER;
3147                 }
3148               else
3149                 is_range_exp = 1;
3150             }
3151         }
3152
3153       if (is_range_exp == 1)
3154         {
3155           end_elem.opr.name = end_name_buf;
3156           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3157                                        dfa, syntax, 1);
3158           if (BE (ret != REG_NOERROR, 0))
3159             {
3160               *err = ret;
3161               goto parse_bracket_exp_free_return;
3162             }
3163
3164           token_len = peek_token_bracket (token, regexp, syntax);
3165
3166 #ifdef _LIBC
3167           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3168                                   &start_elem, &end_elem);
3169 #else
3170 # ifdef RE_ENABLE_I18N
3171           *err = build_range_exp (sbcset,
3172                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3173                                   &range_alloc, &start_elem, &end_elem);
3174 # else
3175           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3176 # endif
3177 #endif /* RE_ENABLE_I18N */
3178           if (BE (*err != REG_NOERROR, 0))
3179             goto parse_bracket_exp_free_return;
3180         }
3181       else
3182         {
3183           switch (start_elem.type)
3184             {
3185             case SB_CHAR:
3186               bitset_set (sbcset, start_elem.opr.ch);
3187               break;
3188 #ifdef RE_ENABLE_I18N
3189             case MB_CHAR:
3190               /* Check whether the array has enough space.  */
3191               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3192                 {
3193                   wchar_t *new_mbchars;
3194                   /* Not enough, realloc it.  */
3195                   /* +1 in case of mbcset->nmbchars is 0.  */
3196                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3197                   /* Use realloc since array is NULL if *alloc == 0.  */
3198                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3199                                             mbchar_alloc);
3200                   if (BE (new_mbchars == NULL, 0))
3201                     goto parse_bracket_exp_espace;
3202                   mbcset->mbchars = new_mbchars;
3203                 }
3204               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3205               break;
3206 #endif /* RE_ENABLE_I18N */
3207             case EQUIV_CLASS:
3208               *err = build_equiv_class (sbcset,
3209 #ifdef RE_ENABLE_I18N
3210                                         mbcset, &equiv_class_alloc,
3211 #endif /* RE_ENABLE_I18N */
3212                                         start_elem.opr.name);
3213               if (BE (*err != REG_NOERROR, 0))
3214                 goto parse_bracket_exp_free_return;
3215               break;
3216             case COLL_SYM:
3217               *err = build_collating_symbol (sbcset,
3218 #ifdef RE_ENABLE_I18N
3219                                              mbcset, &coll_sym_alloc,
3220 #endif /* RE_ENABLE_I18N */
3221                                              start_elem.opr.name);
3222               if (BE (*err != REG_NOERROR, 0))
3223                 goto parse_bracket_exp_free_return;
3224               break;
3225             case CHAR_CLASS:
3226               *err = build_charclass (regexp->trans, sbcset,
3227 #ifdef RE_ENABLE_I18N
3228                                       mbcset, &char_class_alloc,
3229 #endif /* RE_ENABLE_I18N */
3230                                       start_elem.opr.name, syntax);
3231               if (BE (*err != REG_NOERROR, 0))
3232                goto parse_bracket_exp_free_return;
3233               break;
3234             default:
3235               assert (0);
3236               break;
3237             }
3238         }
3239       if (BE (token->type == END_OF_RE, 0))
3240         {
3241           *err = REG_EBRACK;
3242           goto parse_bracket_exp_free_return;
3243         }
3244       if (token->type == OP_CLOSE_BRACKET)
3245         break;
3246     }
3247
3248   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3249
3250   /* If it is non-matching list.  */
3251   if (non_match)
3252     bitset_not (sbcset);
3253
3254 #ifdef RE_ENABLE_I18N
3255   /* Ensure only single byte characters are set.  */
3256   if (dfa->mb_cur_max > 1)
3257     bitset_mask (sbcset, dfa->sb_char);
3258
3259   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3260       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3261                                                      || mbcset->non_match)))
3262     {
3263       bin_tree_t *mbc_tree;
3264       int sbc_idx;
3265       /* Build a tree for complex bracket.  */
3266       dfa->has_mb_node = 1;
3267       br_token.type = COMPLEX_BRACKET;
3268       br_token.opr.mbcset = mbcset;
3269       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3270       if (BE (mbc_tree == NULL, 0))
3271         goto parse_bracket_exp_espace;
3272       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3273         if (sbcset[sbc_idx])
3274           break;
3275       /* If there are no bits set in sbcset, there is no point
3276          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3277       if (sbc_idx < BITSET_WORDS)
3278         {
3279           /* Build a tree for simple bracket.  */
3280           br_token.type = SIMPLE_BRACKET;
3281           br_token.opr.sbcset = sbcset;
3282           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3283           if (BE (work_tree == NULL, 0))
3284             goto parse_bracket_exp_espace;
3285
3286           /* Then join them by ALT node.  */
3287           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3288           if (BE (work_tree == NULL, 0))
3289             goto parse_bracket_exp_espace;
3290         }
3291       else
3292         {
3293           re_free (sbcset);
3294           work_tree = mbc_tree;
3295         }
3296     }
3297   else
3298 #endif /* not RE_ENABLE_I18N */
3299     {
3300 #ifdef RE_ENABLE_I18N
3301       free_charset (mbcset);
3302 #endif
3303       /* Build a tree for simple bracket.  */
3304       br_token.type = SIMPLE_BRACKET;
3305       br_token.opr.sbcset = sbcset;
3306       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3307       if (BE (work_tree == NULL, 0))
3308         goto parse_bracket_exp_espace;
3309     }
3310   return work_tree;
3311
3312  parse_bracket_exp_espace:
3313   *err = REG_ESPACE;
3314  parse_bracket_exp_free_return:
3315   re_free (sbcset);
3316 #ifdef RE_ENABLE_I18N
3317   free_charset (mbcset);
3318 #endif /* RE_ENABLE_I18N */
3319   return NULL;
3320 }
3321
3322 /* Parse an element in the bracket expression.  */
3323
3324 static reg_errcode_t
3325 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3326                        re_token_t *token, int token_len, re_dfa_t *dfa,
3327                        reg_syntax_t syntax, int accept_hyphen)
3328 {
3329 #ifdef RE_ENABLE_I18N
3330   int cur_char_size;
3331   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3332   if (cur_char_size > 1)
3333     {
3334       elem->type = MB_CHAR;
3335       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3336       re_string_skip_bytes (regexp, cur_char_size);
3337       return REG_NOERROR;
3338     }
3339 #endif /* RE_ENABLE_I18N */
3340   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3341   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3342       || token->type == OP_OPEN_EQUIV_CLASS)
3343     return parse_bracket_symbol (elem, regexp, token);
3344   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3345     {
3346       /* A '-' must only appear as anything but a range indicator before
3347          the closing bracket.  Everything else is an error.  */
3348       re_token_t token2;
3349       (void) peek_token_bracket (&token2, regexp, syntax);
3350       if (token2.type != OP_CLOSE_BRACKET)
3351         /* The actual error value is not standardized since this whole
3352            case is undefined.  But ERANGE makes good sense.  */
3353         return REG_ERANGE;
3354     }
3355   elem->type = SB_CHAR;
3356   elem->opr.ch = token->opr.c;
3357   return REG_NOERROR;
3358 }
3359
3360 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3361    such as [:<character_class>:], [.<collating_element>.], and
3362    [=<equivalent_class>=].  */
3363
3364 static reg_errcode_t
3365 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3366                       re_token_t *token)
3367 {
3368   unsigned char ch, delim = token->opr.c;
3369   int i = 0;
3370   if (re_string_eoi(regexp))
3371     return REG_EBRACK;
3372   for (;; ++i)
3373     {
3374       if (i >= BRACKET_NAME_BUF_SIZE)
3375         return REG_EBRACK;
3376       if (token->type == OP_OPEN_CHAR_CLASS)
3377         ch = re_string_fetch_byte_case (regexp);
3378       else
3379         ch = re_string_fetch_byte (regexp);
3380       if (re_string_eoi(regexp))
3381         return REG_EBRACK;
3382       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3383         break;
3384       elem->opr.name[i] = ch;
3385     }
3386   re_string_skip_bytes (regexp, 1);
3387   elem->opr.name[i] = '\0';
3388   switch (token->type)
3389     {
3390     case OP_OPEN_COLL_ELEM:
3391       elem->type = COLL_SYM;
3392       break;
3393     case OP_OPEN_EQUIV_CLASS:
3394       elem->type = EQUIV_CLASS;
3395       break;
3396     case OP_OPEN_CHAR_CLASS:
3397       elem->type = CHAR_CLASS;
3398       break;
3399     default:
3400       break;
3401     }
3402   return REG_NOERROR;
3403 }
3404
3405   /* Helper function for parse_bracket_exp.
3406      Build the equivalence class which is represented by NAME.
3407      The result are written to MBCSET and SBCSET.
3408      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3409      is a pointer argument sinse we may update it.  */
3410
3411 static reg_errcode_t
3412 #ifdef RE_ENABLE_I18N
3413 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3414                    int *equiv_class_alloc, const unsigned char *name)
3415 #else /* not RE_ENABLE_I18N */
3416 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3417 #endif /* not RE_ENABLE_I18N */
3418 {
3419 #ifdef _LIBC
3420   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3421   if (nrules != 0)
3422     {
3423       const int32_t *table, *indirect;
3424       const unsigned char *weights, *extra, *cp;
3425       unsigned char char_buf[2];
3426       int32_t idx1, idx2;
3427       unsigned int ch;
3428       size_t len;
3429       /* This #include defines a local function!  */
3430 # include <locale/weight.h>
3431       /* Calculate the index for equivalence class.  */
3432       cp = name;
3433       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3434       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3435                                                _NL_COLLATE_WEIGHTMB);
3436       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3437                                                    _NL_COLLATE_EXTRAMB);
3438       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3439                                                 _NL_COLLATE_INDIRECTMB);
3440       idx1 = findidx (&cp, -1);
3441       if (BE (idx1 == 0 || *cp != '\0', 0))
3442         /* This isn't a valid character.  */
3443         return REG_ECOLLATE;
3444
3445       /* Build single byte matcing table for this equivalence class.  */
3446       len = weights[idx1 & 0xffffff];
3447       for (ch = 0; ch < SBC_MAX; ++ch)
3448         {
3449           char_buf[0] = ch;
3450           cp = char_buf;
3451           idx2 = findidx (&cp, 1);
3452 /*
3453           idx2 = table[ch];
3454 */
3455           if (idx2 == 0)
3456             /* This isn't a valid character.  */
3457             continue;
3458           /* Compare only if the length matches and the collation rule
3459              index is the same.  */
3460           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3461             {
3462               int cnt = 0;
3463
3464               while (cnt <= len &&
3465                      weights[(idx1 & 0xffffff) + 1 + cnt]
3466                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3467                 ++cnt;
3468
3469               if (cnt > len)
3470                 bitset_set (sbcset, ch);
3471             }
3472         }
3473       /* Check whether the array has enough space.  */
3474       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3475         {
3476           /* Not enough, realloc it.  */
3477           /* +1 in case of mbcset->nequiv_classes is 0.  */
3478           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3479           /* Use realloc since the array is NULL if *alloc == 0.  */
3480           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3481                                                    int32_t,
3482                                                    new_equiv_class_alloc);
3483           if (BE (new_equiv_classes == NULL, 0))
3484             return REG_ESPACE;
3485           mbcset->equiv_classes = new_equiv_classes;
3486           *equiv_class_alloc = new_equiv_class_alloc;
3487         }
3488       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3489     }
3490   else
3491 #endif /* _LIBC */
3492     {
3493       if (BE (strlen ((const char *) name) != 1, 0))
3494         return REG_ECOLLATE;
3495       bitset_set (sbcset, *name);
3496     }
3497   return REG_NOERROR;
3498 }
3499
3500   /* Helper function for parse_bracket_exp.
3501      Build the character class which is represented by NAME.
3502      The result are written to MBCSET and SBCSET.
3503      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3504      is a pointer argument sinse we may update it.  */
3505
3506 static reg_errcode_t
3507 #ifdef RE_ENABLE_I18N
3508 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3509                  re_charset_t *mbcset, int *char_class_alloc,
3510                  const unsigned char *class_name, reg_syntax_t syntax)
3511 #else /* not RE_ENABLE_I18N */
3512 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3513                  const unsigned char *class_name, reg_syntax_t syntax)
3514 #endif /* not RE_ENABLE_I18N */
3515 {
3516   int i;
3517   const char *name = (const char *) class_name;
3518
3519   /* In case of REG_ICASE "upper" and "lower" match the both of
3520      upper and lower cases.  */
3521   if ((syntax & RE_ICASE)
3522       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3523     name = "alpha";
3524
3525 #ifdef RE_ENABLE_I18N
3526   /* Check the space of the arrays.  */
3527   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3528     {
3529       /* Not enough, realloc it.  */
3530       /* +1 in case of mbcset->nchar_classes is 0.  */
3531       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3532       /* Use realloc since array is NULL if *alloc == 0.  */
3533       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3534                                                new_char_class_alloc);
3535       if (BE (new_char_classes == NULL, 0))
3536         return REG_ESPACE;
3537       mbcset->char_classes = new_char_classes;
3538       *char_class_alloc = new_char_class_alloc;
3539     }
3540   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3541 #endif /* RE_ENABLE_I18N */
3542
3543 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3544   do {                                          \
3545     if (BE (trans != NULL, 0))                  \
3546       {                                         \
3547         for (i = 0; i < SBC_MAX; ++i)           \
3548           if (ctype_func (i))                   \
3549             bitset_set (sbcset, trans[i]);      \
3550       }                                         \
3551     else                                        \
3552       {                                         \
3553         for (i = 0; i < SBC_MAX; ++i)           \
3554           if (ctype_func (i))                   \
3555             bitset_set (sbcset, i);             \
3556       }                                         \
3557   } while (0)
3558
3559   if (strcmp (name, "alnum") == 0)
3560     BUILD_CHARCLASS_LOOP (isalnum);
3561   else if (strcmp (name, "cntrl") == 0)
3562     BUILD_CHARCLASS_LOOP (iscntrl);
3563   else if (strcmp (name, "lower") == 0)
3564     BUILD_CHARCLASS_LOOP (islower);
3565   else if (strcmp (name, "space") == 0)
3566     BUILD_CHARCLASS_LOOP (isspace);
3567   else if (strcmp (name, "alpha") == 0)
3568     BUILD_CHARCLASS_LOOP (isalpha);
3569   else if (strcmp (name, "digit") == 0)
3570     BUILD_CHARCLASS_LOOP (isdigit);
3571   else if (strcmp (name, "print") == 0)
3572     BUILD_CHARCLASS_LOOP (isprint);
3573   else if (strcmp (name, "upper") == 0)
3574     BUILD_CHARCLASS_LOOP (isupper);
3575   else if (strcmp (name, "blank") == 0)
3576     BUILD_CHARCLASS_LOOP (isblank);
3577   else if (strcmp (name, "graph") == 0)
3578     BUILD_CHARCLASS_LOOP (isgraph);
3579   else if (strcmp (name, "punct") == 0)
3580     BUILD_CHARCLASS_LOOP (ispunct);
3581   else if (strcmp (name, "xdigit") == 0)
3582     BUILD_CHARCLASS_LOOP (isxdigit);
3583   else
3584     return REG_ECTYPE;
3585
3586   return REG_NOERROR;
3587 }
3588
3589 static bin_tree_t *
3590 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3591                     const unsigned char *class_name,
3592                     const unsigned char *extra, int non_match,
3593                     reg_errcode_t *err)
3594 {
3595   re_bitset_ptr_t sbcset;
3596 #ifdef RE_ENABLE_I18N
3597   re_charset_t *mbcset;
3598   int alloc = 0;
3599 #endif /* not RE_ENABLE_I18N */
3600   reg_errcode_t ret;
3601   re_token_t br_token;
3602   bin_tree_t *tree;
3603
3604   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3605 #ifdef RE_ENABLE_I18N
3606   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3607 #endif /* RE_ENABLE_I18N */
3608
3609 #ifdef RE_ENABLE_I18N
3610   if (BE (sbcset == NULL || mbcset == NULL, 0))
3611 #else /* not RE_ENABLE_I18N */
3612   if (BE (sbcset == NULL, 0))
3613 #endif /* not RE_ENABLE_I18N */
3614     {
3615       *err = REG_ESPACE;
3616       return NULL;
3617     }
3618
3619   if (non_match)
3620     {
3621 #ifdef RE_ENABLE_I18N
3622       mbcset->non_match = 1;
3623 #endif /* not RE_ENABLE_I18N */
3624     }
3625
3626   /* We don't care the syntax in this case.  */
3627   ret = build_charclass (trans, sbcset,
3628 #ifdef RE_ENABLE_I18N
3629                          mbcset, &alloc,
3630 #endif /* RE_ENABLE_I18N */
3631                          class_name, 0);
3632
3633   if (BE (ret != REG_NOERROR, 0))
3634     {
3635       re_free (sbcset);
3636 #ifdef RE_ENABLE_I18N
3637       free_charset (mbcset);
3638 #endif /* RE_ENABLE_I18N */
3639       *err = ret;
3640       return NULL;
3641     }
3642   /* \w match '_' also.  */
3643   for (; *extra; extra++)
3644     bitset_set (sbcset, *extra);
3645
3646   /* If it is non-matching list.  */
3647   if (non_match)
3648     bitset_not (sbcset);
3649
3650 #ifdef RE_ENABLE_I18N
3651   /* Ensure only single byte characters are set.  */
3652   if (dfa->mb_cur_max > 1)
3653     bitset_mask (sbcset, dfa->sb_char);
3654 #endif
3655
3656   /* Build a tree for simple bracket.  */
3657   br_token.type = SIMPLE_BRACKET;
3658   br_token.opr.sbcset = sbcset;
3659   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3660   if (BE (tree == NULL, 0))
3661     goto build_word_op_espace;
3662
3663 #ifdef RE_ENABLE_I18N
3664   if (dfa->mb_cur_max > 1)
3665     {
3666       bin_tree_t *mbc_tree;
3667       /* Build a tree for complex bracket.  */
3668       br_token.type = COMPLEX_BRACKET;
3669       br_token.opr.mbcset = mbcset;
3670       dfa->has_mb_node = 1;
3671       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3672       if (BE (mbc_tree == NULL, 0))
3673         goto build_word_op_espace;
3674       /* Then join them by ALT node.  */
3675       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3676       if (BE (mbc_tree != NULL, 1))
3677         return tree;
3678     }
3679   else
3680     {
3681       free_charset (mbcset);
3682       return tree;
3683     }
3684 #else /* not RE_ENABLE_I18N */
3685   return tree;
3686 #endif /* not RE_ENABLE_I18N */
3687
3688  build_word_op_espace:
3689   re_free (sbcset);
3690 #ifdef RE_ENABLE_I18N
3691   free_charset (mbcset);
3692 #endif /* RE_ENABLE_I18N */
3693   *err = REG_ESPACE;
3694   return NULL;
3695 }
3696
3697 /* This is intended for the expressions like "a{1,3}".
3698    Fetch a number from `input', and return the number.
3699    Return -1, if the number field is empty like "{,1}".
3700    Return -2, If an error is occured.  */
3701
3702 static int
3703 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3704 {
3705   int num = -1;
3706   unsigned char c;
3707   while (1)
3708     {
3709       fetch_token (token, input, syntax);
3710       c = token->opr.c;
3711       if (BE (token->type == END_OF_RE, 0))
3712         return -2;
3713       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3714         break;
3715       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3716              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3717       num = (num > RE_DUP_MAX) ? -2 : num;
3718     }
3719   return num;
3720 }
3721 \f
3722 #ifdef RE_ENABLE_I18N
3723 static void
3724 free_charset (re_charset_t *cset)
3725 {
3726   re_free (cset->mbchars);
3727 # ifdef _LIBC
3728   re_free (cset->coll_syms);
3729   re_free (cset->equiv_classes);
3730   re_free (cset->range_starts);
3731   re_free (cset->range_ends);
3732 # endif
3733   re_free (cset->char_classes);
3734   re_free (cset);
3735 }
3736 #endif /* RE_ENABLE_I18N */
3737 \f
3738 /* Functions for binary tree operation.  */
3739
3740 /* Create a tree node.  */
3741
3742 static bin_tree_t *
3743 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3744              re_token_type_t type)
3745 {
3746   re_token_t t;
3747   t.type = type;
3748   return create_token_tree (dfa, left, right, &t);
3749 }
3750
3751 static bin_tree_t *
3752 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3753                    const re_token_t *token)
3754 {
3755   bin_tree_t *tree;
3756   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3757     {
3758       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3759
3760       if (storage == NULL)
3761         return NULL;
3762       storage->next = dfa->str_tree_storage;
3763       dfa->str_tree_storage = storage;
3764       dfa->str_tree_storage_idx = 0;
3765     }
3766   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3767
3768   tree->parent = NULL;
3769   tree->left = left;
3770   tree->right = right;
3771   tree->token = *token;
3772   tree->token.duplicated = 0;
3773   tree->token.opt_subexp = 0;
3774   tree->first = NULL;
3775   tree->next = NULL;
3776   tree->node_idx = -1;
3777
3778   if (left != NULL)
3779     left->parent = tree;
3780   if (right != NULL)
3781     right->parent = tree;
3782   return tree;
3783 }
3784
3785 /* Mark the tree SRC as an optional subexpression.
3786    To be called from preorder or postorder.  */
3787
3788 static reg_errcode_t
3789 mark_opt_subexp (void *extra, bin_tree_t *node)
3790 {
3791   int idx = (int) (long) extra;
3792   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3793     node->token.opt_subexp = 1;
3794
3795   return REG_NOERROR;
3796 }
3797
3798 /* Free the allocated memory inside NODE. */
3799
3800 static void
3801 free_token (re_token_t *node)
3802 {
3803 #ifdef RE_ENABLE_I18N
3804   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3805     free_charset (node->opr.mbcset);
3806   else
3807 #endif /* RE_ENABLE_I18N */
3808     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3809       re_free (node->opr.sbcset);
3810 }
3811
3812 /* Worker function for tree walking.  Free the allocated memory inside NODE
3813    and its children. */
3814
3815 static reg_errcode_t
3816 free_tree (void *extra, bin_tree_t *node)
3817 {
3818   free_token (&node->token);
3819   return REG_NOERROR;
3820 }
3821
3822
3823 /* Duplicate the node SRC, and return new node.  This is a preorder
3824    visit similar to the one implemented by the generic visitor, but
3825    we need more infrastructure to maintain two parallel trees --- so,
3826    it's easier to duplicate.  */
3827
3828 static bin_tree_t *
3829 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3830 {
3831   const bin_tree_t *node;
3832   bin_tree_t *dup_root;
3833   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3834
3835   for (node = root; ; )
3836     {
3837       /* Create a new tree and link it back to the current parent.  */
3838       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3839       if (*p_new == NULL)
3840         return NULL;
3841       (*p_new)->parent = dup_node;
3842       (*p_new)->token.duplicated = 1;
3843       dup_node = *p_new;
3844
3845       /* Go to the left node, or up and to the right.  */
3846       if (node->left)
3847         {
3848           node = node->left;
3849           p_new = &dup_node->left;
3850         }
3851       else
3852         {
3853           const bin_tree_t *prev = NULL;
3854           while (node->right == prev || node->right == NULL)
3855             {
3856               prev = node;
3857               node = node->parent;
3858               dup_node = dup_node->parent;
3859               if (!node)
3860                 return dup_root;
3861             }
3862           node = node->right;
3863           p_new = &dup_node->right;
3864         }
3865     }
3866 }