regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          int node, int root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static int fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         int *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       int *char_class_alloc,
  97                                       const char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const char *class_name,
 110                                        const char *extra,
 111                                        int non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205
 206 #ifdef ZOS_USS
 207
 208 /* For ZOS USS we must define btowc */
 209
 210 wchar_t
 211 btowc (int c)
 212 {
 213    wchar_t wtmp[2];
 214    char tmp[2];
 215    mbstate_t mbs;
 216
 217    memset(& mbs, 0, sizeof(mbs));
 218    tmp[0] = c;
 219    tmp[1] = 0;
 220
 221    mbrtowc (wtmp, tmp, 1, & mbs);
 222    return wtmp[0];
 223 }
 224 #endif
 225
 226 /* re_compile_pattern is the GNU regular expression compiler: it
 227    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 228    Returns 0 if the pattern was valid, otherwise an error string.
 229
 230    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 231    are set in BUFP on entry.  */
 232
 233 const char *
 234 re_compile_pattern (pattern, length, bufp)
 235     const char *pattern;
 236     size_t length;
 237     struct re_pattern_buffer *bufp;
 238 {
 239   reg_errcode_t ret;
 240
 241   /* And GNU code determines whether or not to get register information
 242      by passing null for the REGS argument to re_match, etc., not by
 243      setting no_sub, unless RE_NO_SUB is set.  */
 244   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 245
 246   /* Match anchors at newline.  */
 247   bufp->newline_anchor = 1;
 248
 249   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 250
 251   if (!ret)
 252     return NULL;
 253   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 254 }
 255 #ifdef _LIBC
 256 weak_alias (__re_compile_pattern, re_compile_pattern)
 257 #endif
 258
 259 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 260    also be assigned to arbitrarily: each pattern buffer stores its own
 261    syntax, so it can be changed between regex compilations.  */
 262 /* This has no initializer because initialized variables in Emacs
 263    become read-only after dumping.  */
 264 reg_syntax_t re_syntax_options;
 265
 266
 267 /* Specify the precise syntax of regexps for compilation.  This provides
 268    for compatibility for various utilities which historically have
 269    different, incompatible syntaxes.
 270
 271    The argument SYNTAX is a bit mask comprised of the various bits
 272    defined in regex.h.  We return the old syntax.  */
 273
 274 reg_syntax_t
 275 re_set_syntax (syntax)
 276     reg_syntax_t syntax;
 277 {
 278   reg_syntax_t ret = re_syntax_options;
 279
 280   re_syntax_options = syntax;
 281   return ret;
 282 }
 283 #ifdef _LIBC
 284 weak_alias (__re_set_syntax, re_set_syntax)
 285 #endif
 286
 287 int
 288 re_compile_fastmap (bufp)
 289     struct re_pattern_buffer *bufp;
 290 {
 291   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 292   char *fastmap = bufp->fastmap;
 293
 294   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 295   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 296   if (dfa->init_state != dfa->init_state_word)
 297     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 298   if (dfa->init_state != dfa->init_state_nl)
 299     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 300   if (dfa->init_state != dfa->init_state_begbuf)
 301     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 302   bufp->fastmap_accurate = 1;
 303   return 0;
 304 }
 305 #ifdef _LIBC
 306 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 307 #endif
 308
 309 static inline void
 310 __attribute ((always_inline))
 311 re_set_fastmap (char *fastmap, int icase, int ch)
 312 {
 313   fastmap[ch] = 1;
 314   if (icase)
 315     fastmap[tolower (ch)] = 1;
 316 }
 317
 318 /* Helper function for re_compile_fastmap.
 319    Compile fastmap for the initial_state INIT_STATE.  */
 320
 321 static void
 322 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 323                          char *fastmap)
 324 {
 325   volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 326   int node_cnt;
 327   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 328   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 329     {
 330       int node = init_state->nodes.elems[node_cnt];
 331       re_token_type_t type = dfa->nodes[node].type;
 332
 333       if (type == CHARACTER)
 334         {
 335           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 336 #ifdef RE_ENABLE_I18N
 337           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 338             {
 339               unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 340               wchar_t wc;
 341               mbstate_t state;
 342
 343               p = buf;
 344               *p++ = dfa->nodes[node].opr.c;
 345               while (++node < dfa->nodes_len
 346                      && dfa->nodes[node].type == CHARACTER
 347                      && dfa->nodes[node].mb_partial)
 348                 *p++ = dfa->nodes[node].opr.c;
 349               memset (&state, '\0', sizeof (state));
 350               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 351                              &state) == p - buf
 352                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 353                       != (size_t) -1))
 354                 re_set_fastmap (fastmap, 0, buf[0]);
 355               re_free (buf);
 356             }
 357 #endif
 358         }
 359       else if (type == SIMPLE_BRACKET)
 360         {
 361           int i, ch;
 362           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 363             {
 364               int j;
 365               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 366               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 367                 if (w & ((bitset_word_t) 1 << j))
 368                   re_set_fastmap (fastmap, icase, ch);
 369             }
 370         }
 371 #ifdef RE_ENABLE_I18N
 372       else if (type == COMPLEX_BRACKET)
 373         {
 374           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 375           int i;
 376
 377 # ifdef _LIBC
 378           /* See if we have to try all bytes which start multiple collation
 379              elements.
 380              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 381                   collation element, and don't catch 'b' since 'b' is
 382                   the only collation element which starts from 'b' (and
 383                   it is caught by SIMPLE_BRACKET).  */
 384               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 385                   && (cset->ncoll_syms || cset->nranges))
 386                 {
 387                   const int32_t *table = (const int32_t *)
 388                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 389                   for (i = 0; i < SBC_MAX; ++i)
 390                     if (table[i] < 0)
 391                       re_set_fastmap (fastmap, icase, i);
 392                 }
 393 # endif /* _LIBC */
 394
 395           /* See if we have to start the match at all multibyte characters,
 396              i.e. where we would not find an invalid sequence.  This only
 397              applies to multibyte character sets; for single byte character
 398              sets, the SIMPLE_BRACKET again suffices.  */
 399           if (dfa->mb_cur_max > 1
 400               && (cset->nchar_classes || cset->non_match || cset->nranges
 401 # ifdef _LIBC
 402                   || cset->nequiv_classes
 403 # endif /* _LIBC */
 404                  ))
 405             {
 406               unsigned char c = 0;
 407               do
 408                 {
 409                   mbstate_t mbs;
 410                   memset (&mbs, 0, sizeof (mbs));
 411                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 412                     re_set_fastmap (fastmap, false, (int) c);
 413                 }
 414               while (++c != 0);
 415             }
 416
 417           else
 418             {
 419               /* ... Else catch all bytes which can start the mbchars.  */
 420               for (i = 0; i < cset->nmbchars; ++i)
 421                 {
 422                   char buf[256];
 423                   mbstate_t state;
 424                   memset (&state, '\0', sizeof (state));
 425                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 426                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 427                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 428                     {
 429                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 430                           != (size_t) -1)
 431                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 432                     }
 433                 }
 434             }
 435         }
 436 #endif /* RE_ENABLE_I18N */
 437       else if (type == OP_PERIOD
 438 #ifdef RE_ENABLE_I18N
 439                || type == OP_UTF8_PERIOD
 440 #endif /* RE_ENABLE_I18N */
 441                || type == END_OF_RE)
 442         {
 443           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 444           if (type == END_OF_RE)
 445             bufp->can_be_null = 1;
 446           return;
 447         }
 448     }
 449 }
 450 \f
 451 /* Entry point for POSIX code.  */
 452 /* regcomp takes a regular expression as a string and compiles it.
 453
 454    PREG is a regex_t *.  We do not expect any fields to be initialized,
 455    since POSIX says we shouldn't.  Thus, we set
 456
 457      `buffer' to the compiled pattern;
 458      `used' to the length of the compiled pattern;
 459      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 460        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 461        RE_SYNTAX_POSIX_BASIC;
 462      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 463      `fastmap' to an allocated space for the fastmap;
 464      `fastmap_accurate' to zero;
 465      `re_nsub' to the number of subexpressions in PATTERN.
 466
 467    PATTERN is the address of the pattern string.
 468
 469    CFLAGS is a series of bits which affect compilation.
 470
 471      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 472      use POSIX basic syntax.
 473
 474      If REG_NEWLINE is set, then . and [^...] don't match newline.
 475      Also, regexec will try a match beginning after every newline.
 476
 477      If REG_ICASE is set, then we considers upper- and lowercase
 478      versions of letters to be equivalent when matching.
 479
 480      If REG_NOSUB is set, then when PREG is passed to regexec, that
 481      routine will report only success or failure, and nothing about the
 482      registers.
 483
 484    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 485    the return codes and their meanings.)  */
 486
 487 int
 488 regcomp (preg, pattern, cflags)
 489     regex_t *__restrict preg;
 490     const char *__restrict pattern;
 491     int cflags;
 492 {
 493   reg_errcode_t ret;
 494   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 495                          : RE_SYNTAX_POSIX_BASIC);
 496
 497   preg->buffer = NULL;
 498   preg->allocated = 0;
 499   preg->used = 0;
 500
 501   /* Try to allocate space for the fastmap.  */
 502   preg->fastmap = re_malloc (char, SBC_MAX);
 503   if (BE (preg->fastmap == NULL, 0))
 504     return REG_ESPACE;
 505
 506   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 507
 508   /* If REG_NEWLINE is set, newlines are treated differently.  */
 509   if (cflags & REG_NEWLINE)
 510     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 511       syntax &= ~RE_DOT_NEWLINE;
 512       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 513       /* It also changes the matching behavior.  */
 514       preg->newline_anchor = 1;
 515     }
 516   else
 517     preg->newline_anchor = 0;
 518   preg->no_sub = !!(cflags & REG_NOSUB);
 519   preg->translate = NULL;
 520
 521   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 522
 523   /* POSIX doesn't distinguish between an unmatched open-group and an
 524      unmatched close-group: both are REG_EPAREN.  */
 525   if (ret == REG_ERPAREN)
 526     ret = REG_EPAREN;
 527
 528   /* We have already checked preg->fastmap != NULL.  */
 529   if (BE (ret == REG_NOERROR, 1))
 530     /* Compute the fastmap now, since regexec cannot modify the pattern
 531        buffer.  This function never fails in this implementation.  */
 532     (void) re_compile_fastmap (preg);
 533   else
 534     {
 535       /* Some error occurred while compiling the expression.  */
 536       re_free (preg->fastmap);
 537       preg->fastmap = NULL;
 538     }
 539
 540   return (int) ret;
 541 }
 542 #ifdef _LIBC
 543 weak_alias (__regcomp, regcomp)
 544 #endif
 545
 546 /* Returns a message corresponding to an error code, ERRCODE, returned
 547    from either regcomp or regexec.   We don't use PREG here.  */
 548
 549 size_t
 550 regerror (errcode, preg, errbuf, errbuf_size)
 551     int errcode;
 552     const regex_t *__restrict preg;
 553     char *__restrict errbuf;
 554     size_t errbuf_size;
 555 {
 556   const char *msg;
 557   size_t msg_size;
 558
 559   if (BE (errcode < 0
 560           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 561                                / sizeof (__re_error_msgid_idx[0])), 0))
 562     /* Only error codes returned by the rest of the code should be passed
 563        to this routine.  If we are given anything else, or if other regex
 564        code generates an invalid error code, then the program has a bug.
 565        Dump core so we can fix it.  */
 566     abort ();
 567
 568   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 569
 570   msg_size = strlen (msg) + 1; /* Includes the null.  */
 571
 572   if (BE (errbuf_size != 0, 1))
 573     {
 574       if (BE (msg_size > errbuf_size, 0))
 575         {
 576           memcpy (errbuf, msg, errbuf_size - 1);
 577           errbuf[errbuf_size - 1] = 0;
 578         }
 579       else
 580         memcpy (errbuf, msg, msg_size);
 581     }
 582
 583   return msg_size;
 584 }
 585 #ifdef _LIBC
 586 weak_alias (__regerror, regerror)
 587 #endif
 588
 589
 590 #ifdef RE_ENABLE_I18N
 591 /* This static array is used for the map to single-byte characters when
 592    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 593    it the same all the time.  UTF-8 is the preferred encoding so this is
 594    a worthwhile optimization.  */
 595 #if __GNUC__ >= 3
 596 static const bitset_t utf8_sb_map = {
 597   /* Set the first 128 bits.  */
 598   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 599 };
 600 #else /* ! (__GNUC__ >= 3) */
 601 static bitset_t utf8_sb_map;
 602 #endif /* __GNUC__ >= 3 */
 603 #endif /* RE_ENABLE_I18N */
 604
 605
 606 static void
 607 free_dfa_content (re_dfa_t *dfa)
 608 {
 609   int i, j;
 610
 611   if (dfa->nodes)
 612     for (i = 0; i < dfa->nodes_len; ++i)
 613       free_token (dfa->nodes + i);
 614   re_free (dfa->nexts);
 615   for (i = 0; i < dfa->nodes_len; ++i)
 616     {
 617       if (dfa->eclosures != NULL)
 618         re_node_set_free (dfa->eclosures + i);
 619       if (dfa->inveclosures != NULL)
 620         re_node_set_free (dfa->inveclosures + i);
 621       if (dfa->edests != NULL)
 622         re_node_set_free (dfa->edests + i);
 623     }
 624   re_free (dfa->edests);
 625   re_free (dfa->eclosures);
 626   re_free (dfa->inveclosures);
 627   re_free (dfa->nodes);
 628
 629   if (dfa->state_table)
 630     for (i = 0; i <= dfa->state_hash_mask; ++i)
 631       {
 632         struct re_state_table_entry *entry = dfa->state_table + i;
 633         for (j = 0; j < entry->num; ++j)
 634           {
 635             re_dfastate_t *state = entry->array[j];
 636             free_state (state);
 637           }
 638         re_free (entry->array);
 639       }
 640   re_free (dfa->state_table);
 641 #ifdef RE_ENABLE_I18N
 642   if (dfa->sb_char != utf8_sb_map)
 643     re_free (dfa->sb_char);
 644 #endif
 645   re_free (dfa->subexp_map);
 646 #ifdef DEBUG
 647   re_free (dfa->re_str);
 648 #endif
 649
 650   re_free (dfa);
 651 }
 652
 653
 654 /* Free dynamically allocated space used by PREG.  */
 655
 656 void
 657 regfree (preg)
 658     regex_t *preg;
 659 {
 660   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 661   if (BE (dfa != NULL, 1))
 662     free_dfa_content (dfa);
 663   preg->buffer = NULL;
 664   preg->allocated = 0;
 665
 666   re_free (preg->fastmap);
 667   preg->fastmap = NULL;
 668
 669   re_free (preg->translate);
 670   preg->translate = NULL;
 671 }
 672 #ifdef _LIBC
 673 weak_alias (__regfree, regfree)
 674 #endif
 675 \f
 676 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 677    them unless specifically requested.  */
 678
 679 #if defined _REGEX_RE_COMP || defined _LIBC
 680
 681 /* BSD has one and only one pattern buffer.  */
 682 static struct re_pattern_buffer re_comp_buf;
 683
 684 char *
 685 # ifdef _LIBC
 686 /* Make these definitions weak in libc, so POSIX programs can redefine
 687    these names if they don't use our functions, and still use
 688    regcomp/regexec above without link errors.  */
 689 weak_function
 690 # endif
 691 re_comp (s)
 692      const char *s;
 693 {
 694   reg_errcode_t ret;
 695   char *fastmap;
 696
 697   if (!s)
 698     {
 699       if (!re_comp_buf.buffer)
 700         return gettext ("No previous regular expression");
 701       return 0;
 702     }
 703
 704   if (re_comp_buf.buffer)
 705     {
 706       fastmap = re_comp_buf.fastmap;
 707       re_comp_buf.fastmap = NULL;
 708       __regfree (&re_comp_buf);
 709       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 710       re_comp_buf.fastmap = fastmap;
 711     }
 712
 713   if (re_comp_buf.fastmap == NULL)
 714     {
 715       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 716       if (re_comp_buf.fastmap == NULL)
 717         return (char *) gettext (__re_error_msgid
 718                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 719     }
 720
 721   /* Since `re_exec' always passes NULL for the `regs' argument, we
 722      don't need to initialize the pattern buffer fields which affect it.  */
 723
 724   /* Match anchors at newlines.  */
 725   re_comp_buf.newline_anchor = 1;
 726
 727   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 728
 729   if (!ret)
 730     return NULL;
 731
 732   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 733   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 734 }
 735
 736 #ifdef _LIBC
 737 libc_freeres_fn (free_mem)
 738 {
 739   __regfree (&re_comp_buf);
 740 }
 741 #endif
 742
 743 #endif /* _REGEX_RE_COMP */
 744 \f
 745 /* Internal entry point.
 746    Compile the regular expression PATTERN, whose length is LENGTH.
 747    SYNTAX indicate regular expression's syntax.  */
 748
 749 static reg_errcode_t
 750 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 751                      reg_syntax_t syntax)
 752 {
 753   reg_errcode_t err = REG_NOERROR;
 754   re_dfa_t *dfa;
 755   re_string_t regexp;
 756
 757   /* Initialize the pattern buffer.  */
 758   preg->fastmap_accurate = 0;
 759   preg->syntax = syntax;
 760   preg->not_bol = preg->not_eol = 0;
 761   preg->used = 0;
 762   preg->re_nsub = 0;
 763   preg->can_be_null = 0;
 764   preg->regs_allocated = REGS_UNALLOCATED;
 765
 766   /* Initialize the dfa.  */
 767   dfa = (re_dfa_t *) preg->buffer;
 768   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 769     {
 770       /* If zero allocated, but buffer is non-null, try to realloc
 771          enough space.  This loses if buffer's address is bogus, but
 772          that is the user's responsibility.  If ->buffer is NULL this
 773          is a simple allocation.  */
 774       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 775       if (dfa == NULL)
 776         return REG_ESPACE;
 777       preg->allocated = sizeof (re_dfa_t);
 778       preg->buffer = (unsigned char *) dfa;
 779     }
 780   preg->used = sizeof (re_dfa_t);
 781
 782   err = init_dfa (dfa, length);
 783   if (BE (err != REG_NOERROR, 0))
 784     {
 785       free_dfa_content (dfa);
 786       preg->buffer = NULL;
 787       preg->allocated = 0;
 788       return err;
 789     }
 790 #ifdef DEBUG
 791   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 792   dfa->re_str = re_malloc (char, length + 1);
 793   strncpy (dfa->re_str, pattern, length + 1);
 794 #endif
 795
 796   __libc_lock_init (dfa->lock);
 797
 798   err = re_string_construct (&regexp, pattern, length, preg->translate,
 799                              syntax & RE_ICASE, dfa);
 800   if (BE (err != REG_NOERROR, 0))
 801     {
 802     re_compile_internal_free_return:
 803       free_workarea_compile (preg);
 804       re_string_destruct (&regexp);
 805       free_dfa_content (dfa);
 806       preg->buffer = NULL;
 807       preg->allocated = 0;
 808       return err;
 809     }
 810
 811   /* Parse the regular expression, and build a structure tree.  */
 812   preg->re_nsub = 0;
 813   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 814   if (BE (dfa->str_tree == NULL, 0))
 815     goto re_compile_internal_free_return;
 816
 817   /* Analyze the tree and create the nfa.  */
 818   err = analyze (preg);
 819   if (BE (err != REG_NOERROR, 0))
 820     goto re_compile_internal_free_return;
 821
 822 #ifdef RE_ENABLE_I18N
 823   /* If possible, do searching in single byte encoding to speed things up.  */
 824   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 825     optimize_utf8 (dfa);
 826 #endif
 827
 828   /* Then create the initial state of the dfa.  */
 829   err = create_initial_state (dfa);
 830
 831   /* Release work areas.  */
 832   free_workarea_compile (preg);
 833   re_string_destruct (&regexp);
 834
 835   if (BE (err != REG_NOERROR, 0))
 836     {
 837       free_dfa_content (dfa);
 838       preg->buffer = NULL;
 839       preg->allocated = 0;
 840     }
 841
 842   return err;
 843 }
 844
 845 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 846    as the initial length of some arrays.  */
 847
 848 static reg_errcode_t
 849 init_dfa (re_dfa_t *dfa, size_t pat_len)
 850 {
 851   unsigned int table_size;
 852 #ifndef _LIBC
 853   char *codeset_name;
 854 #endif
 855
 856   memset (dfa, '\0', sizeof (re_dfa_t));
 857
 858   /* Force allocation of str_tree_storage the first time.  */
 859   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 860
 861   /* Avoid overflows.  */
 862   if (pat_len == SIZE_MAX)
 863     return REG_ESPACE;
 864
 865   dfa->nodes_alloc = pat_len + 1;
 866   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 867
 868   /*  table_size = 2 ^ ceil(log pat_len) */
 869   for (table_size = 1; ; table_size <<= 1)
 870     if (table_size > pat_len)
 871       break;
 872
 873   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 874   dfa->state_hash_mask = table_size - 1;
 875
 876   dfa->mb_cur_max = MB_CUR_MAX;
 877 #ifdef _LIBC
 878   if (dfa->mb_cur_max == 6
 879       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 880     dfa->is_utf8 = 1;
 881   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 882                        != 0);
 883 #else
 884 # ifdef HAVE_LANGINFO_CODESET
 885   codeset_name = nl_langinfo (CODESET);
 886 # else
 887   codeset_name = getenv ("LC_ALL");
 888   if (codeset_name == NULL || codeset_name[0] == '\0')
 889     codeset_name = getenv ("LC_CTYPE");
 890   if (codeset_name == NULL || codeset_name[0] == '\0')
 891     codeset_name = getenv ("LANG");
 892   if (codeset_name == NULL)
 893     codeset_name = "";
 894   else if (strchr (codeset_name, '.') !=  NULL)
 895     codeset_name = strchr (codeset_name, '.') + 1;
 896 # endif
 897
 898   /* strcasecmp isn't a standard interface. brute force check */
 899 #if 0
 900   if (strcasecmp (codeset_name, "UTF-8") == 0
 901       || strcasecmp (codeset_name, "UTF8") == 0)
 902     dfa->is_utf8 = 1;
 903 #else
 904   if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 905       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 906       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 907       && (codeset_name[3] == '-'
 908           ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 909           : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 910     dfa->is_utf8 = 1;
 911 #endif
 912
 913   /* We check exhaustively in the loop below if this charset is a
 914      superset of ASCII.  */
 915   dfa->map_notascii = 0;
 916 #endif
 917
 918 #ifdef RE_ENABLE_I18N
 919   if (dfa->mb_cur_max > 1)
 920     {
 921       if (dfa->is_utf8)
 922         {
 923 #if !defined(__GNUC__) || __GNUC__ < 3
 924           static short utf8_sb_map_inited = 0;
 925
 926           if (! utf8_sb_map_inited)
 927             {
 928                 int i;
 929
 930                 utf8_sb_map_inited = 0;
 931                 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 932                   utf8_sb_map[i] = BITSET_WORD_MAX;
 933             }
 934 #endif
 935           dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 936         }
 937       else
 938         {
 939           int i, j, ch;
 940
 941           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 942           if (BE (dfa->sb_char == NULL, 0))
 943             return REG_ESPACE;
 944
 945           /* Set the bits corresponding to single byte chars.  */
 946           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 947             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 948               {
 949                 wint_t wch = __btowc (ch);
 950                 if (wch != WEOF)
 951                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 952 # ifndef _LIBC
 953                 if (isascii (ch) && wch != ch)
 954                   dfa->map_notascii = 1;
 955 # endif
 956               }
 957         }
 958     }
 959 #endif
 960
 961   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 962     return REG_ESPACE;
 963   return REG_NOERROR;
 964 }
 965
 966 /* Initialize WORD_CHAR table, which indicate which character is
 967    "word".  In this case "word" means that it is the word construction
 968    character used by some operators like "\<", "\>", etc.  */
 969
 970 static void
 971 internal_function
 972 init_word_char (re_dfa_t *dfa)
 973 {
 974   int i, j, ch;
 975   dfa->word_ops_used = 1;
 976 #ifndef GAWK
 977   if (BE (dfa->map_notascii == 0, 1))
 978     {
 979       if (sizeof (dfa->word_char[0]) == 8)
 980         {
 981           dfa->word_char[0] = UINT64_C (0x03ff000000000000);
 982           dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
 983           i = 2;
 984         }
 985       else if (sizeof (dfa->word_char[0]) == 4)
 986         {
 987           dfa->word_char[0] = UINT32_C (0x00000000);
 988           dfa->word_char[1] = UINT32_C (0x03ff0000);
 989           dfa->word_char[2] = UINT32_C (0x87fffffe);
 990           dfa->word_char[3] = UINT32_C (0x07fffffe);
 991           i = 4;
 992         }
 993       else
 994         abort ();
 995       ch = 128;
 996
 997       if (BE (dfa->is_utf8, 1))
 998         {
 999           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
1000           return;
1001         }
1002     }
1003 #endif
1004
1005   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
1006     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
1007       if (isalnum (ch) || ch == '_')
1008         dfa->word_char[i] |= (bitset_word_t) 1 << j;
1009 }
1010
1011 /* Free the work area which are only used while compiling.  */
1012
1013 static void
1014 free_workarea_compile (regex_t *preg)
1015 {
1016   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1017   bin_tree_storage_t *storage, *next;
1018   for (storage = dfa->str_tree_storage; storage; storage = next)
1019     {
1020       next = storage->next;
1021       re_free (storage);
1022     }
1023   dfa->str_tree_storage = NULL;
1024   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1025   dfa->str_tree = NULL;
1026   re_free (dfa->org_indices);
1027   dfa->org_indices = NULL;
1028 }
1029
1030 /* Create initial states for all contexts.  */
1031
1032 static reg_errcode_t
1033 create_initial_state (re_dfa_t *dfa)
1034 {
1035   int first, i;
1036   reg_errcode_t err;
1037   re_node_set init_nodes;
1038
1039   /* Initial states have the epsilon closure of the node which is
1040      the first node of the regular expression.  */
1041   first = dfa->str_tree->first->node_idx;
1042   dfa->init_node = first;
1043   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1044   if (BE (err != REG_NOERROR, 0))
1045     return err;
1046
1047   /* The back-references which are in initial states can epsilon transit,
1048      since in this case all of the subexpressions can be null.
1049      Then we add epsilon closures of the nodes which are the next nodes of
1050      the back-references.  */
1051   if (dfa->nbackref > 0)
1052     for (i = 0; i < init_nodes.nelem; ++i)
1053       {
1054         int node_idx = init_nodes.elems[i];
1055         re_token_type_t type = dfa->nodes[node_idx].type;
1056
1057         int clexp_idx;
1058         if (type != OP_BACK_REF)
1059           continue;
1060         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1061           {
1062             re_token_t *clexp_node;
1063             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1064             if (clexp_node->type == OP_CLOSE_SUBEXP
1065                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1066               break;
1067           }
1068         if (clexp_idx == init_nodes.nelem)
1069           continue;
1070
1071         if (type == OP_BACK_REF)
1072           {
1073             int dest_idx = dfa->edests[node_idx].elems[0];
1074             if (!re_node_set_contains (&init_nodes, dest_idx))
1075               {
1076                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1077                                                        dfa->eclosures
1078                                                        + dest_idx);
1079                 if (err != REG_NOERROR)
1080                   return err;
1081                 i = 0;
1082               }
1083           }
1084       }
1085
1086   /* It must be the first time to invoke acquire_state.  */
1087   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1088   /* We don't check ERR here, since the initial state must not be NULL.  */
1089   if (BE (dfa->init_state == NULL, 0))
1090     return err;
1091   if (dfa->init_state->has_constraint)
1092     {
1093       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1094                                                        CONTEXT_WORD);
1095       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1096                                                      CONTEXT_NEWLINE);
1097       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1098                                                          &init_nodes,
1099                                                          CONTEXT_NEWLINE
1100                                                          | CONTEXT_BEGBUF);
1101       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1102               || dfa->init_state_begbuf == NULL, 0))
1103         return err;
1104     }
1105   else
1106     dfa->init_state_word = dfa->init_state_nl
1107       = dfa->init_state_begbuf = dfa->init_state;
1108
1109   re_node_set_free (&init_nodes);
1110   return REG_NOERROR;
1111 }
1112 \f
1113 #ifdef RE_ENABLE_I18N
1114 /* If it is possible to do searching in single byte encoding instead of UTF-8
1115    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1116    DFA nodes where needed.  */
1117
1118 static void
1119 optimize_utf8 (re_dfa_t *dfa)
1120 {
1121   int node, i, mb_chars = 0, has_period = 0;
1122
1123   for (node = 0; node < dfa->nodes_len; ++node)
1124     switch (dfa->nodes[node].type)
1125       {
1126       case CHARACTER:
1127         if (dfa->nodes[node].opr.c >= 0x80)
1128           mb_chars = 1;
1129         break;
1130       case ANCHOR:
1131         switch (dfa->nodes[node].opr.ctx_type)
1132           {
1133           case LINE_FIRST:
1134           case LINE_LAST:
1135           case BUF_FIRST:
1136           case BUF_LAST:
1137             break;
1138           default:
1139             /* Word anchors etc. cannot be handled.  It's okay to test
1140                opr.ctx_type since constraints (for all DFA nodes) are
1141                created by ORing one or more opr.ctx_type values.  */
1142             return;
1143           }
1144         break;
1145       case OP_PERIOD:
1146         has_period = 1;
1147         break;
1148       case OP_BACK_REF:
1149       case OP_ALT:
1150       case END_OF_RE:
1151       case OP_DUP_ASTERISK:
1152       case OP_OPEN_SUBEXP:
1153       case OP_CLOSE_SUBEXP:
1154         break;
1155       case COMPLEX_BRACKET:
1156         return;
1157       case SIMPLE_BRACKET:
1158         /* Just double check.  The non-ASCII range starts at 0x80.  */
1159         assert (0x80 % BITSET_WORD_BITS == 0);
1160         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1161           if (dfa->nodes[node].opr.sbcset[i])
1162             return;
1163         break;
1164       default:
1165         abort ();
1166       }
1167
1168   if (mb_chars || has_period)
1169     for (node = 0; node < dfa->nodes_len; ++node)
1170       {
1171         if (dfa->nodes[node].type == CHARACTER
1172             && dfa->nodes[node].opr.c >= 0x80)
1173           dfa->nodes[node].mb_partial = 0;
1174         else if (dfa->nodes[node].type == OP_PERIOD)
1175           dfa->nodes[node].type = OP_UTF8_PERIOD;
1176       }
1177
1178   /* The search can be in single byte locale.  */
1179   dfa->mb_cur_max = 1;
1180   dfa->is_utf8 = 0;
1181   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1182 }
1183 #endif
1184 \f
1185 /* Analyze the structure tree, and calculate "first", "next", "edest",
1186    "eclosure", and "inveclosure".  */
1187
1188 static reg_errcode_t
1189 analyze (regex_t *preg)
1190 {
1191   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1192   reg_errcode_t ret;
1193
1194   /* Allocate arrays.  */
1195   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1196   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1197   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1198   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1199   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1200           || dfa->eclosures == NULL, 0))
1201     return REG_ESPACE;
1202
1203   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1204   if (dfa->subexp_map != NULL)
1205     {
1206       int i;
1207       for (i = 0; i < preg->re_nsub; i++)
1208         dfa->subexp_map[i] = i;
1209       preorder (dfa->str_tree, optimize_subexps, dfa);
1210       for (i = 0; i < preg->re_nsub; i++)
1211         if (dfa->subexp_map[i] != i)
1212           break;
1213       if (i == preg->re_nsub)
1214         {
1215           free (dfa->subexp_map);
1216           dfa->subexp_map = NULL;
1217         }
1218     }
1219
1220   ret = postorder (dfa->str_tree, lower_subexps, preg);
1221   if (BE (ret != REG_NOERROR, 0))
1222     return ret;
1223   ret = postorder (dfa->str_tree, calc_first, dfa);
1224   if (BE (ret != REG_NOERROR, 0))
1225     return ret;
1226   preorder (dfa->str_tree, calc_next, dfa);
1227   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1228   if (BE (ret != REG_NOERROR, 0))
1229     return ret;
1230   ret = calc_eclosure (dfa);
1231   if (BE (ret != REG_NOERROR, 0))
1232     return ret;
1233
1234   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1235      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1236   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1237       || dfa->nbackref)
1238     {
1239       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1240       if (BE (dfa->inveclosures == NULL, 0))
1241         return REG_ESPACE;
1242       ret = calc_inveclosure (dfa);
1243     }
1244
1245   return ret;
1246 }
1247
1248 /* Our parse trees are very unbalanced, so we cannot use a stack to
1249    implement parse tree visits.  Instead, we use parent pointers and
1250    some hairy code in these two functions.  */
1251 static reg_errcode_t
1252 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1253            void *extra)
1254 {
1255   bin_tree_t *node, *prev;
1256
1257   for (node = root; ; )
1258     {
1259       /* Descend down the tree, preferably to the left (or to the right
1260          if that's the only child).  */
1261       while (node->left || node->right)
1262         if (node->left)
1263           node = node->left;
1264         else
1265           node = node->right;
1266
1267       do
1268         {
1269           reg_errcode_t err = fn (extra, node);
1270           if (BE (err != REG_NOERROR, 0))
1271             return err;
1272           if (node->parent == NULL)
1273             return REG_NOERROR;
1274           prev = node;
1275           node = node->parent;
1276         }
1277       /* Go up while we have a node that is reached from the right.  */
1278       while (node->right == prev || node->right == NULL);
1279       node = node->right;
1280     }
1281 }
1282
1283 static reg_errcode_t
1284 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1285           void *extra)
1286 {
1287   bin_tree_t *node;
1288
1289   for (node = root; ; )
1290     {
1291       reg_errcode_t err = fn (extra, node);
1292       if (BE (err != REG_NOERROR, 0))
1293         return err;
1294
1295       /* Go to the left node, or up and to the right.  */
1296       if (node->left)
1297         node = node->left;
1298       else
1299         {
1300           bin_tree_t *prev = NULL;
1301           while (node->right == prev || node->right == NULL)
1302             {
1303               prev = node;
1304               node = node->parent;
1305               if (!node)
1306                 return REG_NOERROR;
1307             }
1308           node = node->right;
1309         }
1310     }
1311 }
1312
1313 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1314    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1315    backreferences as well.  Requires a preorder visit.  */
1316 static reg_errcode_t
1317 optimize_subexps (void *extra, bin_tree_t *node)
1318 {
1319   re_dfa_t *dfa = (re_dfa_t *) extra;
1320
1321   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1322     {
1323       int idx = node->token.opr.idx;
1324       node->token.opr.idx = dfa->subexp_map[idx];
1325       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1326     }
1327
1328   else if (node->token.type == SUBEXP
1329            && node->left && node->left->token.type == SUBEXP)
1330     {
1331       int other_idx = node->left->token.opr.idx;
1332
1333       node->left = node->left->left;
1334       if (node->left)
1335         node->left->parent = node;
1336
1337       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1338       if (other_idx < BITSET_WORD_BITS)
1339           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1340     }
1341
1342   return REG_NOERROR;
1343 }
1344
1345 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1346    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1347 static reg_errcode_t
1348 lower_subexps (void *extra, bin_tree_t *node)
1349 {
1350   regex_t *preg = (regex_t *) extra;
1351   reg_errcode_t err = REG_NOERROR;
1352
1353   if (node->left && node->left->token.type == SUBEXP)
1354     {
1355       node->left = lower_subexp (&err, preg, node->left);
1356       if (node->left)
1357         node->left->parent = node;
1358     }
1359   if (node->right && node->right->token.type == SUBEXP)
1360     {
1361       node->right = lower_subexp (&err, preg, node->right);
1362       if (node->right)
1363         node->right->parent = node;
1364     }
1365
1366   return err;
1367 }
1368
1369 static bin_tree_t *
1370 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1371 {
1372   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1373   bin_tree_t *body = node->left;
1374   bin_tree_t *op, *cls, *tree1, *tree;
1375
1376   if (preg->no_sub
1377       /* We do not optimize empty subexpressions, because otherwise we may
1378          have bad CONCAT nodes with NULL children.  This is obviously not
1379          very common, so we do not lose much.  An example that triggers
1380          this case is the sed "script" /\(\)/x.  */
1381       && node->left != NULL
1382       && (node->token.opr.idx >= BITSET_WORD_BITS
1383           || !(dfa->used_bkref_map
1384                & ((bitset_word_t) 1 << node->token.opr.idx))))
1385     return node->left;
1386
1387   /* Convert the SUBEXP node to the concatenation of an
1388      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1389   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1390   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1391   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1392   tree = create_tree (dfa, op, tree1, CONCAT);
1393   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1394     {
1395       *err = REG_ESPACE;
1396       return NULL;
1397     }
1398
1399   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1400   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1401   return tree;
1402 }
1403
1404 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1405    nodes.  Requires a postorder visit.  */
1406 static reg_errcode_t
1407 calc_first (void *extra, bin_tree_t *node)
1408 {
1409   re_dfa_t *dfa = (re_dfa_t *) extra;
1410   if (node->token.type == CONCAT)
1411     {
1412       node->first = node->left->first;
1413       node->node_idx = node->left->node_idx;
1414     }
1415   else
1416     {
1417       node->first = node;
1418       node->node_idx = re_dfa_add_node (dfa, node->token);
1419       if (BE (node->node_idx == -1, 0))
1420         return REG_ESPACE;
1421       if (node->token.type == ANCHOR)
1422         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1423     }
1424   return REG_NOERROR;
1425 }
1426
1427 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1428 static reg_errcode_t
1429 calc_next (void *extra, bin_tree_t *node)
1430 {
1431   switch (node->token.type)
1432     {
1433     case OP_DUP_ASTERISK:
1434       node->left->next = node;
1435       break;
1436     case CONCAT:
1437       node->left->next = node->right->first;
1438       node->right->next = node->next;
1439       break;
1440     default:
1441       if (node->left)
1442         node->left->next = node->next;
1443       if (node->right)
1444         node->right->next = node->next;
1445       break;
1446     }
1447   return REG_NOERROR;
1448 }
1449
1450 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1451 static reg_errcode_t
1452 link_nfa_nodes (void *extra, bin_tree_t *node)
1453 {
1454   re_dfa_t *dfa = (re_dfa_t *) extra;
1455   int idx = node->node_idx;
1456   reg_errcode_t err = REG_NOERROR;
1457
1458   switch (node->token.type)
1459     {
1460     case CONCAT:
1461       break;
1462
1463     case END_OF_RE:
1464       assert (node->next == NULL);
1465       break;
1466
1467     case OP_DUP_ASTERISK:
1468     case OP_ALT:
1469       {
1470         int left, right;
1471         dfa->has_plural_match = 1;
1472         if (node->left != NULL)
1473           left = node->left->first->node_idx;
1474         else
1475           left = node->next->node_idx;
1476         if (node->right != NULL)
1477           right = node->right->first->node_idx;
1478         else
1479           right = node->next->node_idx;
1480         assert (left > -1);
1481         assert (right > -1);
1482         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1483       }
1484       break;
1485
1486     case ANCHOR:
1487     case OP_OPEN_SUBEXP:
1488     case OP_CLOSE_SUBEXP:
1489       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1490       break;
1491
1492     case OP_BACK_REF:
1493       dfa->nexts[idx] = node->next->node_idx;
1494       if (node->token.type == OP_BACK_REF)
1495         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1496       break;
1497
1498     default:
1499       assert (!IS_EPSILON_NODE (node->token.type));
1500       dfa->nexts[idx] = node->next->node_idx;
1501       break;
1502     }
1503
1504   return err;
1505 }
1506
1507 /* Duplicate the epsilon closure of the node ROOT_NODE.
1508    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1509    to their own constraint.  */
1510
1511 static reg_errcode_t
1512 internal_function
1513 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1514                         int root_node, unsigned int init_constraint)
1515 {
1516   int org_node, clone_node, ret;
1517   unsigned int constraint = init_constraint;
1518   for (org_node = top_org_node, clone_node = top_clone_node;;)
1519     {
1520       int org_dest, clone_dest;
1521       if (dfa->nodes[org_node].type == OP_BACK_REF)
1522         {
1523           /* If the back reference epsilon-transit, its destination must
1524              also have the constraint.  Then duplicate the epsilon closure
1525              of the destination of the back reference, and store it in
1526              edests of the back reference.  */
1527           org_dest = dfa->nexts[org_node];
1528           re_node_set_empty (dfa->edests + clone_node);
1529           clone_dest = duplicate_node (dfa, org_dest, constraint);
1530           if (BE (clone_dest == -1, 0))
1531             return REG_ESPACE;
1532           dfa->nexts[clone_node] = dfa->nexts[org_node];
1533           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534           if (BE (ret < 0, 0))
1535             return REG_ESPACE;
1536         }
1537       else if (dfa->edests[org_node].nelem == 0)
1538         {
1539           /* In case of the node can't epsilon-transit, don't duplicate the
1540              destination and store the original destination as the
1541              destination of the node.  */
1542           dfa->nexts[clone_node] = dfa->nexts[org_node];
1543           break;
1544         }
1545       else if (dfa->edests[org_node].nelem == 1)
1546         {
1547           /* In case of the node can epsilon-transit, and it has only one
1548              destination.  */
1549           org_dest = dfa->edests[org_node].elems[0];
1550           re_node_set_empty (dfa->edests + clone_node);
1551           /* If the node is root_node itself, it means the epsilon clsoure
1552              has a loop.   Then tie it to the destination of the root_node.  */
1553           if (org_node == root_node && clone_node != org_node)
1554             {
1555               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1556               if (BE (ret < 0, 0))
1557                 return REG_ESPACE;
1558               break;
1559             }
1560           /* In case of the node has another constraint, add it.  */
1561           constraint |= dfa->nodes[org_node].constraint;
1562           clone_dest = duplicate_node (dfa, org_dest, constraint);
1563           if (BE (clone_dest == -1, 0))
1564             return REG_ESPACE;
1565           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566           if (BE (ret < 0, 0))
1567             return REG_ESPACE;
1568         }
1569       else /* dfa->edests[org_node].nelem == 2 */
1570         {
1571           /* In case of the node can epsilon-transit, and it has two
1572              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1573           org_dest = dfa->edests[org_node].elems[0];
1574           re_node_set_empty (dfa->edests + clone_node);
1575           /* Search for a duplicated node which satisfies the constraint.  */
1576           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1577           if (clone_dest == -1)
1578             {
1579               /* There is no such duplicated node, create a new one.  */
1580               reg_errcode_t err;
1581               clone_dest = duplicate_node (dfa, org_dest, constraint);
1582               if (BE (clone_dest == -1, 0))
1583                 return REG_ESPACE;
1584               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1585               if (BE (ret < 0, 0))
1586                 return REG_ESPACE;
1587               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1588                                             root_node, constraint);
1589               if (BE (err != REG_NOERROR, 0))
1590                 return err;
1591             }
1592           else
1593             {
1594               /* There is a duplicated node which satisfies the constraint,
1595                  use it to avoid infinite loop.  */
1596               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1597               if (BE (ret < 0, 0))
1598                 return REG_ESPACE;
1599             }
1600
1601           org_dest = dfa->edests[org_node].elems[1];
1602           clone_dest = duplicate_node (dfa, org_dest, constraint);
1603           if (BE (clone_dest == -1, 0))
1604             return REG_ESPACE;
1605           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1606           if (BE (ret < 0, 0))
1607             return REG_ESPACE;
1608         }
1609       org_node = org_dest;
1610       clone_node = clone_dest;
1611     }
1612   return REG_NOERROR;
1613 }
1614
1615 /* Search for a node which is duplicated from the node ORG_NODE, and
1616    satisfies the constraint CONSTRAINT.  */
1617
1618 static int
1619 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1620                         unsigned int constraint)
1621 {
1622   int idx;
1623   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1624     {
1625       if (org_node == dfa->org_indices[idx]
1626           && constraint == dfa->nodes[idx].constraint)
1627         return idx; /* Found.  */
1628     }
1629   return -1; /* Not found.  */
1630 }
1631
1632 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1633    Return the index of the new node, or -1 if insufficient storage is
1634    available.  */
1635
1636 static int
1637 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1638 {
1639   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1640   if (BE (dup_idx != -1, 1))
1641     {
1642       dfa->nodes[dup_idx].constraint = constraint;
1643       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1644       dfa->nodes[dup_idx].duplicated = 1;
1645
1646       /* Store the index of the original node.  */
1647       dfa->org_indices[dup_idx] = org_idx;
1648     }
1649   return dup_idx;
1650 }
1651
1652 static reg_errcode_t
1653 calc_inveclosure (re_dfa_t *dfa)
1654 {
1655   int src, idx, ret;
1656   for (idx = 0; idx < dfa->nodes_len; ++idx)
1657     re_node_set_init_empty (dfa->inveclosures + idx);
1658
1659   for (src = 0; src < dfa->nodes_len; ++src)
1660     {
1661       int *elems = dfa->eclosures[src].elems;
1662       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1663         {
1664           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1665           if (BE (ret == -1, 0))
1666             return REG_ESPACE;
1667         }
1668     }
1669
1670   return REG_NOERROR;
1671 }
1672
1673 /* Calculate "eclosure" for all the node in DFA.  */
1674
1675 static reg_errcode_t
1676 calc_eclosure (re_dfa_t *dfa)
1677 {
1678   int node_idx, incomplete;
1679 #ifdef DEBUG
1680   assert (dfa->nodes_len > 0);
1681 #endif
1682   incomplete = 0;
1683   /* For each nodes, calculate epsilon closure.  */
1684   for (node_idx = 0; ; ++node_idx)
1685     {
1686       reg_errcode_t err;
1687       re_node_set eclosure_elem;
1688       if (node_idx == dfa->nodes_len)
1689         {
1690           if (!incomplete)
1691             break;
1692           incomplete = 0;
1693           node_idx = 0;
1694         }
1695
1696 #ifdef DEBUG
1697       assert (dfa->eclosures[node_idx].nelem != -1);
1698 #endif
1699
1700       /* If we have already calculated, skip it.  */
1701       if (dfa->eclosures[node_idx].nelem != 0)
1702         continue;
1703       /* Calculate epsilon closure of `node_idx'.  */
1704       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1705       if (BE (err != REG_NOERROR, 0))
1706         return err;
1707
1708       if (dfa->eclosures[node_idx].nelem == 0)
1709         {
1710           incomplete = 1;
1711           re_node_set_free (&eclosure_elem);
1712         }
1713     }
1714   return REG_NOERROR;
1715 }
1716
1717 /* Calculate epsilon closure of NODE.  */
1718
1719 static reg_errcode_t
1720 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1721 {
1722   reg_errcode_t err;
1723   int i;
1724   re_node_set eclosure;
1725   int ret;
1726   int incomplete = 0;
1727   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1728   if (BE (err != REG_NOERROR, 0))
1729     return err;
1730
1731   /* This indicates that we are calculating this node now.
1732      We reference this value to avoid infinite loop.  */
1733   dfa->eclosures[node].nelem = -1;
1734
1735   /* If the current node has constraints, duplicate all nodes
1736      since they must inherit the constraints.  */
1737   if (dfa->nodes[node].constraint
1738       && dfa->edests[node].nelem
1739       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1740     {
1741       err = duplicate_node_closure (dfa, node, node, node,
1742                                     dfa->nodes[node].constraint);
1743       if (BE (err != REG_NOERROR, 0))
1744         return err;
1745     }
1746
1747   /* Expand each epsilon destination nodes.  */
1748   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1749     for (i = 0; i < dfa->edests[node].nelem; ++i)
1750       {
1751         re_node_set eclosure_elem;
1752         int edest = dfa->edests[node].elems[i];
1753         /* If calculating the epsilon closure of `edest' is in progress,
1754            return intermediate result.  */
1755         if (dfa->eclosures[edest].nelem == -1)
1756           {
1757             incomplete = 1;
1758             continue;
1759           }
1760         /* If we haven't calculated the epsilon closure of `edest' yet,
1761            calculate now. Otherwise use calculated epsilon closure.  */
1762         if (dfa->eclosures[edest].nelem == 0)
1763           {
1764             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1765             if (BE (err != REG_NOERROR, 0))
1766               return err;
1767           }
1768         else
1769           eclosure_elem = dfa->eclosures[edest];
1770         /* Merge the epsilon closure of `edest'.  */
1771         err = re_node_set_merge (&eclosure, &eclosure_elem);
1772         if (BE (err != REG_NOERROR, 0))
1773           return err;
1774         /* If the epsilon closure of `edest' is incomplete,
1775            the epsilon closure of this node is also incomplete.  */
1776         if (dfa->eclosures[edest].nelem == 0)
1777           {
1778             incomplete = 1;
1779             re_node_set_free (&eclosure_elem);
1780           }
1781       }
1782
1783   /* An epsilon closure includes itself.  */
1784   ret = re_node_set_insert (&eclosure, node);
1785   if (BE (ret < 0, 0))
1786     return REG_ESPACE;
1787   if (incomplete && !root)
1788     dfa->eclosures[node].nelem = 0;
1789   else
1790     dfa->eclosures[node] = eclosure;
1791   *new_set = eclosure;
1792   return REG_NOERROR;
1793 }
1794 \f
1795 /* Functions for token which are used in the parser.  */
1796
1797 /* Fetch a token from INPUT.
1798    We must not use this function inside bracket expressions.  */
1799
1800 static void
1801 internal_function
1802 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1803 {
1804   re_string_skip_bytes (input, peek_token (result, input, syntax));
1805 }
1806
1807 /* Peek a token from INPUT, and return the length of the token.
1808    We must not use this function inside bracket expressions.  */
1809
1810 static int
1811 internal_function
1812 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1813 {
1814   unsigned char c;
1815
1816   if (re_string_eoi (input))
1817     {
1818       token->type = END_OF_RE;
1819       return 0;
1820     }
1821
1822   c = re_string_peek_byte (input, 0);
1823   token->opr.c = c;
1824
1825   token->word_char = 0;
1826 #ifdef RE_ENABLE_I18N
1827   token->mb_partial = 0;
1828   if (input->mb_cur_max > 1 &&
1829       !re_string_first_byte (input, re_string_cur_idx (input)))
1830     {
1831       token->type = CHARACTER;
1832       token->mb_partial = 1;
1833       return 1;
1834     }
1835 #endif
1836   if (c == '\\')
1837     {
1838       unsigned char c2;
1839       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1840         {
1841           token->type = BACK_SLASH;
1842           return 1;
1843         }
1844
1845       c2 = re_string_peek_byte_case (input, 1);
1846       token->opr.c = c2;
1847       token->type = CHARACTER;
1848 #ifdef RE_ENABLE_I18N
1849       if (input->mb_cur_max > 1)
1850         {
1851           wint_t wc = re_string_wchar_at (input,
1852                                           re_string_cur_idx (input) + 1);
1853           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1854         }
1855       else
1856 #endif
1857         token->word_char = IS_WORD_CHAR (c2) != 0;
1858
1859       switch (c2)
1860         {
1861         case '|':
1862           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1863             token->type = OP_ALT;
1864           break;
1865         case '1': case '2': case '3': case '4': case '5':
1866         case '6': case '7': case '8': case '9':
1867           if (!(syntax & RE_NO_BK_REFS))
1868             {
1869               token->type = OP_BACK_REF;
1870               token->opr.idx = c2 - '1';
1871             }
1872           break;
1873         case '<':
1874           if (!(syntax & RE_NO_GNU_OPS))
1875             {
1876               token->type = ANCHOR;
1877               token->opr.ctx_type = WORD_FIRST;
1878             }
1879           break;
1880         case '>':
1881           if (!(syntax & RE_NO_GNU_OPS))
1882             {
1883               token->type = ANCHOR;
1884               token->opr.ctx_type = WORD_LAST;
1885             }
1886           break;
1887         case 'b':
1888           if (!(syntax & RE_NO_GNU_OPS))
1889             {
1890               token->type = ANCHOR;
1891               token->opr.ctx_type = WORD_DELIM;
1892             }
1893           break;
1894         case 'B':
1895           if (!(syntax & RE_NO_GNU_OPS))
1896             {
1897               token->type = ANCHOR;
1898               token->opr.ctx_type = NOT_WORD_DELIM;
1899             }
1900           break;
1901         case 'w':
1902           if (!(syntax & RE_NO_GNU_OPS))
1903             token->type = OP_WORD;
1904           break;
1905         case 'W':
1906           if (!(syntax & RE_NO_GNU_OPS))
1907             token->type = OP_NOTWORD;
1908           break;
1909         case 's':
1910           if (!(syntax & RE_NO_GNU_OPS))
1911             token->type = OP_SPACE;
1912           break;
1913         case 'S':
1914           if (!(syntax & RE_NO_GNU_OPS))
1915             token->type = OP_NOTSPACE;
1916           break;
1917         case '`':
1918           if (!(syntax & RE_NO_GNU_OPS))
1919             {
1920               token->type = ANCHOR;
1921               token->opr.ctx_type = BUF_FIRST;
1922             }
1923           break;
1924         case '\'':
1925           if (!(syntax & RE_NO_GNU_OPS))
1926             {
1927               token->type = ANCHOR;
1928               token->opr.ctx_type = BUF_LAST;
1929             }
1930           break;
1931         case '(':
1932           if (!(syntax & RE_NO_BK_PARENS))
1933             token->type = OP_OPEN_SUBEXP;
1934           break;
1935         case ')':
1936           if (!(syntax & RE_NO_BK_PARENS))
1937             token->type = OP_CLOSE_SUBEXP;
1938           break;
1939         case '+':
1940           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1941             token->type = OP_DUP_PLUS;
1942           break;
1943         case '?':
1944           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945             token->type = OP_DUP_QUESTION;
1946           break;
1947         case '{':
1948           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1949             token->type = OP_OPEN_DUP_NUM;
1950           break;
1951         case '}':
1952           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953             token->type = OP_CLOSE_DUP_NUM;
1954           break;
1955         default:
1956           break;
1957         }
1958       return 2;
1959     }
1960
1961   token->type = CHARACTER;
1962 #ifdef RE_ENABLE_I18N
1963   if (input->mb_cur_max > 1)
1964     {
1965       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1966       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1967     }
1968   else
1969 #endif
1970     token->word_char = IS_WORD_CHAR (token->opr.c);
1971
1972   switch (c)
1973     {
1974     case '\n':
1975       if (syntax & RE_NEWLINE_ALT)
1976         token->type = OP_ALT;
1977       break;
1978     case '|':
1979       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1980         token->type = OP_ALT;
1981       break;
1982     case '*':
1983       token->type = OP_DUP_ASTERISK;
1984       break;
1985     case '+':
1986       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1987         token->type = OP_DUP_PLUS;
1988       break;
1989     case '?':
1990       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991         token->type = OP_DUP_QUESTION;
1992       break;
1993     case '{':
1994       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1995         token->type = OP_OPEN_DUP_NUM;
1996       break;
1997     case '}':
1998       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999         token->type = OP_CLOSE_DUP_NUM;
2000       break;
2001     case '(':
2002       if (syntax & RE_NO_BK_PARENS)
2003         token->type = OP_OPEN_SUBEXP;
2004       break;
2005     case ')':
2006       if (syntax & RE_NO_BK_PARENS)
2007         token->type = OP_CLOSE_SUBEXP;
2008       break;
2009     case '[':
2010       token->type = OP_OPEN_BRACKET;
2011       break;
2012     case '.':
2013       token->type = OP_PERIOD;
2014       break;
2015     case '^':
2016       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2017           re_string_cur_idx (input) != 0)
2018         {
2019           char prev = re_string_peek_byte (input, -1);
2020           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2021             break;
2022         }
2023       token->type = ANCHOR;
2024       token->opr.ctx_type = LINE_FIRST;
2025       break;
2026     case '$':
2027       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2028           re_string_cur_idx (input) + 1 != re_string_length (input))
2029         {
2030           re_token_t next;
2031           re_string_skip_bytes (input, 1);
2032           peek_token (&next, input, syntax);
2033           re_string_skip_bytes (input, -1);
2034           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2035             break;
2036         }
2037       token->type = ANCHOR;
2038       token->opr.ctx_type = LINE_LAST;
2039       break;
2040     default:
2041       break;
2042     }
2043   return 1;
2044 }
2045
2046 /* Peek a token from INPUT, and return the length of the token.
2047    We must not use this function out of bracket expressions.  */
2048
2049 static int
2050 internal_function
2051 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2052 {
2053   unsigned char c;
2054   if (re_string_eoi (input))
2055     {
2056       token->type = END_OF_RE;
2057       return 0;
2058     }
2059   c = re_string_peek_byte (input, 0);
2060   token->opr.c = c;
2061
2062 #ifdef RE_ENABLE_I18N
2063   if (input->mb_cur_max > 1 &&
2064       !re_string_first_byte (input, re_string_cur_idx (input)))
2065     {
2066       token->type = CHARACTER;
2067       return 1;
2068     }
2069 #endif /* RE_ENABLE_I18N */
2070
2071   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2072       && re_string_cur_idx (input) + 1 < re_string_length (input))
2073     {
2074       /* In this case, '\' escape a character.  */
2075       unsigned char c2;
2076       re_string_skip_bytes (input, 1);
2077       c2 = re_string_peek_byte (input, 0);
2078       token->opr.c = c2;
2079       token->type = CHARACTER;
2080       return 1;
2081     }
2082   if (c == '[') /* '[' is a special char in a bracket exps.  */
2083     {
2084       unsigned char c2;
2085       int token_len;
2086       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2087         c2 = re_string_peek_byte (input, 1);
2088       else
2089         c2 = 0;
2090       token->opr.c = c2;
2091       token_len = 2;
2092       switch (c2)
2093         {
2094         case '.':
2095           token->type = OP_OPEN_COLL_ELEM;
2096           break;
2097         case '=':
2098           token->type = OP_OPEN_EQUIV_CLASS;
2099           break;
2100         case ':':
2101           if (syntax & RE_CHAR_CLASSES)
2102             {
2103               token->type = OP_OPEN_CHAR_CLASS;
2104               break;
2105             }
2106           /* else fall through.  */
2107         default:
2108           token->type = CHARACTER;
2109           token->opr.c = c;
2110           token_len = 1;
2111           break;
2112         }
2113       return token_len;
2114     }
2115   switch (c)
2116     {
2117     case '-':
2118       token->type = OP_CHARSET_RANGE;
2119       break;
2120     case ']':
2121       token->type = OP_CLOSE_BRACKET;
2122       break;
2123     case '^':
2124       token->type = OP_NON_MATCH_LIST;
2125       break;
2126     default:
2127       token->type = CHARACTER;
2128     }
2129   return 1;
2130 }
2131 \f
2132 /* Functions for parser.  */
2133
2134 /* Entry point of the parser.
2135    Parse the regular expression REGEXP and return the structure tree.
2136    If an error is occured, ERR is set by error code, and return NULL.
2137    This function build the following tree, from regular expression <reg_exp>:
2138            CAT
2139            / \
2140           /   \
2141    <reg_exp>  EOR
2142
2143    CAT means concatenation.
2144    EOR means end of regular expression.  */
2145
2146 static bin_tree_t *
2147 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2148        reg_errcode_t *err)
2149 {
2150   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2151   bin_tree_t *tree, *eor, *root;
2152   re_token_t current_token;
2153   dfa->syntax = syntax;
2154   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2155   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2156   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2157     return NULL;
2158   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2159   if (tree != NULL)
2160     root = create_tree (dfa, tree, eor, CONCAT);
2161   else
2162     root = eor;
2163   if (BE (eor == NULL || root == NULL, 0))
2164     {
2165       *err = REG_ESPACE;
2166       return NULL;
2167     }
2168   return root;
2169 }
2170
2171 /* This function build the following tree, from regular expression
2172    <branch1>|<branch2>:
2173            ALT
2174            / \
2175           /   \
2176    <branch1> <branch2>
2177
2178    ALT means alternative, which represents the operator `|'.  */
2179
2180 static bin_tree_t *
2181 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2182                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2183 {
2184   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2185   bin_tree_t *tree, *branch = NULL;
2186   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2187   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2188     return NULL;
2189
2190   while (token->type == OP_ALT)
2191     {
2192       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2193       if (token->type != OP_ALT && token->type != END_OF_RE
2194           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2195         {
2196           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2197           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2198             return NULL;
2199         }
2200       else
2201         branch = NULL;
2202       tree = create_tree (dfa, tree, branch, OP_ALT);
2203       if (BE (tree == NULL, 0))
2204         {
2205           *err = REG_ESPACE;
2206           return NULL;
2207         }
2208     }
2209   return tree;
2210 }
2211
2212 /* This function build the following tree, from regular expression
2213    <exp1><exp2>:
2214         CAT
2215         / \
2216        /   \
2217    <exp1> <exp2>
2218
2219    CAT means concatenation.  */
2220
2221 static bin_tree_t *
2222 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2224 {
2225   bin_tree_t *tree, *exp;
2226   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2228   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2229     return NULL;
2230
2231   while (token->type != OP_ALT && token->type != END_OF_RE
2232          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2233     {
2234       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2235       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2236         {
2237           if (tree != NULL)
2238             postorder (tree, free_tree, NULL);
2239           return NULL;
2240         }
2241       if (tree != NULL && exp != NULL)
2242         {
2243           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2244           if (newtree == NULL)
2245             {
2246               postorder (exp, free_tree, NULL);
2247               postorder (tree, free_tree, NULL);
2248               *err = REG_ESPACE;
2249               return NULL;
2250             }
2251           tree = newtree;
2252         }
2253       else if (tree == NULL)
2254         tree = exp;
2255       /* Otherwise exp == NULL, we don't need to create new tree.  */
2256     }
2257   return tree;
2258 }
2259
2260 /* This function build the following tree, from regular expression a*:
2261          *
2262          |
2263          a
2264 */
2265
2266 static bin_tree_t *
2267 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2268                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2269 {
2270   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2271   bin_tree_t *tree;
2272   switch (token->type)
2273     {
2274     case CHARACTER:
2275       tree = create_token_tree (dfa, NULL, NULL, token);
2276       if (BE (tree == NULL, 0))
2277         {
2278           *err = REG_ESPACE;
2279           return NULL;
2280         }
2281 #ifdef RE_ENABLE_I18N
2282       if (dfa->mb_cur_max > 1)
2283         {
2284           while (!re_string_eoi (regexp)
2285                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2286             {
2287               bin_tree_t *mbc_remain;
2288               fetch_token (token, regexp, syntax);
2289               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2290               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2291               if (BE (mbc_remain == NULL || tree == NULL, 0))
2292                 {
2293                   *err = REG_ESPACE;
2294                   return NULL;
2295                 }
2296             }
2297         }
2298 #endif
2299       break;
2300     case OP_OPEN_SUBEXP:
2301       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2302       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2303         return NULL;
2304       break;
2305     case OP_OPEN_BRACKET:
2306       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2307       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2308         return NULL;
2309       break;
2310     case OP_BACK_REF:
2311       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2312         {
2313           *err = REG_ESUBREG;
2314           return NULL;
2315         }
2316       dfa->used_bkref_map |= 1 << token->opr.idx;
2317       tree = create_token_tree (dfa, NULL, NULL, token);
2318       if (BE (tree == NULL, 0))
2319         {
2320           *err = REG_ESPACE;
2321           return NULL;
2322         }
2323       ++dfa->nbackref;
2324       dfa->has_mb_node = 1;
2325       break;
2326     case OP_OPEN_DUP_NUM:
2327       if (syntax & RE_CONTEXT_INVALID_DUP)
2328         {
2329           *err = REG_BADRPT;
2330           return NULL;
2331         }
2332       /* FALLTHROUGH */
2333     case OP_DUP_ASTERISK:
2334     case OP_DUP_PLUS:
2335     case OP_DUP_QUESTION:
2336       if (syntax & RE_CONTEXT_INVALID_OPS)
2337         {
2338           *err = REG_BADRPT;
2339           return NULL;
2340         }
2341       else if (syntax & RE_CONTEXT_INDEP_OPS)
2342         {
2343           fetch_token (token, regexp, syntax);
2344           return parse_expression (regexp, preg, token, syntax, nest, err);
2345         }
2346       /* else fall through  */
2347     case OP_CLOSE_SUBEXP:
2348       if ((token->type == OP_CLOSE_SUBEXP) &&
2349           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2350         {
2351           *err = REG_ERPAREN;
2352           return NULL;
2353         }
2354       /* else fall through  */
2355     case OP_CLOSE_DUP_NUM:
2356       /* We treat it as a normal character.  */
2357
2358       /* Then we can these characters as normal characters.  */
2359       token->type = CHARACTER;
2360       /* mb_partial and word_char bits should be initialized already
2361          by peek_token.  */
2362       tree = create_token_tree (dfa, NULL, NULL, token);
2363       if (BE (tree == NULL, 0))
2364         {
2365           *err = REG_ESPACE;
2366           return NULL;
2367         }
2368       break;
2369     case ANCHOR:
2370       if ((token->opr.ctx_type
2371            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2372           && dfa->word_ops_used == 0)
2373         init_word_char (dfa);
2374       if (token->opr.ctx_type == WORD_DELIM
2375           || token->opr.ctx_type == NOT_WORD_DELIM)
2376         {
2377           bin_tree_t *tree_first, *tree_last;
2378           if (token->opr.ctx_type == WORD_DELIM)
2379             {
2380               token->opr.ctx_type = WORD_FIRST;
2381               tree_first = create_token_tree (dfa, NULL, NULL, token);
2382               token->opr.ctx_type = WORD_LAST;
2383             }
2384           else
2385             {
2386               token->opr.ctx_type = INSIDE_WORD;
2387               tree_first = create_token_tree (dfa, NULL, NULL, token);
2388               token->opr.ctx_type = INSIDE_NOTWORD;
2389             }
2390           tree_last = create_token_tree (dfa, NULL, NULL, token);
2391           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2392           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2393             {
2394               *err = REG_ESPACE;
2395               return NULL;
2396             }
2397         }
2398       else
2399         {
2400           tree = create_token_tree (dfa, NULL, NULL, token);
2401           if (BE (tree == NULL, 0))
2402             {
2403               *err = REG_ESPACE;
2404               return NULL;
2405             }
2406         }
2407       /* We must return here, since ANCHORs can't be followed
2408          by repetition operators.
2409          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2410              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2411       fetch_token (token, regexp, syntax);
2412       return tree;
2413     case OP_PERIOD:
2414       tree = create_token_tree (dfa, NULL, NULL, token);
2415       if (BE (tree == NULL, 0))
2416         {
2417           *err = REG_ESPACE;
2418           return NULL;
2419         }
2420       if (dfa->mb_cur_max > 1)
2421         dfa->has_mb_node = 1;
2422       break;
2423     case OP_WORD:
2424     case OP_NOTWORD:
2425       tree = build_charclass_op (dfa, regexp->trans,
2426                                  "alnum",
2427                                  "_",
2428                                  token->type == OP_NOTWORD, err);
2429       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2430         return NULL;
2431       break;
2432     case OP_SPACE:
2433     case OP_NOTSPACE:
2434       tree = build_charclass_op (dfa, regexp->trans,
2435                                  "space",
2436                                  "",
2437                                  token->type == OP_NOTSPACE, err);
2438       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2439         return NULL;
2440       break;
2441     case OP_ALT:
2442     case END_OF_RE:
2443       return NULL;
2444     case BACK_SLASH:
2445       *err = REG_EESCAPE;
2446       return NULL;
2447     default:
2448       /* Must not happen?  */
2449 #ifdef DEBUG
2450       assert (0);
2451 #endif
2452       return NULL;
2453     }
2454   fetch_token (token, regexp, syntax);
2455
2456   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2457          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2458     {
2459       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2460       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2461         return NULL;
2462       /* In BRE consecutive duplications are not allowed.  */
2463       if ((syntax & RE_CONTEXT_INVALID_DUP)
2464           && (token->type == OP_DUP_ASTERISK
2465               || token->type == OP_OPEN_DUP_NUM))
2466         {
2467           *err = REG_BADRPT;
2468           return NULL;
2469         }
2470     }
2471
2472   return tree;
2473 }
2474
2475 /* This function build the following tree, from regular expression
2476    (<reg_exp>):
2477          SUBEXP
2478             |
2479         <reg_exp>
2480 */
2481
2482 static bin_tree_t *
2483 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2484                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2485 {
2486   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2487   bin_tree_t *tree;
2488   size_t cur_nsub;
2489   cur_nsub = preg->re_nsub++;
2490
2491   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2492
2493   /* The subexpression may be a null string.  */
2494   if (token->type == OP_CLOSE_SUBEXP)
2495     tree = NULL;
2496   else
2497     {
2498       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2499       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2500         {
2501           if (tree != NULL)
2502             postorder (tree, free_tree, NULL);
2503           *err = REG_EPAREN;
2504         }
2505       if (BE (*err != REG_NOERROR, 0))
2506         return NULL;
2507     }
2508
2509   if (cur_nsub <= '9' - '1')
2510     dfa->completed_bkref_map |= 1 << cur_nsub;
2511
2512   tree = create_tree (dfa, tree, NULL, SUBEXP);
2513   if (BE (tree == NULL, 0))
2514     {
2515       *err = REG_ESPACE;
2516       return NULL;
2517     }
2518   tree->token.opr.idx = cur_nsub;
2519   return tree;
2520 }
2521
2522 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2523
2524 static bin_tree_t *
2525 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2526               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2527 {
2528   bin_tree_t *tree = NULL, *old_tree = NULL;
2529   int i, start, end, start_idx = re_string_cur_idx (regexp);
2530 #ifndef RE_TOKEN_INIT_BUG
2531   re_token_t start_token = *token;
2532 #else
2533   re_token_t start_token;
2534
2535   memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2536 #endif
2537
2538   if (token->type == OP_OPEN_DUP_NUM)
2539     {
2540       end = 0;
2541       start = fetch_number (regexp, token, syntax);
2542       if (start == -1)
2543         {
2544           if (token->type == CHARACTER && token->opr.c == ',')
2545             start = 0; /* We treat "{,m}" as "{0,m}".  */
2546           else
2547             {
2548               *err = REG_BADBR; /* <re>{} is invalid.  */
2549               return NULL;
2550             }
2551         }
2552       if (BE (start != -2, 1))
2553         {
2554           /* We treat "{n}" as "{n,n}".  */
2555           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2556                  : ((token->type == CHARACTER && token->opr.c == ',')
2557                     ? fetch_number (regexp, token, syntax) : -2));
2558         }
2559       if (BE (start == -2 || end == -2, 0))
2560         {
2561           /* Invalid sequence.  */
2562           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2563             {
2564               if (token->type == END_OF_RE)
2565                 *err = REG_EBRACE;
2566               else
2567                 *err = REG_BADBR;
2568
2569               return NULL;
2570             }
2571
2572           /* If the syntax bit is set, rollback.  */
2573           re_string_set_index (regexp, start_idx);
2574           *token = start_token;
2575           token->type = CHARACTER;
2576           /* mb_partial and word_char bits should be already initialized by
2577              peek_token.  */
2578           return elem;
2579         }
2580
2581       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2582         {
2583           /* First number greater than second.  */
2584           *err = REG_BADBR;
2585           return NULL;
2586         }
2587     }
2588   else
2589     {
2590       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2592     }
2593
2594   fetch_token (token, regexp, syntax);
2595
2596   if (BE (elem == NULL, 0))
2597     return NULL;
2598   if (BE (start == 0 && end == 0, 0))
2599     {
2600       postorder (elem, free_tree, NULL);
2601       return NULL;
2602     }
2603
2604   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2605   if (BE (start > 0, 0))
2606     {
2607       tree = elem;
2608       for (i = 2; i <= start; ++i)
2609         {
2610           elem = duplicate_tree (elem, dfa);
2611           tree = create_tree (dfa, tree, elem, CONCAT);
2612           if (BE (elem == NULL || tree == NULL, 0))
2613             goto parse_dup_op_espace;
2614         }
2615
2616       if (start == end)
2617         return tree;
2618
2619       /* Duplicate ELEM before it is marked optional.  */
2620       elem = duplicate_tree (elem, dfa);
2621       old_tree = tree;
2622     }
2623   else
2624     old_tree = NULL;
2625
2626   if (elem->token.type == SUBEXP)
2627     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2628
2629   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2630   if (BE (tree == NULL, 0))
2631     goto parse_dup_op_espace;
2632
2633   /* This loop is actually executed only when end != -1,
2634      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2635      already created the start+1-th copy.  */
2636   for (i = start + 2; i <= end; ++i)
2637     {
2638       elem = duplicate_tree (elem, dfa);
2639       tree = create_tree (dfa, tree, elem, CONCAT);
2640       if (BE (elem == NULL || tree == NULL, 0))
2641         goto parse_dup_op_espace;
2642
2643       tree = create_tree (dfa, tree, NULL, OP_ALT);
2644       if (BE (tree == NULL, 0))
2645         goto parse_dup_op_espace;
2646     }
2647
2648   if (old_tree)
2649     tree = create_tree (dfa, old_tree, tree, CONCAT);
2650
2651   return tree;
2652
2653  parse_dup_op_espace:
2654   *err = REG_ESPACE;
2655   return NULL;
2656 }
2657
2658 /* Size of the names for collating symbol/equivalence_class/character_class.
2659    I'm not sure, but maybe enough.  */
2660 #define BRACKET_NAME_BUF_SIZE 32
2661
2662 #ifndef _LIBC
2663   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2664      Build the range expression which starts from START_ELEM, and ends
2665      at END_ELEM.  The result are written to MBCSET and SBCSET.
2666      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2667      mbcset->range_ends, is a pointer argument sinse we may
2668      update it.  */
2669
2670 static reg_errcode_t
2671 internal_function
2672 # ifdef RE_ENABLE_I18N
2673 build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset,
2674                 int *range_alloc, bracket_elem_t *start_elem,
2675                 bracket_elem_t *end_elem)
2676 # else /* not RE_ENABLE_I18N */
2677 build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
2678                 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2679 # endif /* not RE_ENABLE_I18N */
2680 {
2681   unsigned int start_ch, end_ch;
2682
2683   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2684   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2685           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2686           0))
2687     return REG_ERANGE;
2688
2689   /* We can handle no multi character collating elements without libc
2690      support.  */
2691   if (BE ((start_elem->type == COLL_SYM
2692            && strlen ((char *) start_elem->opr.name) > 1)
2693           || (end_elem->type == COLL_SYM
2694               && strlen ((char *) end_elem->opr.name) > 1), 0))
2695     return REG_ECOLLATE;
2696
2697 # ifdef RE_ENABLE_I18N
2698   {
2699     wchar_t wc;
2700     wint_t start_wc;
2701     wint_t end_wc;
2702
2703     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2704                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2705                    : 0));
2706     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2707               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2708                  : 0));
2709 #ifdef GAWK
2710     /*
2711      * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2712      * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2713      * unsigned, so we don't have sign extension problems.
2714      */
2715     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2716                 ? start_ch : start_elem->opr.wch);
2717     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2718               ? end_ch : end_elem->opr.wch);
2719 #else
2720     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2721                 ? __btowc (start_ch) : start_elem->opr.wch);
2722     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2723               ? __btowc (end_ch) : end_elem->opr.wch);
2724 #endif
2725     if (start_wc == WEOF || end_wc == WEOF)
2726       return REG_ECOLLATE;
2727     else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
2728       return REG_ERANGE;
2729
2730     /* Got valid collation sequence values, add them as a new entry.
2731        However, for !_LIBC we have no collation elements: if the
2732        character set is single byte, the single byte character set
2733        that we build below suffices.  parse_bracket_exp passes
2734        no MBCSET if dfa->mb_cur_max == 1.  */
2735     if (mbcset)
2736       {
2737         /* Check the space of the arrays.  */
2738         if (BE (*range_alloc == mbcset->nranges, 0))
2739           {
2740             /* There is not enough space, need realloc.  */
2741             wchar_t *new_array_start, *new_array_end;
2742             int new_nranges;
2743
2744             /* +1 in case of mbcset->nranges is 0.  */
2745             new_nranges = 2 * mbcset->nranges + 1;
2746             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2747                are NULL if *range_alloc == 0.  */
2748             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2749                                           new_nranges);
2750             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2751                                         new_nranges);
2752
2753             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2754               return REG_ESPACE;
2755
2756             mbcset->range_starts = new_array_start;
2757             mbcset->range_ends = new_array_end;
2758             *range_alloc = new_nranges;
2759           }
2760
2761         mbcset->range_starts[mbcset->nranges] = start_wc;
2762         mbcset->range_ends[mbcset->nranges++] = end_wc;
2763       }
2764
2765     /* Build the table for single byte characters.  */
2766     for (wc = 0; wc < SBC_MAX; ++wc)
2767       {
2768          if (start_wc <= wc && wc <= end_wc)
2769            bitset_set (sbcset, wc);
2770       }
2771   }
2772 # else /* not RE_ENABLE_I18N */
2773   {
2774     unsigned int ch;
2775     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2776                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2777                    : 0));
2778     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2779               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2780                  : 0));
2781     if (start_ch > end_ch)
2782       return REG_ERANGE;
2783     /* Build the table for single byte characters.  */
2784     for (ch = 0; ch < SBC_MAX; ++ch)
2785       if (start_ch <= ch  && ch <= end_ch)
2786         bitset_set (sbcset, ch);
2787   }
2788 # endif /* not RE_ENABLE_I18N */
2789   return REG_NOERROR;
2790 }
2791 #endif /* not _LIBC */
2792
2793 #ifndef _LIBC
2794 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2795    Build the collating element which is represented by NAME.
2796    The result are written to MBCSET and SBCSET.
2797    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2798    pointer argument since we may update it.  */
2799
2800 static reg_errcode_t
2801 internal_function
2802 # ifdef RE_ENABLE_I18N
2803 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2804                         int *coll_sym_alloc, const unsigned char *name)
2805 # else /* not RE_ENABLE_I18N */
2806 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2807 # endif /* not RE_ENABLE_I18N */
2808 {
2809   size_t name_len = strlen ((const char *) name);
2810   if (BE (name_len != 1, 0))
2811     return REG_ECOLLATE;
2812   else
2813     {
2814       bitset_set (sbcset, name[0]);
2815       return REG_NOERROR;
2816     }
2817 }
2818 #endif /* not _LIBC */
2819
2820 /* This function parse bracket expression like "[abc]", "[a-c]",
2821    "[[.a-a.]]" etc.  */
2822
2823 static bin_tree_t *
2824 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2825                    reg_syntax_t syntax, reg_errcode_t *err)
2826 {
2827 #ifdef _LIBC
2828   const unsigned char *collseqmb;
2829   const char *collseqwc;
2830   uint32_t nrules;
2831   int32_t table_size;
2832   const int32_t *symb_table;
2833   const unsigned char *extra;
2834
2835   /* Local function for parse_bracket_exp used in _LIBC environement.
2836      Seek the collating symbol entry correspondings to NAME.
2837      Return the index of the symbol in the SYMB_TABLE.  */
2838
2839   auto inline int32_t
2840   __attribute ((always_inline))
2841   seek_collating_symbol_entry (name, name_len)
2842          const unsigned char *name;
2843          size_t name_len;
2844     {
2845       int32_t hash = elem_hash ((const char *) name, name_len);
2846       int32_t elem = hash % table_size;
2847       if (symb_table[2 * elem] != 0)
2848         {
2849           int32_t second = hash % (table_size - 2) + 1;
2850
2851           do
2852             {
2853               /* First compare the hashing value.  */
2854               if (symb_table[2 * elem] == hash
2855                   /* Compare the length of the name.  */
2856                   && name_len == extra[symb_table[2 * elem + 1]]
2857                   /* Compare the name.  */
2858                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2859                              name_len) == 0)
2860                 {
2861                   /* Yep, this is the entry.  */
2862                   break;
2863                 }
2864
2865               /* Next entry.  */
2866               elem += second;
2867             }
2868           while (symb_table[2 * elem] != 0);
2869         }
2870       return elem;
2871     }
2872
2873   /* Local function for parse_bracket_exp used in _LIBC environment.
2874      Look up the collation sequence value of BR_ELEM.
2875      Return the value if succeeded, UINT_MAX otherwise.  */
2876
2877   auto inline unsigned int
2878   __attribute ((always_inline))
2879   lookup_collation_sequence_value (br_elem)
2880          bracket_elem_t *br_elem;
2881     {
2882       if (br_elem->type == SB_CHAR)
2883         {
2884           /*
2885           if (MB_CUR_MAX == 1)
2886           */
2887           if (nrules == 0)
2888             return collseqmb[br_elem->opr.ch];
2889           else
2890             {
2891               wint_t wc = __btowc (br_elem->opr.ch);
2892               return __collseq_table_lookup (collseqwc, wc);
2893             }
2894         }
2895       else if (br_elem->type == MB_CHAR)
2896         {
2897           if (nrules != 0)
2898             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2899         }
2900       else if (br_elem->type == COLL_SYM)
2901         {
2902           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2903           if (nrules != 0)
2904             {
2905               int32_t elem, idx;
2906               elem = seek_collating_symbol_entry (br_elem->opr.name,
2907                                                   sym_name_len);
2908               if (symb_table[2 * elem] != 0)
2909                 {
2910                   /* We found the entry.  */
2911                   idx = symb_table[2 * elem + 1];
2912                   /* Skip the name of collating element name.  */
2913                   idx += 1 + extra[idx];
2914                   /* Skip the byte sequence of the collating element.  */
2915                   idx += 1 + extra[idx];
2916                   /* Adjust for the alignment.  */
2917                   idx = (idx + 3) & ~3;
2918                   /* Skip the multibyte collation sequence value.  */
2919                   idx += sizeof (unsigned int);
2920                   /* Skip the wide char sequence of the collating element.  */
2921                   idx += sizeof (unsigned int) *
2922                     (1 + *(unsigned int *) (extra + idx));
2923                   /* Return the collation sequence value.  */
2924                   return *(unsigned int *) (extra + idx);
2925                 }
2926               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2927                 {
2928                   /* No valid character.  Match it as a single byte
2929                      character.  */
2930                   return collseqmb[br_elem->opr.name[0]];
2931                 }
2932             }
2933           else if (sym_name_len == 1)
2934             return collseqmb[br_elem->opr.name[0]];
2935         }
2936       return UINT_MAX;
2937     }
2938
2939   /* Local function for parse_bracket_exp used in _LIBC environement.
2940      Build the range expression which starts from START_ELEM, and ends
2941      at END_ELEM.  The result are written to MBCSET and SBCSET.
2942      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2943      mbcset->range_ends, is a pointer argument sinse we may
2944      update it.  */
2945
2946   auto inline reg_errcode_t
2947   __attribute ((always_inline))
2948   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2949          re_charset_t *mbcset;
2950          int *range_alloc;
2951          bitset_t sbcset;
2952          bracket_elem_t *start_elem, *end_elem;
2953     {
2954       unsigned int ch;
2955       uint32_t start_collseq;
2956       uint32_t end_collseq;
2957
2958       /* Equivalence Classes and Character Classes can't be a range
2959          start/end.  */
2960       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2961               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2962               0))
2963         return REG_ERANGE;
2964
2965       start_collseq = lookup_collation_sequence_value (start_elem);
2966       end_collseq = lookup_collation_sequence_value (end_elem);
2967       /* Check start/end collation sequence values.  */
2968       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2969         return REG_ECOLLATE;
2970       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2971         return REG_ERANGE;
2972
2973       /* Got valid collation sequence values, add them as a new entry.
2974          However, if we have no collation elements, and the character set
2975          is single byte, the single byte character set that we
2976          build below suffices. */
2977       if (nrules > 0 || dfa->mb_cur_max > 1)
2978         {
2979           /* Check the space of the arrays.  */
2980           if (BE (*range_alloc == mbcset->nranges, 0))
2981             {
2982               /* There is not enough space, need realloc.  */
2983               uint32_t *new_array_start;
2984               uint32_t *new_array_end;
2985               int new_nranges;
2986
2987               /* +1 in case of mbcset->nranges is 0.  */
2988               new_nranges = 2 * mbcset->nranges + 1;
2989               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2990                                             new_nranges);
2991               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2992                                           new_nranges);
2993
2994               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2995                 return REG_ESPACE;
2996
2997               mbcset->range_starts = new_array_start;
2998               mbcset->range_ends = new_array_end;
2999               *range_alloc = new_nranges;
3000             }
3001
3002           mbcset->range_starts[mbcset->nranges] = start_collseq;
3003           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3004         }
3005
3006       /* Build the table for single byte characters.  */
3007       for (ch = 0; ch < SBC_MAX; ch++)
3008         {
3009           uint32_t ch_collseq;
3010           /*
3011           if (MB_CUR_MAX == 1)
3012           */
3013           if (nrules == 0)
3014             ch_collseq = collseqmb[ch];
3015           else
3016             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3017           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3018             bitset_set (sbcset, ch);
3019         }
3020       return REG_NOERROR;
3021     }
3022
3023   /* Local function for parse_bracket_exp used in _LIBC environement.
3024      Build the collating element which is represented by NAME.
3025      The result are written to MBCSET and SBCSET.
3026      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3027      pointer argument sinse we may update it.  */
3028
3029   auto inline reg_errcode_t
3030   __attribute ((always_inline))
3031   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3032          re_charset_t *mbcset;
3033          int *coll_sym_alloc;
3034          bitset_t sbcset;
3035          const unsigned char *name;
3036     {
3037       int32_t elem, idx;
3038       size_t name_len = strlen ((const char *) name);
3039       if (nrules != 0)
3040         {
3041           elem = seek_collating_symbol_entry (name, name_len);
3042           if (symb_table[2 * elem] != 0)
3043             {
3044               /* We found the entry.  */
3045               idx = symb_table[2 * elem + 1];
3046               /* Skip the name of collating element name.  */
3047               idx += 1 + extra[idx];
3048             }
3049           else if (symb_table[2 * elem] == 0 && name_len == 1)
3050             {
3051               /* No valid character, treat it as a normal
3052                  character.  */
3053               bitset_set (sbcset, name[0]);
3054               return REG_NOERROR;
3055             }
3056           else
3057             return REG_ECOLLATE;
3058
3059           /* Got valid collation sequence, add it as a new entry.  */
3060           /* Check the space of the arrays.  */
3061           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3062             {
3063               /* Not enough, realloc it.  */
3064               /* +1 in case of mbcset->ncoll_syms is 0.  */
3065               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3066               /* Use realloc since mbcset->coll_syms is NULL
3067                  if *alloc == 0.  */
3068               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3069                                                    new_coll_sym_alloc);
3070               if (BE (new_coll_syms == NULL, 0))
3071                 return REG_ESPACE;
3072               mbcset->coll_syms = new_coll_syms;
3073               *coll_sym_alloc = new_coll_sym_alloc;
3074             }
3075           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3076           return REG_NOERROR;
3077         }
3078       else
3079         {
3080           if (BE (name_len != 1, 0))
3081             return REG_ECOLLATE;
3082           else
3083             {
3084               bitset_set (sbcset, name[0]);
3085               return REG_NOERROR;
3086             }
3087         }
3088     }
3089 #endif
3090
3091   re_token_t br_token;
3092   re_bitset_ptr_t sbcset;
3093 #ifdef RE_ENABLE_I18N
3094   re_charset_t *mbcset;
3095   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3096   int equiv_class_alloc = 0, char_class_alloc = 0;
3097 #endif /* not RE_ENABLE_I18N */
3098   int non_match = 0;
3099   bin_tree_t *work_tree;
3100   int token_len;
3101   int first_round = 1;
3102 #ifdef _LIBC
3103   collseqmb = (const unsigned char *)
3104     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3105   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3106   if (nrules)
3107     {
3108       /*
3109       if (MB_CUR_MAX > 1)
3110       */
3111       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3112       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3113       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3114                                                   _NL_COLLATE_SYMB_TABLEMB);
3115       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3116                                                    _NL_COLLATE_SYMB_EXTRAMB);
3117     }
3118 #endif
3119   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3120 #ifdef RE_ENABLE_I18N
3121   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3122 #endif /* RE_ENABLE_I18N */
3123 #ifdef RE_ENABLE_I18N
3124   if (BE (sbcset == NULL || mbcset == NULL, 0))
3125 #else
3126   if (BE (sbcset == NULL, 0))
3127 #endif /* RE_ENABLE_I18N */
3128     {
3129       re_free (sbcset);
3130 #ifdef RE_ENABLE_I18N
3131       re_free (mbcset);
3132 #endif
3133       *err = REG_ESPACE;
3134       return NULL;
3135     }
3136
3137   token_len = peek_token_bracket (token, regexp, syntax);
3138   if (BE (token->type == END_OF_RE, 0))
3139     {
3140       *err = REG_BADPAT;
3141       goto parse_bracket_exp_free_return;
3142     }
3143   if (token->type == OP_NON_MATCH_LIST)
3144     {
3145 #ifdef RE_ENABLE_I18N
3146       mbcset->non_match = 1;
3147 #endif /* not RE_ENABLE_I18N */
3148       non_match = 1;
3149       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3150         bitset_set (sbcset, '\n');
3151       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3152       token_len = peek_token_bracket (token, regexp, syntax);
3153       if (BE (token->type == END_OF_RE, 0))
3154         {
3155           *err = REG_BADPAT;
3156           goto parse_bracket_exp_free_return;
3157         }
3158     }
3159
3160   /* We treat the first ']' as a normal character.  */
3161   if (token->type == OP_CLOSE_BRACKET)
3162     token->type = CHARACTER;
3163
3164   while (1)
3165     {
3166       bracket_elem_t start_elem, end_elem;
3167       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3168       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3169       reg_errcode_t ret;
3170       int token_len2 = 0, is_range_exp = 0;
3171       re_token_t token2;
3172
3173       start_elem.opr.name = start_name_buf;
3174       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3175                                    syntax, first_round);
3176       if (BE (ret != REG_NOERROR, 0))
3177         {
3178           *err = ret;
3179           goto parse_bracket_exp_free_return;
3180         }
3181       first_round = 0;
3182
3183       /* Get information about the next token.  We need it in any case.  */
3184       token_len = peek_token_bracket (token, regexp, syntax);
3185
3186       /* Do not check for ranges if we know they are not allowed.  */
3187       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3188         {
3189           if (BE (token->type == END_OF_RE, 0))
3190             {
3191               *err = REG_EBRACK;
3192               goto parse_bracket_exp_free_return;
3193             }
3194           if (token->type == OP_CHARSET_RANGE)
3195             {
3196               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3197               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3198               if (BE (token2.type == END_OF_RE, 0))
3199                 {
3200                   *err = REG_EBRACK;
3201                   goto parse_bracket_exp_free_return;
3202                 }
3203               if (token2.type == OP_CLOSE_BRACKET)
3204                 {
3205                   /* We treat the last '-' as a normal character.  */
3206                   re_string_skip_bytes (regexp, -token_len);
3207                   token->type = CHARACTER;
3208                 }
3209               else
3210                 is_range_exp = 1;
3211             }
3212         }
3213
3214       if (is_range_exp == 1)
3215         {
3216           end_elem.opr.name = end_name_buf;
3217           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3218                                        dfa, syntax, 1);
3219           if (BE (ret != REG_NOERROR, 0))
3220             {
3221               *err = ret;
3222               goto parse_bracket_exp_free_return;
3223             }
3224
3225           token_len = peek_token_bracket (token, regexp, syntax);
3226
3227 #ifdef _LIBC
3228           *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc,
3229                                   &start_elem, &end_elem);
3230 #else
3231 # ifdef RE_ENABLE_I18N
3232           *err = build_range_exp (syntax, sbcset,
3233                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3234                                   &range_alloc, &start_elem, &end_elem);
3235 # else
3236           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3237 # endif
3238 #endif /* RE_ENABLE_I18N */
3239           if (BE (*err != REG_NOERROR, 0))
3240             goto parse_bracket_exp_free_return;
3241         }
3242       else
3243         {
3244           switch (start_elem.type)
3245             {
3246             case SB_CHAR:
3247               bitset_set (sbcset, start_elem.opr.ch);
3248               break;
3249 #ifdef RE_ENABLE_I18N
3250             case MB_CHAR:
3251               /* Check whether the array has enough space.  */
3252               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3253                 {
3254                   wchar_t *new_mbchars;
3255                   /* Not enough, realloc it.  */
3256                   /* +1 in case of mbcset->nmbchars is 0.  */
3257                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3258                   /* Use realloc since array is NULL if *alloc == 0.  */
3259                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3260                                             mbchar_alloc);
3261                   if (BE (new_mbchars == NULL, 0))
3262                     goto parse_bracket_exp_espace;
3263                   mbcset->mbchars = new_mbchars;
3264                 }
3265               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3266               break;
3267 #endif /* RE_ENABLE_I18N */
3268             case EQUIV_CLASS:
3269               *err = build_equiv_class (sbcset,
3270 #ifdef RE_ENABLE_I18N
3271                                         mbcset, &equiv_class_alloc,
3272 #endif /* RE_ENABLE_I18N */
3273                                         start_elem.opr.name);
3274               if (BE (*err != REG_NOERROR, 0))
3275                 goto parse_bracket_exp_free_return;
3276               break;
3277             case COLL_SYM:
3278               *err = build_collating_symbol (sbcset,
3279 #ifdef RE_ENABLE_I18N
3280                                              mbcset, &coll_sym_alloc,
3281 #endif /* RE_ENABLE_I18N */
3282                                              start_elem.opr.name);
3283               if (BE (*err != REG_NOERROR, 0))
3284                 goto parse_bracket_exp_free_return;
3285               break;
3286             case CHAR_CLASS:
3287               *err = build_charclass (regexp->trans, sbcset,
3288 #ifdef RE_ENABLE_I18N
3289                                       mbcset, &char_class_alloc,
3290 #endif /* RE_ENABLE_I18N */
3291                                       (const char *) start_elem.opr.name, syntax);
3292               if (BE (*err != REG_NOERROR, 0))
3293                goto parse_bracket_exp_free_return;
3294               break;
3295             default:
3296               assert (0);
3297               break;
3298             }
3299         }
3300       if (BE (token->type == END_OF_RE, 0))
3301         {
3302           *err = REG_EBRACK;
3303           goto parse_bracket_exp_free_return;
3304         }
3305       if (token->type == OP_CLOSE_BRACKET)
3306         break;
3307     }
3308
3309   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3310
3311   /* If it is non-matching list.  */
3312   if (non_match)
3313     bitset_not (sbcset);
3314
3315 #ifdef RE_ENABLE_I18N
3316   /* Ensure only single byte characters are set.  */
3317   if (dfa->mb_cur_max > 1)
3318     bitset_mask (sbcset, dfa->sb_char);
3319
3320   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3321       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3322                                                      || mbcset->non_match)))
3323     {
3324       bin_tree_t *mbc_tree;
3325       int sbc_idx;
3326       /* Build a tree for complex bracket.  */
3327       dfa->has_mb_node = 1;
3328       br_token.type = COMPLEX_BRACKET;
3329       br_token.opr.mbcset = mbcset;
3330       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3331       if (BE (mbc_tree == NULL, 0))
3332         goto parse_bracket_exp_espace;
3333       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3334         if (sbcset[sbc_idx])
3335           break;
3336       /* If there are no bits set in sbcset, there is no point
3337          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3338       if (sbc_idx < BITSET_WORDS)
3339         {
3340           /* Build a tree for simple bracket.  */
3341           br_token.type = SIMPLE_BRACKET;
3342           br_token.opr.sbcset = sbcset;
3343           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3344           if (BE (work_tree == NULL, 0))
3345             goto parse_bracket_exp_espace;
3346
3347           /* Then join them by ALT node.  */
3348           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3349           if (BE (work_tree == NULL, 0))
3350             goto parse_bracket_exp_espace;
3351         }
3352       else
3353         {
3354           re_free (sbcset);
3355           work_tree = mbc_tree;
3356         }
3357     }
3358   else
3359 #endif /* not RE_ENABLE_I18N */
3360     {
3361 #ifdef RE_ENABLE_I18N
3362       free_charset (mbcset);
3363 #endif
3364       /* Build a tree for simple bracket.  */
3365       br_token.type = SIMPLE_BRACKET;
3366       br_token.opr.sbcset = sbcset;
3367       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3368       if (BE (work_tree == NULL, 0))
3369         goto parse_bracket_exp_espace;
3370     }
3371   return work_tree;
3372
3373  parse_bracket_exp_espace:
3374   *err = REG_ESPACE;
3375  parse_bracket_exp_free_return:
3376   re_free (sbcset);
3377 #ifdef RE_ENABLE_I18N
3378   free_charset (mbcset);
3379 #endif /* RE_ENABLE_I18N */
3380   return NULL;
3381 }
3382
3383 /* Parse an element in the bracket expression.  */
3384
3385 static reg_errcode_t
3386 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3387                        re_token_t *token, int token_len, re_dfa_t *dfa,
3388                        reg_syntax_t syntax, int accept_hyphen)
3389 {
3390 #ifdef RE_ENABLE_I18N
3391   int cur_char_size;
3392   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3393   if (cur_char_size > 1)
3394     {
3395       elem->type = MB_CHAR;
3396       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3397       re_string_skip_bytes (regexp, cur_char_size);
3398       return REG_NOERROR;
3399     }
3400 #endif /* RE_ENABLE_I18N */
3401   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3402   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3403       || token->type == OP_OPEN_EQUIV_CLASS)
3404     return parse_bracket_symbol (elem, regexp, token);
3405   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3406     {
3407       /* A '-' must only appear as anything but a range indicator before
3408          the closing bracket.  Everything else is an error.  */
3409       re_token_t token2;
3410       (void) peek_token_bracket (&token2, regexp, syntax);
3411       if (token2.type != OP_CLOSE_BRACKET)
3412         /* The actual error value is not standardized since this whole
3413            case is undefined.  But ERANGE makes good sense.  */
3414         return REG_ERANGE;
3415     }
3416   elem->type = SB_CHAR;
3417   elem->opr.ch = token->opr.c;
3418   return REG_NOERROR;
3419 }
3420
3421 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3422    such as [:<character_class>:], [.<collating_element>.], and
3423    [=<equivalent_class>=].  */
3424
3425 static reg_errcode_t
3426 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3427                       re_token_t *token)
3428 {
3429   unsigned char ch, delim = token->opr.c;
3430   int i = 0;
3431   if (re_string_eoi(regexp))
3432     return REG_EBRACK;
3433   for (;; ++i)
3434     {
3435       if (i >= BRACKET_NAME_BUF_SIZE)
3436         return REG_EBRACK;
3437       if (token->type == OP_OPEN_CHAR_CLASS)
3438         ch = re_string_fetch_byte_case (regexp);
3439       else
3440         ch = re_string_fetch_byte (regexp);
3441       if (re_string_eoi(regexp))
3442         return REG_EBRACK;
3443       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3444         break;
3445       elem->opr.name[i] = ch;
3446     }
3447   re_string_skip_bytes (regexp, 1);
3448   elem->opr.name[i] = '\0';
3449   switch (token->type)
3450     {
3451     case OP_OPEN_COLL_ELEM:
3452       elem->type = COLL_SYM;
3453       break;
3454     case OP_OPEN_EQUIV_CLASS:
3455       elem->type = EQUIV_CLASS;
3456       break;
3457     case OP_OPEN_CHAR_CLASS:
3458       elem->type = CHAR_CLASS;
3459       break;
3460     default:
3461       break;
3462     }
3463   return REG_NOERROR;
3464 }
3465
3466   /* Helper function for parse_bracket_exp.
3467      Build the equivalence class which is represented by NAME.
3468      The result are written to MBCSET and SBCSET.
3469      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3470      is a pointer argument sinse we may update it.  */
3471
3472 static reg_errcode_t
3473 #ifdef RE_ENABLE_I18N
3474 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3475                    int *equiv_class_alloc, const unsigned char *name)
3476 #else /* not RE_ENABLE_I18N */
3477 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3478 #endif /* not RE_ENABLE_I18N */
3479 {
3480 #ifdef _LIBC
3481   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3482   if (nrules != 0)
3483     {
3484       const int32_t *table, *indirect;
3485       const unsigned char *weights, *extra, *cp;
3486       unsigned char char_buf[2];
3487       int32_t idx1, idx2;
3488       unsigned int ch;
3489       size_t len;
3490       /* This #include defines a local function!  */
3491 # include <locale/weight.h>
3492       /* Calculate the index for equivalence class.  */
3493       cp = name;
3494       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3495       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3496                                                _NL_COLLATE_WEIGHTMB);
3497       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3498                                                    _NL_COLLATE_EXTRAMB);
3499       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3500                                                 _NL_COLLATE_INDIRECTMB);
3501       idx1 = findidx (&cp, -1);
3502       if (BE (idx1 == 0 || *cp != '\0', 0))
3503         /* This isn't a valid character.  */
3504         return REG_ECOLLATE;
3505
3506       /* Build single byte matcing table for this equivalence class.  */
3507       char_buf[1] = (unsigned char) '\0';
3508       len = weights[idx1 & 0xffffff];
3509       for (ch = 0; ch < SBC_MAX; ++ch)
3510         {
3511           char_buf[0] = ch;
3512           cp = char_buf;
3513           idx2 = findidx (&cp, 1);
3514 /*
3515           idx2 = table[ch];
3516 */
3517           if (idx2 == 0)
3518             /* This isn't a valid character.  */
3519             continue;
3520           /* Compare only if the length matches and the collation rule
3521              index is the same.  */
3522           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3523             {
3524               int cnt = 0;
3525
3526               while (cnt <= len &&
3527                      weights[(idx1 & 0xffffff) + 1 + cnt]
3528                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3529                 ++cnt;
3530
3531               if (cnt > len)
3532                 bitset_set (sbcset, ch);
3533             }
3534         }
3535       /* Check whether the array has enough space.  */
3536       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3537         {
3538           /* Not enough, realloc it.  */
3539           /* +1 in case of mbcset->nequiv_classes is 0.  */
3540           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3541           /* Use realloc since the array is NULL if *alloc == 0.  */
3542           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3543                                                    int32_t,
3544                                                    new_equiv_class_alloc);
3545           if (BE (new_equiv_classes == NULL, 0))
3546             return REG_ESPACE;
3547           mbcset->equiv_classes = new_equiv_classes;
3548           *equiv_class_alloc = new_equiv_class_alloc;
3549         }
3550       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3551     }
3552   else
3553 #endif /* _LIBC */
3554     {
3555       if (BE (strlen ((const char *) name) != 1, 0))
3556         return REG_ECOLLATE;
3557       bitset_set (sbcset, *name);
3558     }
3559   return REG_NOERROR;
3560 }
3561
3562   /* Helper function for parse_bracket_exp.
3563      Build the character class which is represented by NAME.
3564      The result are written to MBCSET and SBCSET.
3565      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3566      is a pointer argument sinse we may update it.  */
3567
3568 static reg_errcode_t
3569 #ifdef RE_ENABLE_I18N
3570 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3571                  re_charset_t *mbcset, int *char_class_alloc,
3572                  const char *class_name, reg_syntax_t syntax)
3573 #else /* not RE_ENABLE_I18N */
3574 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3575                  const char *class_name, reg_syntax_t syntax)
3576 #endif /* not RE_ENABLE_I18N */
3577 {
3578   int i;
3579
3580   /* In case of REG_ICASE "upper" and "lower" match the both of
3581      upper and lower cases.  */
3582   if ((syntax & RE_ICASE)
3583       && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3584     class_name = "alpha";
3585
3586 #ifdef RE_ENABLE_I18N
3587   /* Check the space of the arrays.  */
3588   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3589     {
3590       /* Not enough, realloc it.  */
3591       /* +1 in case of mbcset->nchar_classes is 0.  */
3592       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3593       /* Use realloc since array is NULL if *alloc == 0.  */
3594       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3595                                                new_char_class_alloc);
3596       if (BE (new_char_classes == NULL, 0))
3597         return REG_ESPACE;
3598       mbcset->char_classes = new_char_classes;
3599       *char_class_alloc = new_char_class_alloc;
3600     }
3601   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3602 #endif /* RE_ENABLE_I18N */
3603
3604 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3605   do {                                          \
3606     if (BE (trans != NULL, 0))                  \
3607       {                                         \
3608         for (i = 0; i < SBC_MAX; ++i)           \
3609           if (ctype_func (i))                   \
3610             bitset_set (sbcset, trans[i]);      \
3611       }                                         \
3612     else                                        \
3613       {                                         \
3614         for (i = 0; i < SBC_MAX; ++i)           \
3615           if (ctype_func (i))                   \
3616             bitset_set (sbcset, i);             \
3617       }                                         \
3618   } while (0)
3619
3620   if (strcmp (class_name, "alnum") == 0)
3621     BUILD_CHARCLASS_LOOP (isalnum);
3622   else if (strcmp (class_name, "cntrl") == 0)
3623     BUILD_CHARCLASS_LOOP (iscntrl);
3624   else if (strcmp (class_name, "lower") == 0)
3625     BUILD_CHARCLASS_LOOP (islower);
3626   else if (strcmp (class_name, "space") == 0)
3627     BUILD_CHARCLASS_LOOP (isspace);
3628   else if (strcmp (class_name, "alpha") == 0)
3629     BUILD_CHARCLASS_LOOP (isalpha);
3630   else if (strcmp (class_name, "digit") == 0)
3631     BUILD_CHARCLASS_LOOP (isdigit);
3632   else if (strcmp (class_name, "print") == 0)
3633     BUILD_CHARCLASS_LOOP (isprint);
3634   else if (strcmp (class_name, "upper") == 0)
3635     BUILD_CHARCLASS_LOOP (isupper);
3636   else if (strcmp (class_name, "blank") == 0)
3637 #ifndef GAWK
3638     BUILD_CHARCLASS_LOOP (isblank);
3639 #else
3640     /* see comments above */
3641     BUILD_CHARCLASS_LOOP (is_blank);
3642 #endif
3643   else if (strcmp (class_name, "graph") == 0)
3644     BUILD_CHARCLASS_LOOP (isgraph);
3645   else if (strcmp (class_name, "punct") == 0)
3646     BUILD_CHARCLASS_LOOP (ispunct);
3647   else if (strcmp (class_name, "xdigit") == 0)
3648     BUILD_CHARCLASS_LOOP (isxdigit);
3649   else
3650     return REG_ECTYPE;
3651
3652   return REG_NOERROR;
3653 }
3654
3655 static bin_tree_t *
3656 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3657                     const char *class_name,
3658                     const char *extra, int non_match,
3659                     reg_errcode_t *err)
3660 {
3661   re_bitset_ptr_t sbcset;
3662 #ifdef RE_ENABLE_I18N
3663   re_charset_t *mbcset;
3664   int alloc = 0;
3665 #endif /* not RE_ENABLE_I18N */
3666   reg_errcode_t ret;
3667   re_token_t br_token;
3668   bin_tree_t *tree;
3669
3670   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3671 #ifdef RE_ENABLE_I18N
3672   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3673 #endif /* RE_ENABLE_I18N */
3674
3675 #ifdef RE_ENABLE_I18N
3676   if (BE (sbcset == NULL || mbcset == NULL, 0))
3677 #else /* not RE_ENABLE_I18N */
3678   if (BE (sbcset == NULL, 0))
3679 #endif /* not RE_ENABLE_I18N */
3680     {
3681       *err = REG_ESPACE;
3682       return NULL;
3683     }
3684
3685   if (non_match)
3686     {
3687 #ifdef RE_ENABLE_I18N
3688       mbcset->non_match = 1;
3689 #endif /* not RE_ENABLE_I18N */
3690     }
3691
3692   /* We don't care the syntax in this case.  */
3693   ret = build_charclass (trans, sbcset,
3694 #ifdef RE_ENABLE_I18N
3695                          mbcset, &alloc,
3696 #endif /* RE_ENABLE_I18N */
3697                          class_name, 0);
3698
3699   if (BE (ret != REG_NOERROR, 0))
3700     {
3701       re_free (sbcset);
3702 #ifdef RE_ENABLE_I18N
3703       free_charset (mbcset);
3704 #endif /* RE_ENABLE_I18N */
3705       *err = ret;
3706       return NULL;
3707     }
3708   /* \w match '_' also.  */
3709   for (; *extra; extra++)
3710     bitset_set (sbcset, *extra);
3711
3712   /* If it is non-matching list.  */
3713   if (non_match)
3714     bitset_not (sbcset);
3715
3716 #ifdef RE_ENABLE_I18N
3717   /* Ensure only single byte characters are set.  */
3718   if (dfa->mb_cur_max > 1)
3719     bitset_mask (sbcset, dfa->sb_char);
3720 #endif
3721
3722   /* Build a tree for simple bracket.  */
3723   br_token.type = SIMPLE_BRACKET;
3724   br_token.opr.sbcset = sbcset;
3725   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726   if (BE (tree == NULL, 0))
3727     goto build_word_op_espace;
3728
3729 #ifdef RE_ENABLE_I18N
3730   if (dfa->mb_cur_max > 1)
3731     {
3732       bin_tree_t *mbc_tree;
3733       /* Build a tree for complex bracket.  */
3734       br_token.type = COMPLEX_BRACKET;
3735       br_token.opr.mbcset = mbcset;
3736       dfa->has_mb_node = 1;
3737       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3738       if (BE (mbc_tree == NULL, 0))
3739         goto build_word_op_espace;
3740       /* Then join them by ALT node.  */
3741       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3742       if (BE (mbc_tree != NULL, 1))
3743         return tree;
3744     }
3745   else
3746     {
3747       free_charset (mbcset);
3748       return tree;
3749     }
3750 #else /* not RE_ENABLE_I18N */
3751   return tree;
3752 #endif /* not RE_ENABLE_I18N */
3753
3754  build_word_op_espace:
3755   re_free (sbcset);
3756 #ifdef RE_ENABLE_I18N
3757   free_charset (mbcset);
3758 #endif /* RE_ENABLE_I18N */
3759   *err = REG_ESPACE;
3760   return NULL;
3761 }
3762
3763 /* This is intended for the expressions like "a{1,3}".
3764    Fetch a number from `input', and return the number.
3765    Return -1, if the number field is empty like "{,1}".
3766    Return -2, If an error is occured.  */
3767
3768 static int
3769 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3770 {
3771   int num = -1;
3772   unsigned char c;
3773   while (1)
3774     {
3775       fetch_token (token, input, syntax);
3776       c = token->opr.c;
3777       if (BE (token->type == END_OF_RE, 0))
3778         return -2;
3779       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3780         break;
3781       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3782              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3783       num = (num > RE_DUP_MAX) ? -2 : num;
3784     }
3785   return num;
3786 }
3787 \f
3788 #ifdef RE_ENABLE_I18N
3789 static void
3790 free_charset (re_charset_t *cset)
3791 {
3792   re_free (cset->mbchars);
3793 # ifdef _LIBC
3794   re_free (cset->coll_syms);
3795   re_free (cset->equiv_classes);
3796   re_free (cset->range_starts);
3797   re_free (cset->range_ends);
3798 # endif
3799   re_free (cset->char_classes);
3800   re_free (cset);
3801 }
3802 #endif /* RE_ENABLE_I18N */
3803 \f
3804 /* Functions for binary tree operation.  */
3805
3806 /* Create a tree node.  */
3807
3808 static bin_tree_t *
3809 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3810              re_token_type_t type)
3811 {
3812   re_token_t t;
3813   t.type = type;
3814   return create_token_tree (dfa, left, right, &t);
3815 }
3816
3817 static bin_tree_t *
3818 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3819                    const re_token_t *token)
3820 {
3821   bin_tree_t *tree;
3822   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3823     {
3824       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3825
3826       if (storage == NULL)
3827         return NULL;
3828       storage->next = dfa->str_tree_storage;
3829       dfa->str_tree_storage = storage;
3830       dfa->str_tree_storage_idx = 0;
3831     }
3832   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3833
3834   tree->parent = NULL;
3835   tree->left = left;
3836   tree->right = right;
3837   tree->token = *token;
3838   tree->token.duplicated = 0;
3839   tree->token.opt_subexp = 0;
3840   tree->first = NULL;
3841   tree->next = NULL;
3842   tree->node_idx = -1;
3843
3844   if (left != NULL)
3845     left->parent = tree;
3846   if (right != NULL)
3847     right->parent = tree;
3848   return tree;
3849 }
3850
3851 /* Mark the tree SRC as an optional subexpression.
3852    To be called from preorder or postorder.  */
3853
3854 static reg_errcode_t
3855 mark_opt_subexp (void *extra, bin_tree_t *node)
3856 {
3857   int idx = (int) (long) extra;
3858   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3859     node->token.opt_subexp = 1;
3860
3861   return REG_NOERROR;
3862 }
3863
3864 /* Free the allocated memory inside NODE. */
3865
3866 static void
3867 free_token (re_token_t *node)
3868 {
3869 #ifdef RE_ENABLE_I18N
3870   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3871     free_charset (node->opr.mbcset);
3872   else
3873 #endif /* RE_ENABLE_I18N */
3874     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3875       re_free (node->opr.sbcset);
3876 }
3877
3878 /* Worker function for tree walking.  Free the allocated memory inside NODE
3879    and its children. */
3880
3881 static reg_errcode_t
3882 free_tree (void *extra, bin_tree_t *node)
3883 {
3884   free_token (&node->token);
3885   return REG_NOERROR;
3886 }
3887
3888
3889 /* Duplicate the node SRC, and return new node.  This is a preorder
3890    visit similar to the one implemented by the generic visitor, but
3891    we need more infrastructure to maintain two parallel trees --- so,
3892    it's easier to duplicate.  */
3893
3894 static bin_tree_t *
3895 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3896 {
3897   const bin_tree_t *node;
3898   bin_tree_t *dup_root;
3899   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3900
3901   for (node = root; ; )
3902     {
3903       /* Create a new tree and link it back to the current parent.  */
3904       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3905       if (*p_new == NULL)
3906         return NULL;
3907       (*p_new)->parent = dup_node;
3908       (*p_new)->token.duplicated = 1;
3909       dup_node = *p_new;
3910
3911       /* Go to the left node, or up and to the right.  */
3912       if (node->left)
3913         {
3914           node = node->left;
3915           p_new = &dup_node->left;
3916         }
3917       else
3918         {
3919           const bin_tree_t *prev = NULL;
3920           while (node->right == prev || node->right == NULL)
3921             {
3922               prev = node;
3923               node = node->parent;
3924               dup_node = dup_node->parent;
3925               if (!node)
3926                 return dup_root;
3927             }
3928           node = node->right;
3929           p_new = &dup_node->right;
3930         }
3931     }
3932 }