lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2016 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 #ifdef RE_ENABLE_I18N
  31 static void free_charset (re_charset_t *cset);
  32 #endif /* RE_ENABLE_I18N */
  33 static void free_workarea_compile (regex_t *preg);
  34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  35 #ifdef RE_ENABLE_I18N
  36 static void optimize_utf8 (re_dfa_t *dfa);
  37 #endif
  38 static reg_errcode_t analyze (regex_t *preg);
  39 static reg_errcode_t preorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42 static reg_errcode_t postorder (bin_tree_t *root,
  43                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  44                                 void *extra);
  45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  48                                  bin_tree_t *node);
  49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  54                                    unsigned int constraint);
  55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  57                                          Idx node, bool root);
  58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  59 static Idx fetch_number (re_string_t *input, re_token_t *token,
  60                          reg_syntax_t syntax);
  61 static int peek_token (re_token_t *token, re_string_t *input,
  62                         reg_syntax_t syntax) internal_function;
  63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  64                           reg_syntax_t syntax, reg_errcode_t *err);
  65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  66                                   re_token_t *token, reg_syntax_t syntax,
  67                                   Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  69                                  re_token_t *token, reg_syntax_t syntax,
  70                                  Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  72                                      re_token_t *token, reg_syntax_t syntax,
  73                                      Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  75                                   re_token_t *token, reg_syntax_t syntax,
  76                                   Idx nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  78                                  re_dfa_t *dfa, re_token_t *token,
  79                                  reg_syntax_t syntax, reg_errcode_t *err);
  80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  81                                       re_token_t *token, reg_syntax_t syntax,
  82                                       reg_errcode_t *err);
  83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  84                                             re_string_t *regexp,
  85                                             re_token_t *token, int token_len,
  86                                             re_dfa_t *dfa,
  87                                             reg_syntax_t syntax,
  88                                             bool accept_hyphen);
  89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  90                                           re_string_t *regexp,
  91                                           re_token_t *token);
  92 #ifdef RE_ENABLE_I18N
  93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  94                                         re_charset_t *mbcset,
  95                                         Idx *equiv_class_alloc,
  96                                         const unsigned char *name);
  97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  98                                       bitset_t sbcset,
  99                                       re_charset_t *mbcset,
 100                                       Idx *char_class_alloc,
 101                                       const char *class_name,
 102                                       reg_syntax_t syntax);
 103 #else  /* not RE_ENABLE_I18N */
 104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 105                                         const unsigned char *name);
 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 107                                       bitset_t sbcset,
 108                                       const char *class_name,
 109                                       reg_syntax_t syntax);
 110 #endif /* not RE_ENABLE_I18N */
 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 112                                        RE_TRANSLATE_TYPE trans,
 113                                        const char *class_name,
 114                                        const char *extra,
 115                                        bool non_match, reg_errcode_t *err);
 116 static bin_tree_t *create_tree (re_dfa_t *dfa,
 117                                 bin_tree_t *left, bin_tree_t *right,
 118                                 re_token_type_t type);
 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 120                                       bin_tree_t *left, bin_tree_t *right,
 121                                       const re_token_t *token);
 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 123 static void free_token (re_token_t *node);
 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 126 \f
 127 /* This table gives an error message for each of the error codes listed
 128    in regex.h.  Obviously the order here has to be same as there.
 129    POSIX doesn't require that we do anything for REG_NOERROR,
 130    but why not be nice?  */
 131
 132 static const char __re_error_msgid[] =
 133   {
 134 #define REG_NOERROR_IDX 0
 135     gettext_noop ("Success")    /* REG_NOERROR */
 136     "\0"
 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 138     gettext_noop ("No match")   /* REG_NOMATCH */
 139     "\0"
 140 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 142     "\0"
 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 145     "\0"
 146 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 148     "\0"
 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 151     "\0"
 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 154     "\0"
 155 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 156     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 157     "\0"
 158 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 160     "\0"
 161 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 163     "\0"
 164 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 166     "\0"
 167 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 168     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 169     "\0"
 170 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 172     "\0"
 173 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 175     "\0"
 176 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 178     "\0"
 179 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 181     "\0"
 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 184   };
 185
 186 static const size_t __re_error_msgid_idx[] =
 187   {
 188     REG_NOERROR_IDX,
 189     REG_NOMATCH_IDX,
 190     REG_BADPAT_IDX,
 191     REG_ECOLLATE_IDX,
 192     REG_ECTYPE_IDX,
 193     REG_EESCAPE_IDX,
 194     REG_ESUBREG_IDX,
 195     REG_EBRACK_IDX,
 196     REG_EPAREN_IDX,
 197     REG_EBRACE_IDX,
 198     REG_BADBR_IDX,
 199     REG_ERANGE_IDX,
 200     REG_ESPACE_IDX,
 201     REG_BADRPT_IDX,
 202     REG_EEND_IDX,
 203     REG_ESIZE_IDX,
 204     REG_ERPAREN_IDX
 205   };
 206 \f
 207 /* Entry points for GNU code.  */
 208
 209 /* re_compile_pattern is the GNU regular expression compiler: it
 210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 211    Returns 0 if the pattern was valid, otherwise an error string.
 212
 213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 214    are set in BUFP on entry.  */
 215
 216 const char *
 217 re_compile_pattern (const char *pattern, size_t length,
 218                     struct re_pattern_buffer *bufp)
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 #ifdef _LIBC
 237 weak_alias (__re_compile_pattern, re_compile_pattern)
 238 #endif
 239
 240 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 241    also be assigned to arbitrarily: each pattern buffer stores its own
 242    syntax, so it can be changed between regex compilations.  */
 243 /* This has no initializer because initialized variables in Emacs
 244    become read-only after dumping.  */
 245 reg_syntax_t re_syntax_options;
 246
 247
 248 /* Specify the precise syntax of regexps for compilation.  This provides
 249    for compatibility for various utilities which historically have
 250    different, incompatible syntaxes.
 251
 252    The argument SYNTAX is a bit mask comprised of the various bits
 253    defined in regex.h.  We return the old syntax.  */
 254
 255 reg_syntax_t
 256 re_set_syntax (reg_syntax_t syntax)
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (struct re_pattern_buffer *bufp)
 269 {
 270   re_dfa_t *dfa = bufp->buffer;
 271   char *fastmap = bufp->fastmap;
 272
 273   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 274   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 275   if (dfa->init_state != dfa->init_state_word)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 277   if (dfa->init_state != dfa->init_state_nl)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 279   if (dfa->init_state != dfa->init_state_begbuf)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 281   bufp->fastmap_accurate = 1;
 282   return 0;
 283 }
 284 #ifdef _LIBC
 285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 286 #endif
 287
 288 static inline void
 289 __attribute__ ((always_inline))
 290 re_set_fastmap (char *fastmap, bool icase, int ch)
 291 {
 292   fastmap[ch] = 1;
 293   if (icase)
 294     fastmap[tolower (ch)] = 1;
 295 }
 296
 297 /* Helper function for re_compile_fastmap.
 298    Compile fastmap for the initial_state INIT_STATE.  */
 299
 300 static void
 301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 302                          char *fastmap)
 303 {
 304   re_dfa_t *dfa = bufp->buffer;
 305   Idx node_cnt;
 306   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 307   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 308     {
 309       Idx node = init_state->nodes.elems[node_cnt];
 310       re_token_type_t type = dfa->nodes[node].type;
 311
 312       if (type == CHARACTER)
 313         {
 314           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 315 #ifdef RE_ENABLE_I18N
 316           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 317             {
 318               unsigned char buf[MB_LEN_MAX];
 319               unsigned char *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 331                              &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, false, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 354           Idx i;
 355
 356 # ifdef _LIBC
 357           /* See if we have to try all bytes which start multiple collation
 358              elements.
 359              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 360                   collation element, and don't catch 'b' since 'b' is
 361                   the only collation element which starts from 'b' (and
 362                   it is caught by SIMPLE_BRACKET).  */
 363               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 364                   && (cset->ncoll_syms || cset->nranges))
 365                 {
 366                   const int32_t *table = (const int32_t *)
 367                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 368                   for (i = 0; i < SBC_MAX; ++i)
 369                     if (table[i] < 0)
 370                       re_set_fastmap (fastmap, icase, i);
 371                 }
 372 # endif /* _LIBC */
 373
 374           /* See if we have to start the match at all multibyte characters,
 375              i.e. where we would not find an invalid sequence.  This only
 376              applies to multibyte character sets; for single byte character
 377              sets, the SIMPLE_BRACKET again suffices.  */
 378           if (dfa->mb_cur_max > 1
 379               && (cset->nchar_classes || cset->non_match || cset->nranges
 380 # ifdef _LIBC
 381                   || cset->nequiv_classes
 382 # endif /* _LIBC */
 383                  ))
 384             {
 385               unsigned char c = 0;
 386               do
 387                 {
 388                   mbstate_t mbs;
 389                   memset (&mbs, 0, sizeof (mbs));
 390                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 391                     re_set_fastmap (fastmap, false, (int) c);
 392                 }
 393               while (++c != 0);
 394             }
 395
 396           else
 397             {
 398               /* ... Else catch all bytes which can start the mbchars.  */
 399               for (i = 0; i < cset->nmbchars; ++i)
 400                 {
 401                   char buf[256];
 402                   mbstate_t state;
 403                   memset (&state, '\0', sizeof (state));
 404                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 405                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 406                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 407                     {
 408                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 409                           != (size_t) -1)
 410                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 411                     }
 412                 }
 413             }
 414         }
 415 #endif /* RE_ENABLE_I18N */
 416       else if (type == OP_PERIOD
 417 #ifdef RE_ENABLE_I18N
 418                || type == OP_UTF8_PERIOD
 419 #endif /* RE_ENABLE_I18N */
 420                || type == END_OF_RE)
 421         {
 422           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 423           if (type == END_OF_RE)
 424             bufp->can_be_null = 1;
 425           return;
 426         }
 427     }
 428 }
 429 \f
 430 /* Entry point for POSIX code.  */
 431 /* regcomp takes a regular expression as a string and compiles it.
 432
 433    PREG is a regex_t *.  We do not expect any fields to be initialized,
 434    since POSIX says we shouldn't.  Thus, we set
 435
 436      'buffer' to the compiled pattern;
 437      'used' to the length of the compiled pattern;
 438      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 439        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 440        RE_SYNTAX_POSIX_BASIC;
 441      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 442      'fastmap' to an allocated space for the fastmap;
 443      'fastmap_accurate' to zero;
 444      're_nsub' to the number of subexpressions in PATTERN.
 445
 446    PATTERN is the address of the pattern string.
 447
 448    CFLAGS is a series of bits which affect compilation.
 449
 450      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 451      use POSIX basic syntax.
 452
 453      If REG_NEWLINE is set, then . and [^...] don't match newline.
 454      Also, regexec will try a match beginning after every newline.
 455
 456      If REG_ICASE is set, then we considers upper- and lowercase
 457      versions of letters to be equivalent when matching.
 458
 459      If REG_NOSUB is set, then when PREG is passed to regexec, that
 460      routine will report only success or failure, and nothing about the
 461      registers.
 462
 463    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 464    the return codes and their meanings.)  */
 465
 466 int
 467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
 468 {
 469   reg_errcode_t ret;
 470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 471                          : RE_SYNTAX_POSIX_BASIC);
 472
 473   preg->buffer = NULL;
 474   preg->allocated = 0;
 475   preg->used = 0;
 476
 477   /* Try to allocate space for the fastmap.  */
 478   preg->fastmap = re_malloc (char, SBC_MAX);
 479   if (BE (preg->fastmap == NULL, 0))
 480     return REG_ESPACE;
 481
 482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 483
 484   /* If REG_NEWLINE is set, newlines are treated differently.  */
 485   if (cflags & REG_NEWLINE)
 486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 487       syntax &= ~RE_DOT_NEWLINE;
 488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 489       /* It also changes the matching behavior.  */
 490       preg->newline_anchor = 1;
 491     }
 492   else
 493     preg->newline_anchor = 0;
 494   preg->no_sub = !!(cflags & REG_NOSUB);
 495   preg->translate = NULL;
 496
 497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 498
 499   /* POSIX doesn't distinguish between an unmatched open-group and an
 500      unmatched close-group: both are REG_EPAREN.  */
 501   if (ret == REG_ERPAREN)
 502     ret = REG_EPAREN;
 503
 504   /* We have already checked preg->fastmap != NULL.  */
 505   if (BE (ret == REG_NOERROR, 1))
 506     /* Compute the fastmap now, since regexec cannot modify the pattern
 507        buffer.  This function never fails in this implementation.  */
 508     (void) re_compile_fastmap (preg);
 509   else
 510     {
 511       /* Some error occurred while compiling the expression.  */
 512       re_free (preg->fastmap);
 513       preg->fastmap = NULL;
 514     }
 515
 516   return (int) ret;
 517 }
 518 #ifdef _LIBC
 519 weak_alias (__regcomp, regcomp)
 520 #endif
 521
 522 /* Returns a message corresponding to an error code, ERRCODE, returned
 523    from either regcomp or regexec.   We don't use PREG here.  */
 524
 525 size_t
 526 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
 527           size_t errbuf_size)
 528 {
 529   const char *msg;
 530   size_t msg_size;
 531
 532   if (BE (errcode < 0
 533           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 534                                / sizeof (__re_error_msgid_idx[0])), 0))
 535     /* Only error codes returned by the rest of the code should be passed
 536        to this routine.  If we are given anything else, or if other regex
 537        code generates an invalid error code, then the program has a bug.
 538        Dump core so we can fix it.  */
 539     abort ();
 540
 541   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 542
 543   msg_size = strlen (msg) + 1; /* Includes the null.  */
 544
 545   if (BE (errbuf_size != 0, 1))
 546     {
 547       size_t cpy_size = msg_size;
 548       if (BE (msg_size > errbuf_size, 0))
 549         {
 550           cpy_size = errbuf_size - 1;
 551           errbuf[cpy_size] = '\0';
 552         }
 553       memcpy (errbuf, msg, cpy_size);
 554     }
 555
 556   return msg_size;
 557 }
 558 #ifdef _LIBC
 559 weak_alias (__regerror, regerror)
 560 #endif
 561
 562
 563 #ifdef RE_ENABLE_I18N
 564 /* This static array is used for the map to single-byte characters when
 565    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 566    it the same all the time.  UTF-8 is the preferred encoding so this is
 567    a worthwhile optimization.  */
 568 static const bitset_t utf8_sb_map =
 569 {
 570   /* Set the first 128 bits.  */
 571 # if defined __GNUC__ && !defined __STRICT_ANSI__
 572   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 573 # else
 574 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 575 #   error "bitset_word_t is narrower than 32 bits"
 576 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 577   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 578 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 579   BITSET_WORD_MAX, BITSET_WORD_MAX,
 580 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 581   BITSET_WORD_MAX,
 582 #  endif
 583   (BITSET_WORD_MAX
 584    >> (SBC_MAX % BITSET_WORD_BITS == 0
 585        ? 0
 586        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 587 # endif
 588 };
 589 #endif
 590
 591
 592 static void
 593 free_dfa_content (re_dfa_t *dfa)
 594 {
 595   Idx i, j;
 596
 597   if (dfa->nodes)
 598     for (i = 0; i < dfa->nodes_len; ++i)
 599       free_token (dfa->nodes + i);
 600   re_free (dfa->nexts);
 601   for (i = 0; i < dfa->nodes_len; ++i)
 602     {
 603       if (dfa->eclosures != NULL)
 604         re_node_set_free (dfa->eclosures + i);
 605       if (dfa->inveclosures != NULL)
 606         re_node_set_free (dfa->inveclosures + i);
 607       if (dfa->edests != NULL)
 608         re_node_set_free (dfa->edests + i);
 609     }
 610   re_free (dfa->edests);
 611   re_free (dfa->eclosures);
 612   re_free (dfa->inveclosures);
 613   re_free (dfa->nodes);
 614
 615   if (dfa->state_table)
 616     for (i = 0; i <= dfa->state_hash_mask; ++i)
 617       {
 618         struct re_state_table_entry *entry = dfa->state_table + i;
 619         for (j = 0; j < entry->num; ++j)
 620           {
 621             re_dfastate_t *state = entry->array[j];
 622             free_state (state);
 623           }
 624         re_free (entry->array);
 625       }
 626   re_free (dfa->state_table);
 627 #ifdef RE_ENABLE_I18N
 628   if (dfa->sb_char != utf8_sb_map)
 629     re_free (dfa->sb_char);
 630 #endif
 631   re_free (dfa->subexp_map);
 632 #ifdef DEBUG
 633   re_free (dfa->re_str);
 634 #endif
 635
 636   re_free (dfa);
 637 }
 638
 639
 640 /* Free dynamically allocated space used by PREG.  */
 641
 642 void
 643 regfree (regex_t *preg)
 644 {
 645   re_dfa_t *dfa = preg->buffer;
 646   if (BE (dfa != NULL, 1))
 647     {
 648       lock_fini (dfa->lock);
 649       free_dfa_content (dfa);
 650     }
 651   preg->buffer = NULL;
 652   preg->allocated = 0;
 653
 654   re_free (preg->fastmap);
 655   preg->fastmap = NULL;
 656
 657   re_free (preg->translate);
 658   preg->translate = NULL;
 659 }
 660 #ifdef _LIBC
 661 weak_alias (__regfree, regfree)
 662 #endif
 663 \f
 664 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 665    them unless specifically requested.  */
 666
 667 #if defined _REGEX_RE_COMP || defined _LIBC
 668
 669 /* BSD has one and only one pattern buffer.  */
 670 static struct re_pattern_buffer re_comp_buf;
 671
 672 char *
 673 # ifdef _LIBC
 674 /* Make these definitions weak in libc, so POSIX programs can redefine
 675    these names if they don't use our functions, and still use
 676    regcomp/regexec above without link errors.  */
 677 weak_function
 678 # endif
 679 re_comp (const char *s)
 680 {
 681   reg_errcode_t ret;
 682   char *fastmap;
 683
 684   if (!s)
 685     {
 686       if (!re_comp_buf.buffer)
 687         return gettext ("No previous regular expression");
 688       return 0;
 689     }
 690
 691   if (re_comp_buf.buffer)
 692     {
 693       fastmap = re_comp_buf.fastmap;
 694       re_comp_buf.fastmap = NULL;
 695       __regfree (&re_comp_buf);
 696       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 697       re_comp_buf.fastmap = fastmap;
 698     }
 699
 700   if (re_comp_buf.fastmap == NULL)
 701     {
 702       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 703       if (re_comp_buf.fastmap == NULL)
 704         return (char *) gettext (__re_error_msgid
 705                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 706     }
 707
 708   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 709      don't need to initialize the pattern buffer fields which affect it.  */
 710
 711   /* Match anchors at newlines.  */
 712   re_comp_buf.newline_anchor = 1;
 713
 714   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 715
 716   if (!ret)
 717     return NULL;
 718
 719   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 720   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 721 }
 722
 723 #ifdef _LIBC
 724 libc_freeres_fn (free_mem)
 725 {
 726   __regfree (&re_comp_buf);
 727 }
 728 #endif
 729
 730 #endif /* _REGEX_RE_COMP */
 731 \f
 732 /* Internal entry point.
 733    Compile the regular expression PATTERN, whose length is LENGTH.
 734    SYNTAX indicate regular expression's syntax.  */
 735
 736 static reg_errcode_t
 737 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 738                      reg_syntax_t syntax)
 739 {
 740   reg_errcode_t err = REG_NOERROR;
 741   re_dfa_t *dfa;
 742   re_string_t regexp;
 743
 744   /* Initialize the pattern buffer.  */
 745   preg->fastmap_accurate = 0;
 746   preg->syntax = syntax;
 747   preg->not_bol = preg->not_eol = 0;
 748   preg->used = 0;
 749   preg->re_nsub = 0;
 750   preg->can_be_null = 0;
 751   preg->regs_allocated = REGS_UNALLOCATED;
 752
 753   /* Initialize the dfa.  */
 754   dfa = preg->buffer;
 755   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 756     {
 757       /* If zero allocated, but buffer is non-null, try to realloc
 758          enough space.  This loses if buffer's address is bogus, but
 759          that is the user's responsibility.  If ->buffer is NULL this
 760          is a simple allocation.  */
 761       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 762       if (dfa == NULL)
 763         return REG_ESPACE;
 764       preg->allocated = sizeof (re_dfa_t);
 765       preg->buffer = dfa;
 766     }
 767   preg->used = sizeof (re_dfa_t);
 768
 769   err = init_dfa (dfa, length);
 770   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
 771     err = REG_ESPACE;
 772   if (BE (err != REG_NOERROR, 0))
 773     {
 774       free_dfa_content (dfa);
 775       preg->buffer = NULL;
 776       preg->allocated = 0;
 777       return err;
 778     }
 779 #ifdef DEBUG
 780   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 781   dfa->re_str = re_malloc (char, length + 1);
 782   strncpy (dfa->re_str, pattern, length + 1);
 783 #endif
 784
 785   err = re_string_construct (&regexp, pattern, length, preg->translate,
 786                              (syntax & RE_ICASE) != 0, dfa);
 787   if (BE (err != REG_NOERROR, 0))
 788     {
 789     re_compile_internal_free_return:
 790       free_workarea_compile (preg);
 791       re_string_destruct (&regexp);
 792       lock_fini (dfa->lock);
 793       free_dfa_content (dfa);
 794       preg->buffer = NULL;
 795       preg->allocated = 0;
 796       return err;
 797     }
 798
 799   /* Parse the regular expression, and build a structure tree.  */
 800   preg->re_nsub = 0;
 801   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 802   if (BE (dfa->str_tree == NULL, 0))
 803     goto re_compile_internal_free_return;
 804
 805   /* Analyze the tree and create the nfa.  */
 806   err = analyze (preg);
 807   if (BE (err != REG_NOERROR, 0))
 808     goto re_compile_internal_free_return;
 809
 810 #ifdef RE_ENABLE_I18N
 811   /* If possible, do searching in single byte encoding to speed things up.  */
 812   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 813     optimize_utf8 (dfa);
 814 #endif
 815
 816   /* Then create the initial state of the dfa.  */
 817   err = create_initial_state (dfa);
 818
 819   /* Release work areas.  */
 820   free_workarea_compile (preg);
 821   re_string_destruct (&regexp);
 822
 823   if (BE (err != REG_NOERROR, 0))
 824     {
 825       lock_fini (dfa->lock);
 826       free_dfa_content (dfa);
 827       preg->buffer = NULL;
 828       preg->allocated = 0;
 829     }
 830
 831   return err;
 832 }
 833
 834 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 835    as the initial length of some arrays.  */
 836
 837 static reg_errcode_t
 838 init_dfa (re_dfa_t *dfa, size_t pat_len)
 839 {
 840   __re_size_t table_size;
 841 #ifndef _LIBC
 842   const char *codeset_name;
 843 #endif
 844 #ifdef RE_ENABLE_I18N
 845   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 846 #else
 847   size_t max_i18n_object_size = 0;
 848 #endif
 849   size_t max_object_size =
 850     MAX (sizeof (struct re_state_table_entry),
 851          MAX (sizeof (re_token_t),
 852               MAX (sizeof (re_node_set),
 853                    MAX (sizeof (regmatch_t),
 854                         max_i18n_object_size))));
 855
 856   memset (dfa, '\0', sizeof (re_dfa_t));
 857
 858   /* Force allocation of str_tree_storage the first time.  */
 859   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 860
 861   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 862      calculation below, and for similar doubling calculations
 863      elsewhere.  And it's <= rather than <, because some of the
 864      doubling calculations add 1 afterwards.  */
 865   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 866     return REG_ESPACE;
 867
 868   dfa->nodes_alloc = pat_len + 1;
 869   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 870
 871   /*  table_size = 2 ^ ceil(log pat_len) */
 872   for (table_size = 1; ; table_size <<= 1)
 873     if (table_size > pat_len)
 874       break;
 875
 876   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 877   dfa->state_hash_mask = table_size - 1;
 878
 879   dfa->mb_cur_max = MB_CUR_MAX;
 880 #ifdef _LIBC
 881   if (dfa->mb_cur_max == 6
 882       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 883     dfa->is_utf8 = 1;
 884   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 885                        != 0);
 886 #else
 887   codeset_name = nl_langinfo (CODESET);
 888   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 889       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 890       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 891       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 892     dfa->is_utf8 = 1;
 893
 894   /* We check exhaustively in the loop below if this charset is a
 895      superset of ASCII.  */
 896   dfa->map_notascii = 0;
 897 #endif
 898
 899 #ifdef RE_ENABLE_I18N
 900   if (dfa->mb_cur_max > 1)
 901     {
 902       if (dfa->is_utf8)
 903         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 904       else
 905         {
 906           int i, j, ch;
 907
 908           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 909           if (BE (dfa->sb_char == NULL, 0))
 910             return REG_ESPACE;
 911
 912           /* Set the bits corresponding to single byte chars.  */
 913           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 914             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 915               {
 916                 wint_t wch = __btowc (ch);
 917                 if (wch != WEOF)
 918                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 919 # ifndef _LIBC
 920                 if (isascii (ch) && wch != ch)
 921                   dfa->map_notascii = 1;
 922 # endif
 923               }
 924         }
 925     }
 926 #endif
 927
 928   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 929     return REG_ESPACE;
 930   return REG_NOERROR;
 931 }
 932
 933 /* Initialize WORD_CHAR table, which indicate which character is
 934    "word".  In this case "word" means that it is the word construction
 935    character used by some operators like "\<", "\>", etc.  */
 936
 937 static void
 938 internal_function
 939 init_word_char (re_dfa_t *dfa)
 940 {
 941   int i = 0;
 942   int j;
 943   int ch = 0;
 944   dfa->word_ops_used = 1;
 945   if (BE (dfa->map_notascii == 0, 1))
 946     {
 947       bitset_word_t bits0 = 0x00000000;
 948       bitset_word_t bits1 = 0x03ff0000;
 949       bitset_word_t bits2 = 0x87fffffe;
 950       bitset_word_t bits3 = 0x07fffffe;
 951       if (BITSET_WORD_BITS == 64)
 952         {
 953           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 954           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 955           i = 2;
 956         }
 957       else if (BITSET_WORD_BITS == 32)
 958         {
 959           dfa->word_char[0] = bits0;
 960           dfa->word_char[1] = bits1;
 961           dfa->word_char[2] = bits2;
 962           dfa->word_char[3] = bits3;
 963           i = 4;
 964         }
 965       else
 966         goto general_case;
 967       ch = 128;
 968
 969       if (BE (dfa->is_utf8, 1))
 970         {
 971           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 972           return;
 973         }
 974     }
 975
 976  general_case:
 977   for (; i < BITSET_WORDS; ++i)
 978     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 979       if (isalnum (ch) || ch == '_')
 980         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 981 }
 982
 983 /* Free the work area which are only used while compiling.  */
 984
 985 static void
 986 free_workarea_compile (regex_t *preg)
 987 {
 988   re_dfa_t *dfa = preg->buffer;
 989   bin_tree_storage_t *storage, *next;
 990   for (storage = dfa->str_tree_storage; storage; storage = next)
 991     {
 992       next = storage->next;
 993       re_free (storage);
 994     }
 995   dfa->str_tree_storage = NULL;
 996   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 997   dfa->str_tree = NULL;
 998   re_free (dfa->org_indices);
 999   dfa->org_indices = NULL;
1000 }
1001
1002 /* Create initial states for all contexts.  */
1003
1004 static reg_errcode_t
1005 create_initial_state (re_dfa_t *dfa)
1006 {
1007   Idx first, i;
1008   reg_errcode_t err;
1009   re_node_set init_nodes;
1010
1011   /* Initial states have the epsilon closure of the node which is
1012      the first node of the regular expression.  */
1013   first = dfa->str_tree->first->node_idx;
1014   dfa->init_node = first;
1015   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1016   if (BE (err != REG_NOERROR, 0))
1017     return err;
1018
1019   /* The back-references which are in initial states can epsilon transit,
1020      since in this case all of the subexpressions can be null.
1021      Then we add epsilon closures of the nodes which are the next nodes of
1022      the back-references.  */
1023   if (dfa->nbackref > 0)
1024     for (i = 0; i < init_nodes.nelem; ++i)
1025       {
1026         Idx node_idx = init_nodes.elems[i];
1027         re_token_type_t type = dfa->nodes[node_idx].type;
1028
1029         Idx clexp_idx;
1030         if (type != OP_BACK_REF)
1031           continue;
1032         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1033           {
1034             re_token_t *clexp_node;
1035             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1036             if (clexp_node->type == OP_CLOSE_SUBEXP
1037                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1038               break;
1039           }
1040         if (clexp_idx == init_nodes.nelem)
1041           continue;
1042
1043         if (type == OP_BACK_REF)
1044           {
1045             Idx dest_idx = dfa->edests[node_idx].elems[0];
1046             if (!re_node_set_contains (&init_nodes, dest_idx))
1047               {
1048                 reg_errcode_t merge_err
1049                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1050                 if (merge_err != REG_NOERROR)
1051                   return merge_err;
1052                 i = 0;
1053               }
1054           }
1055       }
1056
1057   /* It must be the first time to invoke acquire_state.  */
1058   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1059   /* We don't check ERR here, since the initial state must not be NULL.  */
1060   if (BE (dfa->init_state == NULL, 0))
1061     return err;
1062   if (dfa->init_state->has_constraint)
1063     {
1064       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1065                                                        CONTEXT_WORD);
1066       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1067                                                      CONTEXT_NEWLINE);
1068       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1069                                                          &init_nodes,
1070                                                          CONTEXT_NEWLINE
1071                                                          | CONTEXT_BEGBUF);
1072       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1073               || dfa->init_state_begbuf == NULL, 0))
1074         return err;
1075     }
1076   else
1077     dfa->init_state_word = dfa->init_state_nl
1078       = dfa->init_state_begbuf = dfa->init_state;
1079
1080   re_node_set_free (&init_nodes);
1081   return REG_NOERROR;
1082 }
1083 \f
1084 #ifdef RE_ENABLE_I18N
1085 /* If it is possible to do searching in single byte encoding instead of UTF-8
1086    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1087    DFA nodes where needed.  */
1088
1089 static void
1090 optimize_utf8 (re_dfa_t *dfa)
1091 {
1092   Idx node;
1093   int i;
1094   bool mb_chars = false;
1095   bool has_period = false;
1096
1097   for (node = 0; node < dfa->nodes_len; ++node)
1098     switch (dfa->nodes[node].type)
1099       {
1100       case CHARACTER:
1101         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1102           mb_chars = true;
1103         break;
1104       case ANCHOR:
1105         switch (dfa->nodes[node].opr.ctx_type)
1106           {
1107           case LINE_FIRST:
1108           case LINE_LAST:
1109           case BUF_FIRST:
1110           case BUF_LAST:
1111             break;
1112           default:
1113             /* Word anchors etc. cannot be handled.  It's okay to test
1114                opr.ctx_type since constraints (for all DFA nodes) are
1115                created by ORing one or more opr.ctx_type values.  */
1116             return;
1117           }
1118         break;
1119       case OP_PERIOD:
1120         has_period = true;
1121         break;
1122       case OP_BACK_REF:
1123       case OP_ALT:
1124       case END_OF_RE:
1125       case OP_DUP_ASTERISK:
1126       case OP_OPEN_SUBEXP:
1127       case OP_CLOSE_SUBEXP:
1128         break;
1129       case COMPLEX_BRACKET:
1130         return;
1131       case SIMPLE_BRACKET:
1132         /* Just double check.  */
1133         {
1134           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1135                         ? 0
1136                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1137           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1138             {
1139               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1140                 return;
1141               rshift = 0;
1142             }
1143         }
1144         break;
1145       default:
1146         abort ();
1147       }
1148
1149   if (mb_chars || has_period)
1150     for (node = 0; node < dfa->nodes_len; ++node)
1151       {
1152         if (dfa->nodes[node].type == CHARACTER
1153             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1154           dfa->nodes[node].mb_partial = 0;
1155         else if (dfa->nodes[node].type == OP_PERIOD)
1156           dfa->nodes[node].type = OP_UTF8_PERIOD;
1157       }
1158
1159   /* The search can be in single byte locale.  */
1160   dfa->mb_cur_max = 1;
1161   dfa->is_utf8 = 0;
1162   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1163 }
1164 #endif
1165 \f
1166 /* Analyze the structure tree, and calculate "first", "next", "edest",
1167    "eclosure", and "inveclosure".  */
1168
1169 static reg_errcode_t
1170 analyze (regex_t *preg)
1171 {
1172   re_dfa_t *dfa = preg->buffer;
1173   reg_errcode_t ret;
1174
1175   /* Allocate arrays.  */
1176   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1177   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1178   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1179   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1180   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1181           || dfa->eclosures == NULL, 0))
1182     return REG_ESPACE;
1183
1184   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1185   if (dfa->subexp_map != NULL)
1186     {
1187       Idx i;
1188       for (i = 0; i < preg->re_nsub; i++)
1189         dfa->subexp_map[i] = i;
1190       preorder (dfa->str_tree, optimize_subexps, dfa);
1191       for (i = 0; i < preg->re_nsub; i++)
1192         if (dfa->subexp_map[i] != i)
1193           break;
1194       if (i == preg->re_nsub)
1195         {
1196           free (dfa->subexp_map);
1197           dfa->subexp_map = NULL;
1198         }
1199     }
1200
1201   ret = postorder (dfa->str_tree, lower_subexps, preg);
1202   if (BE (ret != REG_NOERROR, 0))
1203     return ret;
1204   ret = postorder (dfa->str_tree, calc_first, dfa);
1205   if (BE (ret != REG_NOERROR, 0))
1206     return ret;
1207   preorder (dfa->str_tree, calc_next, dfa);
1208   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1209   if (BE (ret != REG_NOERROR, 0))
1210     return ret;
1211   ret = calc_eclosure (dfa);
1212   if (BE (ret != REG_NOERROR, 0))
1213     return ret;
1214
1215   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1216      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1217   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1218       || dfa->nbackref)
1219     {
1220       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1221       if (BE (dfa->inveclosures == NULL, 0))
1222         return REG_ESPACE;
1223       ret = calc_inveclosure (dfa);
1224     }
1225
1226   return ret;
1227 }
1228
1229 /* Our parse trees are very unbalanced, so we cannot use a stack to
1230    implement parse tree visits.  Instead, we use parent pointers and
1231    some hairy code in these two functions.  */
1232 static reg_errcode_t
1233 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1234            void *extra)
1235 {
1236   bin_tree_t *node, *prev;
1237
1238   for (node = root; ; )
1239     {
1240       /* Descend down the tree, preferably to the left (or to the right
1241          if that's the only child).  */
1242       while (node->left || node->right)
1243         if (node->left)
1244           node = node->left;
1245         else
1246           node = node->right;
1247
1248       do
1249         {
1250           reg_errcode_t err = fn (extra, node);
1251           if (BE (err != REG_NOERROR, 0))
1252             return err;
1253           if (node->parent == NULL)
1254             return REG_NOERROR;
1255           prev = node;
1256           node = node->parent;
1257         }
1258       /* Go up while we have a node that is reached from the right.  */
1259       while (node->right == prev || node->right == NULL);
1260       node = node->right;
1261     }
1262 }
1263
1264 static reg_errcode_t
1265 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1266           void *extra)
1267 {
1268   bin_tree_t *node;
1269
1270   for (node = root; ; )
1271     {
1272       reg_errcode_t err = fn (extra, node);
1273       if (BE (err != REG_NOERROR, 0))
1274         return err;
1275
1276       /* Go to the left node, or up and to the right.  */
1277       if (node->left)
1278         node = node->left;
1279       else
1280         {
1281           bin_tree_t *prev = NULL;
1282           while (node->right == prev || node->right == NULL)
1283             {
1284               prev = node;
1285               node = node->parent;
1286               if (!node)
1287                 return REG_NOERROR;
1288             }
1289           node = node->right;
1290         }
1291     }
1292 }
1293
1294 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1295    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1296    backreferences as well.  Requires a preorder visit.  */
1297 static reg_errcode_t
1298 optimize_subexps (void *extra, bin_tree_t *node)
1299 {
1300   re_dfa_t *dfa = (re_dfa_t *) extra;
1301
1302   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1303     {
1304       int idx = node->token.opr.idx;
1305       node->token.opr.idx = dfa->subexp_map[idx];
1306       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1307     }
1308
1309   else if (node->token.type == SUBEXP
1310            && node->left && node->left->token.type == SUBEXP)
1311     {
1312       Idx other_idx = node->left->token.opr.idx;
1313
1314       node->left = node->left->left;
1315       if (node->left)
1316         node->left->parent = node;
1317
1318       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1319       if (other_idx < BITSET_WORD_BITS)
1320         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1321     }
1322
1323   return REG_NOERROR;
1324 }
1325
1326 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1327    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1328 static reg_errcode_t
1329 lower_subexps (void *extra, bin_tree_t *node)
1330 {
1331   regex_t *preg = (regex_t *) extra;
1332   reg_errcode_t err = REG_NOERROR;
1333
1334   if (node->left && node->left->token.type == SUBEXP)
1335     {
1336       node->left = lower_subexp (&err, preg, node->left);
1337       if (node->left)
1338         node->left->parent = node;
1339     }
1340   if (node->right && node->right->token.type == SUBEXP)
1341     {
1342       node->right = lower_subexp (&err, preg, node->right);
1343       if (node->right)
1344         node->right->parent = node;
1345     }
1346
1347   return err;
1348 }
1349
1350 static bin_tree_t *
1351 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1352 {
1353   re_dfa_t *dfa = preg->buffer;
1354   bin_tree_t *body = node->left;
1355   bin_tree_t *op, *cls, *tree1, *tree;
1356
1357   if (preg->no_sub
1358       /* We do not optimize empty subexpressions, because otherwise we may
1359          have bad CONCAT nodes with NULL children.  This is obviously not
1360          very common, so we do not lose much.  An example that triggers
1361          this case is the sed "script" /\(\)/x.  */
1362       && node->left != NULL
1363       && (node->token.opr.idx >= BITSET_WORD_BITS
1364           || !(dfa->used_bkref_map
1365                & ((bitset_word_t) 1 << node->token.opr.idx))))
1366     return node->left;
1367
1368   /* Convert the SUBEXP node to the concatenation of an
1369      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1370   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1371   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1372   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1373   tree = create_tree (dfa, op, tree1, CONCAT);
1374   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1375     {
1376       *err = REG_ESPACE;
1377       return NULL;
1378     }
1379
1380   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1381   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1382   return tree;
1383 }
1384
1385 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1386    nodes.  Requires a postorder visit.  */
1387 static reg_errcode_t
1388 calc_first (void *extra, bin_tree_t *node)
1389 {
1390   re_dfa_t *dfa = (re_dfa_t *) extra;
1391   if (node->token.type == CONCAT)
1392     {
1393       node->first = node->left->first;
1394       node->node_idx = node->left->node_idx;
1395     }
1396   else
1397     {
1398       node->first = node;
1399       node->node_idx = re_dfa_add_node (dfa, node->token);
1400       if (BE (node->node_idx == -1, 0))
1401         return REG_ESPACE;
1402       if (node->token.type == ANCHOR)
1403         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1404     }
1405   return REG_NOERROR;
1406 }
1407
1408 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1409 static reg_errcode_t
1410 calc_next (void *extra, bin_tree_t *node)
1411 {
1412   switch (node->token.type)
1413     {
1414     case OP_DUP_ASTERISK:
1415       node->left->next = node;
1416       break;
1417     case CONCAT:
1418       node->left->next = node->right->first;
1419       node->right->next = node->next;
1420       break;
1421     default:
1422       if (node->left)
1423         node->left->next = node->next;
1424       if (node->right)
1425         node->right->next = node->next;
1426       break;
1427     }
1428   return REG_NOERROR;
1429 }
1430
1431 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1432 static reg_errcode_t
1433 link_nfa_nodes (void *extra, bin_tree_t *node)
1434 {
1435   re_dfa_t *dfa = (re_dfa_t *) extra;
1436   Idx idx = node->node_idx;
1437   reg_errcode_t err = REG_NOERROR;
1438
1439   switch (node->token.type)
1440     {
1441     case CONCAT:
1442       break;
1443
1444     case END_OF_RE:
1445       assert (node->next == NULL);
1446       break;
1447
1448     case OP_DUP_ASTERISK:
1449     case OP_ALT:
1450       {
1451         Idx left, right;
1452         dfa->has_plural_match = 1;
1453         if (node->left != NULL)
1454           left = node->left->first->node_idx;
1455         else
1456           left = node->next->node_idx;
1457         if (node->right != NULL)
1458           right = node->right->first->node_idx;
1459         else
1460           right = node->next->node_idx;
1461         assert (left > -1);
1462         assert (right > -1);
1463         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1464       }
1465       break;
1466
1467     case ANCHOR:
1468     case OP_OPEN_SUBEXP:
1469     case OP_CLOSE_SUBEXP:
1470       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1471       break;
1472
1473     case OP_BACK_REF:
1474       dfa->nexts[idx] = node->next->node_idx;
1475       if (node->token.type == OP_BACK_REF)
1476         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1477       break;
1478
1479     default:
1480       assert (!IS_EPSILON_NODE (node->token.type));
1481       dfa->nexts[idx] = node->next->node_idx;
1482       break;
1483     }
1484
1485   return err;
1486 }
1487
1488 /* Duplicate the epsilon closure of the node ROOT_NODE.
1489    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1490    to their own constraint.  */
1491
1492 static reg_errcode_t
1493 internal_function
1494 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1495                         Idx root_node, unsigned int init_constraint)
1496 {
1497   Idx org_node, clone_node;
1498   bool ok;
1499   unsigned int constraint = init_constraint;
1500   for (org_node = top_org_node, clone_node = top_clone_node;;)
1501     {
1502       Idx org_dest, clone_dest;
1503       if (dfa->nodes[org_node].type == OP_BACK_REF)
1504         {
1505           /* If the back reference epsilon-transit, its destination must
1506              also have the constraint.  Then duplicate the epsilon closure
1507              of the destination of the back reference, and store it in
1508              edests of the back reference.  */
1509           org_dest = dfa->nexts[org_node];
1510           re_node_set_empty (dfa->edests + clone_node);
1511           clone_dest = duplicate_node (dfa, org_dest, constraint);
1512           if (BE (clone_dest == -1, 0))
1513             return REG_ESPACE;
1514           dfa->nexts[clone_node] = dfa->nexts[org_node];
1515           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516           if (BE (! ok, 0))
1517             return REG_ESPACE;
1518         }
1519       else if (dfa->edests[org_node].nelem == 0)
1520         {
1521           /* In case of the node can't epsilon-transit, don't duplicate the
1522              destination and store the original destination as the
1523              destination of the node.  */
1524           dfa->nexts[clone_node] = dfa->nexts[org_node];
1525           break;
1526         }
1527       else if (dfa->edests[org_node].nelem == 1)
1528         {
1529           /* In case of the node can epsilon-transit, and it has only one
1530              destination.  */
1531           org_dest = dfa->edests[org_node].elems[0];
1532           re_node_set_empty (dfa->edests + clone_node);
1533           /* If the node is root_node itself, it means the epsilon closure
1534              has a loop.  Then tie it to the destination of the root_node.  */
1535           if (org_node == root_node && clone_node != org_node)
1536             {
1537               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1538               if (BE (! ok, 0))
1539                 return REG_ESPACE;
1540               break;
1541             }
1542           /* In case the node has another constraint, append it.  */
1543           constraint |= dfa->nodes[org_node].constraint;
1544           clone_dest = duplicate_node (dfa, org_dest, constraint);
1545           if (BE (clone_dest == -1, 0))
1546             return REG_ESPACE;
1547           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548           if (BE (! ok, 0))
1549             return REG_ESPACE;
1550         }
1551       else /* dfa->edests[org_node].nelem == 2 */
1552         {
1553           /* In case of the node can epsilon-transit, and it has two
1554              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1555           org_dest = dfa->edests[org_node].elems[0];
1556           re_node_set_empty (dfa->edests + clone_node);
1557           /* Search for a duplicated node which satisfies the constraint.  */
1558           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1559           if (clone_dest == -1)
1560             {
1561               /* There is no such duplicated node, create a new one.  */
1562               reg_errcode_t err;
1563               clone_dest = duplicate_node (dfa, org_dest, constraint);
1564               if (BE (clone_dest == -1, 0))
1565                 return REG_ESPACE;
1566               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567               if (BE (! ok, 0))
1568                 return REG_ESPACE;
1569               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1570                                             root_node, constraint);
1571               if (BE (err != REG_NOERROR, 0))
1572                 return err;
1573             }
1574           else
1575             {
1576               /* There is a duplicated node which satisfies the constraint,
1577                  use it to avoid infinite loop.  */
1578               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1579               if (BE (! ok, 0))
1580                 return REG_ESPACE;
1581             }
1582
1583           org_dest = dfa->edests[org_node].elems[1];
1584           clone_dest = duplicate_node (dfa, org_dest, constraint);
1585           if (BE (clone_dest == -1, 0))
1586             return REG_ESPACE;
1587           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1588           if (BE (! ok, 0))
1589             return REG_ESPACE;
1590         }
1591       org_node = org_dest;
1592       clone_node = clone_dest;
1593     }
1594   return REG_NOERROR;
1595 }
1596
1597 /* Search for a node which is duplicated from the node ORG_NODE, and
1598    satisfies the constraint CONSTRAINT.  */
1599
1600 static Idx
1601 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1602                         unsigned int constraint)
1603 {
1604   Idx idx;
1605   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1606     {
1607       if (org_node == dfa->org_indices[idx]
1608           && constraint == dfa->nodes[idx].constraint)
1609         return idx; /* Found.  */
1610     }
1611   return -1; /* Not found.  */
1612 }
1613
1614 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1615    Return the index of the new node, or -1 if insufficient storage is
1616    available.  */
1617
1618 static Idx
1619 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1620 {
1621   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1622   if (BE (dup_idx != -1, 1))
1623     {
1624       dfa->nodes[dup_idx].constraint = constraint;
1625       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1626       dfa->nodes[dup_idx].duplicated = 1;
1627
1628       /* Store the index of the original node.  */
1629       dfa->org_indices[dup_idx] = org_idx;
1630     }
1631   return dup_idx;
1632 }
1633
1634 static reg_errcode_t
1635 calc_inveclosure (re_dfa_t *dfa)
1636 {
1637   Idx src, idx;
1638   bool ok;
1639   for (idx = 0; idx < dfa->nodes_len; ++idx)
1640     re_node_set_init_empty (dfa->inveclosures + idx);
1641
1642   for (src = 0; src < dfa->nodes_len; ++src)
1643     {
1644       Idx *elems = dfa->eclosures[src].elems;
1645       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1646         {
1647           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1648           if (BE (! ok, 0))
1649             return REG_ESPACE;
1650         }
1651     }
1652
1653   return REG_NOERROR;
1654 }
1655
1656 /* Calculate "eclosure" for all the node in DFA.  */
1657
1658 static reg_errcode_t
1659 calc_eclosure (re_dfa_t *dfa)
1660 {
1661   Idx node_idx;
1662   bool incomplete;
1663 #ifdef DEBUG
1664   assert (dfa->nodes_len > 0);
1665 #endif
1666   incomplete = false;
1667   /* For each nodes, calculate epsilon closure.  */
1668   for (node_idx = 0; ; ++node_idx)
1669     {
1670       reg_errcode_t err;
1671       re_node_set eclosure_elem;
1672       if (node_idx == dfa->nodes_len)
1673         {
1674           if (!incomplete)
1675             break;
1676           incomplete = false;
1677           node_idx = 0;
1678         }
1679
1680 #ifdef DEBUG
1681       assert (dfa->eclosures[node_idx].nelem != -1);
1682 #endif
1683
1684       /* If we have already calculated, skip it.  */
1685       if (dfa->eclosures[node_idx].nelem != 0)
1686         continue;
1687       /* Calculate epsilon closure of 'node_idx'.  */
1688       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1689       if (BE (err != REG_NOERROR, 0))
1690         return err;
1691
1692       if (dfa->eclosures[node_idx].nelem == 0)
1693         {
1694           incomplete = true;
1695           re_node_set_free (&eclosure_elem);
1696         }
1697     }
1698   return REG_NOERROR;
1699 }
1700
1701 /* Calculate epsilon closure of NODE.  */
1702
1703 static reg_errcode_t
1704 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1705 {
1706   reg_errcode_t err;
1707   Idx i;
1708   re_node_set eclosure;
1709   bool ok;
1710   bool incomplete = false;
1711   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1712   if (BE (err != REG_NOERROR, 0))
1713     return err;
1714
1715   /* This indicates that we are calculating this node now.
1716      We reference this value to avoid infinite loop.  */
1717   dfa->eclosures[node].nelem = -1;
1718
1719   /* If the current node has constraints, duplicate all nodes
1720      since they must inherit the constraints.  */
1721   if (dfa->nodes[node].constraint
1722       && dfa->edests[node].nelem
1723       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1724     {
1725       err = duplicate_node_closure (dfa, node, node, node,
1726                                     dfa->nodes[node].constraint);
1727       if (BE (err != REG_NOERROR, 0))
1728         return err;
1729     }
1730
1731   /* Expand each epsilon destination nodes.  */
1732   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1733     for (i = 0; i < dfa->edests[node].nelem; ++i)
1734       {
1735         re_node_set eclosure_elem;
1736         Idx edest = dfa->edests[node].elems[i];
1737         /* If calculating the epsilon closure of 'edest' is in progress,
1738            return intermediate result.  */
1739         if (dfa->eclosures[edest].nelem == -1)
1740           {
1741             incomplete = true;
1742             continue;
1743           }
1744         /* If we haven't calculated the epsilon closure of 'edest' yet,
1745            calculate now. Otherwise use calculated epsilon closure.  */
1746         if (dfa->eclosures[edest].nelem == 0)
1747           {
1748             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1749             if (BE (err != REG_NOERROR, 0))
1750               return err;
1751           }
1752         else
1753           eclosure_elem = dfa->eclosures[edest];
1754         /* Merge the epsilon closure of 'edest'.  */
1755         err = re_node_set_merge (&eclosure, &eclosure_elem);
1756         if (BE (err != REG_NOERROR, 0))
1757           return err;
1758         /* If the epsilon closure of 'edest' is incomplete,
1759            the epsilon closure of this node is also incomplete.  */
1760         if (dfa->eclosures[edest].nelem == 0)
1761           {
1762             incomplete = true;
1763             re_node_set_free (&eclosure_elem);
1764           }
1765       }
1766
1767   /* An epsilon closure includes itself.  */
1768   ok = re_node_set_insert (&eclosure, node);
1769   if (BE (! ok, 0))
1770     return REG_ESPACE;
1771   if (incomplete && !root)
1772     dfa->eclosures[node].nelem = 0;
1773   else
1774     dfa->eclosures[node] = eclosure;
1775   *new_set = eclosure;
1776   return REG_NOERROR;
1777 }
1778 \f
1779 /* Functions for token which are used in the parser.  */
1780
1781 /* Fetch a token from INPUT.
1782    We must not use this function inside bracket expressions.  */
1783
1784 static void
1785 internal_function
1786 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1787 {
1788   re_string_skip_bytes (input, peek_token (result, input, syntax));
1789 }
1790
1791 /* Peek a token from INPUT, and return the length of the token.
1792    We must not use this function inside bracket expressions.  */
1793
1794 static int
1795 internal_function
1796 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1797 {
1798   unsigned char c;
1799
1800   if (re_string_eoi (input))
1801     {
1802       token->type = END_OF_RE;
1803       return 0;
1804     }
1805
1806   c = re_string_peek_byte (input, 0);
1807   token->opr.c = c;
1808
1809   token->word_char = 0;
1810 #ifdef RE_ENABLE_I18N
1811   token->mb_partial = 0;
1812   if (input->mb_cur_max > 1 &&
1813       !re_string_first_byte (input, re_string_cur_idx (input)))
1814     {
1815       token->type = CHARACTER;
1816       token->mb_partial = 1;
1817       return 1;
1818     }
1819 #endif
1820   if (c == '\\')
1821     {
1822       unsigned char c2;
1823       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1824         {
1825           token->type = BACK_SLASH;
1826           return 1;
1827         }
1828
1829       c2 = re_string_peek_byte_case (input, 1);
1830       token->opr.c = c2;
1831       token->type = CHARACTER;
1832 #ifdef RE_ENABLE_I18N
1833       if (input->mb_cur_max > 1)
1834         {
1835           wint_t wc = re_string_wchar_at (input,
1836                                           re_string_cur_idx (input) + 1);
1837           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1838         }
1839       else
1840 #endif
1841         token->word_char = IS_WORD_CHAR (c2) != 0;
1842
1843       switch (c2)
1844         {
1845         case '|':
1846           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1847             token->type = OP_ALT;
1848           break;
1849         case '1': case '2': case '3': case '4': case '5':
1850         case '6': case '7': case '8': case '9':
1851           if (!(syntax & RE_NO_BK_REFS))
1852             {
1853               token->type = OP_BACK_REF;
1854               token->opr.idx = c2 - '1';
1855             }
1856           break;
1857         case '<':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             {
1860               token->type = ANCHOR;
1861               token->opr.ctx_type = WORD_FIRST;
1862             }
1863           break;
1864         case '>':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             {
1867               token->type = ANCHOR;
1868               token->opr.ctx_type = WORD_LAST;
1869             }
1870           break;
1871         case 'b':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             {
1874               token->type = ANCHOR;
1875               token->opr.ctx_type = WORD_DELIM;
1876             }
1877           break;
1878         case 'B':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             {
1881               token->type = ANCHOR;
1882               token->opr.ctx_type = NOT_WORD_DELIM;
1883             }
1884           break;
1885         case 'w':
1886           if (!(syntax & RE_NO_GNU_OPS))
1887             token->type = OP_WORD;
1888           break;
1889         case 'W':
1890           if (!(syntax & RE_NO_GNU_OPS))
1891             token->type = OP_NOTWORD;
1892           break;
1893         case 's':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             token->type = OP_SPACE;
1896           break;
1897         case 'S':
1898           if (!(syntax & RE_NO_GNU_OPS))
1899             token->type = OP_NOTSPACE;
1900           break;
1901         case '`':
1902           if (!(syntax & RE_NO_GNU_OPS))
1903             {
1904               token->type = ANCHOR;
1905               token->opr.ctx_type = BUF_FIRST;
1906             }
1907           break;
1908         case '\'':
1909           if (!(syntax & RE_NO_GNU_OPS))
1910             {
1911               token->type = ANCHOR;
1912               token->opr.ctx_type = BUF_LAST;
1913             }
1914           break;
1915         case '(':
1916           if (!(syntax & RE_NO_BK_PARENS))
1917             token->type = OP_OPEN_SUBEXP;
1918           break;
1919         case ')':
1920           if (!(syntax & RE_NO_BK_PARENS))
1921             token->type = OP_CLOSE_SUBEXP;
1922           break;
1923         case '+':
1924           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1925             token->type = OP_DUP_PLUS;
1926           break;
1927         case '?':
1928           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1929             token->type = OP_DUP_QUESTION;
1930           break;
1931         case '{':
1932           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1933             token->type = OP_OPEN_DUP_NUM;
1934           break;
1935         case '}':
1936           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1937             token->type = OP_CLOSE_DUP_NUM;
1938           break;
1939         default:
1940           break;
1941         }
1942       return 2;
1943     }
1944
1945   token->type = CHARACTER;
1946 #ifdef RE_ENABLE_I18N
1947   if (input->mb_cur_max > 1)
1948     {
1949       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1950       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1951     }
1952   else
1953 #endif
1954     token->word_char = IS_WORD_CHAR (token->opr.c);
1955
1956   switch (c)
1957     {
1958     case '\n':
1959       if (syntax & RE_NEWLINE_ALT)
1960         token->type = OP_ALT;
1961       break;
1962     case '|':
1963       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1964         token->type = OP_ALT;
1965       break;
1966     case '*':
1967       token->type = OP_DUP_ASTERISK;
1968       break;
1969     case '+':
1970       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1971         token->type = OP_DUP_PLUS;
1972       break;
1973     case '?':
1974       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1975         token->type = OP_DUP_QUESTION;
1976       break;
1977     case '{':
1978       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1979         token->type = OP_OPEN_DUP_NUM;
1980       break;
1981     case '}':
1982       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1983         token->type = OP_CLOSE_DUP_NUM;
1984       break;
1985     case '(':
1986       if (syntax & RE_NO_BK_PARENS)
1987         token->type = OP_OPEN_SUBEXP;
1988       break;
1989     case ')':
1990       if (syntax & RE_NO_BK_PARENS)
1991         token->type = OP_CLOSE_SUBEXP;
1992       break;
1993     case '[':
1994       token->type = OP_OPEN_BRACKET;
1995       break;
1996     case '.':
1997       token->type = OP_PERIOD;
1998       break;
1999     case '^':
2000       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2001           re_string_cur_idx (input) != 0)
2002         {
2003           char prev = re_string_peek_byte (input, -1);
2004           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2005             break;
2006         }
2007       token->type = ANCHOR;
2008       token->opr.ctx_type = LINE_FIRST;
2009       break;
2010     case '$':
2011       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2012           re_string_cur_idx (input) + 1 != re_string_length (input))
2013         {
2014           re_token_t next;
2015           re_string_skip_bytes (input, 1);
2016           peek_token (&next, input, syntax);
2017           re_string_skip_bytes (input, -1);
2018           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2019             break;
2020         }
2021       token->type = ANCHOR;
2022       token->opr.ctx_type = LINE_LAST;
2023       break;
2024     default:
2025       break;
2026     }
2027   return 1;
2028 }
2029
2030 /* Peek a token from INPUT, and return the length of the token.
2031    We must not use this function out of bracket expressions.  */
2032
2033 static int
2034 internal_function
2035 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2036 {
2037   unsigned char c;
2038   if (re_string_eoi (input))
2039     {
2040       token->type = END_OF_RE;
2041       return 0;
2042     }
2043   c = re_string_peek_byte (input, 0);
2044   token->opr.c = c;
2045
2046 #ifdef RE_ENABLE_I18N
2047   if (input->mb_cur_max > 1 &&
2048       !re_string_first_byte (input, re_string_cur_idx (input)))
2049     {
2050       token->type = CHARACTER;
2051       return 1;
2052     }
2053 #endif /* RE_ENABLE_I18N */
2054
2055   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056       && re_string_cur_idx (input) + 1 < re_string_length (input))
2057     {
2058       /* In this case, '\' escape a character.  */
2059       unsigned char c2;
2060       re_string_skip_bytes (input, 1);
2061       c2 = re_string_peek_byte (input, 0);
2062       token->opr.c = c2;
2063       token->type = CHARACTER;
2064       return 1;
2065     }
2066   if (c == '[') /* '[' is a special char in a bracket exps.  */
2067     {
2068       unsigned char c2;
2069       int token_len;
2070       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071         c2 = re_string_peek_byte (input, 1);
2072       else
2073         c2 = 0;
2074       token->opr.c = c2;
2075       token_len = 2;
2076       switch (c2)
2077         {
2078         case '.':
2079           token->type = OP_OPEN_COLL_ELEM;
2080           break;
2081         case '=':
2082           token->type = OP_OPEN_EQUIV_CLASS;
2083           break;
2084         case ':':
2085           if (syntax & RE_CHAR_CLASSES)
2086             {
2087               token->type = OP_OPEN_CHAR_CLASS;
2088               break;
2089             }
2090           /* else fall through.  */
2091         default:
2092           token->type = CHARACTER;
2093           token->opr.c = c;
2094           token_len = 1;
2095           break;
2096         }
2097       return token_len;
2098     }
2099   switch (c)
2100     {
2101     case '-':
2102       token->type = OP_CHARSET_RANGE;
2103       break;
2104     case ']':
2105       token->type = OP_CLOSE_BRACKET;
2106       break;
2107     case '^':
2108       token->type = OP_NON_MATCH_LIST;
2109       break;
2110     default:
2111       token->type = CHARACTER;
2112     }
2113   return 1;
2114 }
2115 \f
2116 /* Functions for parser.  */
2117
2118 /* Entry point of the parser.
2119    Parse the regular expression REGEXP and return the structure tree.
2120    If an error occurs, ERR is set by error code, and return NULL.
2121    This function build the following tree, from regular expression <reg_exp>:
2122            CAT
2123            / \
2124           /   \
2125    <reg_exp>  EOR
2126
2127    CAT means concatenation.
2128    EOR means end of regular expression.  */
2129
2130 static bin_tree_t *
2131 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2132        reg_errcode_t *err)
2133 {
2134   re_dfa_t *dfa = preg->buffer;
2135   bin_tree_t *tree, *eor, *root;
2136   re_token_t current_token;
2137   dfa->syntax = syntax;
2138   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2139   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2140   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141     return NULL;
2142   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2143   if (tree != NULL)
2144     root = create_tree (dfa, tree, eor, CONCAT);
2145   else
2146     root = eor;
2147   if (BE (eor == NULL || root == NULL, 0))
2148     {
2149       *err = REG_ESPACE;
2150       return NULL;
2151     }
2152   return root;
2153 }
2154
2155 /* This function build the following tree, from regular expression
2156    <branch1>|<branch2>:
2157            ALT
2158            / \
2159           /   \
2160    <branch1> <branch2>
2161
2162    ALT means alternative, which represents the operator '|'.  */
2163
2164 static bin_tree_t *
2165 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2166                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2167 {
2168   re_dfa_t *dfa = preg->buffer;
2169   bin_tree_t *tree, *branch = NULL;
2170   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2171   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2172   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2173     return NULL;
2174
2175   while (token->type == OP_ALT)
2176     {
2177       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2178       if (token->type != OP_ALT && token->type != END_OF_RE
2179           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2180         {
2181           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2182           dfa->completed_bkref_map = initial_bkref_map;
2183           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2184           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2185             {
2186               if (tree != NULL)
2187                 postorder (tree, free_tree, NULL);
2188               return NULL;
2189             }
2190           dfa->completed_bkref_map |= accumulated_bkref_map;
2191         }
2192       else
2193         branch = NULL;
2194       tree = create_tree (dfa, tree, branch, OP_ALT);
2195       if (BE (tree == NULL, 0))
2196         {
2197           *err = REG_ESPACE;
2198           return NULL;
2199         }
2200     }
2201   return tree;
2202 }
2203
2204 /* This function build the following tree, from regular expression
2205    <exp1><exp2>:
2206         CAT
2207         / \
2208        /   \
2209    <exp1> <exp2>
2210
2211    CAT means concatenation.  */
2212
2213 static bin_tree_t *
2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216 {
2217   bin_tree_t *tree, *expr;
2218   re_dfa_t *dfa = preg->buffer;
2219   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221     return NULL;
2222
2223   while (token->type != OP_ALT && token->type != END_OF_RE
2224          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2225     {
2226       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2228         {
2229           if (tree != NULL)
2230             postorder (tree, free_tree, NULL);
2231           return NULL;
2232         }
2233       if (tree != NULL && expr != NULL)
2234         {
2235           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236           if (newtree == NULL)
2237             {
2238               postorder (expr, free_tree, NULL);
2239               postorder (tree, free_tree, NULL);
2240               *err = REG_ESPACE;
2241               return NULL;
2242             }
2243           tree = newtree;
2244         }
2245       else if (tree == NULL)
2246         tree = expr;
2247       /* Otherwise expr == NULL, we don't need to create new tree.  */
2248     }
2249   return tree;
2250 }
2251
2252 /* This function build the following tree, from regular expression a*:
2253          *
2254          |
2255          a
2256 */
2257
2258 static bin_tree_t *
2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2261 {
2262   re_dfa_t *dfa = preg->buffer;
2263   bin_tree_t *tree;
2264   switch (token->type)
2265     {
2266     case CHARACTER:
2267       tree = create_token_tree (dfa, NULL, NULL, token);
2268       if (BE (tree == NULL, 0))
2269         {
2270           *err = REG_ESPACE;
2271           return NULL;
2272         }
2273 #ifdef RE_ENABLE_I18N
2274       if (dfa->mb_cur_max > 1)
2275         {
2276           while (!re_string_eoi (regexp)
2277                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2278             {
2279               bin_tree_t *mbc_remain;
2280               fetch_token (token, regexp, syntax);
2281               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283               if (BE (mbc_remain == NULL || tree == NULL, 0))
2284                 {
2285                   *err = REG_ESPACE;
2286                   return NULL;
2287                 }
2288             }
2289         }
2290 #endif
2291       break;
2292     case OP_OPEN_SUBEXP:
2293       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2294       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2295         return NULL;
2296       break;
2297     case OP_OPEN_BRACKET:
2298       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2299       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300         return NULL;
2301       break;
2302     case OP_BACK_REF:
2303       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2304         {
2305           *err = REG_ESUBREG;
2306           return NULL;
2307         }
2308       dfa->used_bkref_map |= 1 << token->opr.idx;
2309       tree = create_token_tree (dfa, NULL, NULL, token);
2310       if (BE (tree == NULL, 0))
2311         {
2312           *err = REG_ESPACE;
2313           return NULL;
2314         }
2315       ++dfa->nbackref;
2316       dfa->has_mb_node = 1;
2317       break;
2318     case OP_OPEN_DUP_NUM:
2319       if (syntax & RE_CONTEXT_INVALID_DUP)
2320         {
2321           *err = REG_BADRPT;
2322           return NULL;
2323         }
2324       /* FALLTHROUGH */
2325     case OP_DUP_ASTERISK:
2326     case OP_DUP_PLUS:
2327     case OP_DUP_QUESTION:
2328       if (syntax & RE_CONTEXT_INVALID_OPS)
2329         {
2330           *err = REG_BADRPT;
2331           return NULL;
2332         }
2333       else if (syntax & RE_CONTEXT_INDEP_OPS)
2334         {
2335           fetch_token (token, regexp, syntax);
2336           return parse_expression (regexp, preg, token, syntax, nest, err);
2337         }
2338       /* else fall through  */
2339     case OP_CLOSE_SUBEXP:
2340       if ((token->type == OP_CLOSE_SUBEXP) &&
2341           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2342         {
2343           *err = REG_ERPAREN;
2344           return NULL;
2345         }
2346       /* else fall through  */
2347     case OP_CLOSE_DUP_NUM:
2348       /* We treat it as a normal character.  */
2349
2350       /* Then we can these characters as normal characters.  */
2351       token->type = CHARACTER;
2352       /* mb_partial and word_char bits should be initialized already
2353          by peek_token.  */
2354       tree = create_token_tree (dfa, NULL, NULL, token);
2355       if (BE (tree == NULL, 0))
2356         {
2357           *err = REG_ESPACE;
2358           return NULL;
2359         }
2360       break;
2361     case ANCHOR:
2362       if ((token->opr.ctx_type
2363            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364           && dfa->word_ops_used == 0)
2365         init_word_char (dfa);
2366       if (token->opr.ctx_type == WORD_DELIM
2367           || token->opr.ctx_type == NOT_WORD_DELIM)
2368         {
2369           bin_tree_t *tree_first, *tree_last;
2370           if (token->opr.ctx_type == WORD_DELIM)
2371             {
2372               token->opr.ctx_type = WORD_FIRST;
2373               tree_first = create_token_tree (dfa, NULL, NULL, token);
2374               token->opr.ctx_type = WORD_LAST;
2375             }
2376           else
2377             {
2378               token->opr.ctx_type = INSIDE_WORD;
2379               tree_first = create_token_tree (dfa, NULL, NULL, token);
2380               token->opr.ctx_type = INSIDE_NOTWORD;
2381             }
2382           tree_last = create_token_tree (dfa, NULL, NULL, token);
2383           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2385             {
2386               *err = REG_ESPACE;
2387               return NULL;
2388             }
2389         }
2390       else
2391         {
2392           tree = create_token_tree (dfa, NULL, NULL, token);
2393           if (BE (tree == NULL, 0))
2394             {
2395               *err = REG_ESPACE;
2396               return NULL;
2397             }
2398         }
2399       /* We must return here, since ANCHORs can't be followed
2400          by repetition operators.
2401          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2402              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2403       fetch_token (token, regexp, syntax);
2404       return tree;
2405     case OP_PERIOD:
2406       tree = create_token_tree (dfa, NULL, NULL, token);
2407       if (BE (tree == NULL, 0))
2408         {
2409           *err = REG_ESPACE;
2410           return NULL;
2411         }
2412       if (dfa->mb_cur_max > 1)
2413         dfa->has_mb_node = 1;
2414       break;
2415     case OP_WORD:
2416     case OP_NOTWORD:
2417       tree = build_charclass_op (dfa, regexp->trans,
2418                                  "alnum",
2419                                  "_",
2420                                  token->type == OP_NOTWORD, err);
2421       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2422         return NULL;
2423       break;
2424     case OP_SPACE:
2425     case OP_NOTSPACE:
2426       tree = build_charclass_op (dfa, regexp->trans,
2427                                  "space",
2428                                  "",
2429                                  token->type == OP_NOTSPACE, err);
2430       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431         return NULL;
2432       break;
2433     case OP_ALT:
2434     case END_OF_RE:
2435       return NULL;
2436     case BACK_SLASH:
2437       *err = REG_EESCAPE;
2438       return NULL;
2439     default:
2440       /* Must not happen?  */
2441 #ifdef DEBUG
2442       assert (0);
2443 #endif
2444       return NULL;
2445     }
2446   fetch_token (token, regexp, syntax);
2447
2448   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2449          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2450     {
2451       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2452                                            syntax, err);
2453       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2454         {
2455           if (tree != NULL)
2456             postorder (tree, free_tree, NULL);
2457           return NULL;
2458         }
2459       tree = dup_tree;
2460       /* In BRE consecutive duplications are not allowed.  */
2461       if ((syntax & RE_CONTEXT_INVALID_DUP)
2462           && (token->type == OP_DUP_ASTERISK
2463               || token->type == OP_OPEN_DUP_NUM))
2464         {
2465           if (tree != NULL)
2466             postorder (tree, free_tree, NULL);
2467           *err = REG_BADRPT;
2468           return NULL;
2469         }
2470     }
2471
2472   return tree;
2473 }
2474
2475 /* This function build the following tree, from regular expression
2476    (<reg_exp>):
2477          SUBEXP
2478             |
2479         <reg_exp>
2480 */
2481
2482 static bin_tree_t *
2483 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2484                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2485 {
2486   re_dfa_t *dfa = preg->buffer;
2487   bin_tree_t *tree;
2488   size_t cur_nsub;
2489   cur_nsub = preg->re_nsub++;
2490
2491   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2492
2493   /* The subexpression may be a null string.  */
2494   if (token->type == OP_CLOSE_SUBEXP)
2495     tree = NULL;
2496   else
2497     {
2498       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2499       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2500         {
2501           if (tree != NULL)
2502             postorder (tree, free_tree, NULL);
2503           *err = REG_EPAREN;
2504         }
2505       if (BE (*err != REG_NOERROR, 0))
2506         return NULL;
2507     }
2508
2509   if (cur_nsub <= '9' - '1')
2510     dfa->completed_bkref_map |= 1 << cur_nsub;
2511
2512   tree = create_tree (dfa, tree, NULL, SUBEXP);
2513   if (BE (tree == NULL, 0))
2514     {
2515       *err = REG_ESPACE;
2516       return NULL;
2517     }
2518   tree->token.opr.idx = cur_nsub;
2519   return tree;
2520 }
2521
2522 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2523
2524 static bin_tree_t *
2525 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2526               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2527 {
2528   bin_tree_t *tree = NULL, *old_tree = NULL;
2529   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2530   re_token_t start_token = *token;
2531
2532   if (token->type == OP_OPEN_DUP_NUM)
2533     {
2534       end = 0;
2535       start = fetch_number (regexp, token, syntax);
2536       if (start == -1)
2537         {
2538           if (token->type == CHARACTER && token->opr.c == ',')
2539             start = 0; /* We treat "{,m}" as "{0,m}".  */
2540           else
2541             {
2542               *err = REG_BADBR; /* <re>{} is invalid.  */
2543               return NULL;
2544             }
2545         }
2546       if (BE (start != -2, 1))
2547         {
2548           /* We treat "{n}" as "{n,n}".  */
2549           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2550                  : ((token->type == CHARACTER && token->opr.c == ',')
2551                     ? fetch_number (regexp, token, syntax) : -2));
2552         }
2553       if (BE (start == -2 || end == -2, 0))
2554         {
2555           /* Invalid sequence.  */
2556           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2557             {
2558               if (token->type == END_OF_RE)
2559                 *err = REG_EBRACE;
2560               else
2561                 *err = REG_BADBR;
2562
2563               return NULL;
2564             }
2565
2566           /* If the syntax bit is set, rollback.  */
2567           re_string_set_index (regexp, start_idx);
2568           *token = start_token;
2569           token->type = CHARACTER;
2570           /* mb_partial and word_char bits should be already initialized by
2571              peek_token.  */
2572           return elem;
2573         }
2574
2575       if (BE ((end != -1 && start > end)
2576               || token->type != OP_CLOSE_DUP_NUM, 0))
2577         {
2578           /* First number greater than second.  */
2579           *err = REG_BADBR;
2580           return NULL;
2581         }
2582
2583       if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2584         {
2585           *err = REG_ESIZE;
2586           return NULL;
2587         }
2588     }
2589   else
2590     {
2591       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2592       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2593     }
2594
2595   fetch_token (token, regexp, syntax);
2596
2597   if (BE (elem == NULL, 0))
2598     return NULL;
2599   if (BE (start == 0 && end == 0, 0))
2600     {
2601       postorder (elem, free_tree, NULL);
2602       return NULL;
2603     }
2604
2605   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2606   if (BE (start > 0, 0))
2607     {
2608       tree = elem;
2609       for (i = 2; i <= start; ++i)
2610         {
2611           elem = duplicate_tree (elem, dfa);
2612           tree = create_tree (dfa, tree, elem, CONCAT);
2613           if (BE (elem == NULL || tree == NULL, 0))
2614             goto parse_dup_op_espace;
2615         }
2616
2617       if (start == end)
2618         return tree;
2619
2620       /* Duplicate ELEM before it is marked optional.  */
2621       elem = duplicate_tree (elem, dfa);
2622       if (BE (elem == NULL, 0))
2623         goto parse_dup_op_espace;
2624       old_tree = tree;
2625     }
2626   else
2627     old_tree = NULL;
2628
2629   if (elem->token.type == SUBEXP)
2630     {
2631       uintptr_t subidx = elem->token.opr.idx;
2632       postorder (elem, mark_opt_subexp, (void *) subidx);
2633     }
2634
2635   tree = create_tree (dfa, elem, NULL,
2636                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2637   if (BE (tree == NULL, 0))
2638     goto parse_dup_op_espace;
2639
2640 /* From gnulib's "intprops.h":
2641    True if the arithmetic type T is signed.  */
2642 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2643
2644   /* This loop is actually executed only when end != -1,
2645      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2646      already created the start+1-th copy.  */
2647   if (TYPE_SIGNED (Idx) || end != -1)
2648     for (i = start + 2; i <= end; ++i)
2649       {
2650         elem = duplicate_tree (elem, dfa);
2651         tree = create_tree (dfa, tree, elem, CONCAT);
2652         if (BE (elem == NULL || tree == NULL, 0))
2653           goto parse_dup_op_espace;
2654
2655         tree = create_tree (dfa, tree, NULL, OP_ALT);
2656         if (BE (tree == NULL, 0))
2657           goto parse_dup_op_espace;
2658       }
2659
2660   if (old_tree)
2661     tree = create_tree (dfa, old_tree, tree, CONCAT);
2662
2663   return tree;
2664
2665  parse_dup_op_espace:
2666   *err = REG_ESPACE;
2667   return NULL;
2668 }
2669
2670 /* Size of the names for collating symbol/equivalence_class/character_class.
2671    I'm not sure, but maybe enough.  */
2672 #define BRACKET_NAME_BUF_SIZE 32
2673
2674 #ifndef _LIBC
2675
2676 # ifdef RE_ENABLE_I18N
2677 /* Convert the byte B to the corresponding wide character.  In a
2678    unibyte locale, treat B as itself if it is an encoding error.
2679    In a multibyte locale, return WEOF if B is an encoding error.  */
2680 static wint_t
2681 parse_byte (unsigned char b, re_charset_t *mbcset)
2682 {
2683   wint_t wc = __btowc (b);
2684   return wc == WEOF && !mbcset ? b : wc;
2685 }
2686 #endif
2687
2688   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2689      Build the range expression which starts from START_ELEM, and ends
2690      at END_ELEM.  The result are written to MBCSET and SBCSET.
2691      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2692      mbcset->range_ends, is a pointer argument since we may
2693      update it.  */
2694
2695 static reg_errcode_t
2696 internal_function
2697 # ifdef RE_ENABLE_I18N
2698 build_range_exp (const reg_syntax_t syntax,
2699                  bitset_t sbcset,
2700                  re_charset_t *mbcset,
2701                  Idx *range_alloc,
2702                  const bracket_elem_t *start_elem,
2703                  const bracket_elem_t *end_elem)
2704 # else /* not RE_ENABLE_I18N */
2705 build_range_exp (const reg_syntax_t syntax,
2706                  bitset_t sbcset,
2707                  const bracket_elem_t *start_elem,
2708                  const bracket_elem_t *end_elem)
2709 # endif /* not RE_ENABLE_I18N */
2710 {
2711   unsigned int start_ch, end_ch;
2712   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2713   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2714           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2715           0))
2716     return REG_ERANGE;
2717
2718   /* We can handle no multi character collating elements without libc
2719      support.  */
2720   if (BE ((start_elem->type == COLL_SYM
2721            && strlen ((char *) start_elem->opr.name) > 1)
2722           || (end_elem->type == COLL_SYM
2723               && strlen ((char *) end_elem->opr.name) > 1), 0))
2724     return REG_ECOLLATE;
2725
2726 # ifdef RE_ENABLE_I18N
2727   {
2728     wchar_t wc;
2729     wint_t start_wc;
2730     wint_t end_wc;
2731
2732     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2733                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2734                    : 0));
2735     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2736               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2737                  : 0));
2738     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2739                 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2740     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2741               ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2742     if (start_wc == WEOF || end_wc == WEOF)
2743       return REG_ECOLLATE;
2744     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2745       return REG_ERANGE;
2746
2747     /* Got valid collation sequence values, add them as a new entry.
2748        However, for !_LIBC we have no collation elements: if the
2749        character set is single byte, the single byte character set
2750        that we build below suffices.  parse_bracket_exp passes
2751        no MBCSET if dfa->mb_cur_max == 1.  */
2752     if (mbcset)
2753       {
2754         /* Check the space of the arrays.  */
2755         if (BE (*range_alloc == mbcset->nranges, 0))
2756           {
2757             /* There is not enough space, need realloc.  */
2758             wchar_t *new_array_start, *new_array_end;
2759             Idx new_nranges;
2760
2761             /* +1 in case of mbcset->nranges is 0.  */
2762             new_nranges = 2 * mbcset->nranges + 1;
2763             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2764                are NULL if *range_alloc == 0.  */
2765             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2766                                           new_nranges);
2767             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2768                                         new_nranges);
2769
2770             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2771               {
2772                 re_free (new_array_start);
2773                 re_free (new_array_end);
2774                 return REG_ESPACE;
2775               }
2776
2777             mbcset->range_starts = new_array_start;
2778             mbcset->range_ends = new_array_end;
2779             *range_alloc = new_nranges;
2780           }
2781
2782         mbcset->range_starts[mbcset->nranges] = start_wc;
2783         mbcset->range_ends[mbcset->nranges++] = end_wc;
2784       }
2785
2786     /* Build the table for single byte characters.  */
2787     for (wc = 0; wc < SBC_MAX; ++wc)
2788       {
2789         if (start_wc <= wc && wc <= end_wc)
2790           bitset_set (sbcset, wc);
2791       }
2792   }
2793 # else /* not RE_ENABLE_I18N */
2794   {
2795     unsigned int ch;
2796     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2797                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2798                    : 0));
2799     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2800               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2801                  : 0));
2802     if (start_ch > end_ch)
2803       return REG_ERANGE;
2804     /* Build the table for single byte characters.  */
2805     for (ch = 0; ch < SBC_MAX; ++ch)
2806       if (start_ch <= ch  && ch <= end_ch)
2807         bitset_set (sbcset, ch);
2808   }
2809 # endif /* not RE_ENABLE_I18N */
2810   return REG_NOERROR;
2811 }
2812 #endif /* not _LIBC */
2813
2814 #ifndef _LIBC
2815 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2816    Build the collating element which is represented by NAME.
2817    The result are written to MBCSET and SBCSET.
2818    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2819    pointer argument since we may update it.  */
2820
2821 static reg_errcode_t
2822 internal_function
2823 # ifdef RE_ENABLE_I18N
2824 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2825                         Idx *coll_sym_alloc, const unsigned char *name)
2826 # else /* not RE_ENABLE_I18N */
2827 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2828 # endif /* not RE_ENABLE_I18N */
2829 {
2830   size_t name_len = strlen ((const char *) name);
2831   if (BE (name_len != 1, 0))
2832     return REG_ECOLLATE;
2833   else
2834     {
2835       bitset_set (sbcset, name[0]);
2836       return REG_NOERROR;
2837     }
2838 }
2839 #endif /* not _LIBC */
2840
2841 /* This function parse bracket expression like "[abc]", "[a-c]",
2842    "[[.a-a.]]" etc.  */
2843
2844 static bin_tree_t *
2845 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2846                    reg_syntax_t syntax, reg_errcode_t *err)
2847 {
2848 #ifdef _LIBC
2849   const unsigned char *collseqmb;
2850   const char *collseqwc;
2851   uint32_t nrules;
2852   int32_t table_size;
2853   const int32_t *symb_table;
2854   const unsigned char *extra;
2855
2856   /* Local function for parse_bracket_exp used in _LIBC environment.
2857      Seek the collating symbol entry corresponding to NAME.
2858      Return the index of the symbol in the SYMB_TABLE,
2859      or -1 if not found.  */
2860
2861   auto inline int32_t
2862   __attribute__ ((always_inline))
2863   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2864     {
2865       int32_t elem;
2866
2867       for (elem = 0; elem < table_size; elem++)
2868         if (symb_table[2 * elem] != 0)
2869           {
2870             int32_t idx = symb_table[2 * elem + 1];
2871             /* Skip the name of collating element name.  */
2872             idx += 1 + extra[idx];
2873             if (/* Compare the length of the name.  */
2874                 name_len == extra[idx]
2875                 /* Compare the name.  */
2876                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2877               /* Yep, this is the entry.  */
2878               return elem;
2879           }
2880       return -1;
2881     }
2882
2883   /* Local function for parse_bracket_exp used in _LIBC environment.
2884      Look up the collation sequence value of BR_ELEM.
2885      Return the value if succeeded, UINT_MAX otherwise.  */
2886
2887   auto inline unsigned int
2888   __attribute__ ((always_inline))
2889   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2890     {
2891       if (br_elem->type == SB_CHAR)
2892         {
2893           /*
2894           if (MB_CUR_MAX == 1)
2895           */
2896           if (nrules == 0)
2897             return collseqmb[br_elem->opr.ch];
2898           else
2899             {
2900               wint_t wc = __btowc (br_elem->opr.ch);
2901               return __collseq_table_lookup (collseqwc, wc);
2902             }
2903         }
2904       else if (br_elem->type == MB_CHAR)
2905         {
2906           if (nrules != 0)
2907             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2908         }
2909       else if (br_elem->type == COLL_SYM)
2910         {
2911           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2912           if (nrules != 0)
2913             {
2914               int32_t elem, idx;
2915               elem = seek_collating_symbol_entry (br_elem->opr.name,
2916                                                   sym_name_len);
2917               if (elem != -1)
2918                 {
2919                   /* We found the entry.  */
2920                   idx = symb_table[2 * elem + 1];
2921                   /* Skip the name of collating element name.  */
2922                   idx += 1 + extra[idx];
2923                   /* Skip the byte sequence of the collating element.  */
2924                   idx += 1 + extra[idx];
2925                   /* Adjust for the alignment.  */
2926                   idx = (idx + 3) & ~3;
2927                   /* Skip the multibyte collation sequence value.  */
2928                   idx += sizeof (unsigned int);
2929                   /* Skip the wide char sequence of the collating element.  */
2930                   idx += sizeof (unsigned int) *
2931                     (1 + *(unsigned int *) (extra + idx));
2932                   /* Return the collation sequence value.  */
2933                   return *(unsigned int *) (extra + idx);
2934                 }
2935               else if (sym_name_len == 1)
2936                 {
2937                   /* No valid character.  Match it as a single byte
2938                      character.  */
2939                   return collseqmb[br_elem->opr.name[0]];
2940                 }
2941             }
2942           else if (sym_name_len == 1)
2943             return collseqmb[br_elem->opr.name[0]];
2944         }
2945       return UINT_MAX;
2946     }
2947
2948   /* Local function for parse_bracket_exp used in _LIBC environment.
2949      Build the range expression which starts from START_ELEM, and ends
2950      at END_ELEM.  The result are written to MBCSET and SBCSET.
2951      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2952      mbcset->range_ends, is a pointer argument since we may
2953      update it.  */
2954
2955   auto inline reg_errcode_t
2956   __attribute__ ((always_inline))
2957   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2958                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2959     {
2960       unsigned int ch;
2961       uint32_t start_collseq;
2962       uint32_t end_collseq;
2963
2964       /* Equivalence Classes and Character Classes can't be a range
2965          start/end.  */
2966       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2967               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2968               0))
2969         return REG_ERANGE;
2970
2971       /* FIXME: Implement rational ranges here, too.  */
2972       start_collseq = lookup_collation_sequence_value (start_elem);
2973       end_collseq = lookup_collation_sequence_value (end_elem);
2974       /* Check start/end collation sequence values.  */
2975       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2976         return REG_ECOLLATE;
2977       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2978         return REG_ERANGE;
2979
2980       /* Got valid collation sequence values, add them as a new entry.
2981          However, if we have no collation elements, and the character set
2982          is single byte, the single byte character set that we
2983          build below suffices. */
2984       if (nrules > 0 || dfa->mb_cur_max > 1)
2985         {
2986           /* Check the space of the arrays.  */
2987           if (BE (*range_alloc == mbcset->nranges, 0))
2988             {
2989               /* There is not enough space, need realloc.  */
2990               uint32_t *new_array_start;
2991               uint32_t *new_array_end;
2992               Idx new_nranges;
2993
2994               /* +1 in case of mbcset->nranges is 0.  */
2995               new_nranges = 2 * mbcset->nranges + 1;
2996               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2997                                             new_nranges);
2998               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2999                                           new_nranges);
3000
3001               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3002                 return REG_ESPACE;
3003
3004               mbcset->range_starts = new_array_start;
3005               mbcset->range_ends = new_array_end;
3006               *range_alloc = new_nranges;
3007             }
3008
3009           mbcset->range_starts[mbcset->nranges] = start_collseq;
3010           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3011         }
3012
3013       /* Build the table for single byte characters.  */
3014       for (ch = 0; ch < SBC_MAX; ch++)
3015         {
3016           uint32_t ch_collseq;
3017           /*
3018           if (MB_CUR_MAX == 1)
3019           */
3020           if (nrules == 0)
3021             ch_collseq = collseqmb[ch];
3022           else
3023             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3024           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3025             bitset_set (sbcset, ch);
3026         }
3027       return REG_NOERROR;
3028     }
3029
3030   /* Local function for parse_bracket_exp used in _LIBC environment.
3031      Build the collating element which is represented by NAME.
3032      The result are written to MBCSET and SBCSET.
3033      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3034      pointer argument since we may update it.  */
3035
3036   auto inline reg_errcode_t
3037   __attribute__ ((always_inline))
3038   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3039                           Idx *coll_sym_alloc, const unsigned char *name)
3040     {
3041       int32_t elem, idx;
3042       size_t name_len = strlen ((const char *) name);
3043       if (nrules != 0)
3044         {
3045           elem = seek_collating_symbol_entry (name, name_len);
3046           if (elem != -1)
3047             {
3048               /* We found the entry.  */
3049               idx = symb_table[2 * elem + 1];
3050               /* Skip the name of collating element name.  */
3051               idx += 1 + extra[idx];
3052             }
3053           else if (name_len == 1)
3054             {
3055               /* No valid character, treat it as a normal
3056                  character.  */
3057               bitset_set (sbcset, name[0]);
3058               return REG_NOERROR;
3059             }
3060           else
3061             return REG_ECOLLATE;
3062
3063           /* Got valid collation sequence, add it as a new entry.  */
3064           /* Check the space of the arrays.  */
3065           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3066             {
3067               /* Not enough, realloc it.  */
3068               /* +1 in case of mbcset->ncoll_syms is 0.  */
3069               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3070               /* Use realloc since mbcset->coll_syms is NULL
3071                  if *alloc == 0.  */
3072               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3073                                                    new_coll_sym_alloc);
3074               if (BE (new_coll_syms == NULL, 0))
3075                 return REG_ESPACE;
3076               mbcset->coll_syms = new_coll_syms;
3077               *coll_sym_alloc = new_coll_sym_alloc;
3078             }
3079           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3080           return REG_NOERROR;
3081         }
3082       else
3083         {
3084           if (BE (name_len != 1, 0))
3085             return REG_ECOLLATE;
3086           else
3087             {
3088               bitset_set (sbcset, name[0]);
3089               return REG_NOERROR;
3090             }
3091         }
3092     }
3093 #endif
3094
3095   re_token_t br_token;
3096   re_bitset_ptr_t sbcset;
3097 #ifdef RE_ENABLE_I18N
3098   re_charset_t *mbcset;
3099   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3100   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3101 #endif /* not RE_ENABLE_I18N */
3102   bool non_match = false;
3103   bin_tree_t *work_tree;
3104   int token_len;
3105   bool first_round = true;
3106 #ifdef _LIBC
3107   collseqmb = (const unsigned char *)
3108     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3109   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3110   if (nrules)
3111     {
3112       /*
3113       if (MB_CUR_MAX > 1)
3114       */
3115       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3116       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3117       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3118                                                   _NL_COLLATE_SYMB_TABLEMB);
3119       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3120                                                    _NL_COLLATE_SYMB_EXTRAMB);
3121     }
3122 #endif
3123   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3124 #ifdef RE_ENABLE_I18N
3125   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3126 #endif /* RE_ENABLE_I18N */
3127 #ifdef RE_ENABLE_I18N
3128   if (BE (sbcset == NULL || mbcset == NULL, 0))
3129 #else
3130   if (BE (sbcset == NULL, 0))
3131 #endif /* RE_ENABLE_I18N */
3132     {
3133       re_free (sbcset);
3134 #ifdef RE_ENABLE_I18N
3135       re_free (mbcset);
3136 #endif
3137       *err = REG_ESPACE;
3138       return NULL;
3139     }
3140
3141   token_len = peek_token_bracket (token, regexp, syntax);
3142   if (BE (token->type == END_OF_RE, 0))
3143     {
3144       *err = REG_BADPAT;
3145       goto parse_bracket_exp_free_return;
3146     }
3147   if (token->type == OP_NON_MATCH_LIST)
3148     {
3149 #ifdef RE_ENABLE_I18N
3150       mbcset->non_match = 1;
3151 #endif /* not RE_ENABLE_I18N */
3152       non_match = true;
3153       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3154         bitset_set (sbcset, '\n');
3155       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3156       token_len = peek_token_bracket (token, regexp, syntax);
3157       if (BE (token->type == END_OF_RE, 0))
3158         {
3159           *err = REG_BADPAT;
3160           goto parse_bracket_exp_free_return;
3161         }
3162     }
3163
3164   /* We treat the first ']' as a normal character.  */
3165   if (token->type == OP_CLOSE_BRACKET)
3166     token->type = CHARACTER;
3167
3168   while (1)
3169     {
3170       bracket_elem_t start_elem, end_elem;
3171       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3172       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3173       reg_errcode_t ret;
3174       int token_len2 = 0;
3175       bool is_range_exp = false;
3176       re_token_t token2;
3177
3178       start_elem.opr.name = start_name_buf;
3179       start_elem.type = COLL_SYM;
3180       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3181                                    syntax, first_round);
3182       if (BE (ret != REG_NOERROR, 0))
3183         {
3184           *err = ret;
3185           goto parse_bracket_exp_free_return;
3186         }
3187       first_round = false;
3188
3189       /* Get information about the next token.  We need it in any case.  */
3190       token_len = peek_token_bracket (token, regexp, syntax);
3191
3192       /* Do not check for ranges if we know they are not allowed.  */
3193       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3194         {
3195           if (BE (token->type == END_OF_RE, 0))
3196             {
3197               *err = REG_EBRACK;
3198               goto parse_bracket_exp_free_return;
3199             }
3200           if (token->type == OP_CHARSET_RANGE)
3201             {
3202               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3203               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3204               if (BE (token2.type == END_OF_RE, 0))
3205                 {
3206                   *err = REG_EBRACK;
3207                   goto parse_bracket_exp_free_return;
3208                 }
3209               if (token2.type == OP_CLOSE_BRACKET)
3210                 {
3211                   /* We treat the last '-' as a normal character.  */
3212                   re_string_skip_bytes (regexp, -token_len);
3213                   token->type = CHARACTER;
3214                 }
3215               else
3216                 is_range_exp = true;
3217             }
3218         }
3219
3220       if (is_range_exp == true)
3221         {
3222           end_elem.opr.name = end_name_buf;
3223           end_elem.type = COLL_SYM;
3224           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3225                                        dfa, syntax, true);
3226           if (BE (ret != REG_NOERROR, 0))
3227             {
3228               *err = ret;
3229               goto parse_bracket_exp_free_return;
3230             }
3231
3232           token_len = peek_token_bracket (token, regexp, syntax);
3233
3234 #ifdef _LIBC
3235           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3236                                   &start_elem, &end_elem);
3237 #else
3238 # ifdef RE_ENABLE_I18N
3239           *err = build_range_exp (syntax, sbcset,
3240                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3241                                   &range_alloc, &start_elem, &end_elem);
3242 # else
3243           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3244 # endif
3245 #endif /* RE_ENABLE_I18N */
3246           if (BE (*err != REG_NOERROR, 0))
3247             goto parse_bracket_exp_free_return;
3248         }
3249       else
3250         {
3251           switch (start_elem.type)
3252             {
3253             case SB_CHAR:
3254               bitset_set (sbcset, start_elem.opr.ch);
3255               break;
3256 #ifdef RE_ENABLE_I18N
3257             case MB_CHAR:
3258               /* Check whether the array has enough space.  */
3259               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3260                 {
3261                   wchar_t *new_mbchars;
3262                   /* Not enough, realloc it.  */
3263                   /* +1 in case of mbcset->nmbchars is 0.  */
3264                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3265                   /* Use realloc since array is NULL if *alloc == 0.  */
3266                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3267                                             mbchar_alloc);
3268                   if (BE (new_mbchars == NULL, 0))
3269                     goto parse_bracket_exp_espace;
3270                   mbcset->mbchars = new_mbchars;
3271                 }
3272               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3273               break;
3274 #endif /* RE_ENABLE_I18N */
3275             case EQUIV_CLASS:
3276               *err = build_equiv_class (sbcset,
3277 #ifdef RE_ENABLE_I18N
3278                                         mbcset, &equiv_class_alloc,
3279 #endif /* RE_ENABLE_I18N */
3280                                         start_elem.opr.name);
3281               if (BE (*err != REG_NOERROR, 0))
3282                 goto parse_bracket_exp_free_return;
3283               break;
3284             case COLL_SYM:
3285               *err = build_collating_symbol (sbcset,
3286 #ifdef RE_ENABLE_I18N
3287                                              mbcset, &coll_sym_alloc,
3288 #endif /* RE_ENABLE_I18N */
3289                                              start_elem.opr.name);
3290               if (BE (*err != REG_NOERROR, 0))
3291                 goto parse_bracket_exp_free_return;
3292               break;
3293             case CHAR_CLASS:
3294               *err = build_charclass (regexp->trans, sbcset,
3295 #ifdef RE_ENABLE_I18N
3296                                       mbcset, &char_class_alloc,
3297 #endif /* RE_ENABLE_I18N */
3298                                       (const char *) start_elem.opr.name,
3299                                       syntax);
3300               if (BE (*err != REG_NOERROR, 0))
3301                goto parse_bracket_exp_free_return;
3302               break;
3303             default:
3304               assert (0);
3305               break;
3306             }
3307         }
3308       if (BE (token->type == END_OF_RE, 0))
3309         {
3310           *err = REG_EBRACK;
3311           goto parse_bracket_exp_free_return;
3312         }
3313       if (token->type == OP_CLOSE_BRACKET)
3314         break;
3315     }
3316
3317   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3318
3319   /* If it is non-matching list.  */
3320   if (non_match)
3321     bitset_not (sbcset);
3322
3323 #ifdef RE_ENABLE_I18N
3324   /* Ensure only single byte characters are set.  */
3325   if (dfa->mb_cur_max > 1)
3326     bitset_mask (sbcset, dfa->sb_char);
3327
3328   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3329       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3330                                                      || mbcset->non_match)))
3331     {
3332       bin_tree_t *mbc_tree;
3333       int sbc_idx;
3334       /* Build a tree for complex bracket.  */
3335       dfa->has_mb_node = 1;
3336       br_token.type = COMPLEX_BRACKET;
3337       br_token.opr.mbcset = mbcset;
3338       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3339       if (BE (mbc_tree == NULL, 0))
3340         goto parse_bracket_exp_espace;
3341       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3342         if (sbcset[sbc_idx])
3343           break;
3344       /* If there are no bits set in sbcset, there is no point
3345          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3346       if (sbc_idx < BITSET_WORDS)
3347         {
3348           /* Build a tree for simple bracket.  */
3349           br_token.type = SIMPLE_BRACKET;
3350           br_token.opr.sbcset = sbcset;
3351           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3352           if (BE (work_tree == NULL, 0))
3353             goto parse_bracket_exp_espace;
3354
3355           /* Then join them by ALT node.  */
3356           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3357           if (BE (work_tree == NULL, 0))
3358             goto parse_bracket_exp_espace;
3359         }
3360       else
3361         {
3362           re_free (sbcset);
3363           work_tree = mbc_tree;
3364         }
3365     }
3366   else
3367 #endif /* not RE_ENABLE_I18N */
3368     {
3369 #ifdef RE_ENABLE_I18N
3370       free_charset (mbcset);
3371 #endif
3372       /* Build a tree for simple bracket.  */
3373       br_token.type = SIMPLE_BRACKET;
3374       br_token.opr.sbcset = sbcset;
3375       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3376       if (BE (work_tree == NULL, 0))
3377         goto parse_bracket_exp_espace;
3378     }
3379   return work_tree;
3380
3381  parse_bracket_exp_espace:
3382   *err = REG_ESPACE;
3383  parse_bracket_exp_free_return:
3384   re_free (sbcset);
3385 #ifdef RE_ENABLE_I18N
3386   free_charset (mbcset);
3387 #endif /* RE_ENABLE_I18N */
3388   return NULL;
3389 }
3390
3391 /* Parse an element in the bracket expression.  */
3392
3393 static reg_errcode_t
3394 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3395                        re_token_t *token, int token_len, re_dfa_t *dfa,
3396                        reg_syntax_t syntax, bool accept_hyphen)
3397 {
3398 #ifdef RE_ENABLE_I18N
3399   int cur_char_size;
3400   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3401   if (cur_char_size > 1)
3402     {
3403       elem->type = MB_CHAR;
3404       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3405       re_string_skip_bytes (regexp, cur_char_size);
3406       return REG_NOERROR;
3407     }
3408 #endif /* RE_ENABLE_I18N */
3409   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3410   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3411       || token->type == OP_OPEN_EQUIV_CLASS)
3412     return parse_bracket_symbol (elem, regexp, token);
3413   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3414     {
3415       /* A '-' must only appear as anything but a range indicator before
3416          the closing bracket.  Everything else is an error.  */
3417       re_token_t token2;
3418       (void) peek_token_bracket (&token2, regexp, syntax);
3419       if (token2.type != OP_CLOSE_BRACKET)
3420         /* The actual error value is not standardized since this whole
3421            case is undefined.  But ERANGE makes good sense.  */
3422         return REG_ERANGE;
3423     }
3424   elem->type = SB_CHAR;
3425   elem->opr.ch = token->opr.c;
3426   return REG_NOERROR;
3427 }
3428
3429 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3430    such as [:<character_class>:], [.<collating_element>.], and
3431    [=<equivalent_class>=].  */
3432
3433 static reg_errcode_t
3434 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3435                       re_token_t *token)
3436 {
3437   unsigned char ch, delim = token->opr.c;
3438   int i = 0;
3439   if (re_string_eoi(regexp))
3440     return REG_EBRACK;
3441   for (;; ++i)
3442     {
3443       if (i >= BRACKET_NAME_BUF_SIZE)
3444         return REG_EBRACK;
3445       if (token->type == OP_OPEN_CHAR_CLASS)
3446         ch = re_string_fetch_byte_case (regexp);
3447       else
3448         ch = re_string_fetch_byte (regexp);
3449       if (re_string_eoi(regexp))
3450         return REG_EBRACK;
3451       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3452         break;
3453       elem->opr.name[i] = ch;
3454     }
3455   re_string_skip_bytes (regexp, 1);
3456   elem->opr.name[i] = '\0';
3457   switch (token->type)
3458     {
3459     case OP_OPEN_COLL_ELEM:
3460       elem->type = COLL_SYM;
3461       break;
3462     case OP_OPEN_EQUIV_CLASS:
3463       elem->type = EQUIV_CLASS;
3464       break;
3465     case OP_OPEN_CHAR_CLASS:
3466       elem->type = CHAR_CLASS;
3467       break;
3468     default:
3469       break;
3470     }
3471   return REG_NOERROR;
3472 }
3473
3474   /* Helper function for parse_bracket_exp.
3475      Build the equivalence class which is represented by NAME.
3476      The result are written to MBCSET and SBCSET.
3477      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3478      is a pointer argument since we may update it.  */
3479
3480 static reg_errcode_t
3481 #ifdef RE_ENABLE_I18N
3482 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3483                    Idx *equiv_class_alloc, const unsigned char *name)
3484 #else /* not RE_ENABLE_I18N */
3485 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3486 #endif /* not RE_ENABLE_I18N */
3487 {
3488 #ifdef _LIBC
3489   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3490   if (nrules != 0)
3491     {
3492       const int32_t *table, *indirect;
3493       const unsigned char *weights, *extra, *cp;
3494       unsigned char char_buf[2];
3495       int32_t idx1, idx2;
3496       unsigned int ch;
3497       size_t len;
3498       /* Calculate the index for equivalence class.  */
3499       cp = name;
3500       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3501       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3502                                                _NL_COLLATE_WEIGHTMB);
3503       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504                                                    _NL_COLLATE_EXTRAMB);
3505       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3506                                                 _NL_COLLATE_INDIRECTMB);
3507       idx1 = findidx (table, indirect, extra, &cp, -1);
3508       if (BE (idx1 == 0 || *cp != '\0', 0))
3509         /* This isn't a valid character.  */
3510         return REG_ECOLLATE;
3511
3512       /* Build single byte matching table for this equivalence class.  */
3513       len = weights[idx1 & 0xffffff];
3514       for (ch = 0; ch < SBC_MAX; ++ch)
3515         {
3516           char_buf[0] = ch;
3517           cp = char_buf;
3518           idx2 = findidx (table, indirect, extra, &cp, 1);
3519 /*
3520           idx2 = table[ch];
3521 */
3522           if (idx2 == 0)
3523             /* This isn't a valid character.  */
3524             continue;
3525           /* Compare only if the length matches and the collation rule
3526              index is the same.  */
3527           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3528             {
3529               int cnt = 0;
3530
3531               while (cnt <= len &&
3532                      weights[(idx1 & 0xffffff) + 1 + cnt]
3533                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3534                 ++cnt;
3535
3536               if (cnt > len)
3537                 bitset_set (sbcset, ch);
3538             }
3539         }
3540       /* Check whether the array has enough space.  */
3541       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3542         {
3543           /* Not enough, realloc it.  */
3544           /* +1 in case of mbcset->nequiv_classes is 0.  */
3545           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3546           /* Use realloc since the array is NULL if *alloc == 0.  */
3547           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3548                                                    int32_t,
3549                                                    new_equiv_class_alloc);
3550           if (BE (new_equiv_classes == NULL, 0))
3551             return REG_ESPACE;
3552           mbcset->equiv_classes = new_equiv_classes;
3553           *equiv_class_alloc = new_equiv_class_alloc;
3554         }
3555       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3556     }
3557   else
3558 #endif /* _LIBC */
3559     {
3560       if (BE (strlen ((const char *) name) != 1, 0))
3561         return REG_ECOLLATE;
3562       bitset_set (sbcset, *name);
3563     }
3564   return REG_NOERROR;
3565 }
3566
3567   /* Helper function for parse_bracket_exp.
3568      Build the character class which is represented by NAME.
3569      The result are written to MBCSET and SBCSET.
3570      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3571      is a pointer argument since we may update it.  */
3572
3573 static reg_errcode_t
3574 #ifdef RE_ENABLE_I18N
3575 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3576                  re_charset_t *mbcset, Idx *char_class_alloc,
3577                  const char *class_name, reg_syntax_t syntax)
3578 #else /* not RE_ENABLE_I18N */
3579 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3580                  const char *class_name, reg_syntax_t syntax)
3581 #endif /* not RE_ENABLE_I18N */
3582 {
3583   int i;
3584   const char *name = class_name;
3585
3586   /* In case of REG_ICASE "upper" and "lower" match the both of
3587      upper and lower cases.  */
3588   if ((syntax & RE_ICASE)
3589       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3590     name = "alpha";
3591
3592 #ifdef RE_ENABLE_I18N
3593   /* Check the space of the arrays.  */
3594   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3595     {
3596       /* Not enough, realloc it.  */
3597       /* +1 in case of mbcset->nchar_classes is 0.  */
3598       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3599       /* Use realloc since array is NULL if *alloc == 0.  */
3600       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3601                                                new_char_class_alloc);
3602       if (BE (new_char_classes == NULL, 0))
3603         return REG_ESPACE;
3604       mbcset->char_classes = new_char_classes;
3605       *char_class_alloc = new_char_class_alloc;
3606     }
3607   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3608 #endif /* RE_ENABLE_I18N */
3609
3610 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3611   do {                                          \
3612     if (BE (trans != NULL, 0))                  \
3613       {                                         \
3614         for (i = 0; i < SBC_MAX; ++i)           \
3615           if (ctype_func (i))                   \
3616             bitset_set (sbcset, trans[i]);      \
3617       }                                         \
3618     else                                        \
3619       {                                         \
3620         for (i = 0; i < SBC_MAX; ++i)           \
3621           if (ctype_func (i))                   \
3622             bitset_set (sbcset, i);             \
3623       }                                         \
3624   } while (0)
3625
3626   if (strcmp (name, "alnum") == 0)
3627     BUILD_CHARCLASS_LOOP (isalnum);
3628   else if (strcmp (name, "cntrl") == 0)
3629     BUILD_CHARCLASS_LOOP (iscntrl);
3630   else if (strcmp (name, "lower") == 0)
3631     BUILD_CHARCLASS_LOOP (islower);
3632   else if (strcmp (name, "space") == 0)
3633     BUILD_CHARCLASS_LOOP (isspace);
3634   else if (strcmp (name, "alpha") == 0)
3635     BUILD_CHARCLASS_LOOP (isalpha);
3636   else if (strcmp (name, "digit") == 0)
3637     BUILD_CHARCLASS_LOOP (isdigit);
3638   else if (strcmp (name, "print") == 0)
3639     BUILD_CHARCLASS_LOOP (isprint);
3640   else if (strcmp (name, "upper") == 0)
3641     BUILD_CHARCLASS_LOOP (isupper);
3642   else if (strcmp (name, "blank") == 0)
3643     BUILD_CHARCLASS_LOOP (isblank);
3644   else if (strcmp (name, "graph") == 0)
3645     BUILD_CHARCLASS_LOOP (isgraph);
3646   else if (strcmp (name, "punct") == 0)
3647     BUILD_CHARCLASS_LOOP (ispunct);
3648   else if (strcmp (name, "xdigit") == 0)
3649     BUILD_CHARCLASS_LOOP (isxdigit);
3650   else
3651     return REG_ECTYPE;
3652
3653   return REG_NOERROR;
3654 }
3655
3656 static bin_tree_t *
3657 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3658                     const char *class_name,
3659                     const char *extra, bool non_match,
3660                     reg_errcode_t *err)
3661 {
3662   re_bitset_ptr_t sbcset;
3663 #ifdef RE_ENABLE_I18N
3664   re_charset_t *mbcset;
3665   Idx alloc = 0;
3666 #endif /* not RE_ENABLE_I18N */
3667   reg_errcode_t ret;
3668   re_token_t br_token;
3669   bin_tree_t *tree;
3670
3671   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3672   if (BE (sbcset == NULL, 0))
3673     {
3674       *err = REG_ESPACE;
3675       return NULL;
3676     }
3677 #ifdef RE_ENABLE_I18N
3678   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3679   if (BE (mbcset == NULL, 0))
3680     {
3681       re_free (sbcset);
3682       *err = REG_ESPACE;
3683       return NULL;
3684     }
3685   mbcset->non_match = non_match;
3686 #endif /* RE_ENABLE_I18N */
3687
3688   /* We don't care the syntax in this case.  */
3689   ret = build_charclass (trans, sbcset,
3690 #ifdef RE_ENABLE_I18N
3691                          mbcset, &alloc,
3692 #endif /* RE_ENABLE_I18N */
3693                          class_name, 0);
3694
3695   if (BE (ret != REG_NOERROR, 0))
3696     {
3697       re_free (sbcset);
3698 #ifdef RE_ENABLE_I18N
3699       free_charset (mbcset);
3700 #endif /* RE_ENABLE_I18N */
3701       *err = ret;
3702       return NULL;
3703     }
3704   /* \w match '_' also.  */
3705   for (; *extra; extra++)
3706     bitset_set (sbcset, *extra);
3707
3708   /* If it is non-matching list.  */
3709   if (non_match)
3710     bitset_not (sbcset);
3711
3712 #ifdef RE_ENABLE_I18N
3713   /* Ensure only single byte characters are set.  */
3714   if (dfa->mb_cur_max > 1)
3715     bitset_mask (sbcset, dfa->sb_char);
3716 #endif
3717
3718   /* Build a tree for simple bracket.  */
3719 #if defined GCC_LINT || defined lint
3720   memset (&br_token, 0, sizeof br_token);
3721 #endif
3722   br_token.type = SIMPLE_BRACKET;
3723   br_token.opr.sbcset = sbcset;
3724   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3725   if (BE (tree == NULL, 0))
3726     goto build_word_op_espace;
3727
3728 #ifdef RE_ENABLE_I18N
3729   if (dfa->mb_cur_max > 1)
3730     {
3731       bin_tree_t *mbc_tree;
3732       /* Build a tree for complex bracket.  */
3733       br_token.type = COMPLEX_BRACKET;
3734       br_token.opr.mbcset = mbcset;
3735       dfa->has_mb_node = 1;
3736       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3737       if (BE (mbc_tree == NULL, 0))
3738         goto build_word_op_espace;
3739       /* Then join them by ALT node.  */
3740       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3741       if (BE (mbc_tree != NULL, 1))
3742         return tree;
3743     }
3744   else
3745     {
3746       free_charset (mbcset);
3747       return tree;
3748     }
3749 #else /* not RE_ENABLE_I18N */
3750   return tree;
3751 #endif /* not RE_ENABLE_I18N */
3752
3753  build_word_op_espace:
3754   re_free (sbcset);
3755 #ifdef RE_ENABLE_I18N
3756   free_charset (mbcset);
3757 #endif /* RE_ENABLE_I18N */
3758   *err = REG_ESPACE;
3759   return NULL;
3760 }
3761
3762 /* This is intended for the expressions like "a{1,3}".
3763    Fetch a number from 'input', and return the number.
3764    Return -1 if the number field is empty like "{,1}".
3765    Return RE_DUP_MAX + 1 if the number field is too large.
3766    Return -2 if an error occurred.  */
3767
3768 static Idx
3769 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3770 {
3771   Idx num = -1;
3772   unsigned char c;
3773   while (1)
3774     {
3775       fetch_token (token, input, syntax);
3776       c = token->opr.c;
3777       if (BE (token->type == END_OF_RE, 0))
3778         return -2;
3779       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3780         break;
3781       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3782              ? -2
3783              : num == -1
3784              ? c - '0'
3785              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3786     }
3787   return num;
3788 }
3789 \f
3790 #ifdef RE_ENABLE_I18N
3791 static void
3792 free_charset (re_charset_t *cset)
3793 {
3794   re_free (cset->mbchars);
3795 # ifdef _LIBC
3796   re_free (cset->coll_syms);
3797   re_free (cset->equiv_classes);
3798   re_free (cset->range_starts);
3799   re_free (cset->range_ends);
3800 # endif
3801   re_free (cset->char_classes);
3802   re_free (cset);
3803 }
3804 #endif /* RE_ENABLE_I18N */
3805 \f
3806 /* Functions for binary tree operation.  */
3807
3808 /* Create a tree node.  */
3809
3810 static bin_tree_t *
3811 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3812              re_token_type_t type)
3813 {
3814   re_token_t t;
3815 #if defined GCC_LINT || defined lint
3816   memset (&t, 0, sizeof t);
3817 #endif
3818   t.type = type;
3819   return create_token_tree (dfa, left, right, &t);
3820 }
3821
3822 static bin_tree_t *
3823 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3824                    const re_token_t *token)
3825 {
3826   bin_tree_t *tree;
3827   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3828     {
3829       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3830
3831       if (storage == NULL)
3832         return NULL;
3833       storage->next = dfa->str_tree_storage;
3834       dfa->str_tree_storage = storage;
3835       dfa->str_tree_storage_idx = 0;
3836     }
3837   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3838
3839   tree->parent = NULL;
3840   tree->left = left;
3841   tree->right = right;
3842   tree->token = *token;
3843   tree->token.duplicated = 0;
3844   tree->token.opt_subexp = 0;
3845   tree->first = NULL;
3846   tree->next = NULL;
3847   tree->node_idx = -1;
3848
3849   if (left != NULL)
3850     left->parent = tree;
3851   if (right != NULL)
3852     right->parent = tree;
3853   return tree;
3854 }
3855
3856 /* Mark the tree SRC as an optional subexpression.
3857    To be called from preorder or postorder.  */
3858
3859 static reg_errcode_t
3860 mark_opt_subexp (void *extra, bin_tree_t *node)
3861 {
3862   Idx idx = (uintptr_t) extra;
3863   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3864     node->token.opt_subexp = 1;
3865
3866   return REG_NOERROR;
3867 }
3868
3869 /* Free the allocated memory inside NODE. */
3870
3871 static void
3872 free_token (re_token_t *node)
3873 {
3874 #ifdef RE_ENABLE_I18N
3875   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3876     free_charset (node->opr.mbcset);
3877   else
3878 #endif /* RE_ENABLE_I18N */
3879     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3880       re_free (node->opr.sbcset);
3881 }
3882
3883 /* Worker function for tree walking.  Free the allocated memory inside NODE
3884    and its children. */
3885
3886 static reg_errcode_t
3887 free_tree (void *extra, bin_tree_t *node)
3888 {
3889   free_token (&node->token);
3890   return REG_NOERROR;
3891 }
3892
3893
3894 /* Duplicate the node SRC, and return new node.  This is a preorder
3895    visit similar to the one implemented by the generic visitor, but
3896    we need more infrastructure to maintain two parallel trees --- so,
3897    it's easier to duplicate.  */
3898
3899 static bin_tree_t *
3900 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3901 {
3902   const bin_tree_t *node;
3903   bin_tree_t *dup_root;
3904   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3905
3906   for (node = root; ; )
3907     {
3908       /* Create a new tree and link it back to the current parent.  */
3909       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3910       if (*p_new == NULL)
3911         return NULL;
3912       (*p_new)->parent = dup_node;
3913       (*p_new)->token.duplicated = 1;
3914       dup_node = *p_new;
3915
3916       /* Go to the left node, or up and to the right.  */
3917       if (node->left)
3918         {
3919           node = node->left;
3920           p_new = &dup_node->left;
3921         }
3922       else
3923         {
3924           const bin_tree_t *prev = NULL;
3925           while (node->right == prev || node->right == NULL)
3926             {
3927               prev = node;
3928               node = node->parent;
3929               dup_node = dup_node->parent;
3930               if (!node)
3931                 return dup_root;
3932             }
3933           node = node->right;
3934           p_new = &dup_node->right;
3935         }
3936     }
3937 }