lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU General Public
   8    License as published by the Free Software Foundation; either
   9    version 3 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const char *class_name,
 110                                        const char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute__ ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 337                              &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 360           Idx i;
 361
 362 # ifdef _LIBC
 363           /* See if we have to try all bytes which start multiple collation
 364              elements.
 365              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 366                   collation element, and don't catch 'b' since 'b' is
 367                   the only collation element which starts from 'b' (and
 368                   it is caught by SIMPLE_BRACKET).  */
 369               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 370                   && (cset->ncoll_syms || cset->nranges))
 371                 {
 372                   const int32_t *table = (const int32_t *)
 373                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 374                   for (i = 0; i < SBC_MAX; ++i)
 375                     if (table[i] < 0)
 376                       re_set_fastmap (fastmap, icase, i);
 377                 }
 378 # endif /* _LIBC */
 379
 380           /* See if we have to start the match at all multibyte characters,
 381              i.e. where we would not find an invalid sequence.  This only
 382              applies to multibyte character sets; for single byte character
 383              sets, the SIMPLE_BRACKET again suffices.  */
 384           if (dfa->mb_cur_max > 1
 385               && (cset->nchar_classes || cset->non_match || cset->nranges
 386 # ifdef _LIBC
 387                   || cset->nequiv_classes
 388 # endif /* _LIBC */
 389                  ))
 390             {
 391               unsigned char c = 0;
 392               do
 393                 {
 394                   mbstate_t mbs;
 395                   memset (&mbs, 0, sizeof (mbs));
 396                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 397                     re_set_fastmap (fastmap, false, (int) c);
 398                 }
 399               while (++c != 0);
 400             }
 401
 402           else
 403             {
 404               /* ... Else catch all bytes which can start the mbchars.  */
 405               for (i = 0; i < cset->nmbchars; ++i)
 406                 {
 407                   char buf[256];
 408                   mbstate_t state;
 409                   memset (&state, '\0', sizeof (state));
 410                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 411                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 413                     {
 414                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 415                           != (size_t) -1)
 416                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 417                     }
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      'buffer' to the compiled pattern;
 443      'used' to the length of the compiled pattern;
 444      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      'fastmap' to an allocated space for the fastmap;
 449      'fastmap_accurate' to zero;
 450      're_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *_Restrict_ preg;
 475     const char *_Restrict_ pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 #ifdef _LIBC
 535 size_t
 536 regerror (errcode, preg, errbuf, errbuf_size)
 537     int errcode;
 538     const regex_t *_Restrict_ preg;
 539     char *_Restrict_ errbuf;
 540     size_t errbuf_size;
 541 #else /* size_t might promote */
 542 size_t
 543 regerror (int errcode, const regex_t *_Restrict_ preg,
 544           char *_Restrict_ errbuf, size_t errbuf_size)
 545 #endif
 546 {
 547   const char *msg;
 548   size_t msg_size;
 549
 550   if (BE (errcode < 0
 551           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 552                                / sizeof (__re_error_msgid_idx[0])), 0))
 553     /* Only error codes returned by the rest of the code should be passed
 554        to this routine.  If we are given anything else, or if other regex
 555        code generates an invalid error code, then the program has a bug.
 556        Dump core so we can fix it.  */
 557     abort ();
 558
 559   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 560
 561   msg_size = strlen (msg) + 1; /* Includes the null.  */
 562
 563   if (BE (errbuf_size != 0, 1))
 564     {
 565       size_t cpy_size = msg_size;
 566       if (BE (msg_size > errbuf_size, 0))
 567         {
 568           cpy_size = errbuf_size - 1;
 569           errbuf[cpy_size] = '\0';
 570         }
 571       memcpy (errbuf, msg, cpy_size);
 572     }
 573
 574   return msg_size;
 575 }
 576 #ifdef _LIBC
 577 weak_alias (__regerror, regerror)
 578 #endif
 579
 580
 581 #ifdef RE_ENABLE_I18N
 582 /* This static array is used for the map to single-byte characters when
 583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 584    it the same all the time.  UTF-8 is the preferred encoding so this is
 585    a worthwhile optimization.  */
 586 static const bitset_t utf8_sb_map =
 587 {
 588   /* Set the first 128 bits.  */
 589 # if defined __GNUC__ && !defined __STRICT_ANSI__
 590   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 591 # else
 592 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 593 #   error "bitset_word_t is narrower than 32 bits"
 594 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX, BITSET_WORD_MAX,
 598 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 599   BITSET_WORD_MAX,
 600 #  endif
 601   (BITSET_WORD_MAX
 602    >> (SBC_MAX % BITSET_WORD_BITS == 0
 603        ? 0
 604        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 605 # endif
 606 };
 607 #endif
 608
 609
 610 static void
 611 free_dfa_content (re_dfa_t *dfa)
 612 {
 613   Idx i, j;
 614
 615   if (dfa->nodes)
 616     for (i = 0; i < dfa->nodes_len; ++i)
 617       free_token (dfa->nodes + i);
 618   re_free (dfa->nexts);
 619   for (i = 0; i < dfa->nodes_len; ++i)
 620     {
 621       if (dfa->eclosures != NULL)
 622         re_node_set_free (dfa->eclosures + i);
 623       if (dfa->inveclosures != NULL)
 624         re_node_set_free (dfa->inveclosures + i);
 625       if (dfa->edests != NULL)
 626         re_node_set_free (dfa->edests + i);
 627     }
 628   re_free (dfa->edests);
 629   re_free (dfa->eclosures);
 630   re_free (dfa->inveclosures);
 631   re_free (dfa->nodes);
 632
 633   if (dfa->state_table)
 634     for (i = 0; i <= dfa->state_hash_mask; ++i)
 635       {
 636         struct re_state_table_entry *entry = dfa->state_table + i;
 637         for (j = 0; j < entry->num; ++j)
 638           {
 639             re_dfastate_t *state = entry->array[j];
 640             free_state (state);
 641           }
 642         re_free (entry->array);
 643       }
 644   re_free (dfa->state_table);
 645 #ifdef RE_ENABLE_I18N
 646   if (dfa->sb_char != utf8_sb_map)
 647     re_free (dfa->sb_char);
 648 #endif
 649   re_free (dfa->subexp_map);
 650 #ifdef DEBUG
 651   re_free (dfa->re_str);
 652 #endif
 653
 654   re_free (dfa);
 655 }
 656
 657
 658 /* Free dynamically allocated space used by PREG.  */
 659
 660 void
 661 regfree (preg)
 662     regex_t *preg;
 663 {
 664   re_dfa_t *dfa = preg->buffer;
 665   if (BE (dfa != NULL, 1))
 666     {
 667       lock_fini (dfa->lock);
 668       free_dfa_content (dfa);
 669     }
 670   preg->buffer = NULL;
 671   preg->allocated = 0;
 672
 673   re_free (preg->fastmap);
 674   preg->fastmap = NULL;
 675
 676   re_free (preg->translate);
 677   preg->translate = NULL;
 678 }
 679 #ifdef _LIBC
 680 weak_alias (__regfree, regfree)
 681 #endif
 682 \f
 683 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 684    them unless specifically requested.  */
 685
 686 #if defined _REGEX_RE_COMP || defined _LIBC
 687
 688 /* BSD has one and only one pattern buffer.  */
 689 static struct re_pattern_buffer re_comp_buf;
 690
 691 char *
 692 # ifdef _LIBC
 693 /* Make these definitions weak in libc, so POSIX programs can redefine
 694    these names if they don't use our functions, and still use
 695    regcomp/regexec above without link errors.  */
 696 weak_function
 697 # endif
 698 re_comp (s)
 699      const char *s;
 700 {
 701   reg_errcode_t ret;
 702   char *fastmap;
 703
 704   if (!s)
 705     {
 706       if (!re_comp_buf.buffer)
 707         return gettext ("No previous regular expression");
 708       return 0;
 709     }
 710
 711   if (re_comp_buf.buffer)
 712     {
 713       fastmap = re_comp_buf.fastmap;
 714       re_comp_buf.fastmap = NULL;
 715       __regfree (&re_comp_buf);
 716       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 717       re_comp_buf.fastmap = fastmap;
 718     }
 719
 720   if (re_comp_buf.fastmap == NULL)
 721     {
 722       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 723       if (re_comp_buf.fastmap == NULL)
 724         return (char *) gettext (__re_error_msgid
 725                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 726     }
 727
 728   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 729      don't need to initialize the pattern buffer fields which affect it.  */
 730
 731   /* Match anchors at newlines.  */
 732   re_comp_buf.newline_anchor = 1;
 733
 734   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 735
 736   if (!ret)
 737     return NULL;
 738
 739   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 740   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 741 }
 742
 743 #ifdef _LIBC
 744 libc_freeres_fn (free_mem)
 745 {
 746   __regfree (&re_comp_buf);
 747 }
 748 #endif
 749
 750 #endif /* _REGEX_RE_COMP */
 751 \f
 752 /* Internal entry point.
 753    Compile the regular expression PATTERN, whose length is LENGTH.
 754    SYNTAX indicate regular expression's syntax.  */
 755
 756 static reg_errcode_t
 757 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 758                      reg_syntax_t syntax)
 759 {
 760   reg_errcode_t err = REG_NOERROR;
 761   re_dfa_t *dfa;
 762   re_string_t regexp;
 763
 764   /* Initialize the pattern buffer.  */
 765   preg->fastmap_accurate = 0;
 766   preg->syntax = syntax;
 767   preg->not_bol = preg->not_eol = 0;
 768   preg->used = 0;
 769   preg->re_nsub = 0;
 770   preg->can_be_null = 0;
 771   preg->regs_allocated = REGS_UNALLOCATED;
 772
 773   /* Initialize the dfa.  */
 774   dfa = preg->buffer;
 775   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 776     {
 777       /* If zero allocated, but buffer is non-null, try to realloc
 778          enough space.  This loses if buffer's address is bogus, but
 779          that is the user's responsibility.  If ->buffer is NULL this
 780          is a simple allocation.  */
 781       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 782       if (dfa == NULL)
 783         return REG_ESPACE;
 784       preg->allocated = sizeof (re_dfa_t);
 785       preg->buffer = dfa;
 786     }
 787   preg->used = sizeof (re_dfa_t);
 788
 789   err = init_dfa (dfa, length);
 790   if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
 791     err = REG_ESPACE;
 792   if (BE (err != REG_NOERROR, 0))
 793     {
 794       free_dfa_content (dfa);
 795       preg->buffer = NULL;
 796       preg->allocated = 0;
 797       return err;
 798     }
 799 #ifdef DEBUG
 800   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 801   dfa->re_str = re_malloc (char, length + 1);
 802   strncpy (dfa->re_str, pattern, length + 1);
 803 #endif
 804
 805   err = re_string_construct (&regexp, pattern, length, preg->translate,
 806                              (syntax & RE_ICASE) != 0, dfa);
 807   if (BE (err != REG_NOERROR, 0))
 808     {
 809     re_compile_internal_free_return:
 810       free_workarea_compile (preg);
 811       re_string_destruct (&regexp);
 812       lock_fini (dfa->lock);
 813       free_dfa_content (dfa);
 814       preg->buffer = NULL;
 815       preg->allocated = 0;
 816       return err;
 817     }
 818
 819   /* Parse the regular expression, and build a structure tree.  */
 820   preg->re_nsub = 0;
 821   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 822   if (BE (dfa->str_tree == NULL, 0))
 823     goto re_compile_internal_free_return;
 824
 825   /* Analyze the tree and create the nfa.  */
 826   err = analyze (preg);
 827   if (BE (err != REG_NOERROR, 0))
 828     goto re_compile_internal_free_return;
 829
 830 #ifdef RE_ENABLE_I18N
 831   /* If possible, do searching in single byte encoding to speed things up.  */
 832   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 833     optimize_utf8 (dfa);
 834 #endif
 835
 836   /* Then create the initial state of the dfa.  */
 837   err = create_initial_state (dfa);
 838
 839   /* Release work areas.  */
 840   free_workarea_compile (preg);
 841   re_string_destruct (&regexp);
 842
 843   if (BE (err != REG_NOERROR, 0))
 844     {
 845       lock_fini (dfa->lock);
 846       free_dfa_content (dfa);
 847       preg->buffer = NULL;
 848       preg->allocated = 0;
 849     }
 850
 851   return err;
 852 }
 853
 854 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 855    as the initial length of some arrays.  */
 856
 857 static reg_errcode_t
 858 init_dfa (re_dfa_t *dfa, size_t pat_len)
 859 {
 860   __re_size_t table_size;
 861 #ifndef _LIBC
 862   const char *codeset_name;
 863 #endif
 864 #ifdef RE_ENABLE_I18N
 865   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 866 #else
 867   size_t max_i18n_object_size = 0;
 868 #endif
 869   size_t max_object_size =
 870     MAX (sizeof (struct re_state_table_entry),
 871          MAX (sizeof (re_token_t),
 872               MAX (sizeof (re_node_set),
 873                    MAX (sizeof (regmatch_t),
 874                         max_i18n_object_size))));
 875
 876   memset (dfa, '\0', sizeof (re_dfa_t));
 877
 878   /* Force allocation of str_tree_storage the first time.  */
 879   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 880
 881   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 882      calculation below, and for similar doubling calculations
 883      elsewhere.  And it's <= rather than <, because some of the
 884      doubling calculations add 1 afterwards.  */
 885   if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
 886     return REG_ESPACE;
 887
 888   dfa->nodes_alloc = pat_len + 1;
 889   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 890
 891   /*  table_size = 2 ^ ceil(log pat_len) */
 892   for (table_size = 1; ; table_size <<= 1)
 893     if (table_size > pat_len)
 894       break;
 895
 896   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 897   dfa->state_hash_mask = table_size - 1;
 898
 899   dfa->mb_cur_max = MB_CUR_MAX;
 900 #ifdef _LIBC
 901   if (dfa->mb_cur_max == 6
 902       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 903     dfa->is_utf8 = 1;
 904   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 905                        != 0);
 906 #else
 907   codeset_name = nl_langinfo (CODESET);
 908   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 909       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 910       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 911       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 912     dfa->is_utf8 = 1;
 913
 914   /* We check exhaustively in the loop below if this charset is a
 915      superset of ASCII.  */
 916   dfa->map_notascii = 0;
 917 #endif
 918
 919 #ifdef RE_ENABLE_I18N
 920   if (dfa->mb_cur_max > 1)
 921     {
 922       if (dfa->is_utf8)
 923         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 924       else
 925         {
 926           int i, j, ch;
 927
 928           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 929           if (BE (dfa->sb_char == NULL, 0))
 930             return REG_ESPACE;
 931
 932           /* Set the bits corresponding to single byte chars.  */
 933           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 934             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 935               {
 936                 wint_t wch = __btowc (ch);
 937                 if (wch != WEOF)
 938                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 939 # ifndef _LIBC
 940                 if (isascii (ch) && wch != ch)
 941                   dfa->map_notascii = 1;
 942 # endif
 943               }
 944         }
 945     }
 946 #endif
 947
 948   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 949     return REG_ESPACE;
 950   return REG_NOERROR;
 951 }
 952
 953 /* Initialize WORD_CHAR table, which indicate which character is
 954    "word".  In this case "word" means that it is the word construction
 955    character used by some operators like "\<", "\>", etc.  */
 956
 957 static void
 958 internal_function
 959 init_word_char (re_dfa_t *dfa)
 960 {
 961   int i = 0;
 962   int j;
 963   int ch = 0;
 964   dfa->word_ops_used = 1;
 965   if (BE (dfa->map_notascii == 0, 1))
 966     {
 967       bitset_word_t bits0 = 0x00000000;
 968       bitset_word_t bits1 = 0x03ff0000;
 969       bitset_word_t bits2 = 0x87fffffe;
 970       bitset_word_t bits3 = 0x07fffffe;
 971       if (BITSET_WORD_BITS == 64)
 972         {
 973           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 974           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 975           i = 2;
 976         }
 977       else if (BITSET_WORD_BITS == 32)
 978         {
 979           dfa->word_char[0] = bits0;
 980           dfa->word_char[1] = bits1;
 981           dfa->word_char[2] = bits2;
 982           dfa->word_char[3] = bits3;
 983           i = 4;
 984         }
 985       else
 986         goto general_case;
 987       ch = 128;
 988
 989       if (BE (dfa->is_utf8, 1))
 990         {
 991           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 992           return;
 993         }
 994     }
 995
 996  general_case:
 997   for (; i < BITSET_WORDS; ++i)
 998     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 999       if (isalnum (ch) || ch == '_')
1000         dfa->word_char[i] |= (bitset_word_t) 1 << j;
1001 }
1002
1003 /* Free the work area which are only used while compiling.  */
1004
1005 static void
1006 free_workarea_compile (regex_t *preg)
1007 {
1008   re_dfa_t *dfa = preg->buffer;
1009   bin_tree_storage_t *storage, *next;
1010   for (storage = dfa->str_tree_storage; storage; storage = next)
1011     {
1012       next = storage->next;
1013       re_free (storage);
1014     }
1015   dfa->str_tree_storage = NULL;
1016   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1017   dfa->str_tree = NULL;
1018   re_free (dfa->org_indices);
1019   dfa->org_indices = NULL;
1020 }
1021
1022 /* Create initial states for all contexts.  */
1023
1024 static reg_errcode_t
1025 create_initial_state (re_dfa_t *dfa)
1026 {
1027   Idx first, i;
1028   reg_errcode_t err;
1029   re_node_set init_nodes;
1030
1031   /* Initial states have the epsilon closure of the node which is
1032      the first node of the regular expression.  */
1033   first = dfa->str_tree->first->node_idx;
1034   dfa->init_node = first;
1035   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1036   if (BE (err != REG_NOERROR, 0))
1037     return err;
1038
1039   /* The back-references which are in initial states can epsilon transit,
1040      since in this case all of the subexpressions can be null.
1041      Then we add epsilon closures of the nodes which are the next nodes of
1042      the back-references.  */
1043   if (dfa->nbackref > 0)
1044     for (i = 0; i < init_nodes.nelem; ++i)
1045       {
1046         Idx node_idx = init_nodes.elems[i];
1047         re_token_type_t type = dfa->nodes[node_idx].type;
1048
1049         Idx clexp_idx;
1050         if (type != OP_BACK_REF)
1051           continue;
1052         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1053           {
1054             re_token_t *clexp_node;
1055             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1056             if (clexp_node->type == OP_CLOSE_SUBEXP
1057                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1058               break;
1059           }
1060         if (clexp_idx == init_nodes.nelem)
1061           continue;
1062
1063         if (type == OP_BACK_REF)
1064           {
1065             Idx dest_idx = dfa->edests[node_idx].elems[0];
1066             if (!re_node_set_contains (&init_nodes, dest_idx))
1067               {
1068                 reg_errcode_t merge_err
1069                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1070                 if (merge_err != REG_NOERROR)
1071                   return merge_err;
1072                 i = 0;
1073               }
1074           }
1075       }
1076
1077   /* It must be the first time to invoke acquire_state.  */
1078   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1079   /* We don't check ERR here, since the initial state must not be NULL.  */
1080   if (BE (dfa->init_state == NULL, 0))
1081     return err;
1082   if (dfa->init_state->has_constraint)
1083     {
1084       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1085                                                        CONTEXT_WORD);
1086       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1087                                                      CONTEXT_NEWLINE);
1088       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1089                                                          &init_nodes,
1090                                                          CONTEXT_NEWLINE
1091                                                          | CONTEXT_BEGBUF);
1092       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1093               || dfa->init_state_begbuf == NULL, 0))
1094         return err;
1095     }
1096   else
1097     dfa->init_state_word = dfa->init_state_nl
1098       = dfa->init_state_begbuf = dfa->init_state;
1099
1100   re_node_set_free (&init_nodes);
1101   return REG_NOERROR;
1102 }
1103 \f
1104 #ifdef RE_ENABLE_I18N
1105 /* If it is possible to do searching in single byte encoding instead of UTF-8
1106    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1107    DFA nodes where needed.  */
1108
1109 static void
1110 optimize_utf8 (re_dfa_t *dfa)
1111 {
1112   Idx node;
1113   int i;
1114   bool mb_chars = false;
1115   bool has_period = false;
1116
1117   for (node = 0; node < dfa->nodes_len; ++node)
1118     switch (dfa->nodes[node].type)
1119       {
1120       case CHARACTER:
1121         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1122           mb_chars = true;
1123         break;
1124       case ANCHOR:
1125         switch (dfa->nodes[node].opr.ctx_type)
1126           {
1127           case LINE_FIRST:
1128           case LINE_LAST:
1129           case BUF_FIRST:
1130           case BUF_LAST:
1131             break;
1132           default:
1133             /* Word anchors etc. cannot be handled.  It's okay to test
1134                opr.ctx_type since constraints (for all DFA nodes) are
1135                created by ORing one or more opr.ctx_type values.  */
1136             return;
1137           }
1138         break;
1139       case OP_PERIOD:
1140         has_period = true;
1141         break;
1142       case OP_BACK_REF:
1143       case OP_ALT:
1144       case END_OF_RE:
1145       case OP_DUP_ASTERISK:
1146       case OP_OPEN_SUBEXP:
1147       case OP_CLOSE_SUBEXP:
1148         break;
1149       case COMPLEX_BRACKET:
1150         return;
1151       case SIMPLE_BRACKET:
1152         /* Just double check.  */
1153         {
1154           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1155                         ? 0
1156                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1157           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1158             {
1159               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1160                 return;
1161               rshift = 0;
1162             }
1163         }
1164         break;
1165       default:
1166         abort ();
1167       }
1168
1169   if (mb_chars || has_period)
1170     for (node = 0; node < dfa->nodes_len; ++node)
1171       {
1172         if (dfa->nodes[node].type == CHARACTER
1173             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1174           dfa->nodes[node].mb_partial = 0;
1175         else if (dfa->nodes[node].type == OP_PERIOD)
1176           dfa->nodes[node].type = OP_UTF8_PERIOD;
1177       }
1178
1179   /* The search can be in single byte locale.  */
1180   dfa->mb_cur_max = 1;
1181   dfa->is_utf8 = 0;
1182   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1183 }
1184 #endif
1185 \f
1186 /* Analyze the structure tree, and calculate "first", "next", "edest",
1187    "eclosure", and "inveclosure".  */
1188
1189 static reg_errcode_t
1190 analyze (regex_t *preg)
1191 {
1192   re_dfa_t *dfa = preg->buffer;
1193   reg_errcode_t ret;
1194
1195   /* Allocate arrays.  */
1196   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1197   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1198   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1199   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1200   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1201           || dfa->eclosures == NULL, 0))
1202     return REG_ESPACE;
1203
1204   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1205   if (dfa->subexp_map != NULL)
1206     {
1207       Idx i;
1208       for (i = 0; i < preg->re_nsub; i++)
1209         dfa->subexp_map[i] = i;
1210       preorder (dfa->str_tree, optimize_subexps, dfa);
1211       for (i = 0; i < preg->re_nsub; i++)
1212         if (dfa->subexp_map[i] != i)
1213           break;
1214       if (i == preg->re_nsub)
1215         {
1216           free (dfa->subexp_map);
1217           dfa->subexp_map = NULL;
1218         }
1219     }
1220
1221   ret = postorder (dfa->str_tree, lower_subexps, preg);
1222   if (BE (ret != REG_NOERROR, 0))
1223     return ret;
1224   ret = postorder (dfa->str_tree, calc_first, dfa);
1225   if (BE (ret != REG_NOERROR, 0))
1226     return ret;
1227   preorder (dfa->str_tree, calc_next, dfa);
1228   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1229   if (BE (ret != REG_NOERROR, 0))
1230     return ret;
1231   ret = calc_eclosure (dfa);
1232   if (BE (ret != REG_NOERROR, 0))
1233     return ret;
1234
1235   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1236      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1237   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1238       || dfa->nbackref)
1239     {
1240       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1241       if (BE (dfa->inveclosures == NULL, 0))
1242         return REG_ESPACE;
1243       ret = calc_inveclosure (dfa);
1244     }
1245
1246   return ret;
1247 }
1248
1249 /* Our parse trees are very unbalanced, so we cannot use a stack to
1250    implement parse tree visits.  Instead, we use parent pointers and
1251    some hairy code in these two functions.  */
1252 static reg_errcode_t
1253 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254            void *extra)
1255 {
1256   bin_tree_t *node, *prev;
1257
1258   for (node = root; ; )
1259     {
1260       /* Descend down the tree, preferably to the left (or to the right
1261          if that's the only child).  */
1262       while (node->left || node->right)
1263         if (node->left)
1264           node = node->left;
1265         else
1266           node = node->right;
1267
1268       do
1269         {
1270           reg_errcode_t err = fn (extra, node);
1271           if (BE (err != REG_NOERROR, 0))
1272             return err;
1273           if (node->parent == NULL)
1274             return REG_NOERROR;
1275           prev = node;
1276           node = node->parent;
1277         }
1278       /* Go up while we have a node that is reached from the right.  */
1279       while (node->right == prev || node->right == NULL);
1280       node = node->right;
1281     }
1282 }
1283
1284 static reg_errcode_t
1285 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1286           void *extra)
1287 {
1288   bin_tree_t *node;
1289
1290   for (node = root; ; )
1291     {
1292       reg_errcode_t err = fn (extra, node);
1293       if (BE (err != REG_NOERROR, 0))
1294         return err;
1295
1296       /* Go to the left node, or up and to the right.  */
1297       if (node->left)
1298         node = node->left;
1299       else
1300         {
1301           bin_tree_t *prev = NULL;
1302           while (node->right == prev || node->right == NULL)
1303             {
1304               prev = node;
1305               node = node->parent;
1306               if (!node)
1307                 return REG_NOERROR;
1308             }
1309           node = node->right;
1310         }
1311     }
1312 }
1313
1314 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1315    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1316    backreferences as well.  Requires a preorder visit.  */
1317 static reg_errcode_t
1318 optimize_subexps (void *extra, bin_tree_t *node)
1319 {
1320   re_dfa_t *dfa = (re_dfa_t *) extra;
1321
1322   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1323     {
1324       int idx = node->token.opr.idx;
1325       node->token.opr.idx = dfa->subexp_map[idx];
1326       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1327     }
1328
1329   else if (node->token.type == SUBEXP
1330            && node->left && node->left->token.type == SUBEXP)
1331     {
1332       Idx other_idx = node->left->token.opr.idx;
1333
1334       node->left = node->left->left;
1335       if (node->left)
1336         node->left->parent = node;
1337
1338       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1339       if (other_idx < BITSET_WORD_BITS)
1340         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1341     }
1342
1343   return REG_NOERROR;
1344 }
1345
1346 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1347    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1348 static reg_errcode_t
1349 lower_subexps (void *extra, bin_tree_t *node)
1350 {
1351   regex_t *preg = (regex_t *) extra;
1352   reg_errcode_t err = REG_NOERROR;
1353
1354   if (node->left && node->left->token.type == SUBEXP)
1355     {
1356       node->left = lower_subexp (&err, preg, node->left);
1357       if (node->left)
1358         node->left->parent = node;
1359     }
1360   if (node->right && node->right->token.type == SUBEXP)
1361     {
1362       node->right = lower_subexp (&err, preg, node->right);
1363       if (node->right)
1364         node->right->parent = node;
1365     }
1366
1367   return err;
1368 }
1369
1370 static bin_tree_t *
1371 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1372 {
1373   re_dfa_t *dfa = preg->buffer;
1374   bin_tree_t *body = node->left;
1375   bin_tree_t *op, *cls, *tree1, *tree;
1376
1377   if (preg->no_sub
1378       /* We do not optimize empty subexpressions, because otherwise we may
1379          have bad CONCAT nodes with NULL children.  This is obviously not
1380          very common, so we do not lose much.  An example that triggers
1381          this case is the sed "script" /\(\)/x.  */
1382       && node->left != NULL
1383       && (node->token.opr.idx >= BITSET_WORD_BITS
1384           || !(dfa->used_bkref_map
1385                & ((bitset_word_t) 1 << node->token.opr.idx))))
1386     return node->left;
1387
1388   /* Convert the SUBEXP node to the concatenation of an
1389      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1390   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1391   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1392   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1393   tree = create_tree (dfa, op, tree1, CONCAT);
1394   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1395     {
1396       *err = REG_ESPACE;
1397       return NULL;
1398     }
1399
1400   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1401   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1402   return tree;
1403 }
1404
1405 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1406    nodes.  Requires a postorder visit.  */
1407 static reg_errcode_t
1408 calc_first (void *extra, bin_tree_t *node)
1409 {
1410   re_dfa_t *dfa = (re_dfa_t *) extra;
1411   if (node->token.type == CONCAT)
1412     {
1413       node->first = node->left->first;
1414       node->node_idx = node->left->node_idx;
1415     }
1416   else
1417     {
1418       node->first = node;
1419       node->node_idx = re_dfa_add_node (dfa, node->token);
1420       if (BE (node->node_idx == REG_MISSING, 0))
1421         return REG_ESPACE;
1422       if (node->token.type == ANCHOR)
1423         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1424     }
1425   return REG_NOERROR;
1426 }
1427
1428 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1429 static reg_errcode_t
1430 calc_next (void *extra, bin_tree_t *node)
1431 {
1432   switch (node->token.type)
1433     {
1434     case OP_DUP_ASTERISK:
1435       node->left->next = node;
1436       break;
1437     case CONCAT:
1438       node->left->next = node->right->first;
1439       node->right->next = node->next;
1440       break;
1441     default:
1442       if (node->left)
1443         node->left->next = node->next;
1444       if (node->right)
1445         node->right->next = node->next;
1446       break;
1447     }
1448   return REG_NOERROR;
1449 }
1450
1451 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1452 static reg_errcode_t
1453 link_nfa_nodes (void *extra, bin_tree_t *node)
1454 {
1455   re_dfa_t *dfa = (re_dfa_t *) extra;
1456   Idx idx = node->node_idx;
1457   reg_errcode_t err = REG_NOERROR;
1458
1459   switch (node->token.type)
1460     {
1461     case CONCAT:
1462       break;
1463
1464     case END_OF_RE:
1465       assert (node->next == NULL);
1466       break;
1467
1468     case OP_DUP_ASTERISK:
1469     case OP_ALT:
1470       {
1471         Idx left, right;
1472         dfa->has_plural_match = 1;
1473         if (node->left != NULL)
1474           left = node->left->first->node_idx;
1475         else
1476           left = node->next->node_idx;
1477         if (node->right != NULL)
1478           right = node->right->first->node_idx;
1479         else
1480           right = node->next->node_idx;
1481         assert (REG_VALID_INDEX (left));
1482         assert (REG_VALID_INDEX (right));
1483         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1484       }
1485       break;
1486
1487     case ANCHOR:
1488     case OP_OPEN_SUBEXP:
1489     case OP_CLOSE_SUBEXP:
1490       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1491       break;
1492
1493     case OP_BACK_REF:
1494       dfa->nexts[idx] = node->next->node_idx;
1495       if (node->token.type == OP_BACK_REF)
1496         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1497       break;
1498
1499     default:
1500       assert (!IS_EPSILON_NODE (node->token.type));
1501       dfa->nexts[idx] = node->next->node_idx;
1502       break;
1503     }
1504
1505   return err;
1506 }
1507
1508 /* Duplicate the epsilon closure of the node ROOT_NODE.
1509    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1510    to their own constraint.  */
1511
1512 static reg_errcode_t
1513 internal_function
1514 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1515                         Idx root_node, unsigned int init_constraint)
1516 {
1517   Idx org_node, clone_node;
1518   bool ok;
1519   unsigned int constraint = init_constraint;
1520   for (org_node = top_org_node, clone_node = top_clone_node;;)
1521     {
1522       Idx org_dest, clone_dest;
1523       if (dfa->nodes[org_node].type == OP_BACK_REF)
1524         {
1525           /* If the back reference epsilon-transit, its destination must
1526              also have the constraint.  Then duplicate the epsilon closure
1527              of the destination of the back reference, and store it in
1528              edests of the back reference.  */
1529           org_dest = dfa->nexts[org_node];
1530           re_node_set_empty (dfa->edests + clone_node);
1531           clone_dest = duplicate_node (dfa, org_dest, constraint);
1532           if (BE (clone_dest == REG_MISSING, 0))
1533             return REG_ESPACE;
1534           dfa->nexts[clone_node] = dfa->nexts[org_node];
1535           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536           if (BE (! ok, 0))
1537             return REG_ESPACE;
1538         }
1539       else if (dfa->edests[org_node].nelem == 0)
1540         {
1541           /* In case of the node can't epsilon-transit, don't duplicate the
1542              destination and store the original destination as the
1543              destination of the node.  */
1544           dfa->nexts[clone_node] = dfa->nexts[org_node];
1545           break;
1546         }
1547       else if (dfa->edests[org_node].nelem == 1)
1548         {
1549           /* In case of the node can epsilon-transit, and it has only one
1550              destination.  */
1551           org_dest = dfa->edests[org_node].elems[0];
1552           re_node_set_empty (dfa->edests + clone_node);
1553           /* If the node is root_node itself, it means the epsilon closure
1554              has a loop.  Then tie it to the destination of the root_node.  */
1555           if (org_node == root_node && clone_node != org_node)
1556             {
1557               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1558               if (BE (! ok, 0))
1559                 return REG_ESPACE;
1560               break;
1561             }
1562           /* In case the node has another constraint, append it.  */
1563           constraint |= dfa->nodes[org_node].constraint;
1564           clone_dest = duplicate_node (dfa, org_dest, constraint);
1565           if (BE (clone_dest == REG_MISSING, 0))
1566             return REG_ESPACE;
1567           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568           if (BE (! ok, 0))
1569             return REG_ESPACE;
1570         }
1571       else /* dfa->edests[org_node].nelem == 2 */
1572         {
1573           /* In case of the node can epsilon-transit, and it has two
1574              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1575           org_dest = dfa->edests[org_node].elems[0];
1576           re_node_set_empty (dfa->edests + clone_node);
1577           /* Search for a duplicated node which satisfies the constraint.  */
1578           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1579           if (clone_dest == REG_MISSING)
1580             {
1581               /* There is no such duplicated node, create a new one.  */
1582               reg_errcode_t err;
1583               clone_dest = duplicate_node (dfa, org_dest, constraint);
1584               if (BE (clone_dest == REG_MISSING, 0))
1585                 return REG_ESPACE;
1586               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587               if (BE (! ok, 0))
1588                 return REG_ESPACE;
1589               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1590                                             root_node, constraint);
1591               if (BE (err != REG_NOERROR, 0))
1592                 return err;
1593             }
1594           else
1595             {
1596               /* There is a duplicated node which satisfies the constraint,
1597                  use it to avoid infinite loop.  */
1598               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599               if (BE (! ok, 0))
1600                 return REG_ESPACE;
1601             }
1602
1603           org_dest = dfa->edests[org_node].elems[1];
1604           clone_dest = duplicate_node (dfa, org_dest, constraint);
1605           if (BE (clone_dest == REG_MISSING, 0))
1606             return REG_ESPACE;
1607           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1608           if (BE (! ok, 0))
1609             return REG_ESPACE;
1610         }
1611       org_node = org_dest;
1612       clone_node = clone_dest;
1613     }
1614   return REG_NOERROR;
1615 }
1616
1617 /* Search for a node which is duplicated from the node ORG_NODE, and
1618    satisfies the constraint CONSTRAINT.  */
1619
1620 static Idx
1621 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1622                         unsigned int constraint)
1623 {
1624   Idx idx;
1625   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1626     {
1627       if (org_node == dfa->org_indices[idx]
1628           && constraint == dfa->nodes[idx].constraint)
1629         return idx; /* Found.  */
1630     }
1631   return REG_MISSING; /* Not found.  */
1632 }
1633
1634 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1635    Return the index of the new node, or REG_MISSING if insufficient storage is
1636    available.  */
1637
1638 static Idx
1639 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1640 {
1641   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1642   if (BE (dup_idx != REG_MISSING, 1))
1643     {
1644       dfa->nodes[dup_idx].constraint = constraint;
1645       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1646       dfa->nodes[dup_idx].duplicated = 1;
1647
1648       /* Store the index of the original node.  */
1649       dfa->org_indices[dup_idx] = org_idx;
1650     }
1651   return dup_idx;
1652 }
1653
1654 static reg_errcode_t
1655 calc_inveclosure (re_dfa_t *dfa)
1656 {
1657   Idx src, idx;
1658   bool ok;
1659   for (idx = 0; idx < dfa->nodes_len; ++idx)
1660     re_node_set_init_empty (dfa->inveclosures + idx);
1661
1662   for (src = 0; src < dfa->nodes_len; ++src)
1663     {
1664       Idx *elems = dfa->eclosures[src].elems;
1665       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1666         {
1667           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1668           if (BE (! ok, 0))
1669             return REG_ESPACE;
1670         }
1671     }
1672
1673   return REG_NOERROR;
1674 }
1675
1676 /* Calculate "eclosure" for all the node in DFA.  */
1677
1678 static reg_errcode_t
1679 calc_eclosure (re_dfa_t *dfa)
1680 {
1681   Idx node_idx;
1682   bool incomplete;
1683 #ifdef DEBUG
1684   assert (dfa->nodes_len > 0);
1685 #endif
1686   incomplete = false;
1687   /* For each nodes, calculate epsilon closure.  */
1688   for (node_idx = 0; ; ++node_idx)
1689     {
1690       reg_errcode_t err;
1691       re_node_set eclosure_elem;
1692       if (node_idx == dfa->nodes_len)
1693         {
1694           if (!incomplete)
1695             break;
1696           incomplete = false;
1697           node_idx = 0;
1698         }
1699
1700 #ifdef DEBUG
1701       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1702 #endif
1703
1704       /* If we have already calculated, skip it.  */
1705       if (dfa->eclosures[node_idx].nelem != 0)
1706         continue;
1707       /* Calculate epsilon closure of 'node_idx'.  */
1708       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1709       if (BE (err != REG_NOERROR, 0))
1710         return err;
1711
1712       if (dfa->eclosures[node_idx].nelem == 0)
1713         {
1714           incomplete = true;
1715           re_node_set_free (&eclosure_elem);
1716         }
1717     }
1718   return REG_NOERROR;
1719 }
1720
1721 /* Calculate epsilon closure of NODE.  */
1722
1723 static reg_errcode_t
1724 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1725 {
1726   reg_errcode_t err;
1727   Idx i;
1728   re_node_set eclosure;
1729   bool ok;
1730   bool incomplete = false;
1731   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1732   if (BE (err != REG_NOERROR, 0))
1733     return err;
1734
1735   /* This indicates that we are calculating this node now.
1736      We reference this value to avoid infinite loop.  */
1737   dfa->eclosures[node].nelem = REG_MISSING;
1738
1739   /* If the current node has constraints, duplicate all nodes
1740      since they must inherit the constraints.  */
1741   if (dfa->nodes[node].constraint
1742       && dfa->edests[node].nelem
1743       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1744     {
1745       err = duplicate_node_closure (dfa, node, node, node,
1746                                     dfa->nodes[node].constraint);
1747       if (BE (err != REG_NOERROR, 0))
1748         return err;
1749     }
1750
1751   /* Expand each epsilon destination nodes.  */
1752   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1753     for (i = 0; i < dfa->edests[node].nelem; ++i)
1754       {
1755         re_node_set eclosure_elem;
1756         Idx edest = dfa->edests[node].elems[i];
1757         /* If calculating the epsilon closure of 'edest' is in progress,
1758            return intermediate result.  */
1759         if (dfa->eclosures[edest].nelem == REG_MISSING)
1760           {
1761             incomplete = true;
1762             continue;
1763           }
1764         /* If we haven't calculated the epsilon closure of 'edest' yet,
1765            calculate now. Otherwise use calculated epsilon closure.  */
1766         if (dfa->eclosures[edest].nelem == 0)
1767           {
1768             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1769             if (BE (err != REG_NOERROR, 0))
1770               return err;
1771           }
1772         else
1773           eclosure_elem = dfa->eclosures[edest];
1774         /* Merge the epsilon closure of 'edest'.  */
1775         err = re_node_set_merge (&eclosure, &eclosure_elem);
1776         if (BE (err != REG_NOERROR, 0))
1777           return err;
1778         /* If the epsilon closure of 'edest' is incomplete,
1779            the epsilon closure of this node is also incomplete.  */
1780         if (dfa->eclosures[edest].nelem == 0)
1781           {
1782             incomplete = true;
1783             re_node_set_free (&eclosure_elem);
1784           }
1785       }
1786
1787   /* An epsilon closure includes itself.  */
1788   ok = re_node_set_insert (&eclosure, node);
1789   if (BE (! ok, 0))
1790     return REG_ESPACE;
1791   if (incomplete && !root)
1792     dfa->eclosures[node].nelem = 0;
1793   else
1794     dfa->eclosures[node] = eclosure;
1795   *new_set = eclosure;
1796   return REG_NOERROR;
1797 }
1798 \f
1799 /* Functions for token which are used in the parser.  */
1800
1801 /* Fetch a token from INPUT.
1802    We must not use this function inside bracket expressions.  */
1803
1804 static void
1805 internal_function
1806 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1807 {
1808   re_string_skip_bytes (input, peek_token (result, input, syntax));
1809 }
1810
1811 /* Peek a token from INPUT, and return the length of the token.
1812    We must not use this function inside bracket expressions.  */
1813
1814 static int
1815 internal_function
1816 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1817 {
1818   unsigned char c;
1819
1820   if (re_string_eoi (input))
1821     {
1822       token->type = END_OF_RE;
1823       return 0;
1824     }
1825
1826   c = re_string_peek_byte (input, 0);
1827   token->opr.c = c;
1828
1829   token->word_char = 0;
1830 #ifdef RE_ENABLE_I18N
1831   token->mb_partial = 0;
1832   if (input->mb_cur_max > 1 &&
1833       !re_string_first_byte (input, re_string_cur_idx (input)))
1834     {
1835       token->type = CHARACTER;
1836       token->mb_partial = 1;
1837       return 1;
1838     }
1839 #endif
1840   if (c == '\\')
1841     {
1842       unsigned char c2;
1843       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1844         {
1845           token->type = BACK_SLASH;
1846           return 1;
1847         }
1848
1849       c2 = re_string_peek_byte_case (input, 1);
1850       token->opr.c = c2;
1851       token->type = CHARACTER;
1852 #ifdef RE_ENABLE_I18N
1853       if (input->mb_cur_max > 1)
1854         {
1855           wint_t wc = re_string_wchar_at (input,
1856                                           re_string_cur_idx (input) + 1);
1857           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1858         }
1859       else
1860 #endif
1861         token->word_char = IS_WORD_CHAR (c2) != 0;
1862
1863       switch (c2)
1864         {
1865         case '|':
1866           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1867             token->type = OP_ALT;
1868           break;
1869         case '1': case '2': case '3': case '4': case '5':
1870         case '6': case '7': case '8': case '9':
1871           if (!(syntax & RE_NO_BK_REFS))
1872             {
1873               token->type = OP_BACK_REF;
1874               token->opr.idx = c2 - '1';
1875             }
1876           break;
1877         case '<':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             {
1880               token->type = ANCHOR;
1881               token->opr.ctx_type = WORD_FIRST;
1882             }
1883           break;
1884         case '>':
1885           if (!(syntax & RE_NO_GNU_OPS))
1886             {
1887               token->type = ANCHOR;
1888               token->opr.ctx_type = WORD_LAST;
1889             }
1890           break;
1891         case 'b':
1892           if (!(syntax & RE_NO_GNU_OPS))
1893             {
1894               token->type = ANCHOR;
1895               token->opr.ctx_type = WORD_DELIM;
1896             }
1897           break;
1898         case 'B':
1899           if (!(syntax & RE_NO_GNU_OPS))
1900             {
1901               token->type = ANCHOR;
1902               token->opr.ctx_type = NOT_WORD_DELIM;
1903             }
1904           break;
1905         case 'w':
1906           if (!(syntax & RE_NO_GNU_OPS))
1907             token->type = OP_WORD;
1908           break;
1909         case 'W':
1910           if (!(syntax & RE_NO_GNU_OPS))
1911             token->type = OP_NOTWORD;
1912           break;
1913         case 's':
1914           if (!(syntax & RE_NO_GNU_OPS))
1915             token->type = OP_SPACE;
1916           break;
1917         case 'S':
1918           if (!(syntax & RE_NO_GNU_OPS))
1919             token->type = OP_NOTSPACE;
1920           break;
1921         case '`':
1922           if (!(syntax & RE_NO_GNU_OPS))
1923             {
1924               token->type = ANCHOR;
1925               token->opr.ctx_type = BUF_FIRST;
1926             }
1927           break;
1928         case '\'':
1929           if (!(syntax & RE_NO_GNU_OPS))
1930             {
1931               token->type = ANCHOR;
1932               token->opr.ctx_type = BUF_LAST;
1933             }
1934           break;
1935         case '(':
1936           if (!(syntax & RE_NO_BK_PARENS))
1937             token->type = OP_OPEN_SUBEXP;
1938           break;
1939         case ')':
1940           if (!(syntax & RE_NO_BK_PARENS))
1941             token->type = OP_CLOSE_SUBEXP;
1942           break;
1943         case '+':
1944           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945             token->type = OP_DUP_PLUS;
1946           break;
1947         case '?':
1948           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949             token->type = OP_DUP_QUESTION;
1950           break;
1951         case '{':
1952           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953             token->type = OP_OPEN_DUP_NUM;
1954           break;
1955         case '}':
1956           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957             token->type = OP_CLOSE_DUP_NUM;
1958           break;
1959         default:
1960           break;
1961         }
1962       return 2;
1963     }
1964
1965   token->type = CHARACTER;
1966 #ifdef RE_ENABLE_I18N
1967   if (input->mb_cur_max > 1)
1968     {
1969       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1970       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1971     }
1972   else
1973 #endif
1974     token->word_char = IS_WORD_CHAR (token->opr.c);
1975
1976   switch (c)
1977     {
1978     case '\n':
1979       if (syntax & RE_NEWLINE_ALT)
1980         token->type = OP_ALT;
1981       break;
1982     case '|':
1983       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1984         token->type = OP_ALT;
1985       break;
1986     case '*':
1987       token->type = OP_DUP_ASTERISK;
1988       break;
1989     case '+':
1990       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991         token->type = OP_DUP_PLUS;
1992       break;
1993     case '?':
1994       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995         token->type = OP_DUP_QUESTION;
1996       break;
1997     case '{':
1998       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999         token->type = OP_OPEN_DUP_NUM;
2000       break;
2001     case '}':
2002       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003         token->type = OP_CLOSE_DUP_NUM;
2004       break;
2005     case '(':
2006       if (syntax & RE_NO_BK_PARENS)
2007         token->type = OP_OPEN_SUBEXP;
2008       break;
2009     case ')':
2010       if (syntax & RE_NO_BK_PARENS)
2011         token->type = OP_CLOSE_SUBEXP;
2012       break;
2013     case '[':
2014       token->type = OP_OPEN_BRACKET;
2015       break;
2016     case '.':
2017       token->type = OP_PERIOD;
2018       break;
2019     case '^':
2020       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2021           re_string_cur_idx (input) != 0)
2022         {
2023           char prev = re_string_peek_byte (input, -1);
2024           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2025             break;
2026         }
2027       token->type = ANCHOR;
2028       token->opr.ctx_type = LINE_FIRST;
2029       break;
2030     case '$':
2031       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2032           re_string_cur_idx (input) + 1 != re_string_length (input))
2033         {
2034           re_token_t next;
2035           re_string_skip_bytes (input, 1);
2036           peek_token (&next, input, syntax);
2037           re_string_skip_bytes (input, -1);
2038           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2039             break;
2040         }
2041       token->type = ANCHOR;
2042       token->opr.ctx_type = LINE_LAST;
2043       break;
2044     default:
2045       break;
2046     }
2047   return 1;
2048 }
2049
2050 /* Peek a token from INPUT, and return the length of the token.
2051    We must not use this function out of bracket expressions.  */
2052
2053 static int
2054 internal_function
2055 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2056 {
2057   unsigned char c;
2058   if (re_string_eoi (input))
2059     {
2060       token->type = END_OF_RE;
2061       return 0;
2062     }
2063   c = re_string_peek_byte (input, 0);
2064   token->opr.c = c;
2065
2066 #ifdef RE_ENABLE_I18N
2067   if (input->mb_cur_max > 1 &&
2068       !re_string_first_byte (input, re_string_cur_idx (input)))
2069     {
2070       token->type = CHARACTER;
2071       return 1;
2072     }
2073 #endif /* RE_ENABLE_I18N */
2074
2075   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2076       && re_string_cur_idx (input) + 1 < re_string_length (input))
2077     {
2078       /* In this case, '\' escape a character.  */
2079       unsigned char c2;
2080       re_string_skip_bytes (input, 1);
2081       c2 = re_string_peek_byte (input, 0);
2082       token->opr.c = c2;
2083       token->type = CHARACTER;
2084       return 1;
2085     }
2086   if (c == '[') /* '[' is a special char in a bracket exps.  */
2087     {
2088       unsigned char c2;
2089       int token_len;
2090       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2091         c2 = re_string_peek_byte (input, 1);
2092       else
2093         c2 = 0;
2094       token->opr.c = c2;
2095       token_len = 2;
2096       switch (c2)
2097         {
2098         case '.':
2099           token->type = OP_OPEN_COLL_ELEM;
2100           break;
2101         case '=':
2102           token->type = OP_OPEN_EQUIV_CLASS;
2103           break;
2104         case ':':
2105           if (syntax & RE_CHAR_CLASSES)
2106             {
2107               token->type = OP_OPEN_CHAR_CLASS;
2108               break;
2109             }
2110           /* else fall through.  */
2111         default:
2112           token->type = CHARACTER;
2113           token->opr.c = c;
2114           token_len = 1;
2115           break;
2116         }
2117       return token_len;
2118     }
2119   switch (c)
2120     {
2121     case '-':
2122       token->type = OP_CHARSET_RANGE;
2123       break;
2124     case ']':
2125       token->type = OP_CLOSE_BRACKET;
2126       break;
2127     case '^':
2128       token->type = OP_NON_MATCH_LIST;
2129       break;
2130     default:
2131       token->type = CHARACTER;
2132     }
2133   return 1;
2134 }
2135 \f
2136 /* Functions for parser.  */
2137
2138 /* Entry point of the parser.
2139    Parse the regular expression REGEXP and return the structure tree.
2140    If an error occurs, ERR is set by error code, and return NULL.
2141    This function build the following tree, from regular expression <reg_exp>:
2142            CAT
2143            / \
2144           /   \
2145    <reg_exp>  EOR
2146
2147    CAT means concatenation.
2148    EOR means end of regular expression.  */
2149
2150 static bin_tree_t *
2151 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2152        reg_errcode_t *err)
2153 {
2154   re_dfa_t *dfa = preg->buffer;
2155   bin_tree_t *tree, *eor, *root;
2156   re_token_t current_token;
2157   dfa->syntax = syntax;
2158   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2159   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2160   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2161     return NULL;
2162   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2163   if (tree != NULL)
2164     root = create_tree (dfa, tree, eor, CONCAT);
2165   else
2166     root = eor;
2167   if (BE (eor == NULL || root == NULL, 0))
2168     {
2169       *err = REG_ESPACE;
2170       return NULL;
2171     }
2172   return root;
2173 }
2174
2175 /* This function build the following tree, from regular expression
2176    <branch1>|<branch2>:
2177            ALT
2178            / \
2179           /   \
2180    <branch1> <branch2>
2181
2182    ALT means alternative, which represents the operator '|'.  */
2183
2184 static bin_tree_t *
2185 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187 {
2188   re_dfa_t *dfa = preg->buffer;
2189   bin_tree_t *tree, *branch = NULL;
2190   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2191   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2192   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193     return NULL;
2194
2195   while (token->type == OP_ALT)
2196     {
2197       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2198       if (token->type != OP_ALT && token->type != END_OF_RE
2199           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2200         {
2201       bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2202       dfa->completed_bkref_map = initial_bkref_map;
2203           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2204           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2205           {
2206             if (tree != NULL)
2207                   postorder (tree, free_tree, NULL);
2208             return NULL;
2209           }
2210           dfa->completed_bkref_map |= accumulated_bkref_map;
2211         }
2212       else
2213         branch = NULL;
2214       tree = create_tree (dfa, tree, branch, OP_ALT);
2215       if (BE (tree == NULL, 0))
2216         {
2217           *err = REG_ESPACE;
2218           return NULL;
2219         }
2220     }
2221   return tree;
2222 }
2223
2224 /* This function build the following tree, from regular expression
2225    <exp1><exp2>:
2226         CAT
2227         / \
2228        /   \
2229    <exp1> <exp2>
2230
2231    CAT means concatenation.  */
2232
2233 static bin_tree_t *
2234 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2235               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2236 {
2237   bin_tree_t *tree, *expr;
2238   re_dfa_t *dfa = preg->buffer;
2239   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2240   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2241     return NULL;
2242
2243   while (token->type != OP_ALT && token->type != END_OF_RE
2244          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2245     {
2246       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2247       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2248         {
2249           if (tree != NULL)
2250             postorder (tree, free_tree, NULL);
2251           return NULL;
2252         }
2253       if (tree != NULL && expr != NULL)
2254         {
2255           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2256           if (newtree == NULL)
2257             {
2258               postorder (expr, free_tree, NULL);
2259               postorder (tree, free_tree, NULL);
2260               *err = REG_ESPACE;
2261               return NULL;
2262             }
2263           tree = newtree;
2264         }
2265       else if (tree == NULL)
2266         tree = expr;
2267       /* Otherwise expr == NULL, we don't need to create new tree.  */
2268     }
2269   return tree;
2270 }
2271
2272 /* This function build the following tree, from regular expression a*:
2273          *
2274          |
2275          a
2276 */
2277
2278 static bin_tree_t *
2279 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2280                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2281 {
2282   re_dfa_t *dfa = preg->buffer;
2283   bin_tree_t *tree;
2284   switch (token->type)
2285     {
2286     case CHARACTER:
2287       tree = create_token_tree (dfa, NULL, NULL, token);
2288       if (BE (tree == NULL, 0))
2289         {
2290           *err = REG_ESPACE;
2291           return NULL;
2292         }
2293 #ifdef RE_ENABLE_I18N
2294       if (dfa->mb_cur_max > 1)
2295         {
2296           while (!re_string_eoi (regexp)
2297                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2298             {
2299               bin_tree_t *mbc_remain;
2300               fetch_token (token, regexp, syntax);
2301               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2302               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2303               if (BE (mbc_remain == NULL || tree == NULL, 0))
2304                 {
2305                   *err = REG_ESPACE;
2306                   return NULL;
2307                 }
2308             }
2309         }
2310 #endif
2311       break;
2312     case OP_OPEN_SUBEXP:
2313       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2314       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2315         return NULL;
2316       break;
2317     case OP_OPEN_BRACKET:
2318       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2319       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2320         return NULL;
2321       break;
2322     case OP_BACK_REF:
2323       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2324         {
2325           *err = REG_ESUBREG;
2326           return NULL;
2327         }
2328       dfa->used_bkref_map |= 1 << token->opr.idx;
2329       tree = create_token_tree (dfa, NULL, NULL, token);
2330       if (BE (tree == NULL, 0))
2331         {
2332           *err = REG_ESPACE;
2333           return NULL;
2334         }
2335       ++dfa->nbackref;
2336       dfa->has_mb_node = 1;
2337       break;
2338     case OP_OPEN_DUP_NUM:
2339       if (syntax & RE_CONTEXT_INVALID_DUP)
2340         {
2341           *err = REG_BADRPT;
2342           return NULL;
2343         }
2344       /* FALLTHROUGH */
2345     case OP_DUP_ASTERISK:
2346     case OP_DUP_PLUS:
2347     case OP_DUP_QUESTION:
2348       if (syntax & RE_CONTEXT_INVALID_OPS)
2349         {
2350           *err = REG_BADRPT;
2351           return NULL;
2352         }
2353       else if (syntax & RE_CONTEXT_INDEP_OPS)
2354         {
2355           fetch_token (token, regexp, syntax);
2356           return parse_expression (regexp, preg, token, syntax, nest, err);
2357         }
2358       /* else fall through  */
2359     case OP_CLOSE_SUBEXP:
2360       if ((token->type == OP_CLOSE_SUBEXP) &&
2361           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2362         {
2363           *err = REG_ERPAREN;
2364           return NULL;
2365         }
2366       /* else fall through  */
2367     case OP_CLOSE_DUP_NUM:
2368       /* We treat it as a normal character.  */
2369
2370       /* Then we can these characters as normal characters.  */
2371       token->type = CHARACTER;
2372       /* mb_partial and word_char bits should be initialized already
2373          by peek_token.  */
2374       tree = create_token_tree (dfa, NULL, NULL, token);
2375       if (BE (tree == NULL, 0))
2376         {
2377           *err = REG_ESPACE;
2378           return NULL;
2379         }
2380       break;
2381     case ANCHOR:
2382       if ((token->opr.ctx_type
2383            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2384           && dfa->word_ops_used == 0)
2385         init_word_char (dfa);
2386       if (token->opr.ctx_type == WORD_DELIM
2387           || token->opr.ctx_type == NOT_WORD_DELIM)
2388         {
2389           bin_tree_t *tree_first, *tree_last;
2390           if (token->opr.ctx_type == WORD_DELIM)
2391             {
2392               token->opr.ctx_type = WORD_FIRST;
2393               tree_first = create_token_tree (dfa, NULL, NULL, token);
2394               token->opr.ctx_type = WORD_LAST;
2395             }
2396           else
2397             {
2398               token->opr.ctx_type = INSIDE_WORD;
2399               tree_first = create_token_tree (dfa, NULL, NULL, token);
2400               token->opr.ctx_type = INSIDE_NOTWORD;
2401             }
2402           tree_last = create_token_tree (dfa, NULL, NULL, token);
2403           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2404           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2405             {
2406               *err = REG_ESPACE;
2407               return NULL;
2408             }
2409         }
2410       else
2411         {
2412           tree = create_token_tree (dfa, NULL, NULL, token);
2413           if (BE (tree == NULL, 0))
2414             {
2415               *err = REG_ESPACE;
2416               return NULL;
2417             }
2418         }
2419       /* We must return here, since ANCHORs can't be followed
2420          by repetition operators.
2421          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2422              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2423       fetch_token (token, regexp, syntax);
2424       return tree;
2425     case OP_PERIOD:
2426       tree = create_token_tree (dfa, NULL, NULL, token);
2427       if (BE (tree == NULL, 0))
2428         {
2429           *err = REG_ESPACE;
2430           return NULL;
2431         }
2432       if (dfa->mb_cur_max > 1)
2433         dfa->has_mb_node = 1;
2434       break;
2435     case OP_WORD:
2436     case OP_NOTWORD:
2437       tree = build_charclass_op (dfa, regexp->trans,
2438                                  "alnum",
2439                                  "_",
2440                                  token->type == OP_NOTWORD, err);
2441       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2442         return NULL;
2443       break;
2444     case OP_SPACE:
2445     case OP_NOTSPACE:
2446       tree = build_charclass_op (dfa, regexp->trans,
2447                                  "space",
2448                                  "",
2449                                  token->type == OP_NOTSPACE, err);
2450       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2451         return NULL;
2452       break;
2453     case OP_ALT:
2454     case END_OF_RE:
2455       return NULL;
2456     case BACK_SLASH:
2457       *err = REG_EESCAPE;
2458       return NULL;
2459     default:
2460       /* Must not happen?  */
2461 #ifdef DEBUG
2462       assert (0);
2463 #endif
2464       return NULL;
2465     }
2466   fetch_token (token, regexp, syntax);
2467
2468   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2469          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2470     {
2471       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2472       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2473         return NULL;
2474       /* In BRE consecutive duplications are not allowed.  */
2475       if ((syntax & RE_CONTEXT_INVALID_DUP)
2476           && (token->type == OP_DUP_ASTERISK
2477               || token->type == OP_OPEN_DUP_NUM))
2478         {
2479           *err = REG_BADRPT;
2480           return NULL;
2481         }
2482     }
2483
2484   return tree;
2485 }
2486
2487 /* This function build the following tree, from regular expression
2488    (<reg_exp>):
2489          SUBEXP
2490             |
2491         <reg_exp>
2492 */
2493
2494 static bin_tree_t *
2495 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2496                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2497 {
2498   re_dfa_t *dfa = preg->buffer;
2499   bin_tree_t *tree;
2500   size_t cur_nsub;
2501   cur_nsub = preg->re_nsub++;
2502
2503   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2504
2505   /* The subexpression may be a null string.  */
2506   if (token->type == OP_CLOSE_SUBEXP)
2507     tree = NULL;
2508   else
2509     {
2510       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2511       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2512         {
2513           if (tree != NULL)
2514             postorder (tree, free_tree, NULL);
2515           *err = REG_EPAREN;
2516         }
2517       if (BE (*err != REG_NOERROR, 0))
2518         return NULL;
2519     }
2520
2521   if (cur_nsub <= '9' - '1')
2522     dfa->completed_bkref_map |= 1 << cur_nsub;
2523
2524   tree = create_tree (dfa, tree, NULL, SUBEXP);
2525   if (BE (tree == NULL, 0))
2526     {
2527       *err = REG_ESPACE;
2528       return NULL;
2529     }
2530   tree->token.opr.idx = cur_nsub;
2531   return tree;
2532 }
2533
2534 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2535
2536 static bin_tree_t *
2537 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2538               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2539 {
2540   bin_tree_t *tree = NULL, *old_tree = NULL;
2541   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2542   re_token_t start_token = *token;
2543
2544   if (token->type == OP_OPEN_DUP_NUM)
2545     {
2546       end = 0;
2547       start = fetch_number (regexp, token, syntax);
2548       if (start == REG_MISSING)
2549         {
2550           if (token->type == CHARACTER && token->opr.c == ',')
2551             start = 0; /* We treat "{,m}" as "{0,m}".  */
2552           else
2553             {
2554               *err = REG_BADBR; /* <re>{} is invalid.  */
2555               return NULL;
2556             }
2557         }
2558       if (BE (start != REG_ERROR, 1))
2559         {
2560           /* We treat "{n}" as "{n,n}".  */
2561           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2562                  : ((token->type == CHARACTER && token->opr.c == ',')
2563                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2564         }
2565       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2566         {
2567           /* Invalid sequence.  */
2568           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2569             {
2570               if (token->type == END_OF_RE)
2571                 *err = REG_EBRACE;
2572               else
2573                 *err = REG_BADBR;
2574
2575               return NULL;
2576             }
2577
2578           /* If the syntax bit is set, rollback.  */
2579           re_string_set_index (regexp, start_idx);
2580           *token = start_token;
2581           token->type = CHARACTER;
2582           /* mb_partial and word_char bits should be already initialized by
2583              peek_token.  */
2584           return elem;
2585         }
2586
2587       if (BE ((end != REG_MISSING && start > end)
2588               || token->type != OP_CLOSE_DUP_NUM, 0))
2589         {
2590           /* First number greater than second.  */
2591           *err = REG_BADBR;
2592           return NULL;
2593         }
2594
2595       if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2596         {
2597           *err = REG_ESIZE;
2598           return NULL;
2599         }
2600     }
2601   else
2602     {
2603       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2604       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2605     }
2606
2607   fetch_token (token, regexp, syntax);
2608
2609   if (BE (elem == NULL, 0))
2610     return NULL;
2611   if (BE (start == 0 && end == 0, 0))
2612     {
2613       postorder (elem, free_tree, NULL);
2614       return NULL;
2615     }
2616
2617   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2618   if (BE (start > 0, 0))
2619     {
2620       tree = elem;
2621       for (i = 2; i <= start; ++i)
2622         {
2623           elem = duplicate_tree (elem, dfa);
2624           tree = create_tree (dfa, tree, elem, CONCAT);
2625           if (BE (elem == NULL || tree == NULL, 0))
2626             goto parse_dup_op_espace;
2627         }
2628
2629       if (start == end)
2630         return tree;
2631
2632       /* Duplicate ELEM before it is marked optional.  */
2633       elem = duplicate_tree (elem, dfa);
2634       old_tree = tree;
2635     }
2636   else
2637     old_tree = NULL;
2638
2639   if (elem->token.type == SUBEXP)
2640     {
2641       uintptr_t subidx = elem->token.opr.idx;
2642       postorder (elem, mark_opt_subexp, (void *) subidx);
2643     }
2644
2645   tree = create_tree (dfa, elem, NULL,
2646                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2647   if (BE (tree == NULL, 0))
2648     goto parse_dup_op_espace;
2649
2650 /* From gnulib's "intprops.h":
2651    True if the arithmetic type T is signed.  */
2652 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2653
2654   /* This loop is actually executed only when end != REG_MISSING,
2655      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2656      already created the start+1-th copy.  */
2657   if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2658     for (i = start + 2; i <= end; ++i)
2659       {
2660         elem = duplicate_tree (elem, dfa);
2661         tree = create_tree (dfa, tree, elem, CONCAT);
2662         if (BE (elem == NULL || tree == NULL, 0))
2663           goto parse_dup_op_espace;
2664
2665         tree = create_tree (dfa, tree, NULL, OP_ALT);
2666         if (BE (tree == NULL, 0))
2667           goto parse_dup_op_espace;
2668       }
2669
2670   if (old_tree)
2671     tree = create_tree (dfa, old_tree, tree, CONCAT);
2672
2673   return tree;
2674
2675  parse_dup_op_espace:
2676   *err = REG_ESPACE;
2677   return NULL;
2678 }
2679
2680 /* Size of the names for collating symbol/equivalence_class/character_class.
2681    I'm not sure, but maybe enough.  */
2682 #define BRACKET_NAME_BUF_SIZE 32
2683
2684 #ifndef _LIBC
2685   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2686      Build the range expression which starts from START_ELEM, and ends
2687      at END_ELEM.  The result are written to MBCSET and SBCSET.
2688      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2689      mbcset->range_ends, is a pointer argument since we may
2690      update it.  */
2691
2692 static reg_errcode_t
2693 internal_function
2694 # ifdef RE_ENABLE_I18N
2695 build_range_exp (const reg_syntax_t syntax,
2696                  bitset_t sbcset,
2697                  re_charset_t *mbcset,
2698                  Idx *range_alloc,
2699                  const bracket_elem_t *start_elem,
2700                  const bracket_elem_t *end_elem)
2701 # else /* not RE_ENABLE_I18N */
2702 build_range_exp (const reg_syntax_t syntax,
2703                  bitset_t sbcset,
2704                  const bracket_elem_t *start_elem,
2705                  const bracket_elem_t *end_elem)
2706 # endif /* not RE_ENABLE_I18N */
2707 {
2708   unsigned int start_ch, end_ch;
2709   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2710   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2711           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2712           0))
2713     return REG_ERANGE;
2714
2715   /* We can handle no multi character collating elements without libc
2716      support.  */
2717   if (BE ((start_elem->type == COLL_SYM
2718            && strlen ((char *) start_elem->opr.name) > 1)
2719           || (end_elem->type == COLL_SYM
2720               && strlen ((char *) end_elem->opr.name) > 1), 0))
2721     return REG_ECOLLATE;
2722
2723 # ifdef RE_ENABLE_I18N
2724   {
2725     wchar_t wc;
2726     wint_t start_wc;
2727     wint_t end_wc;
2728
2729     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2730                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2731                    : 0));
2732     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2733               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2734                  : 0));
2735     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2736                 ? __btowc (start_ch) : start_elem->opr.wch);
2737     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2738               ? __btowc (end_ch) : end_elem->opr.wch);
2739     if (start_wc == WEOF || end_wc == WEOF)
2740       return REG_ECOLLATE;
2741     else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2742       return REG_ERANGE;
2743
2744     /* Got valid collation sequence values, add them as a new entry.
2745        However, for !_LIBC we have no collation elements: if the
2746        character set is single byte, the single byte character set
2747        that we build below suffices.  parse_bracket_exp passes
2748        no MBCSET if dfa->mb_cur_max == 1.  */
2749     if (mbcset)
2750       {
2751         /* Check the space of the arrays.  */
2752         if (BE (*range_alloc == mbcset->nranges, 0))
2753           {
2754             /* There is not enough space, need realloc.  */
2755             wchar_t *new_array_start, *new_array_end;
2756             Idx new_nranges;
2757
2758             /* +1 in case of mbcset->nranges is 0.  */
2759             new_nranges = 2 * mbcset->nranges + 1;
2760             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2761                are NULL if *range_alloc == 0.  */
2762             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2763                                           new_nranges);
2764             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2765                                         new_nranges);
2766
2767             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2768               return REG_ESPACE;
2769
2770             mbcset->range_starts = new_array_start;
2771             mbcset->range_ends = new_array_end;
2772             *range_alloc = new_nranges;
2773           }
2774
2775         mbcset->range_starts[mbcset->nranges] = start_wc;
2776         mbcset->range_ends[mbcset->nranges++] = end_wc;
2777       }
2778
2779     /* Build the table for single byte characters.  */
2780     for (wc = 0; wc < SBC_MAX; ++wc)
2781       {
2782         if (start_wc <= wc && wc <= end_wc)
2783           bitset_set (sbcset, wc);
2784       }
2785   }
2786 # else /* not RE_ENABLE_I18N */
2787   {
2788     unsigned int ch;
2789     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2790                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2791                    : 0));
2792     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2793               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2794                  : 0));
2795     if (start_ch > end_ch)
2796       return REG_ERANGE;
2797     /* Build the table for single byte characters.  */
2798     for (ch = 0; ch < SBC_MAX; ++ch)
2799       if (start_ch <= ch  && ch <= end_ch)
2800         bitset_set (sbcset, ch);
2801   }
2802 # endif /* not RE_ENABLE_I18N */
2803   return REG_NOERROR;
2804 }
2805 #endif /* not _LIBC */
2806
2807 #ifndef _LIBC
2808 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2809    Build the collating element which is represented by NAME.
2810    The result are written to MBCSET and SBCSET.
2811    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2812    pointer argument since we may update it.  */
2813
2814 static reg_errcode_t
2815 internal_function
2816 # ifdef RE_ENABLE_I18N
2817 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2818                         Idx *coll_sym_alloc, const unsigned char *name)
2819 # else /* not RE_ENABLE_I18N */
2820 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2821 # endif /* not RE_ENABLE_I18N */
2822 {
2823   size_t name_len = strlen ((const char *) name);
2824   if (BE (name_len != 1, 0))
2825     return REG_ECOLLATE;
2826   else
2827     {
2828       bitset_set (sbcset, name[0]);
2829       return REG_NOERROR;
2830     }
2831 }
2832 #endif /* not _LIBC */
2833
2834 /* This function parse bracket expression like "[abc]", "[a-c]",
2835    "[[.a-a.]]" etc.  */
2836
2837 static bin_tree_t *
2838 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2839                    reg_syntax_t syntax, reg_errcode_t *err)
2840 {
2841 #ifdef _LIBC
2842   const unsigned char *collseqmb;
2843   const char *collseqwc;
2844   uint32_t nrules;
2845   int32_t table_size;
2846   const int32_t *symb_table;
2847   const unsigned char *extra;
2848
2849   /* Local function for parse_bracket_exp used in _LIBC environment.
2850      Seek the collating symbol entry corresponding to NAME.
2851      Return the index of the symbol in the SYMB_TABLE,
2852      or -1 if not found.  */
2853
2854   auto inline int32_t
2855   __attribute__ ((always_inline))
2856   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2857     {
2858       int32_t elem;
2859
2860       for (elem = 0; elem < table_size; elem++)
2861         if (symb_table[2 * elem] != 0)
2862           {
2863             int32_t idx = symb_table[2 * elem + 1];
2864             /* Skip the name of collating element name.  */
2865             idx += 1 + extra[idx];
2866             if (/* Compare the length of the name.  */
2867                 name_len == extra[idx]
2868                 /* Compare the name.  */
2869                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2870               /* Yep, this is the entry.  */
2871               return elem;
2872           }
2873       return -1;
2874     }
2875
2876   /* Local function for parse_bracket_exp used in _LIBC environment.
2877      Look up the collation sequence value of BR_ELEM.
2878      Return the value if succeeded, UINT_MAX otherwise.  */
2879
2880   auto inline unsigned int
2881   __attribute__ ((always_inline))
2882   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2883     {
2884       if (br_elem->type == SB_CHAR)
2885         {
2886           /*
2887           if (MB_CUR_MAX == 1)
2888           */
2889           if (nrules == 0)
2890             return collseqmb[br_elem->opr.ch];
2891           else
2892             {
2893               wint_t wc = __btowc (br_elem->opr.ch);
2894               return __collseq_table_lookup (collseqwc, wc);
2895             }
2896         }
2897       else if (br_elem->type == MB_CHAR)
2898         {
2899           if (nrules != 0)
2900             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2901         }
2902       else if (br_elem->type == COLL_SYM)
2903         {
2904           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2905           if (nrules != 0)
2906             {
2907               int32_t elem, idx;
2908               elem = seek_collating_symbol_entry (br_elem->opr.name,
2909                                                   sym_name_len);
2910               if (elem != -1)
2911                 {
2912                   /* We found the entry.  */
2913                   idx = symb_table[2 * elem + 1];
2914                   /* Skip the name of collating element name.  */
2915                   idx += 1 + extra[idx];
2916                   /* Skip the byte sequence of the collating element.  */
2917                   idx += 1 + extra[idx];
2918                   /* Adjust for the alignment.  */
2919                   idx = (idx + 3) & ~3;
2920                   /* Skip the multibyte collation sequence value.  */
2921                   idx += sizeof (unsigned int);
2922                   /* Skip the wide char sequence of the collating element.  */
2923                   idx += sizeof (unsigned int) *
2924                     (1 + *(unsigned int *) (extra + idx));
2925                   /* Return the collation sequence value.  */
2926                   return *(unsigned int *) (extra + idx);
2927                 }
2928               else if (sym_name_len == 1)
2929                 {
2930                   /* No valid character.  Match it as a single byte
2931                      character.  */
2932                   return collseqmb[br_elem->opr.name[0]];
2933                 }
2934             }
2935           else if (sym_name_len == 1)
2936             return collseqmb[br_elem->opr.name[0]];
2937         }
2938       return UINT_MAX;
2939     }
2940
2941   /* Local function for parse_bracket_exp used in _LIBC environment.
2942      Build the range expression which starts from START_ELEM, and ends
2943      at END_ELEM.  The result are written to MBCSET and SBCSET.
2944      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2945      mbcset->range_ends, is a pointer argument since we may
2946      update it.  */
2947
2948   auto inline reg_errcode_t
2949   __attribute__ ((always_inline))
2950   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2951                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2952     {
2953       unsigned int ch;
2954       uint32_t start_collseq;
2955       uint32_t end_collseq;
2956
2957       /* Equivalence Classes and Character Classes can't be a range
2958          start/end.  */
2959       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2960               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2961               0))
2962         return REG_ERANGE;
2963
2964       /* FIXME: Implement rational ranges here, too.  */
2965       start_collseq = lookup_collation_sequence_value (start_elem);
2966       end_collseq = lookup_collation_sequence_value (end_elem);
2967       /* Check start/end collation sequence values.  */
2968       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2969         return REG_ECOLLATE;
2970       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2971         return REG_ERANGE;
2972
2973       /* Got valid collation sequence values, add them as a new entry.
2974          However, if we have no collation elements, and the character set
2975          is single byte, the single byte character set that we
2976          build below suffices. */
2977       if (nrules > 0 || dfa->mb_cur_max > 1)
2978         {
2979           /* Check the space of the arrays.  */
2980           if (BE (*range_alloc == mbcset->nranges, 0))
2981             {
2982               /* There is not enough space, need realloc.  */
2983               uint32_t *new_array_start;
2984               uint32_t *new_array_end;
2985               Idx new_nranges;
2986
2987               /* +1 in case of mbcset->nranges is 0.  */
2988               new_nranges = 2 * mbcset->nranges + 1;
2989               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2990                                             new_nranges);
2991               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2992                                           new_nranges);
2993
2994               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2995                 return REG_ESPACE;
2996
2997               mbcset->range_starts = new_array_start;
2998               mbcset->range_ends = new_array_end;
2999               *range_alloc = new_nranges;
3000             }
3001
3002           mbcset->range_starts[mbcset->nranges] = start_collseq;
3003           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3004         }
3005
3006       /* Build the table for single byte characters.  */
3007       for (ch = 0; ch < SBC_MAX; ch++)
3008         {
3009           uint32_t ch_collseq;
3010           /*
3011           if (MB_CUR_MAX == 1)
3012           */
3013           if (nrules == 0)
3014             ch_collseq = collseqmb[ch];
3015           else
3016             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3017           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3018             bitset_set (sbcset, ch);
3019         }
3020       return REG_NOERROR;
3021     }
3022
3023   /* Local function for parse_bracket_exp used in _LIBC environment.
3024      Build the collating element which is represented by NAME.
3025      The result are written to MBCSET and SBCSET.
3026      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3027      pointer argument since we may update it.  */
3028
3029   auto inline reg_errcode_t
3030   __attribute__ ((always_inline))
3031   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3032                           Idx *coll_sym_alloc, const unsigned char *name)
3033     {
3034       int32_t elem, idx;
3035       size_t name_len = strlen ((const char *) name);
3036       if (nrules != 0)
3037         {
3038           elem = seek_collating_symbol_entry (name, name_len);
3039           if (elem != -1)
3040             {
3041               /* We found the entry.  */
3042               idx = symb_table[2 * elem + 1];
3043               /* Skip the name of collating element name.  */
3044               idx += 1 + extra[idx];
3045             }
3046           else if (name_len == 1)
3047             {
3048               /* No valid character, treat it as a normal
3049                  character.  */
3050               bitset_set (sbcset, name[0]);
3051               return REG_NOERROR;
3052             }
3053           else
3054             return REG_ECOLLATE;
3055
3056           /* Got valid collation sequence, add it as a new entry.  */
3057           /* Check the space of the arrays.  */
3058           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3059             {
3060               /* Not enough, realloc it.  */
3061               /* +1 in case of mbcset->ncoll_syms is 0.  */
3062               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3063               /* Use realloc since mbcset->coll_syms is NULL
3064                  if *alloc == 0.  */
3065               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3066                                                    new_coll_sym_alloc);
3067               if (BE (new_coll_syms == NULL, 0))
3068                 return REG_ESPACE;
3069               mbcset->coll_syms = new_coll_syms;
3070               *coll_sym_alloc = new_coll_sym_alloc;
3071             }
3072           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3073           return REG_NOERROR;
3074         }
3075       else
3076         {
3077           if (BE (name_len != 1, 0))
3078             return REG_ECOLLATE;
3079           else
3080             {
3081               bitset_set (sbcset, name[0]);
3082               return REG_NOERROR;
3083             }
3084         }
3085     }
3086 #endif
3087
3088   re_token_t br_token;
3089   re_bitset_ptr_t sbcset;
3090 #ifdef RE_ENABLE_I18N
3091   re_charset_t *mbcset;
3092   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3093   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3094 #endif /* not RE_ENABLE_I18N */
3095   bool non_match = false;
3096   bin_tree_t *work_tree;
3097   int token_len;
3098   bool first_round = true;
3099 #ifdef _LIBC
3100   collseqmb = (const unsigned char *)
3101     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3102   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3103   if (nrules)
3104     {
3105       /*
3106       if (MB_CUR_MAX > 1)
3107       */
3108       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3109       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3110       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3111                                                   _NL_COLLATE_SYMB_TABLEMB);
3112       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3113                                                    _NL_COLLATE_SYMB_EXTRAMB);
3114     }
3115 #endif
3116   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3117 #ifdef RE_ENABLE_I18N
3118   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3119 #endif /* RE_ENABLE_I18N */
3120 #ifdef RE_ENABLE_I18N
3121   if (BE (sbcset == NULL || mbcset == NULL, 0))
3122 #else
3123   if (BE (sbcset == NULL, 0))
3124 #endif /* RE_ENABLE_I18N */
3125     {
3126       re_free (sbcset);
3127 #ifdef RE_ENABLE_I18N
3128       re_free (mbcset);
3129 #endif
3130       *err = REG_ESPACE;
3131       return NULL;
3132     }
3133
3134   token_len = peek_token_bracket (token, regexp, syntax);
3135   if (BE (token->type == END_OF_RE, 0))
3136     {
3137       *err = REG_BADPAT;
3138       goto parse_bracket_exp_free_return;
3139     }
3140   if (token->type == OP_NON_MATCH_LIST)
3141     {
3142 #ifdef RE_ENABLE_I18N
3143       mbcset->non_match = 1;
3144 #endif /* not RE_ENABLE_I18N */
3145       non_match = true;
3146       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3147         bitset_set (sbcset, '\n');
3148       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3149       token_len = peek_token_bracket (token, regexp, syntax);
3150       if (BE (token->type == END_OF_RE, 0))
3151         {
3152           *err = REG_BADPAT;
3153           goto parse_bracket_exp_free_return;
3154         }
3155     }
3156
3157   /* We treat the first ']' as a normal character.  */
3158   if (token->type == OP_CLOSE_BRACKET)
3159     token->type = CHARACTER;
3160
3161   while (1)
3162     {
3163       bracket_elem_t start_elem, end_elem;
3164       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3165       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3166       reg_errcode_t ret;
3167       int token_len2 = 0;
3168       bool is_range_exp = false;
3169       re_token_t token2;
3170
3171       start_elem.opr.name = start_name_buf;
3172       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3173                                    syntax, first_round);
3174       if (BE (ret != REG_NOERROR, 0))
3175         {
3176           *err = ret;
3177           goto parse_bracket_exp_free_return;
3178         }
3179       first_round = false;
3180
3181       /* Get information about the next token.  We need it in any case.  */
3182       token_len = peek_token_bracket (token, regexp, syntax);
3183
3184       /* Do not check for ranges if we know they are not allowed.  */
3185       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3186         {
3187           if (BE (token->type == END_OF_RE, 0))
3188             {
3189               *err = REG_EBRACK;
3190               goto parse_bracket_exp_free_return;
3191             }
3192           if (token->type == OP_CHARSET_RANGE)
3193             {
3194               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3195               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3196               if (BE (token2.type == END_OF_RE, 0))
3197                 {
3198                   *err = REG_EBRACK;
3199                   goto parse_bracket_exp_free_return;
3200                 }
3201               if (token2.type == OP_CLOSE_BRACKET)
3202                 {
3203                   /* We treat the last '-' as a normal character.  */
3204                   re_string_skip_bytes (regexp, -token_len);
3205                   token->type = CHARACTER;
3206                 }
3207               else
3208                 is_range_exp = true;
3209             }
3210         }
3211
3212       if (is_range_exp == true)
3213         {
3214           end_elem.opr.name = end_name_buf;
3215           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3216                                        dfa, syntax, true);
3217           if (BE (ret != REG_NOERROR, 0))
3218             {
3219               *err = ret;
3220               goto parse_bracket_exp_free_return;
3221             }
3222
3223           token_len = peek_token_bracket (token, regexp, syntax);
3224
3225 #ifdef _LIBC
3226           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3227                                   &start_elem, &end_elem);
3228 #else
3229 # ifdef RE_ENABLE_I18N
3230           *err = build_range_exp (syntax, sbcset,
3231                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3232                                   &range_alloc, &start_elem, &end_elem);
3233 # else
3234           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3235 # endif
3236 #endif /* RE_ENABLE_I18N */
3237           if (BE (*err != REG_NOERROR, 0))
3238             goto parse_bracket_exp_free_return;
3239         }
3240       else
3241         {
3242           switch (start_elem.type)
3243             {
3244             case SB_CHAR:
3245               bitset_set (sbcset, start_elem.opr.ch);
3246               break;
3247 #ifdef RE_ENABLE_I18N
3248             case MB_CHAR:
3249               /* Check whether the array has enough space.  */
3250               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3251                 {
3252                   wchar_t *new_mbchars;
3253                   /* Not enough, realloc it.  */
3254                   /* +1 in case of mbcset->nmbchars is 0.  */
3255                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3256                   /* Use realloc since array is NULL if *alloc == 0.  */
3257                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3258                                             mbchar_alloc);
3259                   if (BE (new_mbchars == NULL, 0))
3260                     goto parse_bracket_exp_espace;
3261                   mbcset->mbchars = new_mbchars;
3262                 }
3263               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3264               break;
3265 #endif /* RE_ENABLE_I18N */
3266             case EQUIV_CLASS:
3267               *err = build_equiv_class (sbcset,
3268 #ifdef RE_ENABLE_I18N
3269                                         mbcset, &equiv_class_alloc,
3270 #endif /* RE_ENABLE_I18N */
3271                                         start_elem.opr.name);
3272               if (BE (*err != REG_NOERROR, 0))
3273                 goto parse_bracket_exp_free_return;
3274               break;
3275             case COLL_SYM:
3276               *err = build_collating_symbol (sbcset,
3277 #ifdef RE_ENABLE_I18N
3278                                              mbcset, &coll_sym_alloc,
3279 #endif /* RE_ENABLE_I18N */
3280                                              start_elem.opr.name);
3281               if (BE (*err != REG_NOERROR, 0))
3282                 goto parse_bracket_exp_free_return;
3283               break;
3284             case CHAR_CLASS:
3285               *err = build_charclass (regexp->trans, sbcset,
3286 #ifdef RE_ENABLE_I18N
3287                                       mbcset, &char_class_alloc,
3288 #endif /* RE_ENABLE_I18N */
3289                                       (const char *) start_elem.opr.name,
3290                                       syntax);
3291               if (BE (*err != REG_NOERROR, 0))
3292                goto parse_bracket_exp_free_return;
3293               break;
3294             default:
3295               assert (0);
3296               break;
3297             }
3298         }
3299       if (BE (token->type == END_OF_RE, 0))
3300         {
3301           *err = REG_EBRACK;
3302           goto parse_bracket_exp_free_return;
3303         }
3304       if (token->type == OP_CLOSE_BRACKET)
3305         break;
3306     }
3307
3308   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3309
3310   /* If it is non-matching list.  */
3311   if (non_match)
3312     bitset_not (sbcset);
3313
3314 #ifdef RE_ENABLE_I18N
3315   /* Ensure only single byte characters are set.  */
3316   if (dfa->mb_cur_max > 1)
3317     bitset_mask (sbcset, dfa->sb_char);
3318
3319   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3320       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3321                                                      || mbcset->non_match)))
3322     {
3323       bin_tree_t *mbc_tree;
3324       int sbc_idx;
3325       /* Build a tree for complex bracket.  */
3326       dfa->has_mb_node = 1;
3327       br_token.type = COMPLEX_BRACKET;
3328       br_token.opr.mbcset = mbcset;
3329       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3330       if (BE (mbc_tree == NULL, 0))
3331         goto parse_bracket_exp_espace;
3332       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3333         if (sbcset[sbc_idx])
3334           break;
3335       /* If there are no bits set in sbcset, there is no point
3336          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3337       if (sbc_idx < BITSET_WORDS)
3338         {
3339           /* Build a tree for simple bracket.  */
3340           br_token.type = SIMPLE_BRACKET;
3341           br_token.opr.sbcset = sbcset;
3342           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3343           if (BE (work_tree == NULL, 0))
3344             goto parse_bracket_exp_espace;
3345
3346           /* Then join them by ALT node.  */
3347           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3348           if (BE (work_tree == NULL, 0))
3349             goto parse_bracket_exp_espace;
3350         }
3351       else
3352         {
3353           re_free (sbcset);
3354           work_tree = mbc_tree;
3355         }
3356     }
3357   else
3358 #endif /* not RE_ENABLE_I18N */
3359     {
3360 #ifdef RE_ENABLE_I18N
3361       free_charset (mbcset);
3362 #endif
3363       /* Build a tree for simple bracket.  */
3364       br_token.type = SIMPLE_BRACKET;
3365       br_token.opr.sbcset = sbcset;
3366       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3367       if (BE (work_tree == NULL, 0))
3368         goto parse_bracket_exp_espace;
3369     }
3370   return work_tree;
3371
3372  parse_bracket_exp_espace:
3373   *err = REG_ESPACE;
3374  parse_bracket_exp_free_return:
3375   re_free (sbcset);
3376 #ifdef RE_ENABLE_I18N
3377   free_charset (mbcset);
3378 #endif /* RE_ENABLE_I18N */
3379   return NULL;
3380 }
3381
3382 /* Parse an element in the bracket expression.  */
3383
3384 static reg_errcode_t
3385 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3386                        re_token_t *token, int token_len, re_dfa_t *dfa,
3387                        reg_syntax_t syntax, bool accept_hyphen)
3388 {
3389 #ifdef RE_ENABLE_I18N
3390   int cur_char_size;
3391   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3392   if (cur_char_size > 1)
3393     {
3394       elem->type = MB_CHAR;
3395       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3396       re_string_skip_bytes (regexp, cur_char_size);
3397       return REG_NOERROR;
3398     }
3399 #endif /* RE_ENABLE_I18N */
3400   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3401   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3402       || token->type == OP_OPEN_EQUIV_CLASS)
3403     return parse_bracket_symbol (elem, regexp, token);
3404   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3405     {
3406       /* A '-' must only appear as anything but a range indicator before
3407          the closing bracket.  Everything else is an error.  */
3408       re_token_t token2;
3409       (void) peek_token_bracket (&token2, regexp, syntax);
3410       if (token2.type != OP_CLOSE_BRACKET)
3411         /* The actual error value is not standardized since this whole
3412            case is undefined.  But ERANGE makes good sense.  */
3413         return REG_ERANGE;
3414     }
3415   elem->type = SB_CHAR;
3416   elem->opr.ch = token->opr.c;
3417   return REG_NOERROR;
3418 }
3419
3420 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3421    such as [:<character_class>:], [.<collating_element>.], and
3422    [=<equivalent_class>=].  */
3423
3424 static reg_errcode_t
3425 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3426                       re_token_t *token)
3427 {
3428   unsigned char ch, delim = token->opr.c;
3429   int i = 0;
3430   if (re_string_eoi(regexp))
3431     return REG_EBRACK;
3432   for (;; ++i)
3433     {
3434       if (i >= BRACKET_NAME_BUF_SIZE)
3435         return REG_EBRACK;
3436       if (token->type == OP_OPEN_CHAR_CLASS)
3437         ch = re_string_fetch_byte_case (regexp);
3438       else
3439         ch = re_string_fetch_byte (regexp);
3440       if (re_string_eoi(regexp))
3441         return REG_EBRACK;
3442       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3443         break;
3444       elem->opr.name[i] = ch;
3445     }
3446   re_string_skip_bytes (regexp, 1);
3447   elem->opr.name[i] = '\0';
3448   switch (token->type)
3449     {
3450     case OP_OPEN_COLL_ELEM:
3451       elem->type = COLL_SYM;
3452       break;
3453     case OP_OPEN_EQUIV_CLASS:
3454       elem->type = EQUIV_CLASS;
3455       break;
3456     case OP_OPEN_CHAR_CLASS:
3457       elem->type = CHAR_CLASS;
3458       break;
3459     default:
3460       break;
3461     }
3462   return REG_NOERROR;
3463 }
3464
3465   /* Helper function for parse_bracket_exp.
3466      Build the equivalence class which is represented by NAME.
3467      The result are written to MBCSET and SBCSET.
3468      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3469      is a pointer argument since we may update it.  */
3470
3471 static reg_errcode_t
3472 #ifdef RE_ENABLE_I18N
3473 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3474                    Idx *equiv_class_alloc, const unsigned char *name)
3475 #else /* not RE_ENABLE_I18N */
3476 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3477 #endif /* not RE_ENABLE_I18N */
3478 {
3479 #ifdef _LIBC
3480   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3481   if (nrules != 0)
3482     {
3483       const int32_t *table, *indirect;
3484       const unsigned char *weights, *extra, *cp;
3485       unsigned char char_buf[2];
3486       int32_t idx1, idx2;
3487       unsigned int ch;
3488       size_t len;
3489       /* This #include defines a local function!  */
3490 # include <locale/weight.h>
3491       /* Calculate the index for equivalence class.  */
3492       cp = name;
3493       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3494       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3495                                                _NL_COLLATE_WEIGHTMB);
3496       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3497                                                    _NL_COLLATE_EXTRAMB);
3498       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3499                                                 _NL_COLLATE_INDIRECTMB);
3500       idx1 = findidx (&cp, -1);
3501       if (BE (idx1 == 0 || *cp != '\0', 0))
3502         /* This isn't a valid character.  */
3503         return REG_ECOLLATE;
3504
3505       /* Build single byte matching table for this equivalence class.  */
3506       len = weights[idx1 & 0xffffff];
3507       for (ch = 0; ch < SBC_MAX; ++ch)
3508         {
3509           char_buf[0] = ch;
3510           cp = char_buf;
3511           idx2 = findidx (&cp, 1);
3512 /*
3513           idx2 = table[ch];
3514 */
3515           if (idx2 == 0)
3516             /* This isn't a valid character.  */
3517             continue;
3518           /* Compare only if the length matches and the collation rule
3519              index is the same.  */
3520           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3521             {
3522               int cnt = 0;
3523
3524               while (cnt <= len &&
3525                      weights[(idx1 & 0xffffff) + 1 + cnt]
3526                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3527                 ++cnt;
3528
3529               if (cnt > len)
3530                 bitset_set (sbcset, ch);
3531             }
3532         }
3533       /* Check whether the array has enough space.  */
3534       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3535         {
3536           /* Not enough, realloc it.  */
3537           /* +1 in case of mbcset->nequiv_classes is 0.  */
3538           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3539           /* Use realloc since the array is NULL if *alloc == 0.  */
3540           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3541                                                    int32_t,
3542                                                    new_equiv_class_alloc);
3543           if (BE (new_equiv_classes == NULL, 0))
3544             return REG_ESPACE;
3545           mbcset->equiv_classes = new_equiv_classes;
3546           *equiv_class_alloc = new_equiv_class_alloc;
3547         }
3548       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3549     }
3550   else
3551 #endif /* _LIBC */
3552     {
3553       if (BE (strlen ((const char *) name) != 1, 0))
3554         return REG_ECOLLATE;
3555       bitset_set (sbcset, *name);
3556     }
3557   return REG_NOERROR;
3558 }
3559
3560   /* Helper function for parse_bracket_exp.
3561      Build the character class which is represented by NAME.
3562      The result are written to MBCSET and SBCSET.
3563      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3564      is a pointer argument since we may update it.  */
3565
3566 static reg_errcode_t
3567 #ifdef RE_ENABLE_I18N
3568 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3569                  re_charset_t *mbcset, Idx *char_class_alloc,
3570                  const char *class_name, reg_syntax_t syntax)
3571 #else /* not RE_ENABLE_I18N */
3572 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3573                  const char *class_name, reg_syntax_t syntax)
3574 #endif /* not RE_ENABLE_I18N */
3575 {
3576   int i;
3577   const char *name = class_name;
3578
3579   /* In case of REG_ICASE "upper" and "lower" match the both of
3580      upper and lower cases.  */
3581   if ((syntax & RE_ICASE)
3582       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3583     name = "alpha";
3584
3585 #ifdef RE_ENABLE_I18N
3586   /* Check the space of the arrays.  */
3587   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3588     {
3589       /* Not enough, realloc it.  */
3590       /* +1 in case of mbcset->nchar_classes is 0.  */
3591       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3592       /* Use realloc since array is NULL if *alloc == 0.  */
3593       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3594                                                new_char_class_alloc);
3595       if (BE (new_char_classes == NULL, 0))
3596         return REG_ESPACE;
3597       mbcset->char_classes = new_char_classes;
3598       *char_class_alloc = new_char_class_alloc;
3599     }
3600   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3601 #endif /* RE_ENABLE_I18N */
3602
3603 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3604   do {                                          \
3605     if (BE (trans != NULL, 0))                  \
3606       {                                         \
3607         for (i = 0; i < SBC_MAX; ++i)           \
3608           if (ctype_func (i))                   \
3609             bitset_set (sbcset, trans[i]);      \
3610       }                                         \
3611     else                                        \
3612       {                                         \
3613         for (i = 0; i < SBC_MAX; ++i)           \
3614           if (ctype_func (i))                   \
3615             bitset_set (sbcset, i);             \
3616       }                                         \
3617   } while (0)
3618
3619   if (strcmp (name, "alnum") == 0)
3620     BUILD_CHARCLASS_LOOP (isalnum);
3621   else if (strcmp (name, "cntrl") == 0)
3622     BUILD_CHARCLASS_LOOP (iscntrl);
3623   else if (strcmp (name, "lower") == 0)
3624     BUILD_CHARCLASS_LOOP (islower);
3625   else if (strcmp (name, "space") == 0)
3626     BUILD_CHARCLASS_LOOP (isspace);
3627   else if (strcmp (name, "alpha") == 0)
3628     BUILD_CHARCLASS_LOOP (isalpha);
3629   else if (strcmp (name, "digit") == 0)
3630     BUILD_CHARCLASS_LOOP (isdigit);
3631   else if (strcmp (name, "print") == 0)
3632     BUILD_CHARCLASS_LOOP (isprint);
3633   else if (strcmp (name, "upper") == 0)
3634     BUILD_CHARCLASS_LOOP (isupper);
3635   else if (strcmp (name, "blank") == 0)
3636     BUILD_CHARCLASS_LOOP (isblank);
3637   else if (strcmp (name, "graph") == 0)
3638     BUILD_CHARCLASS_LOOP (isgraph);
3639   else if (strcmp (name, "punct") == 0)
3640     BUILD_CHARCLASS_LOOP (ispunct);
3641   else if (strcmp (name, "xdigit") == 0)
3642     BUILD_CHARCLASS_LOOP (isxdigit);
3643   else
3644     return REG_ECTYPE;
3645
3646   return REG_NOERROR;
3647 }
3648
3649 static bin_tree_t *
3650 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3651                     const char *class_name,
3652                     const char *extra, bool non_match,
3653                     reg_errcode_t *err)
3654 {
3655   re_bitset_ptr_t sbcset;
3656 #ifdef RE_ENABLE_I18N
3657   re_charset_t *mbcset;
3658   Idx alloc = 0;
3659 #endif /* not RE_ENABLE_I18N */
3660   reg_errcode_t ret;
3661   re_token_t br_token;
3662   bin_tree_t *tree;
3663
3664   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3665 #ifdef RE_ENABLE_I18N
3666   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3667 #endif /* RE_ENABLE_I18N */
3668
3669 #ifdef RE_ENABLE_I18N
3670   if (BE (sbcset == NULL || mbcset == NULL, 0))
3671 #else /* not RE_ENABLE_I18N */
3672   if (BE (sbcset == NULL, 0))
3673 #endif /* not RE_ENABLE_I18N */
3674     {
3675       *err = REG_ESPACE;
3676       return NULL;
3677     }
3678
3679   if (non_match)
3680     {
3681 #ifdef RE_ENABLE_I18N
3682       mbcset->non_match = 1;
3683 #endif /* not RE_ENABLE_I18N */
3684     }
3685
3686   /* We don't care the syntax in this case.  */
3687   ret = build_charclass (trans, sbcset,
3688 #ifdef RE_ENABLE_I18N
3689                          mbcset, &alloc,
3690 #endif /* RE_ENABLE_I18N */
3691                          class_name, 0);
3692
3693   if (BE (ret != REG_NOERROR, 0))
3694     {
3695       re_free (sbcset);
3696 #ifdef RE_ENABLE_I18N
3697       free_charset (mbcset);
3698 #endif /* RE_ENABLE_I18N */
3699       *err = ret;
3700       return NULL;
3701     }
3702   /* \w match '_' also.  */
3703   for (; *extra; extra++)
3704     bitset_set (sbcset, *extra);
3705
3706   /* If it is non-matching list.  */
3707   if (non_match)
3708     bitset_not (sbcset);
3709
3710 #ifdef RE_ENABLE_I18N
3711   /* Ensure only single byte characters are set.  */
3712   if (dfa->mb_cur_max > 1)
3713     bitset_mask (sbcset, dfa->sb_char);
3714 #endif
3715
3716   /* Build a tree for simple bracket.  */
3717   br_token.type = SIMPLE_BRACKET;
3718   br_token.opr.sbcset = sbcset;
3719   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3720   if (BE (tree == NULL, 0))
3721     goto build_word_op_espace;
3722
3723 #ifdef RE_ENABLE_I18N
3724   if (dfa->mb_cur_max > 1)
3725     {
3726       bin_tree_t *mbc_tree;
3727       /* Build a tree for complex bracket.  */
3728       br_token.type = COMPLEX_BRACKET;
3729       br_token.opr.mbcset = mbcset;
3730       dfa->has_mb_node = 1;
3731       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3732       if (BE (mbc_tree == NULL, 0))
3733         goto build_word_op_espace;
3734       /* Then join them by ALT node.  */
3735       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3736       if (BE (mbc_tree != NULL, 1))
3737         return tree;
3738     }
3739   else
3740     {
3741       free_charset (mbcset);
3742       return tree;
3743     }
3744 #else /* not RE_ENABLE_I18N */
3745   return tree;
3746 #endif /* not RE_ENABLE_I18N */
3747
3748  build_word_op_espace:
3749   re_free (sbcset);
3750 #ifdef RE_ENABLE_I18N
3751   free_charset (mbcset);
3752 #endif /* RE_ENABLE_I18N */
3753   *err = REG_ESPACE;
3754   return NULL;
3755 }
3756
3757 /* This is intended for the expressions like "a{1,3}".
3758    Fetch a number from 'input', and return the number.
3759    Return REG_MISSING if the number field is empty like "{,1}".
3760    Return RE_DUP_MAX + 1 if the number field is too large.
3761    Return REG_ERROR if an error occurred.  */
3762
3763 static Idx
3764 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3765 {
3766   Idx num = REG_MISSING;
3767   unsigned char c;
3768   while (1)
3769     {
3770       fetch_token (token, input, syntax);
3771       c = token->opr.c;
3772       if (BE (token->type == END_OF_RE, 0))
3773         return REG_ERROR;
3774       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3775         break;
3776       num = ((token->type != CHARACTER || c < '0' || '9' < c
3777               || num == REG_ERROR)
3778              ? REG_ERROR
3779              : num == REG_MISSING
3780              ? c - '0'
3781              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3782     }
3783   return num;
3784 }
3785 \f
3786 #ifdef RE_ENABLE_I18N
3787 static void
3788 free_charset (re_charset_t *cset)
3789 {
3790   re_free (cset->mbchars);
3791 # ifdef _LIBC
3792   re_free (cset->coll_syms);
3793   re_free (cset->equiv_classes);
3794   re_free (cset->range_starts);
3795   re_free (cset->range_ends);
3796 # endif
3797   re_free (cset->char_classes);
3798   re_free (cset);
3799 }
3800 #endif /* RE_ENABLE_I18N */
3801 \f
3802 /* Functions for binary tree operation.  */
3803
3804 /* Create a tree node.  */
3805
3806 static bin_tree_t *
3807 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3808              re_token_type_t type)
3809 {
3810   re_token_t t;
3811   t.type = type;
3812   return create_token_tree (dfa, left, right, &t);
3813 }
3814
3815 static bin_tree_t *
3816 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3817                    const re_token_t *token)
3818 {
3819   bin_tree_t *tree;
3820   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3821     {
3822       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3823
3824       if (storage == NULL)
3825         return NULL;
3826       storage->next = dfa->str_tree_storage;
3827       dfa->str_tree_storage = storage;
3828       dfa->str_tree_storage_idx = 0;
3829     }
3830   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3831
3832   tree->parent = NULL;
3833   tree->left = left;
3834   tree->right = right;
3835   tree->token = *token;
3836   tree->token.duplicated = 0;
3837   tree->token.opt_subexp = 0;
3838   tree->first = NULL;
3839   tree->next = NULL;
3840   tree->node_idx = REG_MISSING;
3841
3842   if (left != NULL)
3843     left->parent = tree;
3844   if (right != NULL)
3845     right->parent = tree;
3846   return tree;
3847 }
3848
3849 /* Mark the tree SRC as an optional subexpression.
3850    To be called from preorder or postorder.  */
3851
3852 static reg_errcode_t
3853 mark_opt_subexp (void *extra, bin_tree_t *node)
3854 {
3855   Idx idx = (uintptr_t) extra;
3856   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3857     node->token.opt_subexp = 1;
3858
3859   return REG_NOERROR;
3860 }
3861
3862 /* Free the allocated memory inside NODE. */
3863
3864 static void
3865 free_token (re_token_t *node)
3866 {
3867 #ifdef RE_ENABLE_I18N
3868   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3869     free_charset (node->opr.mbcset);
3870   else
3871 #endif /* RE_ENABLE_I18N */
3872     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3873       re_free (node->opr.sbcset);
3874 }
3875
3876 /* Worker function for tree walking.  Free the allocated memory inside NODE
3877    and its children. */
3878
3879 static reg_errcode_t
3880 free_tree (void *extra, bin_tree_t *node)
3881 {
3882   free_token (&node->token);
3883   return REG_NOERROR;
3884 }
3885
3886
3887 /* Duplicate the node SRC, and return new node.  This is a preorder
3888    visit similar to the one implemented by the generic visitor, but
3889    we need more infrastructure to maintain two parallel trees --- so,
3890    it's easier to duplicate.  */
3891
3892 static bin_tree_t *
3893 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3894 {
3895   const bin_tree_t *node;
3896   bin_tree_t *dup_root;
3897   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3898
3899   for (node = root; ; )
3900     {
3901       /* Create a new tree and link it back to the current parent.  */
3902       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3903       if (*p_new == NULL)
3904         return NULL;
3905       (*p_new)->parent = dup_node;
3906       (*p_new)->token.duplicated = 1;
3907       dup_node = *p_new;
3908
3909       /* Go to the left node, or up and to the right.  */
3910       if (node->left)
3911         {
3912           node = node->left;
3913           p_new = &dup_node->left;
3914         }
3915       else
3916         {
3917           const bin_tree_t *prev = NULL;
3918           while (node->right == prev || node->right == NULL)
3919             {
3920               prev = node;
3921               node = node->parent;
3922               dup_node = dup_node->parent;
3923               if (!node)
3924                 return dup_root;
3925             }
3926           node = node->right;
3927           p_new = &dup_node->right;
3928         }
3929     }
3930 }