regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          int node, int root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static int fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   int nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  int nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             int accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         int *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       int *char_class_alloc,
  97                                       const char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const char *class_name,
 110                                        const char *extra,
 111                                        int non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 const char __re_error_msgid[] attribute_hidden =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 const size_t __re_error_msgid_idx[] attribute_hidden =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205
 206 #ifdef ZOS_USS
 207
 208 /* For ZOS USS we must define btowc */
 209
 210 wchar_t
 211 btowc (int c)
 212 {
 213    wchar_t wtmp[2];
 214    char tmp[2];
 215    mbstate_t mbs;
 216
 217    memset(& mbs, 0, sizeof(mbs));
 218    tmp[0] = c;
 219    tmp[1] = 0;
 220
 221    mbrtowc (wtmp, tmp, 1, & mbs);
 222    return wtmp[0];
 223 }
 224 #endif
 225
 226 /* re_compile_pattern is the GNU regular expression compiler: it
 227    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 228    Returns 0 if the pattern was valid, otherwise an error string.
 229
 230    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 231    are set in BUFP on entry.  */
 232
 233 const char *
 234 re_compile_pattern (pattern, length, bufp)
 235     const char *pattern;
 236     size_t length;
 237     struct re_pattern_buffer *bufp;
 238 {
 239   reg_errcode_t ret;
 240
 241   /* And GNU code determines whether or not to get register information
 242      by passing null for the REGS argument to re_match, etc., not by
 243      setting no_sub, unless RE_NO_SUB is set.  */
 244   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 245
 246   /* Match anchors at newline.  */
 247   bufp->newline_anchor = 1;
 248
 249   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 250
 251   if (!ret)
 252     return NULL;
 253   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 254 }
 255 #ifdef _LIBC
 256 weak_alias (__re_compile_pattern, re_compile_pattern)
 257 #endif
 258
 259 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 260    also be assigned to arbitrarily: each pattern buffer stores its own
 261    syntax, so it can be changed between regex compilations.  */
 262 /* This has no initializer because initialized variables in Emacs
 263    become read-only after dumping.  */
 264 reg_syntax_t re_syntax_options;
 265
 266
 267 /* Specify the precise syntax of regexps for compilation.  This provides
 268    for compatibility for various utilities which historically have
 269    different, incompatible syntaxes.
 270
 271    The argument SYNTAX is a bit mask comprised of the various bits
 272    defined in regex.h.  We return the old syntax.  */
 273
 274 reg_syntax_t
 275 re_set_syntax (syntax)
 276     reg_syntax_t syntax;
 277 {
 278   reg_syntax_t ret = re_syntax_options;
 279
 280   re_syntax_options = syntax;
 281   return ret;
 282 }
 283 #ifdef _LIBC
 284 weak_alias (__re_set_syntax, re_set_syntax)
 285 #endif
 286
 287 int
 288 re_compile_fastmap (bufp)
 289     struct re_pattern_buffer *bufp;
 290 {
 291   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 292   char *fastmap = bufp->fastmap;
 293
 294   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 295   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 296   if (dfa->init_state != dfa->init_state_word)
 297     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 298   if (dfa->init_state != dfa->init_state_nl)
 299     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 300   if (dfa->init_state != dfa->init_state_begbuf)
 301     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 302   bufp->fastmap_accurate = 1;
 303   return 0;
 304 }
 305 #ifdef _LIBC
 306 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 307 #endif
 308
 309 static inline void
 310 __attribute ((always_inline))
 311 re_set_fastmap (char *fastmap, int icase, int ch)
 312 {
 313   fastmap[ch] = 1;
 314   if (icase)
 315     fastmap[tolower (ch)] = 1;
 316 }
 317
 318 /* Helper function for re_compile_fastmap.
 319    Compile fastmap for the initial_state INIT_STATE.  */
 320
 321 static void
 322 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 323                          char *fastmap)
 324 {
 325   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 326   int node_cnt;
 327   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 328   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 329     {
 330       int node = init_state->nodes.elems[node_cnt];
 331       re_token_type_t type = dfa->nodes[node].type;
 332
 333       if (type == CHARACTER)
 334         {
 335           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 336 #ifdef RE_ENABLE_I18N
 337           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 338             {
 339               unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
 340               wchar_t wc;
 341               mbstate_t state;
 342
 343               p = buf;
 344               *p++ = dfa->nodes[node].opr.c;
 345               while (++node < dfa->nodes_len
 346                      && dfa->nodes[node].type == CHARACTER
 347                      && dfa->nodes[node].mb_partial)
 348                 *p++ = dfa->nodes[node].opr.c;
 349               memset (&state, '\0', sizeof (state));
 350               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 351                              &state) == p - buf
 352                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 353                       != (size_t) -1))
 354                 re_set_fastmap (fastmap, 0, buf[0]);
 355               re_free (buf);
 356             }
 357 #endif
 358         }
 359       else if (type == SIMPLE_BRACKET)
 360         {
 361           int i, ch;
 362           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 363             {
 364               int j;
 365               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 366               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 367                 if (w & ((bitset_word_t) 1 << j))
 368                   re_set_fastmap (fastmap, icase, ch);
 369             }
 370         }
 371 #ifdef RE_ENABLE_I18N
 372       else if (type == COMPLEX_BRACKET)
 373         {
 374           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 375           int i;
 376
 377 # ifdef _LIBC
 378           /* See if we have to try all bytes which start multiple collation
 379              elements.
 380              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 381                   collation element, and don't catch 'b' since 'b' is
 382                   the only collation element which starts from 'b' (and
 383                   it is caught by SIMPLE_BRACKET).  */
 384               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 385                   && (cset->ncoll_syms || cset->nranges))
 386                 {
 387                   const int32_t *table = (const int32_t *)
 388                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 389                   for (i = 0; i < SBC_MAX; ++i)
 390                     if (table[i] < 0)
 391                       re_set_fastmap (fastmap, icase, i);
 392                 }
 393 # endif /* _LIBC */
 394
 395           /* See if we have to start the match at all multibyte characters,
 396              i.e. where we would not find an invalid sequence.  This only
 397              applies to multibyte character sets; for single byte character
 398              sets, the SIMPLE_BRACKET again suffices.  */
 399           if (dfa->mb_cur_max > 1
 400               && (cset->nchar_classes || cset->non_match || cset->nranges
 401 # ifdef _LIBC
 402                   || cset->nequiv_classes
 403 # endif /* _LIBC */
 404                  ))
 405             {
 406               unsigned char c = 0;
 407               do
 408                 {
 409                   mbstate_t mbs;
 410                   memset (&mbs, 0, sizeof (mbs));
 411                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 412                     re_set_fastmap (fastmap, false, (int) c);
 413                 }
 414               while (++c != 0);
 415             }
 416
 417           else
 418             {
 419               /* ... Else catch all bytes which can start the mbchars.  */
 420               for (i = 0; i < cset->nmbchars; ++i)
 421                 {
 422                   char buf[256];
 423                   mbstate_t state;
 424                   memset (&state, '\0', sizeof (state));
 425                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 426                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 427                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 428                     {
 429                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 430                           != (size_t) -1)
 431                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 432                     }
 433                 }
 434             }
 435         }
 436 #endif /* RE_ENABLE_I18N */
 437       else if (type == OP_PERIOD
 438 #ifdef RE_ENABLE_I18N
 439                || type == OP_UTF8_PERIOD
 440 #endif /* RE_ENABLE_I18N */
 441                || type == END_OF_RE)
 442         {
 443           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 444           if (type == END_OF_RE)
 445             bufp->can_be_null = 1;
 446           return;
 447         }
 448     }
 449 }
 450 \f
 451 /* Entry point for POSIX code.  */
 452 /* regcomp takes a regular expression as a string and compiles it.
 453
 454    PREG is a regex_t *.  We do not expect any fields to be initialized,
 455    since POSIX says we shouldn't.  Thus, we set
 456
 457      `buffer' to the compiled pattern;
 458      `used' to the length of the compiled pattern;
 459      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 460        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 461        RE_SYNTAX_POSIX_BASIC;
 462      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 463      `fastmap' to an allocated space for the fastmap;
 464      `fastmap_accurate' to zero;
 465      `re_nsub' to the number of subexpressions in PATTERN.
 466
 467    PATTERN is the address of the pattern string.
 468
 469    CFLAGS is a series of bits which affect compilation.
 470
 471      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 472      use POSIX basic syntax.
 473
 474      If REG_NEWLINE is set, then . and [^...] don't match newline.
 475      Also, regexec will try a match beginning after every newline.
 476
 477      If REG_ICASE is set, then we considers upper- and lowercase
 478      versions of letters to be equivalent when matching.
 479
 480      If REG_NOSUB is set, then when PREG is passed to regexec, that
 481      routine will report only success or failure, and nothing about the
 482      registers.
 483
 484    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 485    the return codes and their meanings.)  */
 486
 487 int
 488 regcomp (preg, pattern, cflags)
 489     regex_t *__restrict preg;
 490     const char *__restrict pattern;
 491     int cflags;
 492 {
 493   reg_errcode_t ret;
 494   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 495                          : RE_SYNTAX_POSIX_BASIC);
 496
 497   preg->buffer = NULL;
 498   preg->allocated = 0;
 499   preg->used = 0;
 500
 501   /* Try to allocate space for the fastmap.  */
 502   preg->fastmap = re_malloc (char, SBC_MAX);
 503   if (BE (preg->fastmap == NULL, 0))
 504     return REG_ESPACE;
 505
 506   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 507
 508   /* If REG_NEWLINE is set, newlines are treated differently.  */
 509   if (cflags & REG_NEWLINE)
 510     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 511       syntax &= ~RE_DOT_NEWLINE;
 512       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 513       /* It also changes the matching behavior.  */
 514       preg->newline_anchor = 1;
 515     }
 516   else
 517     preg->newline_anchor = 0;
 518   preg->no_sub = !!(cflags & REG_NOSUB);
 519   preg->translate = NULL;
 520
 521   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 522
 523   /* POSIX doesn't distinguish between an unmatched open-group and an
 524      unmatched close-group: both are REG_EPAREN.  */
 525   if (ret == REG_ERPAREN)
 526     ret = REG_EPAREN;
 527
 528   /* We have already checked preg->fastmap != NULL.  */
 529   if (BE (ret == REG_NOERROR, 1))
 530     /* Compute the fastmap now, since regexec cannot modify the pattern
 531        buffer.  This function never fails in this implementation.  */
 532     (void) re_compile_fastmap (preg);
 533   else
 534     {
 535       /* Some error occurred while compiling the expression.  */
 536       re_free (preg->fastmap);
 537       preg->fastmap = NULL;
 538     }
 539
 540   return (int) ret;
 541 }
 542 #ifdef _LIBC
 543 weak_alias (__regcomp, regcomp)
 544 #endif
 545
 546 /* Returns a message corresponding to an error code, ERRCODE, returned
 547    from either regcomp or regexec.   We don't use PREG here.  */
 548
 549 size_t
 550 regerror (errcode, preg, errbuf, errbuf_size)
 551     int errcode;
 552     const regex_t *__restrict preg;
 553     char *__restrict errbuf;
 554     size_t errbuf_size;
 555 {
 556   const char *msg;
 557   size_t msg_size;
 558
 559   if (BE (errcode < 0
 560           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 561                                / sizeof (__re_error_msgid_idx[0])), 0))
 562     /* Only error codes returned by the rest of the code should be passed
 563        to this routine.  If we are given anything else, or if other regex
 564        code generates an invalid error code, then the program has a bug.
 565        Dump core so we can fix it.  */
 566     abort ();
 567
 568   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 569
 570   msg_size = strlen (msg) + 1; /* Includes the null.  */
 571
 572   if (BE (errbuf_size != 0, 1))
 573     {
 574       if (BE (msg_size > errbuf_size, 0))
 575         {
 576 #if defined HAVE_MEMPCPY || defined _LIBC
 577           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 578 #else
 579           memcpy (errbuf, msg, errbuf_size - 1);
 580           errbuf[errbuf_size - 1] = 0;
 581 #endif
 582         }
 583       else
 584         memcpy (errbuf, msg, msg_size);
 585     }
 586
 587   return msg_size;
 588 }
 589 #ifdef _LIBC
 590 weak_alias (__regerror, regerror)
 591 #endif
 592
 593
 594 #ifdef RE_ENABLE_I18N
 595 /* This static array is used for the map to single-byte characters when
 596    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 597    it the same all the time.  UTF-8 is the preferred encoding so this is
 598    a worthwhile optimization.  */
 599 #if __GNUC__ >= 3
 600 static const bitset_t utf8_sb_map = {
 601   /* Set the first 128 bits.  */
 602   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 603 };
 604 #else /* ! (__GNUC__ >= 3) */
 605 static bitset_t utf8_sb_map;
 606 #endif /* __GNUC__ >= 3 */
 607 #endif /* RE_ENABLE_I18N */
 608
 609
 610 static void
 611 free_dfa_content (re_dfa_t *dfa)
 612 {
 613   int i, j;
 614
 615   if (dfa->nodes)
 616     for (i = 0; i < dfa->nodes_len; ++i)
 617       free_token (dfa->nodes + i);
 618   re_free (dfa->nexts);
 619   for (i = 0; i < dfa->nodes_len; ++i)
 620     {
 621       if (dfa->eclosures != NULL)
 622         re_node_set_free (dfa->eclosures + i);
 623       if (dfa->inveclosures != NULL)
 624         re_node_set_free (dfa->inveclosures + i);
 625       if (dfa->edests != NULL)
 626         re_node_set_free (dfa->edests + i);
 627     }
 628   re_free (dfa->edests);
 629   re_free (dfa->eclosures);
 630   re_free (dfa->inveclosures);
 631   re_free (dfa->nodes);
 632
 633   if (dfa->state_table)
 634     for (i = 0; i <= dfa->state_hash_mask; ++i)
 635       {
 636         struct re_state_table_entry *entry = dfa->state_table + i;
 637         for (j = 0; j < entry->num; ++j)
 638           {
 639             re_dfastate_t *state = entry->array[j];
 640             free_state (state);
 641           }
 642         re_free (entry->array);
 643       }
 644   re_free (dfa->state_table);
 645 #ifdef RE_ENABLE_I18N
 646   if (dfa->sb_char != utf8_sb_map)
 647     re_free (dfa->sb_char);
 648 #endif
 649   re_free (dfa->subexp_map);
 650 #ifdef DEBUG
 651   re_free (dfa->re_str);
 652 #endif
 653
 654   re_free (dfa);
 655 }
 656
 657
 658 /* Free dynamically allocated space used by PREG.  */
 659
 660 void
 661 regfree (preg)
 662     regex_t *preg;
 663 {
 664   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 665   if (BE (dfa != NULL, 1))
 666     free_dfa_content (dfa);
 667   preg->buffer = NULL;
 668   preg->allocated = 0;
 669
 670   re_free (preg->fastmap);
 671   preg->fastmap = NULL;
 672
 673   re_free (preg->translate);
 674   preg->translate = NULL;
 675 }
 676 #ifdef _LIBC
 677 weak_alias (__regfree, regfree)
 678 #endif
 679 \f
 680 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 681    them unless specifically requested.  */
 682
 683 #if defined _REGEX_RE_COMP || defined _LIBC
 684
 685 /* BSD has one and only one pattern buffer.  */
 686 static struct re_pattern_buffer re_comp_buf;
 687
 688 char *
 689 # ifdef _LIBC
 690 /* Make these definitions weak in libc, so POSIX programs can redefine
 691    these names if they don't use our functions, and still use
 692    regcomp/regexec above without link errors.  */
 693 weak_function
 694 # endif
 695 re_comp (s)
 696      const char *s;
 697 {
 698   reg_errcode_t ret;
 699   char *fastmap;
 700
 701   if (!s)
 702     {
 703       if (!re_comp_buf.buffer)
 704         return gettext ("No previous regular expression");
 705       return 0;
 706     }
 707
 708   if (re_comp_buf.buffer)
 709     {
 710       fastmap = re_comp_buf.fastmap;
 711       re_comp_buf.fastmap = NULL;
 712       __regfree (&re_comp_buf);
 713       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 714       re_comp_buf.fastmap = fastmap;
 715     }
 716
 717   if (re_comp_buf.fastmap == NULL)
 718     {
 719       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 720       if (re_comp_buf.fastmap == NULL)
 721         return (char *) gettext (__re_error_msgid
 722                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 723     }
 724
 725   /* Since `re_exec' always passes NULL for the `regs' argument, we
 726      don't need to initialize the pattern buffer fields which affect it.  */
 727
 728   /* Match anchors at newlines.  */
 729   re_comp_buf.newline_anchor = 1;
 730
 731   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 732
 733   if (!ret)
 734     return NULL;
 735
 736   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 737   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 738 }
 739
 740 #ifdef _LIBC
 741 libc_freeres_fn (free_mem)
 742 {
 743   __regfree (&re_comp_buf);
 744 }
 745 #endif
 746
 747 #endif /* _REGEX_RE_COMP */
 748 \f
 749 /* Internal entry point.
 750    Compile the regular expression PATTERN, whose length is LENGTH.
 751    SYNTAX indicate regular expression's syntax.  */
 752
 753 static reg_errcode_t
 754 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 755                      reg_syntax_t syntax)
 756 {
 757   reg_errcode_t err = REG_NOERROR;
 758   re_dfa_t *dfa;
 759   re_string_t regexp;
 760
 761   /* Initialize the pattern buffer.  */
 762   preg->fastmap_accurate = 0;
 763   preg->syntax = syntax;
 764   preg->not_bol = preg->not_eol = 0;
 765   preg->used = 0;
 766   preg->re_nsub = 0;
 767   preg->can_be_null = 0;
 768   preg->regs_allocated = REGS_UNALLOCATED;
 769
 770   /* Initialize the dfa.  */
 771   dfa = (re_dfa_t *) preg->buffer;
 772   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 773     {
 774       /* If zero allocated, but buffer is non-null, try to realloc
 775          enough space.  This loses if buffer's address is bogus, but
 776          that is the user's responsibility.  If ->buffer is NULL this
 777          is a simple allocation.  */
 778       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 779       if (dfa == NULL)
 780         return REG_ESPACE;
 781       preg->allocated = sizeof (re_dfa_t);
 782       preg->buffer = (unsigned char *) dfa;
 783     }
 784   preg->used = sizeof (re_dfa_t);
 785
 786   err = init_dfa (dfa, length);
 787   if (BE (err != REG_NOERROR, 0))
 788     {
 789       free_dfa_content (dfa);
 790       preg->buffer = NULL;
 791       preg->allocated = 0;
 792       return err;
 793     }
 794 #ifdef DEBUG
 795   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 796   dfa->re_str = re_malloc (char, length + 1);
 797   strncpy (dfa->re_str, pattern, length + 1);
 798 #endif
 799
 800   __libc_lock_init (dfa->lock);
 801
 802   err = re_string_construct (&regexp, pattern, length, preg->translate,
 803                              syntax & RE_ICASE, dfa);
 804   if (BE (err != REG_NOERROR, 0))
 805     {
 806     re_compile_internal_free_return:
 807       free_workarea_compile (preg);
 808       re_string_destruct (&regexp);
 809       free_dfa_content (dfa);
 810       preg->buffer = NULL;
 811       preg->allocated = 0;
 812       return err;
 813     }
 814
 815   /* Parse the regular expression, and build a structure tree.  */
 816   preg->re_nsub = 0;
 817   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 818   if (BE (dfa->str_tree == NULL, 0))
 819     goto re_compile_internal_free_return;
 820
 821   /* Analyze the tree and create the nfa.  */
 822   err = analyze (preg);
 823   if (BE (err != REG_NOERROR, 0))
 824     goto re_compile_internal_free_return;
 825
 826 #ifdef RE_ENABLE_I18N
 827   /* If possible, do searching in single byte encoding to speed things up.  */
 828   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 829     optimize_utf8 (dfa);
 830 #endif
 831
 832   /* Then create the initial state of the dfa.  */
 833   err = create_initial_state (dfa);
 834
 835   /* Release work areas.  */
 836   free_workarea_compile (preg);
 837   re_string_destruct (&regexp);
 838
 839   if (BE (err != REG_NOERROR, 0))
 840     {
 841       free_dfa_content (dfa);
 842       preg->buffer = NULL;
 843       preg->allocated = 0;
 844     }
 845
 846   return err;
 847 }
 848
 849 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 850    as the initial length of some arrays.  */
 851
 852 static reg_errcode_t
 853 init_dfa (re_dfa_t *dfa, size_t pat_len)
 854 {
 855   unsigned int table_size;
 856 #ifndef _LIBC
 857   char *codeset_name;
 858 #endif
 859 #if defined(GAWK) && defined(LIBC_IS_BORKED)
 860   /* Needed for brain damaged systems */
 861   extern int gawk_mb_cur_max;
 862 #endif
 863
 864   memset (dfa, '\0', sizeof (re_dfa_t));
 865
 866   /* Force allocation of str_tree_storage the first time.  */
 867   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 868
 869   /* Avoid overflows.  */
 870   if (pat_len == SIZE_MAX)
 871     return REG_ESPACE;
 872
 873   dfa->nodes_alloc = pat_len + 1;
 874   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 875
 876   /*  table_size = 2 ^ ceil(log pat_len) */
 877   for (table_size = 1; ; table_size <<= 1)
 878     if (table_size > pat_len)
 879       break;
 880
 881   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 882   dfa->state_hash_mask = table_size - 1;
 883
 884 #if defined(GAWK) && defined(LIBC_IS_BORKED)
 885   dfa->mb_cur_max = gawk_mb_cur_max;
 886 #else
 887   dfa->mb_cur_max = MB_CUR_MAX;
 888 #endif
 889 #ifdef _LIBC
 890   if (dfa->mb_cur_max == 6
 891       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 892     dfa->is_utf8 = 1;
 893   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 894                        != 0);
 895 #else
 896 # ifdef HAVE_LANGINFO_CODESET
 897   codeset_name = nl_langinfo (CODESET);
 898 # else
 899   codeset_name = getenv ("LC_ALL");
 900   if (codeset_name == NULL || codeset_name[0] == '\0')
 901     codeset_name = getenv ("LC_CTYPE");
 902   if (codeset_name == NULL || codeset_name[0] == '\0')
 903     codeset_name = getenv ("LANG");
 904   if (codeset_name == NULL)
 905     codeset_name = "";
 906   else if (strchr (codeset_name, '.') !=  NULL)
 907     codeset_name = strchr (codeset_name, '.') + 1;
 908 # endif
 909
 910   /* strcasecmp isn't a standard interface. brute force check */
 911 #ifndef GAWK
 912   if (strcasecmp (codeset_name, "UTF-8") == 0
 913       || strcasecmp (codeset_name, "UTF8") == 0)
 914     dfa->is_utf8 = 1;
 915 #else
 916   if (   (codeset_name[0] == 'U' || codeset_name[0] == 'u')
 917       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 918       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 919       && (codeset_name[3] == '-'
 920           ? codeset_name[4] == '8' && codeset_name[5] == '\0'
 921           : codeset_name[3] == '8' && codeset_name[4] == '\0'))
 922     dfa->is_utf8 = 1;
 923 #if defined(GAWK) && defined(LIBC_IS_BORKED)
 924   if (gawk_mb_cur_max == 1)
 925     dfa->is_utf8 = 0;
 926 #endif /* defined(GAWK) && defined(LIBC_IS_BORKED) */
 927 #endif
 928
 929   /* We check exhaustively in the loop below if this charset is a
 930      superset of ASCII.  */
 931   dfa->map_notascii = 0;
 932 #endif
 933
 934 #ifdef RE_ENABLE_I18N
 935   if (dfa->mb_cur_max > 1)
 936     {
 937       if (dfa->is_utf8)
 938         {
 939 #if !defined(__GNUC__) || __GNUC__ < 3
 940           static short utf8_sb_map_inited = 0;
 941
 942           if (! utf8_sb_map_inited)
 943             {
 944                 int i;
 945
 946                 utf8_sb_map_inited = 0;
 947                 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
 948                   utf8_sb_map[i] = BITSET_WORD_MAX;
 949             }
 950 #endif
 951           dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 952         }
 953       else
 954         {
 955           int i, j, ch;
 956
 957           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 958           if (BE (dfa->sb_char == NULL, 0))
 959             return REG_ESPACE;
 960
 961           /* Set the bits corresponding to single byte chars.  */
 962           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 963             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 964               {
 965                 wint_t wch = __btowc (ch);
 966                 if (wch != WEOF)
 967                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 968 # ifndef _LIBC
 969                 if (isascii (ch) && wch != ch)
 970                   dfa->map_notascii = 1;
 971 # endif
 972               }
 973         }
 974     }
 975 #endif
 976
 977   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 978     return REG_ESPACE;
 979   return REG_NOERROR;
 980 }
 981
 982 /* Initialize WORD_CHAR table, which indicate which character is
 983    "word".  In this case "word" means that it is the word construction
 984    character used by some operators like "\<", "\>", etc.  */
 985
 986 static void
 987 internal_function
 988 init_word_char (re_dfa_t *dfa)
 989 {
 990   int i, j, ch;
 991   dfa->word_ops_used = 1;
 992 #ifndef GAWK
 993   if (BE (dfa->map_notascii == 0, 1))
 994     {
 995       if (sizeof (dfa->word_char[0]) == 8)
 996         {
 997           /* The extra temporaries here avoid "implicitly truncated"
 998              warnings in the case when this is dead code, i.e. 32-bit.  */
 999           const uint64_t wc0 = UINT64_C (0x03ff000000000000);
1000           const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
1001           dfa->word_char[0] = wc0;
1002           dfa->word_char[1] = wc1;
1003           i = 2;
1004         }
1005       else if (sizeof (dfa->word_char[0]) == 4)
1006         {
1007           dfa->word_char[0] = UINT32_C (0x00000000);
1008           dfa->word_char[1] = UINT32_C (0x03ff0000);
1009           dfa->word_char[2] = UINT32_C (0x87fffffe);
1010           dfa->word_char[3] = UINT32_C (0x07fffffe);
1011           i = 4;
1012         }
1013       else
1014         abort ();
1015       ch = 128;
1016
1017       if (BE (dfa->is_utf8, 1))
1018         {
1019           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
1020           return;
1021         }
1022     }
1023 #endif
1024
1025   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
1026     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
1027       if (isalnum (ch) || ch == '_')
1028         dfa->word_char[i] |= (bitset_word_t) 1 << j;
1029 }
1030
1031 /* Free the work area which are only used while compiling.  */
1032
1033 static void
1034 free_workarea_compile (regex_t *preg)
1035 {
1036   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1037   bin_tree_storage_t *storage, *next;
1038   for (storage = dfa->str_tree_storage; storage; storage = next)
1039     {
1040       next = storage->next;
1041       re_free (storage);
1042     }
1043   dfa->str_tree_storage = NULL;
1044   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1045   dfa->str_tree = NULL;
1046   re_free (dfa->org_indices);
1047   dfa->org_indices = NULL;
1048 }
1049
1050 /* Create initial states for all contexts.  */
1051
1052 static reg_errcode_t
1053 create_initial_state (re_dfa_t *dfa)
1054 {
1055   int first, i;
1056   reg_errcode_t err;
1057   re_node_set init_nodes;
1058
1059   /* Initial states have the epsilon closure of the node which is
1060      the first node of the regular expression.  */
1061   first = dfa->str_tree->first->node_idx;
1062   dfa->init_node = first;
1063   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1064   if (BE (err != REG_NOERROR, 0))
1065     return err;
1066
1067   /* The back-references which are in initial states can epsilon transit,
1068      since in this case all of the subexpressions can be null.
1069      Then we add epsilon closures of the nodes which are the next nodes of
1070      the back-references.  */
1071   if (dfa->nbackref > 0)
1072     for (i = 0; i < init_nodes.nelem; ++i)
1073       {
1074         int node_idx = init_nodes.elems[i];
1075         re_token_type_t type = dfa->nodes[node_idx].type;
1076
1077         int clexp_idx;
1078         if (type != OP_BACK_REF)
1079           continue;
1080         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1081           {
1082             re_token_t *clexp_node;
1083             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1084             if (clexp_node->type == OP_CLOSE_SUBEXP
1085                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1086               break;
1087           }
1088         if (clexp_idx == init_nodes.nelem)
1089           continue;
1090
1091         if (type == OP_BACK_REF)
1092           {
1093             int dest_idx = dfa->edests[node_idx].elems[0];
1094             if (!re_node_set_contains (&init_nodes, dest_idx))
1095               {
1096                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1097                                                        dfa->eclosures
1098                                                        + dest_idx);
1099                 if (err != REG_NOERROR)
1100                   return err;
1101                 i = 0;
1102               }
1103           }
1104       }
1105
1106   /* It must be the first time to invoke acquire_state.  */
1107   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1108   /* We don't check ERR here, since the initial state must not be NULL.  */
1109   if (BE (dfa->init_state == NULL, 0))
1110     return err;
1111   if (dfa->init_state->has_constraint)
1112     {
1113       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1114                                                        CONTEXT_WORD);
1115       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1116                                                      CONTEXT_NEWLINE);
1117       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1118                                                          &init_nodes,
1119                                                          CONTEXT_NEWLINE
1120                                                          | CONTEXT_BEGBUF);
1121       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1122               || dfa->init_state_begbuf == NULL, 0))
1123         return err;
1124     }
1125   else
1126     dfa->init_state_word = dfa->init_state_nl
1127       = dfa->init_state_begbuf = dfa->init_state;
1128
1129   re_node_set_free (&init_nodes);
1130   return REG_NOERROR;
1131 }
1132 \f
1133 #ifdef RE_ENABLE_I18N
1134 /* If it is possible to do searching in single byte encoding instead of UTF-8
1135    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1136    DFA nodes where needed.  */
1137
1138 static void
1139 optimize_utf8 (re_dfa_t *dfa)
1140 {
1141   int node, i, mb_chars = 0, has_period = 0;
1142
1143   for (node = 0; node < dfa->nodes_len; ++node)
1144     switch (dfa->nodes[node].type)
1145       {
1146       case CHARACTER:
1147         if (dfa->nodes[node].opr.c >= 0x80)
1148           mb_chars = 1;
1149         break;
1150       case ANCHOR:
1151         switch (dfa->nodes[node].opr.ctx_type)
1152           {
1153           case LINE_FIRST:
1154           case LINE_LAST:
1155           case BUF_FIRST:
1156           case BUF_LAST:
1157             break;
1158           default:
1159             /* Word anchors etc. cannot be handled.  It's okay to test
1160                opr.ctx_type since constraints (for all DFA nodes) are
1161                created by ORing one or more opr.ctx_type values.  */
1162             return;
1163           }
1164         break;
1165       case OP_PERIOD:
1166         has_period = 1;
1167         break;
1168       case OP_BACK_REF:
1169       case OP_ALT:
1170       case END_OF_RE:
1171       case OP_DUP_ASTERISK:
1172       case OP_OPEN_SUBEXP:
1173       case OP_CLOSE_SUBEXP:
1174         break;
1175       case COMPLEX_BRACKET:
1176         return;
1177       case SIMPLE_BRACKET:
1178         /* Just double check.  The non-ASCII range starts at 0x80.  */
1179         assert (0x80 % BITSET_WORD_BITS == 0);
1180         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1181           if (dfa->nodes[node].opr.sbcset[i])
1182             return;
1183         break;
1184       default:
1185         abort ();
1186       }
1187
1188   if (mb_chars || has_period)
1189     for (node = 0; node < dfa->nodes_len; ++node)
1190       {
1191         if (dfa->nodes[node].type == CHARACTER
1192             && dfa->nodes[node].opr.c >= 0x80)
1193           dfa->nodes[node].mb_partial = 0;
1194         else if (dfa->nodes[node].type == OP_PERIOD)
1195           dfa->nodes[node].type = OP_UTF8_PERIOD;
1196       }
1197
1198   /* The search can be in single byte locale.  */
1199   dfa->mb_cur_max = 1;
1200   dfa->is_utf8 = 0;
1201   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1202 }
1203 #endif
1204 \f
1205 /* Analyze the structure tree, and calculate "first", "next", "edest",
1206    "eclosure", and "inveclosure".  */
1207
1208 static reg_errcode_t
1209 analyze (regex_t *preg)
1210 {
1211   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1212   reg_errcode_t ret;
1213
1214   /* Allocate arrays.  */
1215   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1216   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1217   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1218   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1219   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1220           || dfa->eclosures == NULL, 0))
1221     return REG_ESPACE;
1222
1223   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1224   if (dfa->subexp_map != NULL)
1225     {
1226       int i;
1227       for (i = 0; i < preg->re_nsub; i++)
1228         dfa->subexp_map[i] = i;
1229       preorder (dfa->str_tree, optimize_subexps, dfa);
1230       for (i = 0; i < preg->re_nsub; i++)
1231         if (dfa->subexp_map[i] != i)
1232           break;
1233       if (i == preg->re_nsub)
1234         {
1235           free (dfa->subexp_map);
1236           dfa->subexp_map = NULL;
1237         }
1238     }
1239
1240   ret = postorder (dfa->str_tree, lower_subexps, preg);
1241   if (BE (ret != REG_NOERROR, 0))
1242     return ret;
1243   ret = postorder (dfa->str_tree, calc_first, dfa);
1244   if (BE (ret != REG_NOERROR, 0))
1245     return ret;
1246   preorder (dfa->str_tree, calc_next, dfa);
1247   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1248   if (BE (ret != REG_NOERROR, 0))
1249     return ret;
1250   ret = calc_eclosure (dfa);
1251   if (BE (ret != REG_NOERROR, 0))
1252     return ret;
1253
1254   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1255      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1256   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1257       || dfa->nbackref)
1258     {
1259       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1260       if (BE (dfa->inveclosures == NULL, 0))
1261         return REG_ESPACE;
1262       ret = calc_inveclosure (dfa);
1263     }
1264
1265   return ret;
1266 }
1267
1268 /* Our parse trees are very unbalanced, so we cannot use a stack to
1269    implement parse tree visits.  Instead, we use parent pointers and
1270    some hairy code in these two functions.  */
1271 static reg_errcode_t
1272 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1273            void *extra)
1274 {
1275   bin_tree_t *node, *prev;
1276
1277   for (node = root; ; )
1278     {
1279       /* Descend down the tree, preferably to the left (or to the right
1280          if that's the only child).  */
1281       while (node->left || node->right)
1282         if (node->left)
1283           node = node->left;
1284         else
1285           node = node->right;
1286
1287       do
1288         {
1289           reg_errcode_t err = fn (extra, node);
1290           if (BE (err != REG_NOERROR, 0))
1291             return err;
1292           if (node->parent == NULL)
1293             return REG_NOERROR;
1294           prev = node;
1295           node = node->parent;
1296         }
1297       /* Go up while we have a node that is reached from the right.  */
1298       while (node->right == prev || node->right == NULL);
1299       node = node->right;
1300     }
1301 }
1302
1303 static reg_errcode_t
1304 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1305           void *extra)
1306 {
1307   bin_tree_t *node;
1308
1309   for (node = root; ; )
1310     {
1311       reg_errcode_t err = fn (extra, node);
1312       if (BE (err != REG_NOERROR, 0))
1313         return err;
1314
1315       /* Go to the left node, or up and to the right.  */
1316       if (node->left)
1317         node = node->left;
1318       else
1319         {
1320           bin_tree_t *prev = NULL;
1321           while (node->right == prev || node->right == NULL)
1322             {
1323               prev = node;
1324               node = node->parent;
1325               if (!node)
1326                 return REG_NOERROR;
1327             }
1328           node = node->right;
1329         }
1330     }
1331 }
1332
1333 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1334    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1335    backreferences as well.  Requires a preorder visit.  */
1336 static reg_errcode_t
1337 optimize_subexps (void *extra, bin_tree_t *node)
1338 {
1339   re_dfa_t *dfa = (re_dfa_t *) extra;
1340
1341   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1342     {
1343       int idx = node->token.opr.idx;
1344       node->token.opr.idx = dfa->subexp_map[idx];
1345       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1346     }
1347
1348   else if (node->token.type == SUBEXP
1349            && node->left && node->left->token.type == SUBEXP)
1350     {
1351       int other_idx = node->left->token.opr.idx;
1352
1353       node->left = node->left->left;
1354       if (node->left)
1355         node->left->parent = node;
1356
1357       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1358       if (other_idx < BITSET_WORD_BITS)
1359           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1360     }
1361
1362   return REG_NOERROR;
1363 }
1364
1365 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1366    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1367 static reg_errcode_t
1368 lower_subexps (void *extra, bin_tree_t *node)
1369 {
1370   regex_t *preg = (regex_t *) extra;
1371   reg_errcode_t err = REG_NOERROR;
1372
1373   if (node->left && node->left->token.type == SUBEXP)
1374     {
1375       node->left = lower_subexp (&err, preg, node->left);
1376       if (node->left)
1377         node->left->parent = node;
1378     }
1379   if (node->right && node->right->token.type == SUBEXP)
1380     {
1381       node->right = lower_subexp (&err, preg, node->right);
1382       if (node->right)
1383         node->right->parent = node;
1384     }
1385
1386   return err;
1387 }
1388
1389 static bin_tree_t *
1390 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1391 {
1392   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1393   bin_tree_t *body = node->left;
1394   bin_tree_t *op, *cls, *tree1, *tree;
1395
1396   if (preg->no_sub
1397       /* We do not optimize empty subexpressions, because otherwise we may
1398          have bad CONCAT nodes with NULL children.  This is obviously not
1399          very common, so we do not lose much.  An example that triggers
1400          this case is the sed "script" /\(\)/x.  */
1401       && node->left != NULL
1402       && (node->token.opr.idx >= BITSET_WORD_BITS
1403           || !(dfa->used_bkref_map
1404                & ((bitset_word_t) 1 << node->token.opr.idx))))
1405     return node->left;
1406
1407   /* Convert the SUBEXP node to the concatenation of an
1408      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1409   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1410   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1411   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1412   tree = create_tree (dfa, op, tree1, CONCAT);
1413   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1414     {
1415       *err = REG_ESPACE;
1416       return NULL;
1417     }
1418
1419   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1420   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1421   return tree;
1422 }
1423
1424 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1425    nodes.  Requires a postorder visit.  */
1426 static reg_errcode_t
1427 calc_first (void *extra, bin_tree_t *node)
1428 {
1429   re_dfa_t *dfa = (re_dfa_t *) extra;
1430   if (node->token.type == CONCAT)
1431     {
1432       node->first = node->left->first;
1433       node->node_idx = node->left->node_idx;
1434     }
1435   else
1436     {
1437       node->first = node;
1438       node->node_idx = re_dfa_add_node (dfa, node->token);
1439       if (BE (node->node_idx == -1, 0))
1440         return REG_ESPACE;
1441       if (node->token.type == ANCHOR)
1442         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1443     }
1444   return REG_NOERROR;
1445 }
1446
1447 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1448 static reg_errcode_t
1449 calc_next (void *extra, bin_tree_t *node)
1450 {
1451   switch (node->token.type)
1452     {
1453     case OP_DUP_ASTERISK:
1454       node->left->next = node;
1455       break;
1456     case CONCAT:
1457       node->left->next = node->right->first;
1458       node->right->next = node->next;
1459       break;
1460     default:
1461       if (node->left)
1462         node->left->next = node->next;
1463       if (node->right)
1464         node->right->next = node->next;
1465       break;
1466     }
1467   return REG_NOERROR;
1468 }
1469
1470 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1471 static reg_errcode_t
1472 link_nfa_nodes (void *extra, bin_tree_t *node)
1473 {
1474   re_dfa_t *dfa = (re_dfa_t *) extra;
1475   int idx = node->node_idx;
1476   reg_errcode_t err = REG_NOERROR;
1477
1478   switch (node->token.type)
1479     {
1480     case CONCAT:
1481       break;
1482
1483     case END_OF_RE:
1484       assert (node->next == NULL);
1485       break;
1486
1487     case OP_DUP_ASTERISK:
1488     case OP_ALT:
1489       {
1490         int left, right;
1491         dfa->has_plural_match = 1;
1492         if (node->left != NULL)
1493           left = node->left->first->node_idx;
1494         else
1495           left = node->next->node_idx;
1496         if (node->right != NULL)
1497           right = node->right->first->node_idx;
1498         else
1499           right = node->next->node_idx;
1500         assert (left > -1);
1501         assert (right > -1);
1502         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1503       }
1504       break;
1505
1506     case ANCHOR:
1507     case OP_OPEN_SUBEXP:
1508     case OP_CLOSE_SUBEXP:
1509       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1510       break;
1511
1512     case OP_BACK_REF:
1513       dfa->nexts[idx] = node->next->node_idx;
1514       if (node->token.type == OP_BACK_REF)
1515         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1516       break;
1517
1518     default:
1519       assert (!IS_EPSILON_NODE (node->token.type));
1520       dfa->nexts[idx] = node->next->node_idx;
1521       break;
1522     }
1523
1524   return err;
1525 }
1526
1527 /* Duplicate the epsilon closure of the node ROOT_NODE.
1528    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1529    to their own constraint.  */
1530
1531 static reg_errcode_t
1532 internal_function
1533 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1534                         int root_node, unsigned int init_constraint)
1535 {
1536   int org_node, clone_node, ret;
1537   unsigned int constraint = init_constraint;
1538   for (org_node = top_org_node, clone_node = top_clone_node;;)
1539     {
1540       int org_dest, clone_dest;
1541       if (dfa->nodes[org_node].type == OP_BACK_REF)
1542         {
1543           /* If the back reference epsilon-transit, its destination must
1544              also have the constraint.  Then duplicate the epsilon closure
1545              of the destination of the back reference, and store it in
1546              edests of the back reference.  */
1547           org_dest = dfa->nexts[org_node];
1548           re_node_set_empty (dfa->edests + clone_node);
1549           clone_dest = duplicate_node (dfa, org_dest, constraint);
1550           if (BE (clone_dest == -1, 0))
1551             return REG_ESPACE;
1552           dfa->nexts[clone_node] = dfa->nexts[org_node];
1553           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554           if (BE (ret < 0, 0))
1555             return REG_ESPACE;
1556         }
1557       else if (dfa->edests[org_node].nelem == 0)
1558         {
1559           /* In case of the node can't epsilon-transit, don't duplicate the
1560              destination and store the original destination as the
1561              destination of the node.  */
1562           dfa->nexts[clone_node] = dfa->nexts[org_node];
1563           break;
1564         }
1565       else if (dfa->edests[org_node].nelem == 1)
1566         {
1567           /* In case of the node can epsilon-transit, and it has only one
1568              destination.  */
1569           org_dest = dfa->edests[org_node].elems[0];
1570           re_node_set_empty (dfa->edests + clone_node);
1571           /* If the node is root_node itself, it means the epsilon clsoure
1572              has a loop.   Then tie it to the destination of the root_node.  */
1573           if (org_node == root_node && clone_node != org_node)
1574             {
1575               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1576               if (BE (ret < 0, 0))
1577                 return REG_ESPACE;
1578               break;
1579             }
1580           /* In case of the node has another constraint, add it.  */
1581           constraint |= dfa->nodes[org_node].constraint;
1582           clone_dest = duplicate_node (dfa, org_dest, constraint);
1583           if (BE (clone_dest == -1, 0))
1584             return REG_ESPACE;
1585           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1586           if (BE (ret < 0, 0))
1587             return REG_ESPACE;
1588         }
1589       else /* dfa->edests[org_node].nelem == 2 */
1590         {
1591           /* In case of the node can epsilon-transit, and it has two
1592              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1593           org_dest = dfa->edests[org_node].elems[0];
1594           re_node_set_empty (dfa->edests + clone_node);
1595           /* Search for a duplicated node which satisfies the constraint.  */
1596           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1597           if (clone_dest == -1)
1598             {
1599               /* There is no such duplicated node, create a new one.  */
1600               reg_errcode_t err;
1601               clone_dest = duplicate_node (dfa, org_dest, constraint);
1602               if (BE (clone_dest == -1, 0))
1603                 return REG_ESPACE;
1604               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1605               if (BE (ret < 0, 0))
1606                 return REG_ESPACE;
1607               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1608                                             root_node, constraint);
1609               if (BE (err != REG_NOERROR, 0))
1610                 return err;
1611             }
1612           else
1613             {
1614               /* There is a duplicated node which satisfies the constraint,
1615                  use it to avoid infinite loop.  */
1616               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1617               if (BE (ret < 0, 0))
1618                 return REG_ESPACE;
1619             }
1620
1621           org_dest = dfa->edests[org_node].elems[1];
1622           clone_dest = duplicate_node (dfa, org_dest, constraint);
1623           if (BE (clone_dest == -1, 0))
1624             return REG_ESPACE;
1625           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1626           if (BE (ret < 0, 0))
1627             return REG_ESPACE;
1628         }
1629       org_node = org_dest;
1630       clone_node = clone_dest;
1631     }
1632   return REG_NOERROR;
1633 }
1634
1635 /* Search for a node which is duplicated from the node ORG_NODE, and
1636    satisfies the constraint CONSTRAINT.  */
1637
1638 static int
1639 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1640                         unsigned int constraint)
1641 {
1642   int idx;
1643   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1644     {
1645       if (org_node == dfa->org_indices[idx]
1646           && constraint == dfa->nodes[idx].constraint)
1647         return idx; /* Found.  */
1648     }
1649   return -1; /* Not found.  */
1650 }
1651
1652 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1653    Return the index of the new node, or -1 if insufficient storage is
1654    available.  */
1655
1656 static int
1657 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1658 {
1659   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1660   if (BE (dup_idx != -1, 1))
1661     {
1662       dfa->nodes[dup_idx].constraint = constraint;
1663       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1664       dfa->nodes[dup_idx].duplicated = 1;
1665
1666       /* Store the index of the original node.  */
1667       dfa->org_indices[dup_idx] = org_idx;
1668     }
1669   return dup_idx;
1670 }
1671
1672 static reg_errcode_t
1673 calc_inveclosure (re_dfa_t *dfa)
1674 {
1675   int src, idx, ret;
1676   for (idx = 0; idx < dfa->nodes_len; ++idx)
1677     re_node_set_init_empty (dfa->inveclosures + idx);
1678
1679   for (src = 0; src < dfa->nodes_len; ++src)
1680     {
1681       int *elems = dfa->eclosures[src].elems;
1682       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1683         {
1684           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1685           if (BE (ret == -1, 0))
1686             return REG_ESPACE;
1687         }
1688     }
1689
1690   return REG_NOERROR;
1691 }
1692
1693 /* Calculate "eclosure" for all the node in DFA.  */
1694
1695 static reg_errcode_t
1696 calc_eclosure (re_dfa_t *dfa)
1697 {
1698   int node_idx, incomplete;
1699 #ifdef DEBUG
1700   assert (dfa->nodes_len > 0);
1701 #endif
1702   incomplete = 0;
1703   /* For each nodes, calculate epsilon closure.  */
1704   for (node_idx = 0; ; ++node_idx)
1705     {
1706       reg_errcode_t err;
1707       re_node_set eclosure_elem;
1708       if (node_idx == dfa->nodes_len)
1709         {
1710           if (!incomplete)
1711             break;
1712           incomplete = 0;
1713           node_idx = 0;
1714         }
1715
1716 #ifdef DEBUG
1717       assert (dfa->eclosures[node_idx].nelem != -1);
1718 #endif
1719
1720       /* If we have already calculated, skip it.  */
1721       if (dfa->eclosures[node_idx].nelem != 0)
1722         continue;
1723       /* Calculate epsilon closure of `node_idx'.  */
1724       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1725       if (BE (err != REG_NOERROR, 0))
1726         return err;
1727
1728       if (dfa->eclosures[node_idx].nelem == 0)
1729         {
1730           incomplete = 1;
1731           re_node_set_free (&eclosure_elem);
1732         }
1733     }
1734   return REG_NOERROR;
1735 }
1736
1737 /* Calculate epsilon closure of NODE.  */
1738
1739 static reg_errcode_t
1740 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1741 {
1742   reg_errcode_t err;
1743   int i;
1744   re_node_set eclosure;
1745   int ret;
1746   int incomplete = 0;
1747   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1748   if (BE (err != REG_NOERROR, 0))
1749     return err;
1750
1751   /* This indicates that we are calculating this node now.
1752      We reference this value to avoid infinite loop.  */
1753   dfa->eclosures[node].nelem = -1;
1754
1755   /* If the current node has constraints, duplicate all nodes
1756      since they must inherit the constraints.  */
1757   if (dfa->nodes[node].constraint
1758       && dfa->edests[node].nelem
1759       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1760     {
1761       err = duplicate_node_closure (dfa, node, node, node,
1762                                     dfa->nodes[node].constraint);
1763       if (BE (err != REG_NOERROR, 0))
1764         return err;
1765     }
1766
1767   /* Expand each epsilon destination nodes.  */
1768   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1769     for (i = 0; i < dfa->edests[node].nelem; ++i)
1770       {
1771         re_node_set eclosure_elem;
1772         int edest = dfa->edests[node].elems[i];
1773         /* If calculating the epsilon closure of `edest' is in progress,
1774            return intermediate result.  */
1775         if (dfa->eclosures[edest].nelem == -1)
1776           {
1777             incomplete = 1;
1778             continue;
1779           }
1780         /* If we haven't calculated the epsilon closure of `edest' yet,
1781            calculate now. Otherwise use calculated epsilon closure.  */
1782         if (dfa->eclosures[edest].nelem == 0)
1783           {
1784             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1785             if (BE (err != REG_NOERROR, 0))
1786               return err;
1787           }
1788         else
1789           eclosure_elem = dfa->eclosures[edest];
1790         /* Merge the epsilon closure of `edest'.  */
1791         err = re_node_set_merge (&eclosure, &eclosure_elem);
1792         if (BE (err != REG_NOERROR, 0))
1793           return err;
1794         /* If the epsilon closure of `edest' is incomplete,
1795            the epsilon closure of this node is also incomplete.  */
1796         if (dfa->eclosures[edest].nelem == 0)
1797           {
1798             incomplete = 1;
1799             re_node_set_free (&eclosure_elem);
1800           }
1801       }
1802
1803   /* An epsilon closure includes itself.  */
1804   ret = re_node_set_insert (&eclosure, node);
1805   if (BE (ret < 0, 0))
1806     return REG_ESPACE;
1807   if (incomplete && !root)
1808     dfa->eclosures[node].nelem = 0;
1809   else
1810     dfa->eclosures[node] = eclosure;
1811   *new_set = eclosure;
1812   return REG_NOERROR;
1813 }
1814 \f
1815 /* Functions for token which are used in the parser.  */
1816
1817 /* Fetch a token from INPUT.
1818    We must not use this function inside bracket expressions.  */
1819
1820 static void
1821 internal_function
1822 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1823 {
1824   re_string_skip_bytes (input, peek_token (result, input, syntax));
1825 }
1826
1827 /* Peek a token from INPUT, and return the length of the token.
1828    We must not use this function inside bracket expressions.  */
1829
1830 static int
1831 internal_function
1832 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1833 {
1834   unsigned char c;
1835
1836   if (re_string_eoi (input))
1837     {
1838       token->type = END_OF_RE;
1839       return 0;
1840     }
1841
1842   c = re_string_peek_byte (input, 0);
1843   token->opr.c = c;
1844
1845   token->word_char = 0;
1846 #ifdef RE_ENABLE_I18N
1847   token->mb_partial = 0;
1848   if (input->mb_cur_max > 1 &&
1849       !re_string_first_byte (input, re_string_cur_idx (input)))
1850     {
1851       token->type = CHARACTER;
1852       token->mb_partial = 1;
1853       return 1;
1854     }
1855 #endif
1856   if (c == '\\')
1857     {
1858       unsigned char c2;
1859       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1860         {
1861           token->type = BACK_SLASH;
1862           return 1;
1863         }
1864
1865       c2 = re_string_peek_byte_case (input, 1);
1866       token->opr.c = c2;
1867       token->type = CHARACTER;
1868 #ifdef RE_ENABLE_I18N
1869       if (input->mb_cur_max > 1)
1870         {
1871           wint_t wc = re_string_wchar_at (input,
1872                                           re_string_cur_idx (input) + 1);
1873           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1874         }
1875       else
1876 #endif
1877         token->word_char = IS_WORD_CHAR (c2) != 0;
1878
1879       switch (c2)
1880         {
1881         case '|':
1882           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1883             token->type = OP_ALT;
1884           break;
1885         case '1': case '2': case '3': case '4': case '5':
1886         case '6': case '7': case '8': case '9':
1887           if (!(syntax & RE_NO_BK_REFS))
1888             {
1889               token->type = OP_BACK_REF;
1890               token->opr.idx = c2 - '1';
1891             }
1892           break;
1893         case '<':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             {
1896               token->type = ANCHOR;
1897               token->opr.ctx_type = WORD_FIRST;
1898             }
1899           break;
1900         case '>':
1901           if (!(syntax & RE_NO_GNU_OPS))
1902             {
1903               token->type = ANCHOR;
1904               token->opr.ctx_type = WORD_LAST;
1905             }
1906           break;
1907         case 'b':
1908           if (!(syntax & RE_NO_GNU_OPS))
1909             {
1910               token->type = ANCHOR;
1911               token->opr.ctx_type = WORD_DELIM;
1912             }
1913           break;
1914         case 'B':
1915           if (!(syntax & RE_NO_GNU_OPS))
1916             {
1917               token->type = ANCHOR;
1918               token->opr.ctx_type = NOT_WORD_DELIM;
1919             }
1920           break;
1921         case 'w':
1922           if (!(syntax & RE_NO_GNU_OPS))
1923             token->type = OP_WORD;
1924           break;
1925         case 'W':
1926           if (!(syntax & RE_NO_GNU_OPS))
1927             token->type = OP_NOTWORD;
1928           break;
1929         case 's':
1930           if (!(syntax & RE_NO_GNU_OPS))
1931             token->type = OP_SPACE;
1932           break;
1933         case 'S':
1934           if (!(syntax & RE_NO_GNU_OPS))
1935             token->type = OP_NOTSPACE;
1936           break;
1937         case '`':
1938           if (!(syntax & RE_NO_GNU_OPS))
1939             {
1940               token->type = ANCHOR;
1941               token->opr.ctx_type = BUF_FIRST;
1942             }
1943           break;
1944         case '\'':
1945           if (!(syntax & RE_NO_GNU_OPS))
1946             {
1947               token->type = ANCHOR;
1948               token->opr.ctx_type = BUF_LAST;
1949             }
1950           break;
1951         case '(':
1952           if (!(syntax & RE_NO_BK_PARENS))
1953             token->type = OP_OPEN_SUBEXP;
1954           break;
1955         case ')':
1956           if (!(syntax & RE_NO_BK_PARENS))
1957             token->type = OP_CLOSE_SUBEXP;
1958           break;
1959         case '+':
1960           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1961             token->type = OP_DUP_PLUS;
1962           break;
1963         case '?':
1964           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1965             token->type = OP_DUP_QUESTION;
1966           break;
1967         case '{':
1968           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1969             token->type = OP_OPEN_DUP_NUM;
1970           break;
1971         case '}':
1972           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1973             token->type = OP_CLOSE_DUP_NUM;
1974           break;
1975         default:
1976           break;
1977         }
1978       return 2;
1979     }
1980
1981   token->type = CHARACTER;
1982 #ifdef RE_ENABLE_I18N
1983   if (input->mb_cur_max > 1)
1984     {
1985       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1986       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1987     }
1988   else
1989 #endif
1990     token->word_char = IS_WORD_CHAR (token->opr.c);
1991
1992   switch (c)
1993     {
1994     case '\n':
1995       if (syntax & RE_NEWLINE_ALT)
1996         token->type = OP_ALT;
1997       break;
1998     case '|':
1999       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
2000         token->type = OP_ALT;
2001       break;
2002     case '*':
2003       token->type = OP_DUP_ASTERISK;
2004       break;
2005     case '+':
2006       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
2007         token->type = OP_DUP_PLUS;
2008       break;
2009     case '?':
2010       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
2011         token->type = OP_DUP_QUESTION;
2012       break;
2013     case '{':
2014       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2015         token->type = OP_OPEN_DUP_NUM;
2016       break;
2017     case '}':
2018       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2019         token->type = OP_CLOSE_DUP_NUM;
2020       break;
2021     case '(':
2022       if (syntax & RE_NO_BK_PARENS)
2023         token->type = OP_OPEN_SUBEXP;
2024       break;
2025     case ')':
2026       if (syntax & RE_NO_BK_PARENS)
2027         token->type = OP_CLOSE_SUBEXP;
2028       break;
2029     case '[':
2030       token->type = OP_OPEN_BRACKET;
2031       break;
2032     case '.':
2033       token->type = OP_PERIOD;
2034       break;
2035     case '^':
2036       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2037           re_string_cur_idx (input) != 0)
2038         {
2039           char prev = re_string_peek_byte (input, -1);
2040           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2041             break;
2042         }
2043       token->type = ANCHOR;
2044       token->opr.ctx_type = LINE_FIRST;
2045       break;
2046     case '$':
2047       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2048           re_string_cur_idx (input) + 1 != re_string_length (input))
2049         {
2050           re_token_t next;
2051           re_string_skip_bytes (input, 1);
2052           peek_token (&next, input, syntax);
2053           re_string_skip_bytes (input, -1);
2054           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2055             break;
2056         }
2057       token->type = ANCHOR;
2058       token->opr.ctx_type = LINE_LAST;
2059       break;
2060     default:
2061       break;
2062     }
2063   return 1;
2064 }
2065
2066 /* Peek a token from INPUT, and return the length of the token.
2067    We must not use this function out of bracket expressions.  */
2068
2069 static int
2070 internal_function
2071 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2072 {
2073   unsigned char c;
2074   if (re_string_eoi (input))
2075     {
2076       token->type = END_OF_RE;
2077       return 0;
2078     }
2079   c = re_string_peek_byte (input, 0);
2080   token->opr.c = c;
2081
2082 #ifdef RE_ENABLE_I18N
2083   if (input->mb_cur_max > 1 &&
2084       !re_string_first_byte (input, re_string_cur_idx (input)))
2085     {
2086       token->type = CHARACTER;
2087       return 1;
2088     }
2089 #endif /* RE_ENABLE_I18N */
2090
2091   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2092       && re_string_cur_idx (input) + 1 < re_string_length (input))
2093     {
2094       /* In this case, '\' escape a character.  */
2095       unsigned char c2;
2096       re_string_skip_bytes (input, 1);
2097       c2 = re_string_peek_byte (input, 0);
2098       token->opr.c = c2;
2099       token->type = CHARACTER;
2100       return 1;
2101     }
2102   if (c == '[') /* '[' is a special char in a bracket exps.  */
2103     {
2104       unsigned char c2;
2105       int token_len;
2106       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2107         c2 = re_string_peek_byte (input, 1);
2108       else
2109         c2 = 0;
2110       token->opr.c = c2;
2111       token_len = 2;
2112       switch (c2)
2113         {
2114         case '.':
2115           token->type = OP_OPEN_COLL_ELEM;
2116           break;
2117         case '=':
2118           token->type = OP_OPEN_EQUIV_CLASS;
2119           break;
2120         case ':':
2121           if (syntax & RE_CHAR_CLASSES)
2122             {
2123               token->type = OP_OPEN_CHAR_CLASS;
2124               break;
2125             }
2126           /* else fall through.  */
2127         default:
2128           token->type = CHARACTER;
2129           token->opr.c = c;
2130           token_len = 1;
2131           break;
2132         }
2133       return token_len;
2134     }
2135   switch (c)
2136     {
2137     case '-':
2138       token->type = OP_CHARSET_RANGE;
2139       break;
2140     case ']':
2141       token->type = OP_CLOSE_BRACKET;
2142       break;
2143     case '^':
2144       token->type = OP_NON_MATCH_LIST;
2145       break;
2146     default:
2147       token->type = CHARACTER;
2148     }
2149   return 1;
2150 }
2151 \f
2152 /* Functions for parser.  */
2153
2154 /* Entry point of the parser.
2155    Parse the regular expression REGEXP and return the structure tree.
2156    If an error is occured, ERR is set by error code, and return NULL.
2157    This function build the following tree, from regular expression <reg_exp>:
2158            CAT
2159            / \
2160           /   \
2161    <reg_exp>  EOR
2162
2163    CAT means concatenation.
2164    EOR means end of regular expression.  */
2165
2166 static bin_tree_t *
2167 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2168        reg_errcode_t *err)
2169 {
2170   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2171   bin_tree_t *tree, *eor, *root;
2172   re_token_t current_token;
2173   dfa->syntax = syntax;
2174   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2175   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2176   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2177     return NULL;
2178   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2179   if (tree != NULL)
2180     root = create_tree (dfa, tree, eor, CONCAT);
2181   else
2182     root = eor;
2183   if (BE (eor == NULL || root == NULL, 0))
2184     {
2185       *err = REG_ESPACE;
2186       return NULL;
2187     }
2188   return root;
2189 }
2190
2191 /* This function build the following tree, from regular expression
2192    <branch1>|<branch2>:
2193            ALT
2194            / \
2195           /   \
2196    <branch1> <branch2>
2197
2198    ALT means alternative, which represents the operator `|'.  */
2199
2200 static bin_tree_t *
2201 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2202                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2203 {
2204   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2205   bin_tree_t *tree, *branch = NULL;
2206   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2207   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2208     return NULL;
2209
2210   while (token->type == OP_ALT)
2211     {
2212       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2213       if (token->type != OP_ALT && token->type != END_OF_RE
2214           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2215         {
2216           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2217           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2218             return NULL;
2219         }
2220       else
2221         branch = NULL;
2222       tree = create_tree (dfa, tree, branch, OP_ALT);
2223       if (BE (tree == NULL, 0))
2224         {
2225           *err = REG_ESPACE;
2226           return NULL;
2227         }
2228     }
2229   return tree;
2230 }
2231
2232 /* This function build the following tree, from regular expression
2233    <exp1><exp2>:
2234         CAT
2235         / \
2236        /   \
2237    <exp1> <exp2>
2238
2239    CAT means concatenation.  */
2240
2241 static bin_tree_t *
2242 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2243               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2244 {
2245   bin_tree_t *tree, *exp;
2246   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2247   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2248   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2249     return NULL;
2250
2251   while (token->type != OP_ALT && token->type != END_OF_RE
2252          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2253     {
2254       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2255       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2256         {
2257           if (tree != NULL)
2258             postorder (tree, free_tree, NULL);
2259           return NULL;
2260         }
2261       if (tree != NULL && exp != NULL)
2262         {
2263           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2264           if (newtree == NULL)
2265             {
2266               postorder (exp, free_tree, NULL);
2267               postorder (tree, free_tree, NULL);
2268               *err = REG_ESPACE;
2269               return NULL;
2270             }
2271           tree = newtree;
2272         }
2273       else if (tree == NULL)
2274         tree = exp;
2275       /* Otherwise exp == NULL, we don't need to create new tree.  */
2276     }
2277   return tree;
2278 }
2279
2280 /* This function build the following tree, from regular expression a*:
2281          *
2282          |
2283          a
2284 */
2285
2286 static bin_tree_t *
2287 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2288                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2289 {
2290   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2291   bin_tree_t *tree;
2292   switch (token->type)
2293     {
2294     case CHARACTER:
2295       tree = create_token_tree (dfa, NULL, NULL, token);
2296       if (BE (tree == NULL, 0))
2297         {
2298           *err = REG_ESPACE;
2299           return NULL;
2300         }
2301 #ifdef RE_ENABLE_I18N
2302       if (dfa->mb_cur_max > 1)
2303         {
2304           while (!re_string_eoi (regexp)
2305                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2306             {
2307               bin_tree_t *mbc_remain;
2308               fetch_token (token, regexp, syntax);
2309               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2310               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2311               if (BE (mbc_remain == NULL || tree == NULL, 0))
2312                 {
2313                   *err = REG_ESPACE;
2314                   return NULL;
2315                 }
2316             }
2317         }
2318 #endif
2319       break;
2320     case OP_OPEN_SUBEXP:
2321       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2322       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2323         return NULL;
2324       break;
2325     case OP_OPEN_BRACKET:
2326       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2327       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2328         return NULL;
2329       break;
2330     case OP_BACK_REF:
2331       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2332         {
2333           *err = REG_ESUBREG;
2334           return NULL;
2335         }
2336       dfa->used_bkref_map |= 1 << token->opr.idx;
2337       tree = create_token_tree (dfa, NULL, NULL, token);
2338       if (BE (tree == NULL, 0))
2339         {
2340           *err = REG_ESPACE;
2341           return NULL;
2342         }
2343       ++dfa->nbackref;
2344       dfa->has_mb_node = 1;
2345       break;
2346     case OP_OPEN_DUP_NUM:
2347       if (syntax & RE_CONTEXT_INVALID_DUP)
2348         {
2349           *err = REG_BADRPT;
2350           return NULL;
2351         }
2352       /* FALLTHROUGH */
2353     case OP_DUP_ASTERISK:
2354     case OP_DUP_PLUS:
2355     case OP_DUP_QUESTION:
2356       if (syntax & RE_CONTEXT_INVALID_OPS)
2357         {
2358           *err = REG_BADRPT;
2359           return NULL;
2360         }
2361       else if (syntax & RE_CONTEXT_INDEP_OPS)
2362         {
2363           fetch_token (token, regexp, syntax);
2364           return parse_expression (regexp, preg, token, syntax, nest, err);
2365         }
2366       /* else fall through  */
2367     case OP_CLOSE_SUBEXP:
2368       if ((token->type == OP_CLOSE_SUBEXP) &&
2369           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2370         {
2371           *err = REG_ERPAREN;
2372           return NULL;
2373         }
2374       /* else fall through  */
2375     case OP_CLOSE_DUP_NUM:
2376       /* We treat it as a normal character.  */
2377
2378       /* Then we can these characters as normal characters.  */
2379       token->type = CHARACTER;
2380       /* mb_partial and word_char bits should be initialized already
2381          by peek_token.  */
2382       tree = create_token_tree (dfa, NULL, NULL, token);
2383       if (BE (tree == NULL, 0))
2384         {
2385           *err = REG_ESPACE;
2386           return NULL;
2387         }
2388       break;
2389     case ANCHOR:
2390       if ((token->opr.ctx_type
2391            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2392           && dfa->word_ops_used == 0)
2393         init_word_char (dfa);
2394       if (token->opr.ctx_type == WORD_DELIM
2395           || token->opr.ctx_type == NOT_WORD_DELIM)
2396         {
2397           bin_tree_t *tree_first, *tree_last;
2398           if (token->opr.ctx_type == WORD_DELIM)
2399             {
2400               token->opr.ctx_type = WORD_FIRST;
2401               tree_first = create_token_tree (dfa, NULL, NULL, token);
2402               token->opr.ctx_type = WORD_LAST;
2403             }
2404           else
2405             {
2406               token->opr.ctx_type = INSIDE_WORD;
2407               tree_first = create_token_tree (dfa, NULL, NULL, token);
2408               token->opr.ctx_type = INSIDE_NOTWORD;
2409             }
2410           tree_last = create_token_tree (dfa, NULL, NULL, token);
2411           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2412           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2413             {
2414               *err = REG_ESPACE;
2415               return NULL;
2416             }
2417         }
2418       else
2419         {
2420           tree = create_token_tree (dfa, NULL, NULL, token);
2421           if (BE (tree == NULL, 0))
2422             {
2423               *err = REG_ESPACE;
2424               return NULL;
2425             }
2426         }
2427       /* We must return here, since ANCHORs can't be followed
2428          by repetition operators.
2429          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2430              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2431       fetch_token (token, regexp, syntax);
2432       return tree;
2433     case OP_PERIOD:
2434       tree = create_token_tree (dfa, NULL, NULL, token);
2435       if (BE (tree == NULL, 0))
2436         {
2437           *err = REG_ESPACE;
2438           return NULL;
2439         }
2440       if (dfa->mb_cur_max > 1)
2441         dfa->has_mb_node = 1;
2442       break;
2443     case OP_WORD:
2444     case OP_NOTWORD:
2445       tree = build_charclass_op (dfa, regexp->trans,
2446                                  "alnum",
2447                                  "_",
2448                                  token->type == OP_NOTWORD, err);
2449       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2450         return NULL;
2451       break;
2452     case OP_SPACE:
2453     case OP_NOTSPACE:
2454       tree = build_charclass_op (dfa, regexp->trans,
2455                                  "space",
2456                                  "",
2457                                  token->type == OP_NOTSPACE, err);
2458       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2459         return NULL;
2460       break;
2461     case OP_ALT:
2462     case END_OF_RE:
2463       return NULL;
2464     case BACK_SLASH:
2465       *err = REG_EESCAPE;
2466       return NULL;
2467     default:
2468       /* Must not happen?  */
2469 #ifdef DEBUG
2470       assert (0);
2471 #endif
2472       return NULL;
2473     }
2474   fetch_token (token, regexp, syntax);
2475
2476   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2477          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2478     {
2479       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2480       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2481         return NULL;
2482       /* In BRE consecutive duplications are not allowed.  */
2483       if ((syntax & RE_CONTEXT_INVALID_DUP)
2484           && (token->type == OP_DUP_ASTERISK
2485               || token->type == OP_OPEN_DUP_NUM))
2486         {
2487           *err = REG_BADRPT;
2488           return NULL;
2489         }
2490     }
2491
2492   return tree;
2493 }
2494
2495 /* This function build the following tree, from regular expression
2496    (<reg_exp>):
2497          SUBEXP
2498             |
2499         <reg_exp>
2500 */
2501
2502 static bin_tree_t *
2503 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2504                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2505 {
2506   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2507   bin_tree_t *tree;
2508   size_t cur_nsub;
2509   cur_nsub = preg->re_nsub++;
2510
2511   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2512
2513   /* The subexpression may be a null string.  */
2514   if (token->type == OP_CLOSE_SUBEXP)
2515     tree = NULL;
2516   else
2517     {
2518       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2519       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2520         {
2521           if (tree != NULL)
2522             postorder (tree, free_tree, NULL);
2523           *err = REG_EPAREN;
2524         }
2525       if (BE (*err != REG_NOERROR, 0))
2526         return NULL;
2527     }
2528
2529   if (cur_nsub <= '9' - '1')
2530     dfa->completed_bkref_map |= 1 << cur_nsub;
2531
2532   tree = create_tree (dfa, tree, NULL, SUBEXP);
2533   if (BE (tree == NULL, 0))
2534     {
2535       *err = REG_ESPACE;
2536       return NULL;
2537     }
2538   tree->token.opr.idx = cur_nsub;
2539   return tree;
2540 }
2541
2542 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2543
2544 static bin_tree_t *
2545 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2546               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2547 {
2548   bin_tree_t *tree = NULL, *old_tree = NULL;
2549   int i, start, end, start_idx = re_string_cur_idx (regexp);
2550   re_token_t start_token = *token;
2551
2552   if (token->type == OP_OPEN_DUP_NUM)
2553     {
2554       end = 0;
2555       start = fetch_number (regexp, token, syntax);
2556       if (start == -1)
2557         {
2558           if (token->type == CHARACTER && token->opr.c == ',')
2559             start = 0; /* We treat "{,m}" as "{0,m}".  */
2560           else
2561             {
2562               *err = REG_BADBR; /* <re>{} is invalid.  */
2563               return NULL;
2564             }
2565         }
2566       if (BE (start != -2, 1))
2567         {
2568           /* We treat "{n}" as "{n,n}".  */
2569           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2570                  : ((token->type == CHARACTER && token->opr.c == ',')
2571                     ? fetch_number (regexp, token, syntax) : -2));
2572         }
2573       if (BE (start == -2 || end == -2, 0))
2574         {
2575           /* Invalid sequence.  */
2576           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2577             {
2578               if (token->type == END_OF_RE)
2579                 *err = REG_EBRACE;
2580               else
2581                 *err = REG_BADBR;
2582
2583               return NULL;
2584             }
2585
2586           /* If the syntax bit is set, rollback.  */
2587           re_string_set_index (regexp, start_idx);
2588           *token = start_token;
2589           token->type = CHARACTER;
2590           /* mb_partial and word_char bits should be already initialized by
2591              peek_token.  */
2592           return elem;
2593         }
2594
2595       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2596         {
2597           /* First number greater than second.  */
2598           *err = REG_BADBR;
2599           return NULL;
2600         }
2601     }
2602   else
2603     {
2604       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2606     }
2607
2608   fetch_token (token, regexp, syntax);
2609
2610   if (BE (elem == NULL, 0))
2611     return NULL;
2612   if (BE (start == 0 && end == 0, 0))
2613     {
2614       postorder (elem, free_tree, NULL);
2615       return NULL;
2616     }
2617
2618   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2619   if (BE (start > 0, 0))
2620     {
2621       tree = elem;
2622       for (i = 2; i <= start; ++i)
2623         {
2624           elem = duplicate_tree (elem, dfa);
2625           tree = create_tree (dfa, tree, elem, CONCAT);
2626           if (BE (elem == NULL || tree == NULL, 0))
2627             goto parse_dup_op_espace;
2628         }
2629
2630       if (start == end)
2631         return tree;
2632
2633       /* Duplicate ELEM before it is marked optional.  */
2634       elem = duplicate_tree (elem, dfa);
2635       old_tree = tree;
2636     }
2637   else
2638     old_tree = NULL;
2639
2640   if (elem->token.type == SUBEXP)
2641     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2642
2643   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2644   if (BE (tree == NULL, 0))
2645     goto parse_dup_op_espace;
2646
2647   /* This loop is actually executed only when end != -1,
2648      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2649      already created the start+1-th copy.  */
2650   for (i = start + 2; i <= end; ++i)
2651     {
2652       elem = duplicate_tree (elem, dfa);
2653       tree = create_tree (dfa, tree, elem, CONCAT);
2654       if (BE (elem == NULL || tree == NULL, 0))
2655         goto parse_dup_op_espace;
2656
2657       tree = create_tree (dfa, tree, NULL, OP_ALT);
2658       if (BE (tree == NULL, 0))
2659         goto parse_dup_op_espace;
2660     }
2661
2662   if (old_tree)
2663     tree = create_tree (dfa, old_tree, tree, CONCAT);
2664
2665   return tree;
2666
2667  parse_dup_op_espace:
2668   *err = REG_ESPACE;
2669   return NULL;
2670 }
2671
2672 /* Size of the names for collating symbol/equivalence_class/character_class.
2673    I'm not sure, but maybe enough.  */
2674 #define BRACKET_NAME_BUF_SIZE 32
2675
2676 #ifndef _LIBC
2677   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2678      Build the range expression which starts from START_ELEM, and ends
2679      at END_ELEM.  The result are written to MBCSET and SBCSET.
2680      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2681      mbcset->range_ends, is a pointer argument sinse we may
2682      update it.  */
2683
2684 static reg_errcode_t
2685 internal_function
2686 # ifdef RE_ENABLE_I18N
2687 build_range_exp (reg_syntax_t syntax, bitset_t sbcset, re_charset_t *mbcset,
2688                 int *range_alloc, bracket_elem_t *start_elem,
2689                 bracket_elem_t *end_elem)
2690 # else /* not RE_ENABLE_I18N */
2691 build_range_exp (reg_syntax_t syntax, bitset_t sbcset,
2692                 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2693 # endif /* not RE_ENABLE_I18N */
2694 {
2695   unsigned int start_ch, end_ch;
2696   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2697   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2698           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2699           0))
2700     return REG_ERANGE;
2701
2702   /* We can handle no multi character collating elements without libc
2703      support.  */
2704   if (BE ((start_elem->type == COLL_SYM
2705            && strlen ((char *) start_elem->opr.name) > 1)
2706           || (end_elem->type == COLL_SYM
2707               && strlen ((char *) end_elem->opr.name) > 1), 0))
2708     return REG_ECOLLATE;
2709
2710 # ifdef RE_ENABLE_I18N
2711   {
2712     wchar_t wc;
2713     wint_t start_wc;
2714     wint_t end_wc;
2715
2716     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718                    : 0));
2719     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721                  : 0));
2722 #ifdef GAWK
2723     /*
2724      * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2725      * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2726      * unsigned, so we don't have sign extension problems.
2727      */
2728     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2729                 ? start_ch : start_elem->opr.wch);
2730     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2731               ? end_ch : end_elem->opr.wch);
2732 #else
2733     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2734                 ? __btowc (start_ch) : start_elem->opr.wch);
2735     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2736               ? __btowc (end_ch) : end_elem->opr.wch);
2737 #endif
2738     if (start_wc == WEOF || end_wc == WEOF)
2739       return REG_ECOLLATE;
2740     else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
2741       return REG_ERANGE;
2742
2743     /* Got valid collation sequence values, add them as a new entry.
2744        However, for !_LIBC we have no collation elements: if the
2745        character set is single byte, the single byte character set
2746        that we build below suffices.  parse_bracket_exp passes
2747        no MBCSET if dfa->mb_cur_max == 1.  */
2748     if (mbcset)
2749       {
2750         /* Check the space of the arrays.  */
2751         if (BE (*range_alloc == mbcset->nranges, 0))
2752           {
2753             /* There is not enough space, need realloc.  */
2754             wchar_t *new_array_start, *new_array_end;
2755             int new_nranges;
2756
2757             /* +1 in case of mbcset->nranges is 0.  */
2758             new_nranges = 2 * mbcset->nranges + 1;
2759             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2760                are NULL if *range_alloc == 0.  */
2761             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2762                                           new_nranges);
2763             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2764                                         new_nranges);
2765
2766             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2767               {
2768                  /* if one is not NULL, free it to avoid leaks */
2769                  if (new_array_start != NULL)
2770                      re_free(new_array_start);
2771                  if (new_array_end != NULL)
2772                      re_free(new_array_end);
2773                  return REG_ESPACE;
2774               }
2775
2776             mbcset->range_starts = new_array_start;
2777             mbcset->range_ends = new_array_end;
2778             *range_alloc = new_nranges;
2779           }
2780
2781         mbcset->range_starts[mbcset->nranges] = start_wc;
2782         mbcset->range_ends[mbcset->nranges++] = end_wc;
2783       }
2784
2785     /* Build the table for single byte characters.  */
2786     for (wc = 0; wc < SBC_MAX; ++wc)
2787       {
2788          if (start_wc <= wc && wc <= end_wc)
2789            bitset_set (sbcset, wc);
2790       }
2791   }
2792 # else /* not RE_ENABLE_I18N */
2793   {
2794     unsigned int ch;
2795     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2796                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2797                    : 0));
2798     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2799               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2800                  : 0));
2801     if (start_ch > end_ch)
2802       return REG_ERANGE;
2803     /* Build the table for single byte characters.  */
2804     for (ch = 0; ch < SBC_MAX; ++ch)
2805       if (start_ch <= ch  && ch <= end_ch)
2806         bitset_set (sbcset, ch);
2807   }
2808 # endif /* not RE_ENABLE_I18N */
2809   return REG_NOERROR;
2810 }
2811 #endif /* not _LIBC */
2812
2813 #ifndef _LIBC
2814 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2815    Build the collating element which is represented by NAME.
2816    The result are written to MBCSET and SBCSET.
2817    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2818    pointer argument since we may update it.  */
2819
2820 static reg_errcode_t
2821 internal_function
2822 # ifdef RE_ENABLE_I18N
2823 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2824                         int *coll_sym_alloc, const unsigned char *name)
2825 # else /* not RE_ENABLE_I18N */
2826 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2827 # endif /* not RE_ENABLE_I18N */
2828 {
2829   size_t name_len = strlen ((const char *) name);
2830   if (BE (name_len != 1, 0))
2831     return REG_ECOLLATE;
2832   else
2833     {
2834       bitset_set (sbcset, name[0]);
2835       return REG_NOERROR;
2836     }
2837 }
2838 #endif /* not _LIBC */
2839
2840 /* This function parse bracket expression like "[abc]", "[a-c]",
2841    "[[.a-a.]]" etc.  */
2842
2843 static bin_tree_t *
2844 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2845                    reg_syntax_t syntax, reg_errcode_t *err)
2846 {
2847 #ifdef _LIBC
2848   const unsigned char *collseqmb;
2849   const char *collseqwc;
2850   uint32_t nrules;
2851   int32_t table_size;
2852   const int32_t *symb_table;
2853   const unsigned char *extra;
2854
2855   /* Local function for parse_bracket_exp used in _LIBC environement.
2856      Seek the collating symbol entry correspondings to NAME.
2857      Return the index of the symbol in the SYMB_TABLE,
2858      or -1 if not found.  */
2859
2860   auto inline int32_t
2861   __attribute ((always_inline))
2862   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2863     {
2864       int32_t elem;
2865
2866       for (elem = 0; elem < table_size; elem++)
2867         if (symb_table[2 * elem] != 0)
2868           {
2869             int32_t idx = symb_table[2 * elem + 1];
2870             /* Skip the name of collating element name.  */
2871             idx += 1 + extra[idx];
2872             if (/* Compare the length of the name.  */
2873                 name_len == extra[idx]
2874                 /* Compare the name.  */
2875                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2876               /* Yep, this is the entry.  */
2877               return elem;
2878           }
2879       return -1;
2880     }
2881
2882   /* Local function for parse_bracket_exp used in _LIBC environment.
2883      Look up the collation sequence value of BR_ELEM.
2884      Return the value if succeeded, UINT_MAX otherwise.  */
2885
2886   auto inline unsigned int
2887   __attribute ((always_inline))
2888   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2889     {
2890       if (br_elem->type == SB_CHAR)
2891         {
2892           /*
2893           if (MB_CUR_MAX == 1)
2894           */
2895           if (nrules == 0)
2896             return collseqmb[br_elem->opr.ch];
2897           else
2898             {
2899               wint_t wc = __btowc (br_elem->opr.ch);
2900               return __collseq_table_lookup (collseqwc, wc);
2901             }
2902         }
2903       else if (br_elem->type == MB_CHAR)
2904         {
2905           if (nrules != 0)
2906             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2907         }
2908       else if (br_elem->type == COLL_SYM)
2909         {
2910           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2911           if (nrules != 0)
2912             {
2913               int32_t elem, idx;
2914               elem = seek_collating_symbol_entry (br_elem->opr.name,
2915                                                   sym_name_len);
2916               if (elem != -1)
2917                 {
2918                   /* We found the entry.  */
2919                   idx = symb_table[2 * elem + 1];
2920                   /* Skip the name of collating element name.  */
2921                   idx += 1 + extra[idx];
2922                   /* Skip the byte sequence of the collating element.  */
2923                   idx += 1 + extra[idx];
2924                   /* Adjust for the alignment.  */
2925                   idx = (idx + 3) & ~3;
2926                   /* Skip the multibyte collation sequence value.  */
2927                   idx += sizeof (unsigned int);
2928                   /* Skip the wide char sequence of the collating element.  */
2929                   idx += sizeof (unsigned int) *
2930                     (1 + *(unsigned int *) (extra + idx));
2931                   /* Return the collation sequence value.  */
2932                   return *(unsigned int *) (extra + idx);
2933                 }
2934               else if (sym_name_len == 1)
2935                 {
2936                   /* No valid character.  Match it as a single byte
2937                      character.  */
2938                   return collseqmb[br_elem->opr.name[0]];
2939                 }
2940             }
2941           else if (sym_name_len == 1)
2942             return collseqmb[br_elem->opr.name[0]];
2943         }
2944       return UINT_MAX;
2945     }
2946
2947   /* Local function for parse_bracket_exp used in _LIBC environement.
2948      Build the range expression which starts from START_ELEM, and ends
2949      at END_ELEM.  The result are written to MBCSET and SBCSET.
2950      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2951      mbcset->range_ends, is a pointer argument sinse we may
2952      update it.  */
2953
2954   auto inline reg_errcode_t
2955   __attribute ((always_inline))
2956   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2957                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2958     {
2959       unsigned int ch;
2960       uint32_t start_collseq;
2961       uint32_t end_collseq;
2962
2963       /* Equivalence Classes and Character Classes can't be a range
2964          start/end.  */
2965       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2966               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2967               0))
2968         return REG_ERANGE;
2969
2970       start_collseq = lookup_collation_sequence_value (start_elem);
2971       end_collseq = lookup_collation_sequence_value (end_elem);
2972       /* Check start/end collation sequence values.  */
2973       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2974         return REG_ECOLLATE;
2975       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2976         return REG_ERANGE;
2977
2978       /* Got valid collation sequence values, add them as a new entry.
2979          However, if we have no collation elements, and the character set
2980          is single byte, the single byte character set that we
2981          build below suffices. */
2982       if (nrules > 0 || dfa->mb_cur_max > 1)
2983         {
2984           /* Check the space of the arrays.  */
2985           if (BE (*range_alloc == mbcset->nranges, 0))
2986             {
2987               /* There is not enough space, need realloc.  */
2988               uint32_t *new_array_start;
2989               uint32_t *new_array_end;
2990               int new_nranges;
2991
2992               /* +1 in case of mbcset->nranges is 0.  */
2993               new_nranges = 2 * mbcset->nranges + 1;
2994               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2995                                             new_nranges);
2996               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2997                                           new_nranges);
2998
2999               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3000                 return REG_ESPACE;
3001
3002               mbcset->range_starts = new_array_start;
3003               mbcset->range_ends = new_array_end;
3004               *range_alloc = new_nranges;
3005             }
3006
3007           mbcset->range_starts[mbcset->nranges] = start_collseq;
3008           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3009         }
3010
3011       /* Build the table for single byte characters.  */
3012       for (ch = 0; ch < SBC_MAX; ch++)
3013         {
3014           uint32_t ch_collseq;
3015           /*
3016           if (MB_CUR_MAX == 1)
3017           */
3018           if (nrules == 0)
3019             ch_collseq = collseqmb[ch];
3020           else
3021             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3022           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3023             bitset_set (sbcset, ch);
3024         }
3025       return REG_NOERROR;
3026     }
3027
3028   /* Local function for parse_bracket_exp used in _LIBC environement.
3029      Build the collating element which is represented by NAME.
3030      The result are written to MBCSET and SBCSET.
3031      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3032      pointer argument sinse we may update it.  */
3033
3034   auto inline reg_errcode_t
3035   __attribute ((always_inline))
3036   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3037                           int *coll_sym_alloc, const unsigned char *name)
3038     {
3039       int32_t elem, idx;
3040       size_t name_len = strlen ((const char *) name);
3041       if (nrules != 0)
3042         {
3043           elem = seek_collating_symbol_entry (name, name_len);
3044           if (elem != -1)
3045             {
3046               /* We found the entry.  */
3047               idx = symb_table[2 * elem + 1];
3048               /* Skip the name of collating element name.  */
3049               idx += 1 + extra[idx];
3050             }
3051           else if (name_len == 1)
3052             {
3053               /* No valid character, treat it as a normal
3054                  character.  */
3055               bitset_set (sbcset, name[0]);
3056               return REG_NOERROR;
3057             }
3058           else
3059             return REG_ECOLLATE;
3060
3061           /* Got valid collation sequence, add it as a new entry.  */
3062           /* Check the space of the arrays.  */
3063           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3064             {
3065               /* Not enough, realloc it.  */
3066               /* +1 in case of mbcset->ncoll_syms is 0.  */
3067               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3068               /* Use realloc since mbcset->coll_syms is NULL
3069                  if *alloc == 0.  */
3070               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3071                                                    new_coll_sym_alloc);
3072               if (BE (new_coll_syms == NULL, 0))
3073                 return REG_ESPACE;
3074               mbcset->coll_syms = new_coll_syms;
3075               *coll_sym_alloc = new_coll_sym_alloc;
3076             }
3077           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3078           return REG_NOERROR;
3079         }
3080       else
3081         {
3082           if (BE (name_len != 1, 0))
3083             return REG_ECOLLATE;
3084           else
3085             {
3086               bitset_set (sbcset, name[0]);
3087               return REG_NOERROR;
3088             }
3089         }
3090     }
3091 #endif
3092
3093   re_token_t br_token;
3094   re_bitset_ptr_t sbcset;
3095 #ifdef RE_ENABLE_I18N
3096   re_charset_t *mbcset;
3097   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3098   int equiv_class_alloc = 0, char_class_alloc = 0;
3099 #endif /* not RE_ENABLE_I18N */
3100   int non_match = 0;
3101   bin_tree_t *work_tree;
3102   int token_len;
3103   int first_round = 1;
3104 #ifdef _LIBC
3105   collseqmb = (const unsigned char *)
3106     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3107   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3108   if (nrules)
3109     {
3110       /*
3111       if (MB_CUR_MAX > 1)
3112       */
3113       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3114       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3115       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3116                                                   _NL_COLLATE_SYMB_TABLEMB);
3117       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3118                                                    _NL_COLLATE_SYMB_EXTRAMB);
3119     }
3120 #endif
3121   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3122 #ifdef RE_ENABLE_I18N
3123   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3124 #endif /* RE_ENABLE_I18N */
3125 #ifdef RE_ENABLE_I18N
3126   if (BE (sbcset == NULL || mbcset == NULL, 0))
3127 #else
3128   if (BE (sbcset == NULL, 0))
3129 #endif /* RE_ENABLE_I18N */
3130     {
3131 #ifdef RE_ENABLE_I18N
3132       re_free (sbcset);
3133       re_free (mbcset);
3134 #endif
3135       *err = REG_ESPACE;
3136       return NULL;
3137     }
3138
3139   token_len = peek_token_bracket (token, regexp, syntax);
3140   if (BE (token->type == END_OF_RE, 0))
3141     {
3142       *err = REG_BADPAT;
3143       goto parse_bracket_exp_free_return;
3144     }
3145   if (token->type == OP_NON_MATCH_LIST)
3146     {
3147 #ifdef RE_ENABLE_I18N
3148       mbcset->non_match = 1;
3149 #endif /* not RE_ENABLE_I18N */
3150       non_match = 1;
3151       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3152         bitset_set (sbcset, '\n');
3153       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3154       token_len = peek_token_bracket (token, regexp, syntax);
3155       if (BE (token->type == END_OF_RE, 0))
3156         {
3157           *err = REG_BADPAT;
3158           goto parse_bracket_exp_free_return;
3159         }
3160     }
3161
3162   /* We treat the first ']' as a normal character.  */
3163   if (token->type == OP_CLOSE_BRACKET)
3164     token->type = CHARACTER;
3165
3166   while (1)
3167     {
3168       bracket_elem_t start_elem, end_elem;
3169       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3170       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3171       reg_errcode_t ret;
3172       int token_len2 = 0, is_range_exp = 0;
3173       re_token_t token2;
3174
3175       start_elem.opr.name = start_name_buf;
3176       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3177                                    syntax, first_round);
3178       if (BE (ret != REG_NOERROR, 0))
3179         {
3180           *err = ret;
3181           goto parse_bracket_exp_free_return;
3182         }
3183       first_round = 0;
3184
3185       /* Get information about the next token.  We need it in any case.  */
3186       token_len = peek_token_bracket (token, regexp, syntax);
3187
3188       /* Do not check for ranges if we know they are not allowed.  */
3189       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3190         {
3191           if (BE (token->type == END_OF_RE, 0))
3192             {
3193               *err = REG_EBRACK;
3194               goto parse_bracket_exp_free_return;
3195             }
3196           if (token->type == OP_CHARSET_RANGE)
3197             {
3198               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3199               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3200               if (BE (token2.type == END_OF_RE, 0))
3201                 {
3202                   *err = REG_EBRACK;
3203                   goto parse_bracket_exp_free_return;
3204                 }
3205               if (token2.type == OP_CLOSE_BRACKET)
3206                 {
3207                   /* We treat the last '-' as a normal character.  */
3208                   re_string_skip_bytes (regexp, -token_len);
3209                   token->type = CHARACTER;
3210                 }
3211               else
3212                 is_range_exp = 1;
3213             }
3214         }
3215
3216       if (is_range_exp == 1)
3217         {
3218           end_elem.opr.name = end_name_buf;
3219           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3220                                        dfa, syntax, 1);
3221           if (BE (ret != REG_NOERROR, 0))
3222             {
3223               *err = ret;
3224               goto parse_bracket_exp_free_return;
3225             }
3226
3227           token_len = peek_token_bracket (token, regexp, syntax);
3228
3229 #ifdef _LIBC
3230           *err = build_range_exp (syntax, sbcset, mbcset, &range_alloc,
3231                                   &start_elem, &end_elem);
3232 #else
3233 # ifdef RE_ENABLE_I18N
3234           *err = build_range_exp (syntax, sbcset,
3235                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3236                                   &range_alloc, &start_elem, &end_elem);
3237 # else
3238           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3239 # endif
3240 #endif /* RE_ENABLE_I18N */
3241           if (BE (*err != REG_NOERROR, 0))
3242             goto parse_bracket_exp_free_return;
3243         }
3244       else
3245         {
3246           switch (start_elem.type)
3247             {
3248             case SB_CHAR:
3249               bitset_set (sbcset, start_elem.opr.ch);
3250               break;
3251 #ifdef RE_ENABLE_I18N
3252             case MB_CHAR:
3253               /* Check whether the array has enough space.  */
3254               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3255                 {
3256                   wchar_t *new_mbchars;
3257                   /* Not enough, realloc it.  */
3258                   /* +1 in case of mbcset->nmbchars is 0.  */
3259                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3260                   /* Use realloc since array is NULL if *alloc == 0.  */
3261                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3262                                             mbchar_alloc);
3263                   if (BE (new_mbchars == NULL, 0))
3264                     goto parse_bracket_exp_espace;
3265                   mbcset->mbchars = new_mbchars;
3266                 }
3267               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3268               break;
3269 #endif /* RE_ENABLE_I18N */
3270             case EQUIV_CLASS:
3271               *err = build_equiv_class (sbcset,
3272 #ifdef RE_ENABLE_I18N
3273                                         mbcset, &equiv_class_alloc,
3274 #endif /* RE_ENABLE_I18N */
3275                                         start_elem.opr.name);
3276               if (BE (*err != REG_NOERROR, 0))
3277                 goto parse_bracket_exp_free_return;
3278               break;
3279             case COLL_SYM:
3280               *err = build_collating_symbol (sbcset,
3281 #ifdef RE_ENABLE_I18N
3282                                              mbcset, &coll_sym_alloc,
3283 #endif /* RE_ENABLE_I18N */
3284                                              start_elem.opr.name);
3285               if (BE (*err != REG_NOERROR, 0))
3286                 goto parse_bracket_exp_free_return;
3287               break;
3288             case CHAR_CLASS:
3289               *err = build_charclass (regexp->trans, sbcset,
3290 #ifdef RE_ENABLE_I18N
3291                                       mbcset, &char_class_alloc,
3292 #endif /* RE_ENABLE_I18N */
3293                                       (const char *) start_elem.opr.name, syntax);
3294               if (BE (*err != REG_NOERROR, 0))
3295                goto parse_bracket_exp_free_return;
3296               break;
3297             default:
3298               assert (0);
3299               break;
3300             }
3301         }
3302       if (BE (token->type == END_OF_RE, 0))
3303         {
3304           *err = REG_EBRACK;
3305           goto parse_bracket_exp_free_return;
3306         }
3307       if (token->type == OP_CLOSE_BRACKET)
3308         break;
3309     }
3310
3311   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3312
3313   /* If it is non-matching list.  */
3314   if (non_match)
3315     bitset_not (sbcset);
3316
3317 #ifdef RE_ENABLE_I18N
3318   /* Ensure only single byte characters are set.  */
3319   if (dfa->mb_cur_max > 1)
3320     bitset_mask (sbcset, dfa->sb_char);
3321
3322   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3323       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3324                                                      || mbcset->non_match)))
3325     {
3326       bin_tree_t *mbc_tree;
3327       int sbc_idx;
3328       /* Build a tree for complex bracket.  */
3329       dfa->has_mb_node = 1;
3330       br_token.type = COMPLEX_BRACKET;
3331       br_token.opr.mbcset = mbcset;
3332       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3333       if (BE (mbc_tree == NULL, 0))
3334         goto parse_bracket_exp_espace;
3335       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3336         if (sbcset[sbc_idx])
3337           break;
3338       /* If there are no bits set in sbcset, there is no point
3339          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3340       if (sbc_idx < BITSET_WORDS)
3341         {
3342           /* Build a tree for simple bracket.  */
3343           br_token.type = SIMPLE_BRACKET;
3344           br_token.opr.sbcset = sbcset;
3345           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3346           if (BE (work_tree == NULL, 0))
3347             goto parse_bracket_exp_espace;
3348
3349           /* Then join them by ALT node.  */
3350           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3351           if (BE (work_tree == NULL, 0))
3352             goto parse_bracket_exp_espace;
3353         }
3354       else
3355         {
3356           re_free (sbcset);
3357           work_tree = mbc_tree;
3358         }
3359     }
3360   else
3361 #endif /* not RE_ENABLE_I18N */
3362     {
3363 #ifdef RE_ENABLE_I18N
3364       free_charset (mbcset);
3365 #endif
3366       /* Build a tree for simple bracket.  */
3367       br_token.type = SIMPLE_BRACKET;
3368       br_token.opr.sbcset = sbcset;
3369       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3370       if (BE (work_tree == NULL, 0))
3371         goto parse_bracket_exp_espace;
3372     }
3373   return work_tree;
3374
3375  parse_bracket_exp_espace:
3376   *err = REG_ESPACE;
3377  parse_bracket_exp_free_return:
3378   re_free (sbcset);
3379 #ifdef RE_ENABLE_I18N
3380   free_charset (mbcset);
3381 #endif /* RE_ENABLE_I18N */
3382   return NULL;
3383 }
3384
3385 /* Parse an element in the bracket expression.  */
3386
3387 static reg_errcode_t
3388 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3389                        re_token_t *token, int token_len, re_dfa_t *dfa,
3390                        reg_syntax_t syntax, int accept_hyphen)
3391 {
3392 #ifdef RE_ENABLE_I18N
3393   int cur_char_size;
3394   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3395   if (cur_char_size > 1)
3396     {
3397       elem->type = MB_CHAR;
3398       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3399       re_string_skip_bytes (regexp, cur_char_size);
3400       return REG_NOERROR;
3401     }
3402 #endif /* RE_ENABLE_I18N */
3403   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3404   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3405       || token->type == OP_OPEN_EQUIV_CLASS)
3406     return parse_bracket_symbol (elem, regexp, token);
3407   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3408     {
3409       /* A '-' must only appear as anything but a range indicator before
3410          the closing bracket.  Everything else is an error.  */
3411       re_token_t token2;
3412       (void) peek_token_bracket (&token2, regexp, syntax);
3413       if (token2.type != OP_CLOSE_BRACKET)
3414         /* The actual error value is not standardized since this whole
3415            case is undefined.  But ERANGE makes good sense.  */
3416         return REG_ERANGE;
3417     }
3418   elem->type = SB_CHAR;
3419   elem->opr.ch = token->opr.c;
3420   return REG_NOERROR;
3421 }
3422
3423 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3424    such as [:<character_class>:], [.<collating_element>.], and
3425    [=<equivalent_class>=].  */
3426
3427 static reg_errcode_t
3428 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3429                       re_token_t *token)
3430 {
3431   unsigned char ch, delim = token->opr.c;
3432   int i = 0;
3433   if (re_string_eoi(regexp))
3434     return REG_EBRACK;
3435   for (;; ++i)
3436     {
3437       if (i >= BRACKET_NAME_BUF_SIZE)
3438         return REG_EBRACK;
3439       if (token->type == OP_OPEN_CHAR_CLASS)
3440         ch = re_string_fetch_byte_case (regexp);
3441       else
3442         ch = re_string_fetch_byte (regexp);
3443       if (re_string_eoi(regexp))
3444         return REG_EBRACK;
3445       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3446         break;
3447       elem->opr.name[i] = ch;
3448     }
3449   re_string_skip_bytes (regexp, 1);
3450   elem->opr.name[i] = '\0';
3451   switch (token->type)
3452     {
3453     case OP_OPEN_COLL_ELEM:
3454       elem->type = COLL_SYM;
3455       break;
3456     case OP_OPEN_EQUIV_CLASS:
3457       elem->type = EQUIV_CLASS;
3458       break;
3459     case OP_OPEN_CHAR_CLASS:
3460       elem->type = CHAR_CLASS;
3461       break;
3462     default:
3463       break;
3464     }
3465   return REG_NOERROR;
3466 }
3467
3468   /* Helper function for parse_bracket_exp.
3469      Build the equivalence class which is represented by NAME.
3470      The result are written to MBCSET and SBCSET.
3471      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3472      is a pointer argument sinse we may update it.  */
3473
3474 static reg_errcode_t
3475 #ifdef RE_ENABLE_I18N
3476 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3477                    int *equiv_class_alloc, const unsigned char *name)
3478 #else /* not RE_ENABLE_I18N */
3479 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3480 #endif /* not RE_ENABLE_I18N */
3481 {
3482 #ifdef _LIBC
3483   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3484   if (nrules != 0)
3485     {
3486       const int32_t *table, *indirect;
3487       const unsigned char *weights, *extra, *cp;
3488       unsigned char char_buf[2];
3489       int32_t idx1, idx2;
3490       unsigned int ch;
3491       size_t len;
3492       /* This #include defines a local function!  */
3493 # include <locale/weight.h>
3494       /* Calculate the index for equivalence class.  */
3495       cp = name;
3496       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3497       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3498                                                _NL_COLLATE_WEIGHTMB);
3499       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3500                                                    _NL_COLLATE_EXTRAMB);
3501       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3502                                                 _NL_COLLATE_INDIRECTMB);
3503       idx1 = findidx (&cp, -1);
3504       if (BE (idx1 == 0 || *cp != '\0', 0))
3505         /* This isn't a valid character.  */
3506         return REG_ECOLLATE;
3507
3508       /* Build single byte matcing table for this equivalence class.  */
3509       char_buf[1] = (unsigned char) '\0';
3510       len = weights[idx1 & 0xffffff];
3511       for (ch = 0; ch < SBC_MAX; ++ch)
3512         {
3513           char_buf[0] = ch;
3514           cp = char_buf;
3515           idx2 = findidx (&cp, 1);
3516 /*
3517           idx2 = table[ch];
3518 */
3519           if (idx2 == 0)
3520             /* This isn't a valid character.  */
3521             continue;
3522           /* Compare only if the length matches and the collation rule
3523              index is the same.  */
3524           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3525             {
3526               int cnt = 0;
3527
3528               while (cnt <= len &&
3529                      weights[(idx1 & 0xffffff) + 1 + cnt]
3530                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3531                 ++cnt;
3532
3533               if (cnt > len)
3534                 bitset_set (sbcset, ch);
3535             }
3536         }
3537       /* Check whether the array has enough space.  */
3538       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3539         {
3540           /* Not enough, realloc it.  */
3541           /* +1 in case of mbcset->nequiv_classes is 0.  */
3542           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3543           /* Use realloc since the array is NULL if *alloc == 0.  */
3544           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3545                                                    int32_t,
3546                                                    new_equiv_class_alloc);
3547           if (BE (new_equiv_classes == NULL, 0))
3548             return REG_ESPACE;
3549           mbcset->equiv_classes = new_equiv_classes;
3550           *equiv_class_alloc = new_equiv_class_alloc;
3551         }
3552       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3553     }
3554   else
3555 #endif /* _LIBC */
3556     {
3557       if (BE (strlen ((const char *) name) != 1, 0))
3558         return REG_ECOLLATE;
3559       bitset_set (sbcset, *name);
3560     }
3561   return REG_NOERROR;
3562 }
3563
3564   /* Helper function for parse_bracket_exp.
3565      Build the character class which is represented by NAME.
3566      The result are written to MBCSET and SBCSET.
3567      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3568      is a pointer argument sinse we may update it.  */
3569
3570 static reg_errcode_t
3571 #ifdef RE_ENABLE_I18N
3572 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3573                  re_charset_t *mbcset, int *char_class_alloc,
3574                  const char *class_name, reg_syntax_t syntax)
3575 #else /* not RE_ENABLE_I18N */
3576 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3577                  const char *class_name, reg_syntax_t syntax)
3578 #endif /* not RE_ENABLE_I18N */
3579 {
3580   int i;
3581
3582   /* In case of REG_ICASE "upper" and "lower" match the both of
3583      upper and lower cases.  */
3584   if ((syntax & RE_ICASE)
3585       && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3586     class_name = "alpha";
3587
3588 #ifdef RE_ENABLE_I18N
3589   /* Check the space of the arrays.  */
3590   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3591     {
3592       /* Not enough, realloc it.  */
3593       /* +1 in case of mbcset->nchar_classes is 0.  */
3594       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3595       /* Use realloc since array is NULL if *alloc == 0.  */
3596       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3597                                                new_char_class_alloc);
3598       if (BE (new_char_classes == NULL, 0))
3599         return REG_ESPACE;
3600       mbcset->char_classes = new_char_classes;
3601       *char_class_alloc = new_char_class_alloc;
3602     }
3603   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3604 #endif /* RE_ENABLE_I18N */
3605
3606 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3607   do {                                          \
3608     if (BE (trans != NULL, 0))                  \
3609       {                                         \
3610         for (i = 0; i < SBC_MAX; ++i)           \
3611           if (ctype_func (i))                   \
3612             bitset_set (sbcset, trans[i]);      \
3613       }                                         \
3614     else                                        \
3615       {                                         \
3616         for (i = 0; i < SBC_MAX; ++i)           \
3617           if (ctype_func (i))                   \
3618             bitset_set (sbcset, i);             \
3619       }                                         \
3620   } while (0)
3621
3622   if (strcmp (class_name, "alnum") == 0)
3623     BUILD_CHARCLASS_LOOP (isalnum);
3624   else if (strcmp (class_name, "cntrl") == 0)
3625     BUILD_CHARCLASS_LOOP (iscntrl);
3626   else if (strcmp (class_name, "lower") == 0)
3627     BUILD_CHARCLASS_LOOP (islower);
3628   else if (strcmp (class_name, "space") == 0)
3629     BUILD_CHARCLASS_LOOP (isspace);
3630   else if (strcmp (class_name, "alpha") == 0)
3631     BUILD_CHARCLASS_LOOP (isalpha);
3632   else if (strcmp (class_name, "digit") == 0)
3633     BUILD_CHARCLASS_LOOP (isdigit);
3634   else if (strcmp (class_name, "print") == 0)
3635     BUILD_CHARCLASS_LOOP (isprint);
3636   else if (strcmp (class_name, "upper") == 0)
3637     BUILD_CHARCLASS_LOOP (isupper);
3638   else if (strcmp (class_name, "blank") == 0)
3639 #ifndef GAWK
3640     BUILD_CHARCLASS_LOOP (isblank);
3641 #else
3642     /* see comments above */
3643     BUILD_CHARCLASS_LOOP (is_blank);
3644 #endif
3645   else if (strcmp (class_name, "graph") == 0)
3646     BUILD_CHARCLASS_LOOP (isgraph);
3647   else if (strcmp (class_name, "punct") == 0)
3648     BUILD_CHARCLASS_LOOP (ispunct);
3649   else if (strcmp (class_name, "xdigit") == 0)
3650     BUILD_CHARCLASS_LOOP (isxdigit);
3651   else
3652     return REG_ECTYPE;
3653
3654   return REG_NOERROR;
3655 }
3656
3657 static bin_tree_t *
3658 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3659                     const char *class_name,
3660                     const char *extra, int non_match,
3661                     reg_errcode_t *err)
3662 {
3663   re_bitset_ptr_t sbcset;
3664 #ifdef RE_ENABLE_I18N
3665   re_charset_t *mbcset;
3666   int alloc = 0;
3667 #endif /* not RE_ENABLE_I18N */
3668   reg_errcode_t ret;
3669   re_token_t br_token;
3670   bin_tree_t *tree;
3671
3672   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3673 #ifdef RE_ENABLE_I18N
3674   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3675 #endif /* RE_ENABLE_I18N */
3676
3677 #ifdef RE_ENABLE_I18N
3678   if (BE (sbcset == NULL || mbcset == NULL, 0))
3679 #else /* not RE_ENABLE_I18N */
3680   if (BE (sbcset == NULL, 0))
3681 #endif /* not RE_ENABLE_I18N */
3682     {
3683       /* if one is not NULL, free it to avoid leaks */
3684       if (sbcset != NULL)
3685          free(sbcset);
3686 #ifdef RE_ENABLE_I18N
3687       if (mbcset != NULL)
3688          free(mbcset);
3689 #endif
3690       *err = REG_ESPACE;
3691       return NULL;
3692     }
3693
3694   if (non_match)
3695     {
3696 #ifdef RE_ENABLE_I18N
3697       mbcset->non_match = 1;
3698 #endif /* not RE_ENABLE_I18N */
3699     }
3700
3701   /* We don't care the syntax in this case.  */
3702   ret = build_charclass (trans, sbcset,
3703 #ifdef RE_ENABLE_I18N
3704                          mbcset, &alloc,
3705 #endif /* RE_ENABLE_I18N */
3706                          class_name, 0);
3707
3708   if (BE (ret != REG_NOERROR, 0))
3709     {
3710       re_free (sbcset);
3711 #ifdef RE_ENABLE_I18N
3712       free_charset (mbcset);
3713 #endif /* RE_ENABLE_I18N */
3714       *err = ret;
3715       return NULL;
3716     }
3717   /* \w match '_' also.  */
3718   for (; *extra; extra++)
3719     bitset_set (sbcset, *extra);
3720
3721   /* If it is non-matching list.  */
3722   if (non_match)
3723     bitset_not (sbcset);
3724
3725 #ifdef RE_ENABLE_I18N
3726   /* Ensure only single byte characters are set.  */
3727   if (dfa->mb_cur_max > 1)
3728     bitset_mask (sbcset, dfa->sb_char);
3729 #endif
3730
3731   /* Build a tree for simple bracket.  */
3732   memset(& br_token, 0, sizeof(br_token));      /* silence "not initialized" errors froms static checkers */
3733   br_token.type = SIMPLE_BRACKET;
3734   br_token.opr.sbcset = sbcset;
3735   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3736   if (BE (tree == NULL, 0))
3737     goto build_word_op_espace;
3738
3739 #ifdef RE_ENABLE_I18N
3740   if (dfa->mb_cur_max > 1)
3741     {
3742       bin_tree_t *mbc_tree;
3743       /* Build a tree for complex bracket.  */
3744       br_token.type = COMPLEX_BRACKET;
3745       br_token.opr.mbcset = mbcset;
3746       dfa->has_mb_node = 1;
3747       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3748       if (BE (mbc_tree == NULL, 0))
3749         goto build_word_op_espace;
3750       /* Then join them by ALT node.  */
3751       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3752       if (BE (mbc_tree != NULL, 1))
3753         return tree;
3754     }
3755   else
3756     {
3757       free_charset (mbcset);
3758       return tree;
3759     }
3760 #else /* not RE_ENABLE_I18N */
3761   return tree;
3762 #endif /* not RE_ENABLE_I18N */
3763
3764  build_word_op_espace:
3765   re_free (sbcset);
3766 #ifdef RE_ENABLE_I18N
3767   free_charset (mbcset);
3768 #endif /* RE_ENABLE_I18N */
3769   *err = REG_ESPACE;
3770   return NULL;
3771 }
3772
3773 /* This is intended for the expressions like "a{1,3}".
3774    Fetch a number from `input', and return the number.
3775    Return -1, if the number field is empty like "{,1}".
3776    Return -2, If an error is occured.  */
3777
3778 static int
3779 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3780 {
3781   int num = -1;
3782   unsigned char c;
3783   while (1)
3784     {
3785       fetch_token (token, input, syntax);
3786       c = token->opr.c;
3787       if (BE (token->type == END_OF_RE, 0))
3788         return -2;
3789       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3790         break;
3791       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3792              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3793       num = (num > RE_DUP_MAX) ? -2 : num;
3794     }
3795   return num;
3796 }
3797 \f
3798 #ifdef RE_ENABLE_I18N
3799 static void
3800 free_charset (re_charset_t *cset)
3801 {
3802   re_free (cset->mbchars);
3803 # ifdef _LIBC
3804   re_free (cset->coll_syms);
3805   re_free (cset->equiv_classes);
3806   re_free (cset->range_starts);
3807   re_free (cset->range_ends);
3808 # endif
3809   re_free (cset->char_classes);
3810   re_free (cset);
3811 }
3812 #endif /* RE_ENABLE_I18N */
3813 \f
3814 /* Functions for binary tree operation.  */
3815
3816 /* Create a tree node.  */
3817
3818 static bin_tree_t *
3819 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3820              re_token_type_t type)
3821 {
3822   re_token_t t;
3823   memset(& t, 0, sizeof(t));    /* silence "not initialized" errors froms static checkers */
3824   t.type = type;
3825   return create_token_tree (dfa, left, right, &t);
3826 }
3827
3828 static bin_tree_t *
3829 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3830                    const re_token_t *token)
3831 {
3832   bin_tree_t *tree;
3833   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3834     {
3835       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3836
3837       if (storage == NULL)
3838         return NULL;
3839       storage->next = dfa->str_tree_storage;
3840       dfa->str_tree_storage = storage;
3841       dfa->str_tree_storage_idx = 0;
3842     }
3843   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3844
3845   tree->parent = NULL;
3846   tree->left = left;
3847   tree->right = right;
3848   tree->token = *token;
3849   tree->token.duplicated = 0;
3850   tree->token.opt_subexp = 0;
3851   tree->first = NULL;
3852   tree->next = NULL;
3853   tree->node_idx = -1;
3854
3855   if (left != NULL)
3856     left->parent = tree;
3857   if (right != NULL)
3858     right->parent = tree;
3859   return tree;
3860 }
3861
3862 /* Mark the tree SRC as an optional subexpression.
3863    To be called from preorder or postorder.  */
3864
3865 static reg_errcode_t
3866 mark_opt_subexp (void *extra, bin_tree_t *node)
3867 {
3868   int idx = (int) (long) extra;
3869   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3870     node->token.opt_subexp = 1;
3871
3872   return REG_NOERROR;
3873 }
3874
3875 /* Free the allocated memory inside NODE. */
3876
3877 static void
3878 free_token (re_token_t *node)
3879 {
3880 #ifdef RE_ENABLE_I18N
3881   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3882     free_charset (node->opr.mbcset);
3883   else
3884 #endif /* RE_ENABLE_I18N */
3885     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3886       re_free (node->opr.sbcset);
3887 }
3888
3889 /* Worker function for tree walking.  Free the allocated memory inside NODE
3890    and its children. */
3891
3892 static reg_errcode_t
3893 free_tree (void *extra, bin_tree_t *node)
3894 {
3895   free_token (&node->token);
3896   return REG_NOERROR;
3897 }
3898
3899
3900 /* Duplicate the node SRC, and return new node.  This is a preorder
3901    visit similar to the one implemented by the generic visitor, but
3902    we need more infrastructure to maintain two parallel trees --- so,
3903    it's easier to duplicate.  */
3904
3905 static bin_tree_t *
3906 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3907 {
3908   const bin_tree_t *node;
3909   bin_tree_t *dup_root;
3910   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3911
3912   for (node = root; ; )
3913     {
3914       /* Create a new tree and link it back to the current parent.  */
3915       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3916       if (*p_new == NULL)
3917         return NULL;
3918       (*p_new)->parent = dup_node;
3919       (*p_new)->token.duplicated = 1;
3920       dup_node = *p_new;
3921
3922       /* Go to the left node, or up and to the right.  */
3923       if (node->left)
3924         {
3925           node = node->left;
3926           p_new = &dup_node->left;
3927         }
3928       else
3929         {
3930           const bin_tree_t *prev = NULL;
3931           while (node->right == prev || node->right == NULL)
3932             {
3933               prev = node;
3934               node = node->parent;
3935               dup_node = dup_node->parent;
3936               if (!node)
3937                 return dup_root;
3938             }
3939           node = node->right;
3940           p_new = &dup_node->right;
3941         }
3942     }
3943 }