lib/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU General Public
   8    License as published by the Free Software Foundation; either
   9    version 3 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #ifdef _LIBC
  21 # include <locale/weight.h>
  22 #endif
  23
  24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  25                                           size_t length, reg_syntax_t syntax);
  26 static void re_compile_fastmap_iter (regex_t *bufp,
  27                                      const re_dfastate_t *init_state,
  28                                      char *fastmap);
  29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  30 #ifdef RE_ENABLE_I18N
  31 static void free_charset (re_charset_t *cset);
  32 #endif /* RE_ENABLE_I18N */
  33 static void free_workarea_compile (regex_t *preg);
  34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  35 #ifdef RE_ENABLE_I18N
  36 static void optimize_utf8 (re_dfa_t *dfa);
  37 #endif
  38 static reg_errcode_t analyze (regex_t *preg);
  39 static reg_errcode_t preorder (bin_tree_t *root,
  40                                reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                void *extra);
  42 static reg_errcode_t postorder (bin_tree_t *root,
  43                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  44                                 void *extra);
  45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  48                                  bin_tree_t *node);
  49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  54                                    unsigned int constraint);
  55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  57                                          Idx node, bool root);
  58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  59 static Idx fetch_number (re_string_t *input, re_token_t *token,
  60                          reg_syntax_t syntax);
  61 static int peek_token (re_token_t *token, re_string_t *input,
  62                         reg_syntax_t syntax);
  63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  64                           reg_syntax_t syntax, reg_errcode_t *err);
  65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  66                                   re_token_t *token, reg_syntax_t syntax,
  67                                   Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  69                                  re_token_t *token, reg_syntax_t syntax,
  70                                  Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  72                                      re_token_t *token, reg_syntax_t syntax,
  73                                      Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  75                                   re_token_t *token, reg_syntax_t syntax,
  76                                   Idx nest, reg_errcode_t *err);
  77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  78                                  re_dfa_t *dfa, re_token_t *token,
  79                                  reg_syntax_t syntax, reg_errcode_t *err);
  80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  81                                       re_token_t *token, reg_syntax_t syntax,
  82                                       reg_errcode_t *err);
  83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  84                                             re_string_t *regexp,
  85                                             re_token_t *token, int token_len,
  86                                             re_dfa_t *dfa,
  87                                             reg_syntax_t syntax,
  88                                             bool accept_hyphen);
  89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  90                                           re_string_t *regexp,
  91                                           re_token_t *token);
  92 #ifdef RE_ENABLE_I18N
  93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  94                                         re_charset_t *mbcset,
  95                                         Idx *equiv_class_alloc,
  96                                         const unsigned char *name);
  97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  98                                       bitset_t sbcset,
  99                                       re_charset_t *mbcset,
 100                                       Idx *char_class_alloc,
 101                                       const char *class_name,
 102                                       reg_syntax_t syntax);
 103 #else  /* not RE_ENABLE_I18N */
 104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 105                                         const unsigned char *name);
 106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 107                                       bitset_t sbcset,
 108                                       const char *class_name,
 109                                       reg_syntax_t syntax);
 110 #endif /* not RE_ENABLE_I18N */
 111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 112                                        RE_TRANSLATE_TYPE trans,
 113                                        const char *class_name,
 114                                        const char *extra,
 115                                        bool non_match, reg_errcode_t *err);
 116 static bin_tree_t *create_tree (re_dfa_t *dfa,
 117                                 bin_tree_t *left, bin_tree_t *right,
 118                                 re_token_type_t type);
 119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 120                                       bin_tree_t *left, bin_tree_t *right,
 121                                       const re_token_t *token);
 122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 123 static void free_token (re_token_t *node);
 124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 126 \f
 127 /* This table gives an error message for each of the error codes listed
 128    in regex.h.  Obviously the order here has to be same as there.
 129    POSIX doesn't require that we do anything for REG_NOERROR,
 130    but why not be nice?  */
 131
 132 static const char __re_error_msgid[] =
 133   {
 134 #define REG_NOERROR_IDX 0
 135     gettext_noop ("Success")    /* REG_NOERROR */
 136     "\0"
 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 138     gettext_noop ("No match")   /* REG_NOMATCH */
 139     "\0"
 140 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 142     "\0"
 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 145     "\0"
 146 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 148     "\0"
 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 151     "\0"
 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 154     "\0"
 155 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 156     gettext_noop ("Unmatched [, [^, [:, [., or [=")     /* REG_EBRACK */
 157     "\0"
 158 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
 159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 160     "\0"
 161 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 163     "\0"
 164 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 166     "\0"
 167 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 168     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 169     "\0"
 170 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 172     "\0"
 173 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 175     "\0"
 176 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 178     "\0"
 179 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 181     "\0"
 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 184   };
 185
 186 static const size_t __re_error_msgid_idx[] =
 187   {
 188     REG_NOERROR_IDX,
 189     REG_NOMATCH_IDX,
 190     REG_BADPAT_IDX,
 191     REG_ECOLLATE_IDX,
 192     REG_ECTYPE_IDX,
 193     REG_EESCAPE_IDX,
 194     REG_ESUBREG_IDX,
 195     REG_EBRACK_IDX,
 196     REG_EPAREN_IDX,
 197     REG_EBRACE_IDX,
 198     REG_BADBR_IDX,
 199     REG_ERANGE_IDX,
 200     REG_ESPACE_IDX,
 201     REG_BADRPT_IDX,
 202     REG_EEND_IDX,
 203     REG_ESIZE_IDX,
 204     REG_ERPAREN_IDX
 205   };
 206 \f
 207 /* Entry points for GNU code.  */
 208
 209 /* re_compile_pattern is the GNU regular expression compiler: it
 210    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 211    Returns 0 if the pattern was valid, otherwise an error string.
 212
 213    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 214    are set in BUFP on entry.  */
 215
 216 const char *
 217 re_compile_pattern (const char *pattern, size_t length,
 218                     struct re_pattern_buffer *bufp)
 219 {
 220   reg_errcode_t ret;
 221
 222   /* And GNU code determines whether or not to get register information
 223      by passing null for the REGS argument to re_match, etc., not by
 224      setting no_sub, unless RE_NO_SUB is set.  */
 225   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 226
 227   /* Match anchors at newline.  */
 228   bufp->newline_anchor = 1;
 229
 230   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 231
 232   if (!ret)
 233     return NULL;
 234   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 235 }
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237
 238 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 239    also be assigned to arbitrarily: each pattern buffer stores its own
 240    syntax, so it can be changed between regex compilations.  */
 241 /* This has no initializer because initialized variables in Emacs
 242    become read-only after dumping.  */
 243 reg_syntax_t re_syntax_options;
 244
 245
 246 /* Specify the precise syntax of regexps for compilation.  This provides
 247    for compatibility for various utilities which historically have
 248    different, incompatible syntaxes.
 249
 250    The argument SYNTAX is a bit mask comprised of the various bits
 251    defined in regex.h.  We return the old syntax.  */
 252
 253 reg_syntax_t
 254 re_set_syntax (reg_syntax_t syntax)
 255 {
 256   reg_syntax_t ret = re_syntax_options;
 257
 258   re_syntax_options = syntax;
 259   return ret;
 260 }
 261 weak_alias (__re_set_syntax, re_set_syntax)
 262
 263 int
 264 re_compile_fastmap (struct re_pattern_buffer *bufp)
 265 {
 266   re_dfa_t *dfa = bufp->buffer;
 267   char *fastmap = bufp->fastmap;
 268
 269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 271   if (dfa->init_state != dfa->init_state_word)
 272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 273   if (dfa->init_state != dfa->init_state_nl)
 274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 275   if (dfa->init_state != dfa->init_state_begbuf)
 276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 277   bufp->fastmap_accurate = 1;
 278   return 0;
 279 }
 280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 281
 282 static inline void
 283 __attribute__ ((always_inline))
 284 re_set_fastmap (char *fastmap, bool icase, int ch)
 285 {
 286   fastmap[ch] = 1;
 287   if (icase)
 288     fastmap[tolower (ch)] = 1;
 289 }
 290
 291 /* Helper function for re_compile_fastmap.
 292    Compile fastmap for the initial_state INIT_STATE.  */
 293
 294 static void
 295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 296                          char *fastmap)
 297 {
 298   re_dfa_t *dfa = bufp->buffer;
 299   Idx node_cnt;
 300   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 301   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 302     {
 303       Idx node = init_state->nodes.elems[node_cnt];
 304       re_token_type_t type = dfa->nodes[node].type;
 305
 306       if (type == CHARACTER)
 307         {
 308           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 309 #ifdef RE_ENABLE_I18N
 310           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 311             {
 312               unsigned char buf[MB_LEN_MAX];
 313               unsigned char *p;
 314               wchar_t wc;
 315               mbstate_t state;
 316
 317               p = buf;
 318               *p++ = dfa->nodes[node].opr.c;
 319               while (++node < dfa->nodes_len
 320                      && dfa->nodes[node].type == CHARACTER
 321                      && dfa->nodes[node].mb_partial)
 322                 *p++ = dfa->nodes[node].opr.c;
 323               memset (&state, '\0', sizeof (state));
 324               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 325                              &state) == p - buf
 326                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 327                       != (size_t) -1))
 328                 re_set_fastmap (fastmap, false, buf[0]);
 329             }
 330 #endif
 331         }
 332       else if (type == SIMPLE_BRACKET)
 333         {
 334           int i, ch;
 335           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 336             {
 337               int j;
 338               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 339               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 340                 if (w & ((bitset_word_t) 1 << j))
 341                   re_set_fastmap (fastmap, icase, ch);
 342             }
 343         }
 344 #ifdef RE_ENABLE_I18N
 345       else if (type == COMPLEX_BRACKET)
 346         {
 347           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 348           Idx i;
 349
 350 # ifdef _LIBC
 351           /* See if we have to try all bytes which start multiple collation
 352              elements.
 353              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 354                   collation element, and don't catch 'b' since 'b' is
 355                   the only collation element which starts from 'b' (and
 356                   it is caught by SIMPLE_BRACKET).  */
 357               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 358                   && (cset->ncoll_syms || cset->nranges))
 359                 {
 360                   const int32_t *table = (const int32_t *)
 361                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 362                   for (i = 0; i < SBC_MAX; ++i)
 363                     if (table[i] < 0)
 364                       re_set_fastmap (fastmap, icase, i);
 365                 }
 366 # endif /* _LIBC */
 367
 368           /* See if we have to start the match at all multibyte characters,
 369              i.e. where we would not find an invalid sequence.  This only
 370              applies to multibyte character sets; for single byte character
 371              sets, the SIMPLE_BRACKET again suffices.  */
 372           if (dfa->mb_cur_max > 1
 373               && (cset->nchar_classes || cset->non_match || cset->nranges
 374 # ifdef _LIBC
 375                   || cset->nequiv_classes
 376 # endif /* _LIBC */
 377                  ))
 378             {
 379               unsigned char c = 0;
 380               do
 381                 {
 382                   mbstate_t mbs;
 383                   memset (&mbs, 0, sizeof (mbs));
 384                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 385                     re_set_fastmap (fastmap, false, (int) c);
 386                 }
 387               while (++c != 0);
 388             }
 389
 390           else
 391             {
 392               /* ... Else catch all bytes which can start the mbchars.  */
 393               for (i = 0; i < cset->nmbchars; ++i)
 394                 {
 395                   char buf[256];
 396                   mbstate_t state;
 397                   memset (&state, '\0', sizeof (state));
 398                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 399                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 400                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 401                     {
 402                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 403                           != (size_t) -1)
 404                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 405                     }
 406                 }
 407             }
 408         }
 409 #endif /* RE_ENABLE_I18N */
 410       else if (type == OP_PERIOD
 411 #ifdef RE_ENABLE_I18N
 412                || type == OP_UTF8_PERIOD
 413 #endif /* RE_ENABLE_I18N */
 414                || type == END_OF_RE)
 415         {
 416           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 417           if (type == END_OF_RE)
 418             bufp->can_be_null = 1;
 419           return;
 420         }
 421     }
 422 }
 423 \f
 424 /* Entry point for POSIX code.  */
 425 /* regcomp takes a regular expression as a string and compiles it.
 426
 427    PREG is a regex_t *.  We do not expect any fields to be initialized,
 428    since POSIX says we shouldn't.  Thus, we set
 429
 430      'buffer' to the compiled pattern;
 431      'used' to the length of the compiled pattern;
 432      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 433        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 434        RE_SYNTAX_POSIX_BASIC;
 435      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 436      'fastmap' to an allocated space for the fastmap;
 437      'fastmap_accurate' to zero;
 438      're_nsub' to the number of subexpressions in PATTERN.
 439
 440    PATTERN is the address of the pattern string.
 441
 442    CFLAGS is a series of bits which affect compilation.
 443
 444      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 445      use POSIX basic syntax.
 446
 447      If REG_NEWLINE is set, then . and [^...] don't match newline.
 448      Also, regexec will try a match beginning after every newline.
 449
 450      If REG_ICASE is set, then we considers upper- and lowercase
 451      versions of letters to be equivalent when matching.
 452
 453      If REG_NOSUB is set, then when PREG is passed to regexec, that
 454      routine will report only success or failure, and nothing about the
 455      registers.
 456
 457    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 458    the return codes and their meanings.)  */
 459
 460 int
 461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 462 {
 463   reg_errcode_t ret;
 464   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 465                          : RE_SYNTAX_POSIX_BASIC);
 466
 467   preg->buffer = NULL;
 468   preg->allocated = 0;
 469   preg->used = 0;
 470
 471   /* Try to allocate space for the fastmap.  */
 472   preg->fastmap = re_malloc (char, SBC_MAX);
 473   if (__glibc_unlikely (preg->fastmap == NULL))
 474     return REG_ESPACE;
 475
 476   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 477
 478   /* If REG_NEWLINE is set, newlines are treated differently.  */
 479   if (cflags & REG_NEWLINE)
 480     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 481       syntax &= ~RE_DOT_NEWLINE;
 482       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 483       /* It also changes the matching behavior.  */
 484       preg->newline_anchor = 1;
 485     }
 486   else
 487     preg->newline_anchor = 0;
 488   preg->no_sub = !!(cflags & REG_NOSUB);
 489   preg->translate = NULL;
 490
 491   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 492
 493   /* POSIX doesn't distinguish between an unmatched open-group and an
 494      unmatched close-group: both are REG_EPAREN.  */
 495   if (ret == REG_ERPAREN)
 496     ret = REG_EPAREN;
 497
 498   /* We have already checked preg->fastmap != NULL.  */
 499   if (__glibc_likely (ret == REG_NOERROR))
 500     /* Compute the fastmap now, since regexec cannot modify the pattern
 501        buffer.  This function never fails in this implementation.  */
 502     (void) re_compile_fastmap (preg);
 503   else
 504     {
 505       /* Some error occurred while compiling the expression.  */
 506       re_free (preg->fastmap);
 507       preg->fastmap = NULL;
 508     }
 509
 510   return (int) ret;
 511 }
 512 libc_hidden_def (__regcomp)
 513 weak_alias (__regcomp, regcomp)
 514
 515 /* Returns a message corresponding to an error code, ERRCODE, returned
 516    from either regcomp or regexec.   We don't use PREG here.  */
 517
 518 size_t
 519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 520           size_t errbuf_size)
 521 {
 522   const char *msg;
 523   size_t msg_size;
 524   int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
 525
 526   if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
 527     /* Only error codes returned by the rest of the code should be passed
 528        to this routine.  If we are given anything else, or if other regex
 529        code generates an invalid error code, then the program has a bug.
 530        Dump core so we can fix it.  */
 531     abort ();
 532
 533   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 534
 535   msg_size = strlen (msg) + 1; /* Includes the null.  */
 536
 537   if (__glibc_likely (errbuf_size != 0))
 538     {
 539       size_t cpy_size = msg_size;
 540       if (__glibc_unlikely (msg_size > errbuf_size))
 541         {
 542           cpy_size = errbuf_size - 1;
 543           errbuf[cpy_size] = '\0';
 544         }
 545       memcpy (errbuf, msg, cpy_size);
 546     }
 547
 548   return msg_size;
 549 }
 550 weak_alias (__regerror, regerror)
 551
 552
 553 #ifdef RE_ENABLE_I18N
 554 /* This static array is used for the map to single-byte characters when
 555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 556    it the same all the time.  UTF-8 is the preferred encoding so this is
 557    a worthwhile optimization.  */
 558 static const bitset_t utf8_sb_map =
 559 {
 560   /* Set the first 128 bits.  */
 561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
 562   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 563 # else
 564 #  if 4 * BITSET_WORD_BITS < ASCII_CHARS
 565 #   error "bitset_word_t is narrower than 32 bits"
 566 #  elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 567   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 568 #  elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 569   BITSET_WORD_MAX, BITSET_WORD_MAX,
 570 #  elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 571   BITSET_WORD_MAX,
 572 #  endif
 573   (BITSET_WORD_MAX
 574    >> (SBC_MAX % BITSET_WORD_BITS == 0
 575        ? 0
 576        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 577 # endif
 578 };
 579 #endif
 580
 581
 582 static void
 583 free_dfa_content (re_dfa_t *dfa)
 584 {
 585   Idx i, j;
 586
 587   if (dfa->nodes)
 588     for (i = 0; i < dfa->nodes_len; ++i)
 589       free_token (dfa->nodes + i);
 590   re_free (dfa->nexts);
 591   for (i = 0; i < dfa->nodes_len; ++i)
 592     {
 593       if (dfa->eclosures != NULL)
 594         re_node_set_free (dfa->eclosures + i);
 595       if (dfa->inveclosures != NULL)
 596         re_node_set_free (dfa->inveclosures + i);
 597       if (dfa->edests != NULL)
 598         re_node_set_free (dfa->edests + i);
 599     }
 600   re_free (dfa->edests);
 601   re_free (dfa->eclosures);
 602   re_free (dfa->inveclosures);
 603   re_free (dfa->nodes);
 604
 605   if (dfa->state_table)
 606     for (i = 0; i <= dfa->state_hash_mask; ++i)
 607       {
 608         struct re_state_table_entry *entry = dfa->state_table + i;
 609         for (j = 0; j < entry->num; ++j)
 610           {
 611             re_dfastate_t *state = entry->array[j];
 612             free_state (state);
 613           }
 614         re_free (entry->array);
 615       }
 616   re_free (dfa->state_table);
 617 #ifdef RE_ENABLE_I18N
 618   if (dfa->sb_char != utf8_sb_map)
 619     re_free (dfa->sb_char);
 620 #endif
 621   re_free (dfa->subexp_map);
 622 #ifdef DEBUG
 623   re_free (dfa->re_str);
 624 #endif
 625
 626   re_free (dfa);
 627 }
 628
 629
 630 /* Free dynamically allocated space used by PREG.  */
 631
 632 void
 633 regfree (regex_t *preg)
 634 {
 635   re_dfa_t *dfa = preg->buffer;
 636   if (__glibc_likely (dfa != NULL))
 637     {
 638       lock_fini (dfa->lock);
 639       free_dfa_content (dfa);
 640     }
 641   preg->buffer = NULL;
 642   preg->allocated = 0;
 643
 644   re_free (preg->fastmap);
 645   preg->fastmap = NULL;
 646
 647   re_free (preg->translate);
 648   preg->translate = NULL;
 649 }
 650 libc_hidden_def (__regfree)
 651 weak_alias (__regfree, regfree)
 652 \f
 653 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 654    them unless specifically requested.  */
 655
 656 #if defined _REGEX_RE_COMP || defined _LIBC
 657
 658 /* BSD has one and only one pattern buffer.  */
 659 static struct re_pattern_buffer re_comp_buf;
 660
 661 char *
 662 # ifdef _LIBC
 663 /* Make these definitions weak in libc, so POSIX programs can redefine
 664    these names if they don't use our functions, and still use
 665    regcomp/regexec above without link errors.  */
 666 weak_function
 667 # endif
 668 re_comp (const char *s)
 669 {
 670   reg_errcode_t ret;
 671   char *fastmap;
 672
 673   if (!s)
 674     {
 675       if (!re_comp_buf.buffer)
 676         return gettext ("No previous regular expression");
 677       return 0;
 678     }
 679
 680   if (re_comp_buf.buffer)
 681     {
 682       fastmap = re_comp_buf.fastmap;
 683       re_comp_buf.fastmap = NULL;
 684       __regfree (&re_comp_buf);
 685       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 686       re_comp_buf.fastmap = fastmap;
 687     }
 688
 689   if (re_comp_buf.fastmap == NULL)
 690     {
 691       re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
 692       if (re_comp_buf.fastmap == NULL)
 693         return (char *) gettext (__re_error_msgid
 694                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 695     }
 696
 697   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 698      don't need to initialize the pattern buffer fields which affect it.  */
 699
 700   /* Match anchors at newlines.  */
 701   re_comp_buf.newline_anchor = 1;
 702
 703   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 704
 705   if (!ret)
 706     return NULL;
 707
 708   /* Yes, we're discarding 'const' here if !HAVE_LIBINTL.  */
 709   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 710 }
 711
 712 #ifdef _LIBC
 713 libc_freeres_fn (free_mem)
 714 {
 715   __regfree (&re_comp_buf);
 716 }
 717 #endif
 718
 719 #endif /* _REGEX_RE_COMP */
 720 \f
 721 /* Internal entry point.
 722    Compile the regular expression PATTERN, whose length is LENGTH.
 723    SYNTAX indicate regular expression's syntax.  */
 724
 725 static reg_errcode_t
 726 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 727                      reg_syntax_t syntax)
 728 {
 729   reg_errcode_t err = REG_NOERROR;
 730   re_dfa_t *dfa;
 731   re_string_t regexp;
 732
 733   /* Initialize the pattern buffer.  */
 734   preg->fastmap_accurate = 0;
 735   preg->syntax = syntax;
 736   preg->not_bol = preg->not_eol = 0;
 737   preg->used = 0;
 738   preg->re_nsub = 0;
 739   preg->can_be_null = 0;
 740   preg->regs_allocated = REGS_UNALLOCATED;
 741
 742   /* Initialize the dfa.  */
 743   dfa = preg->buffer;
 744   if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
 745     {
 746       /* If zero allocated, but buffer is non-null, try to realloc
 747          enough space.  This loses if buffer's address is bogus, but
 748          that is the user's responsibility.  If ->buffer is NULL this
 749          is a simple allocation.  */
 750       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 751       if (dfa == NULL)
 752         return REG_ESPACE;
 753       preg->allocated = sizeof (re_dfa_t);
 754       preg->buffer = dfa;
 755     }
 756   preg->used = sizeof (re_dfa_t);
 757
 758   err = init_dfa (dfa, length);
 759   if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
 760     err = REG_ESPACE;
 761   if (__glibc_unlikely (err != REG_NOERROR))
 762     {
 763       free_dfa_content (dfa);
 764       preg->buffer = NULL;
 765       preg->allocated = 0;
 766       return err;
 767     }
 768 #ifdef DEBUG
 769   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 770   dfa->re_str = re_malloc (char, length + 1);
 771   strncpy (dfa->re_str, pattern, length + 1);
 772 #endif
 773
 774   err = re_string_construct (&regexp, pattern, length, preg->translate,
 775                              (syntax & RE_ICASE) != 0, dfa);
 776   if (__glibc_unlikely (err != REG_NOERROR))
 777     {
 778     re_compile_internal_free_return:
 779       free_workarea_compile (preg);
 780       re_string_destruct (&regexp);
 781       lock_fini (dfa->lock);
 782       free_dfa_content (dfa);
 783       preg->buffer = NULL;
 784       preg->allocated = 0;
 785       return err;
 786     }
 787
 788   /* Parse the regular expression, and build a structure tree.  */
 789   preg->re_nsub = 0;
 790   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 791   if (__glibc_unlikely (dfa->str_tree == NULL))
 792     goto re_compile_internal_free_return;
 793
 794   /* Analyze the tree and create the nfa.  */
 795   err = analyze (preg);
 796   if (__glibc_unlikely (err != REG_NOERROR))
 797     goto re_compile_internal_free_return;
 798
 799 #ifdef RE_ENABLE_I18N
 800   /* If possible, do searching in single byte encoding to speed things up.  */
 801   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 802     optimize_utf8 (dfa);
 803 #endif
 804
 805   /* Then create the initial state of the dfa.  */
 806   err = create_initial_state (dfa);
 807
 808   /* Release work areas.  */
 809   free_workarea_compile (preg);
 810   re_string_destruct (&regexp);
 811
 812   if (__glibc_unlikely (err != REG_NOERROR))
 813     {
 814       lock_fini (dfa->lock);
 815       free_dfa_content (dfa);
 816       preg->buffer = NULL;
 817       preg->allocated = 0;
 818     }
 819
 820   return err;
 821 }
 822
 823 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 824    as the initial length of some arrays.  */
 825
 826 static reg_errcode_t
 827 init_dfa (re_dfa_t *dfa, size_t pat_len)
 828 {
 829   __re_size_t table_size;
 830 #ifndef _LIBC
 831   const char *codeset_name;
 832 #endif
 833 #ifdef RE_ENABLE_I18N
 834   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 835 #else
 836   size_t max_i18n_object_size = 0;
 837 #endif
 838   size_t max_object_size =
 839     MAX (sizeof (struct re_state_table_entry),
 840          MAX (sizeof (re_token_t),
 841               MAX (sizeof (re_node_set),
 842                    MAX (sizeof (regmatch_t),
 843                         max_i18n_object_size))));
 844
 845   memset (dfa, '\0', sizeof (re_dfa_t));
 846
 847   /* Force allocation of str_tree_storage the first time.  */
 848   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 849
 850   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 851      calculation below, and for similar doubling calculations
 852      elsewhere.  And it's <= rather than <, because some of the
 853      doubling calculations add 1 afterwards.  */
 854   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
 855                         <= pat_len))
 856     return REG_ESPACE;
 857
 858   dfa->nodes_alloc = pat_len + 1;
 859   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 860
 861   /*  table_size = 2 ^ ceil(log pat_len) */
 862   for (table_size = 1; ; table_size <<= 1)
 863     if (table_size > pat_len)
 864       break;
 865
 866   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 867   dfa->state_hash_mask = table_size - 1;
 868
 869   dfa->mb_cur_max = MB_CUR_MAX;
 870 #ifdef _LIBC
 871   if (dfa->mb_cur_max == 6
 872       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 873     dfa->is_utf8 = 1;
 874   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 875                        != 0);
 876 #else
 877   codeset_name = nl_langinfo (CODESET);
 878   if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
 879       && (codeset_name[1] == 'T' || codeset_name[1] == 't')
 880       && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
 881       && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
 882     dfa->is_utf8 = 1;
 883
 884   /* We check exhaustively in the loop below if this charset is a
 885      superset of ASCII.  */
 886   dfa->map_notascii = 0;
 887 #endif
 888
 889 #ifdef RE_ENABLE_I18N
 890   if (dfa->mb_cur_max > 1)
 891     {
 892       if (dfa->is_utf8)
 893         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 894       else
 895         {
 896           int i, j, ch;
 897
 898           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 899           if (__glibc_unlikely (dfa->sb_char == NULL))
 900             return REG_ESPACE;
 901
 902           /* Set the bits corresponding to single byte chars.  */
 903           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 904             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 905               {
 906                 wint_t wch = __btowc (ch);
 907                 if (wch != WEOF)
 908                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 909 # ifndef _LIBC
 910                 if (isascii (ch) && wch != ch)
 911                   dfa->map_notascii = 1;
 912 # endif
 913               }
 914         }
 915     }
 916 #endif
 917
 918   if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
 919     return REG_ESPACE;
 920   return REG_NOERROR;
 921 }
 922
 923 /* Initialize WORD_CHAR table, which indicate which character is
 924    "word".  In this case "word" means that it is the word construction
 925    character used by some operators like "\<", "\>", etc.  */
 926
 927 static void
 928 init_word_char (re_dfa_t *dfa)
 929 {
 930   int i = 0;
 931   int j;
 932   int ch = 0;
 933   dfa->word_ops_used = 1;
 934   if (__glibc_likely (dfa->map_notascii == 0))
 935     {
 936       /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
 937          them, an issue when this code is used in Gnulib.  */
 938       bitset_word_t bits0 = 0x00000000;
 939       bitset_word_t bits1 = 0x03ff0000;
 940       bitset_word_t bits2 = 0x87fffffe;
 941       bitset_word_t bits3 = 0x07fffffe;
 942       if (BITSET_WORD_BITS == 64)
 943         {
 944           /* Pacify gcc -Woverflow on 32-bit platformns.  */
 945           dfa->word_char[0] = bits1 << 31 << 1 | bits0;
 946           dfa->word_char[1] = bits3 << 31 << 1 | bits2;
 947           i = 2;
 948         }
 949       else if (BITSET_WORD_BITS == 32)
 950         {
 951           dfa->word_char[0] = bits0;
 952           dfa->word_char[1] = bits1;
 953           dfa->word_char[2] = bits2;
 954           dfa->word_char[3] = bits3;
 955           i = 4;
 956         }
 957       else
 958         goto general_case;
 959       ch = 128;
 960
 961       if (__glibc_likely (dfa->is_utf8))
 962         {
 963           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 964           return;
 965         }
 966     }
 967
 968  general_case:
 969   for (; i < BITSET_WORDS; ++i)
 970     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 971       if (isalnum (ch) || ch == '_')
 972         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 973 }
 974
 975 /* Free the work area which are only used while compiling.  */
 976
 977 static void
 978 free_workarea_compile (regex_t *preg)
 979 {
 980   re_dfa_t *dfa = preg->buffer;
 981   bin_tree_storage_t *storage, *next;
 982   for (storage = dfa->str_tree_storage; storage; storage = next)
 983     {
 984       next = storage->next;
 985       re_free (storage);
 986     }
 987   dfa->str_tree_storage = NULL;
 988   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 989   dfa->str_tree = NULL;
 990   re_free (dfa->org_indices);
 991   dfa->org_indices = NULL;
 992 }
 993
 994 /* Create initial states for all contexts.  */
 995
 996 static reg_errcode_t
 997 create_initial_state (re_dfa_t *dfa)
 998 {
 999   Idx first, i;
1000   reg_errcode_t err;
1001   re_node_set init_nodes;
1002
1003   /* Initial states have the epsilon closure of the node which is
1004      the first node of the regular expression.  */
1005   first = dfa->str_tree->first->node_idx;
1006   dfa->init_node = first;
1007   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008   if (__glibc_unlikely (err != REG_NOERROR))
1009     return err;
1010
1011   /* The back-references which are in initial states can epsilon transit,
1012      since in this case all of the subexpressions can be null.
1013      Then we add epsilon closures of the nodes which are the next nodes of
1014      the back-references.  */
1015   if (dfa->nbackref > 0)
1016     for (i = 0; i < init_nodes.nelem; ++i)
1017       {
1018         Idx node_idx = init_nodes.elems[i];
1019         re_token_type_t type = dfa->nodes[node_idx].type;
1020
1021         Idx clexp_idx;
1022         if (type != OP_BACK_REF)
1023           continue;
1024         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025           {
1026             re_token_t *clexp_node;
1027             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028             if (clexp_node->type == OP_CLOSE_SUBEXP
1029                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030               break;
1031           }
1032         if (clexp_idx == init_nodes.nelem)
1033           continue;
1034
1035         if (type == OP_BACK_REF)
1036           {
1037             Idx dest_idx = dfa->edests[node_idx].elems[0];
1038             if (!re_node_set_contains (&init_nodes, dest_idx))
1039               {
1040                 reg_errcode_t merge_err
1041                   = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042                 if (merge_err != REG_NOERROR)
1043                   return merge_err;
1044                 i = 0;
1045               }
1046           }
1047       }
1048
1049   /* It must be the first time to invoke acquire_state.  */
1050   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051   /* We don't check ERR here, since the initial state must not be NULL.  */
1052   if (__glibc_unlikely (dfa->init_state == NULL))
1053     return err;
1054   if (dfa->init_state->has_constraint)
1055     {
1056       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057                                                        CONTEXT_WORD);
1058       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059                                                      CONTEXT_NEWLINE);
1060       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061                                                          &init_nodes,
1062                                                          CONTEXT_NEWLINE
1063                                                          | CONTEXT_BEGBUF);
1064       if (__glibc_unlikely (dfa->init_state_word == NULL
1065                             || dfa->init_state_nl == NULL
1066                             || dfa->init_state_begbuf == NULL))
1067         return err;
1068     }
1069   else
1070     dfa->init_state_word = dfa->init_state_nl
1071       = dfa->init_state_begbuf = dfa->init_state;
1072
1073   re_node_set_free (&init_nodes);
1074   return REG_NOERROR;
1075 }
1076 \f
1077 #ifdef RE_ENABLE_I18N
1078 /* If it is possible to do searching in single byte encoding instead of UTF-8
1079    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080    DFA nodes where needed.  */
1081
1082 static void
1083 optimize_utf8 (re_dfa_t *dfa)
1084 {
1085   Idx node;
1086   int i;
1087   bool mb_chars = false;
1088   bool has_period = false;
1089
1090   for (node = 0; node < dfa->nodes_len; ++node)
1091     switch (dfa->nodes[node].type)
1092       {
1093       case CHARACTER:
1094         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095           mb_chars = true;
1096         break;
1097       case ANCHOR:
1098         switch (dfa->nodes[node].opr.ctx_type)
1099           {
1100           case LINE_FIRST:
1101           case LINE_LAST:
1102           case BUF_FIRST:
1103           case BUF_LAST:
1104             break;
1105           default:
1106             /* Word anchors etc. cannot be handled.  It's okay to test
1107                opr.ctx_type since constraints (for all DFA nodes) are
1108                created by ORing one or more opr.ctx_type values.  */
1109             return;
1110           }
1111         break;
1112       case OP_PERIOD:
1113         has_period = true;
1114         break;
1115       case OP_BACK_REF:
1116       case OP_ALT:
1117       case END_OF_RE:
1118       case OP_DUP_ASTERISK:
1119       case OP_OPEN_SUBEXP:
1120       case OP_CLOSE_SUBEXP:
1121         break;
1122       case COMPLEX_BRACKET:
1123         return;
1124       case SIMPLE_BRACKET:
1125         /* Just double check.  */
1126         {
1127           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1128                         ? 0
1129                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131             {
1132               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1133                 return;
1134               rshift = 0;
1135             }
1136         }
1137         break;
1138       default:
1139         abort ();
1140       }
1141
1142   if (mb_chars || has_period)
1143     for (node = 0; node < dfa->nodes_len; ++node)
1144       {
1145         if (dfa->nodes[node].type == CHARACTER
1146             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1147           dfa->nodes[node].mb_partial = 0;
1148         else if (dfa->nodes[node].type == OP_PERIOD)
1149           dfa->nodes[node].type = OP_UTF8_PERIOD;
1150       }
1151
1152   /* The search can be in single byte locale.  */
1153   dfa->mb_cur_max = 1;
1154   dfa->is_utf8 = 0;
1155   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1156 }
1157 #endif
1158 \f
1159 /* Analyze the structure tree, and calculate "first", "next", "edest",
1160    "eclosure", and "inveclosure".  */
1161
1162 static reg_errcode_t
1163 analyze (regex_t *preg)
1164 {
1165   re_dfa_t *dfa = preg->buffer;
1166   reg_errcode_t ret;
1167
1168   /* Allocate arrays.  */
1169   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173   if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174                         || dfa->edests == NULL || dfa->eclosures == NULL))
1175     return REG_ESPACE;
1176
1177   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178   if (dfa->subexp_map != NULL)
1179     {
1180       Idx i;
1181       for (i = 0; i < preg->re_nsub; i++)
1182         dfa->subexp_map[i] = i;
1183       preorder (dfa->str_tree, optimize_subexps, dfa);
1184       for (i = 0; i < preg->re_nsub; i++)
1185         if (dfa->subexp_map[i] != i)
1186           break;
1187       if (i == preg->re_nsub)
1188         {
1189           re_free (dfa->subexp_map);
1190           dfa->subexp_map = NULL;
1191         }
1192     }
1193
1194   ret = postorder (dfa->str_tree, lower_subexps, preg);
1195   if (__glibc_unlikely (ret != REG_NOERROR))
1196     return ret;
1197   ret = postorder (dfa->str_tree, calc_first, dfa);
1198   if (__glibc_unlikely (ret != REG_NOERROR))
1199     return ret;
1200   preorder (dfa->str_tree, calc_next, dfa);
1201   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202   if (__glibc_unlikely (ret != REG_NOERROR))
1203     return ret;
1204   ret = calc_eclosure (dfa);
1205   if (__glibc_unlikely (ret != REG_NOERROR))
1206     return ret;
1207
1208   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1209      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1210   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1211       || dfa->nbackref)
1212     {
1213       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214       if (__glibc_unlikely (dfa->inveclosures == NULL))
1215         return REG_ESPACE;
1216       ret = calc_inveclosure (dfa);
1217     }
1218
1219   return ret;
1220 }
1221
1222 /* Our parse trees are very unbalanced, so we cannot use a stack to
1223    implement parse tree visits.  Instead, we use parent pointers and
1224    some hairy code in these two functions.  */
1225 static reg_errcode_t
1226 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1227            void *extra)
1228 {
1229   bin_tree_t *node, *prev;
1230
1231   for (node = root; ; )
1232     {
1233       /* Descend down the tree, preferably to the left (or to the right
1234          if that's the only child).  */
1235       while (node->left || node->right)
1236         if (node->left)
1237           node = node->left;
1238         else
1239           node = node->right;
1240
1241       do
1242         {
1243           reg_errcode_t err = fn (extra, node);
1244           if (__glibc_unlikely (err != REG_NOERROR))
1245             return err;
1246           if (node->parent == NULL)
1247             return REG_NOERROR;
1248           prev = node;
1249           node = node->parent;
1250         }
1251       /* Go up while we have a node that is reached from the right.  */
1252       while (node->right == prev || node->right == NULL);
1253       node = node->right;
1254     }
1255 }
1256
1257 static reg_errcode_t
1258 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1259           void *extra)
1260 {
1261   bin_tree_t *node;
1262
1263   for (node = root; ; )
1264     {
1265       reg_errcode_t err = fn (extra, node);
1266       if (__glibc_unlikely (err != REG_NOERROR))
1267         return err;
1268
1269       /* Go to the left node, or up and to the right.  */
1270       if (node->left)
1271         node = node->left;
1272       else
1273         {
1274           bin_tree_t *prev = NULL;
1275           while (node->right == prev || node->right == NULL)
1276             {
1277               prev = node;
1278               node = node->parent;
1279               if (!node)
1280                 return REG_NOERROR;
1281             }
1282           node = node->right;
1283         }
1284     }
1285 }
1286
1287 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1288    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1289    backreferences as well.  Requires a preorder visit.  */
1290 static reg_errcode_t
1291 optimize_subexps (void *extra, bin_tree_t *node)
1292 {
1293   re_dfa_t *dfa = (re_dfa_t *) extra;
1294
1295   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1296     {
1297       int idx = node->token.opr.idx;
1298       node->token.opr.idx = dfa->subexp_map[idx];
1299       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1300     }
1301
1302   else if (node->token.type == SUBEXP
1303            && node->left && node->left->token.type == SUBEXP)
1304     {
1305       Idx other_idx = node->left->token.opr.idx;
1306
1307       node->left = node->left->left;
1308       if (node->left)
1309         node->left->parent = node;
1310
1311       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312       if (other_idx < BITSET_WORD_BITS)
1313         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1314     }
1315
1316   return REG_NOERROR;
1317 }
1318
1319 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1320    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1321 static reg_errcode_t
1322 lower_subexps (void *extra, bin_tree_t *node)
1323 {
1324   regex_t *preg = (regex_t *) extra;
1325   reg_errcode_t err = REG_NOERROR;
1326
1327   if (node->left && node->left->token.type == SUBEXP)
1328     {
1329       node->left = lower_subexp (&err, preg, node->left);
1330       if (node->left)
1331         node->left->parent = node;
1332     }
1333   if (node->right && node->right->token.type == SUBEXP)
1334     {
1335       node->right = lower_subexp (&err, preg, node->right);
1336       if (node->right)
1337         node->right->parent = node;
1338     }
1339
1340   return err;
1341 }
1342
1343 static bin_tree_t *
1344 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1345 {
1346   re_dfa_t *dfa = preg->buffer;
1347   bin_tree_t *body = node->left;
1348   bin_tree_t *op, *cls, *tree1, *tree;
1349
1350   if (preg->no_sub
1351       /* We do not optimize empty subexpressions, because otherwise we may
1352          have bad CONCAT nodes with NULL children.  This is obviously not
1353          very common, so we do not lose much.  An example that triggers
1354          this case is the sed "script" /\(\)/x.  */
1355       && node->left != NULL
1356       && (node->token.opr.idx >= BITSET_WORD_BITS
1357           || !(dfa->used_bkref_map
1358                & ((bitset_word_t) 1 << node->token.opr.idx))))
1359     return node->left;
1360
1361   /* Convert the SUBEXP node to the concatenation of an
1362      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1363   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366   tree = create_tree (dfa, op, tree1, CONCAT);
1367   if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368                         || op == NULL || cls == NULL))
1369     {
1370       *err = REG_ESPACE;
1371       return NULL;
1372     }
1373
1374   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376   return tree;
1377 }
1378
1379 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1380    nodes.  Requires a postorder visit.  */
1381 static reg_errcode_t
1382 calc_first (void *extra, bin_tree_t *node)
1383 {
1384   re_dfa_t *dfa = (re_dfa_t *) extra;
1385   if (node->token.type == CONCAT)
1386     {
1387       node->first = node->left->first;
1388       node->node_idx = node->left->node_idx;
1389     }
1390   else
1391     {
1392       node->first = node;
1393       node->node_idx = re_dfa_add_node (dfa, node->token);
1394       if (__glibc_unlikely (node->node_idx == -1))
1395         return REG_ESPACE;
1396       if (node->token.type == ANCHOR)
1397         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1398     }
1399   return REG_NOERROR;
1400 }
1401
1402 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1403 static reg_errcode_t
1404 calc_next (void *extra, bin_tree_t *node)
1405 {
1406   switch (node->token.type)
1407     {
1408     case OP_DUP_ASTERISK:
1409       node->left->next = node;
1410       break;
1411     case CONCAT:
1412       node->left->next = node->right->first;
1413       node->right->next = node->next;
1414       break;
1415     default:
1416       if (node->left)
1417         node->left->next = node->next;
1418       if (node->right)
1419         node->right->next = node->next;
1420       break;
1421     }
1422   return REG_NOERROR;
1423 }
1424
1425 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1426 static reg_errcode_t
1427 link_nfa_nodes (void *extra, bin_tree_t *node)
1428 {
1429   re_dfa_t *dfa = (re_dfa_t *) extra;
1430   Idx idx = node->node_idx;
1431   reg_errcode_t err = REG_NOERROR;
1432
1433   switch (node->token.type)
1434     {
1435     case CONCAT:
1436       break;
1437
1438     case END_OF_RE:
1439       DEBUG_ASSERT (node->next == NULL);
1440       break;
1441
1442     case OP_DUP_ASTERISK:
1443     case OP_ALT:
1444       {
1445         Idx left, right;
1446         dfa->has_plural_match = 1;
1447         if (node->left != NULL)
1448           left = node->left->first->node_idx;
1449         else
1450           left = node->next->node_idx;
1451         if (node->right != NULL)
1452           right = node->right->first->node_idx;
1453         else
1454           right = node->next->node_idx;
1455         DEBUG_ASSERT (left > -1);
1456         DEBUG_ASSERT (right > -1);
1457         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1458       }
1459       break;
1460
1461     case ANCHOR:
1462     case OP_OPEN_SUBEXP:
1463     case OP_CLOSE_SUBEXP:
1464       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465       break;
1466
1467     case OP_BACK_REF:
1468       dfa->nexts[idx] = node->next->node_idx;
1469       if (node->token.type == OP_BACK_REF)
1470         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1471       break;
1472
1473     default:
1474       DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1475       dfa->nexts[idx] = node->next->node_idx;
1476       break;
1477     }
1478
1479   return err;
1480 }
1481
1482 /* Duplicate the epsilon closure of the node ROOT_NODE.
1483    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484    to their own constraint.  */
1485
1486 static reg_errcode_t
1487 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488                         Idx root_node, unsigned int init_constraint)
1489 {
1490   Idx org_node, clone_node;
1491   bool ok;
1492   unsigned int constraint = init_constraint;
1493   for (org_node = top_org_node, clone_node = top_clone_node;;)
1494     {
1495       Idx org_dest, clone_dest;
1496       if (dfa->nodes[org_node].type == OP_BACK_REF)
1497         {
1498           /* If the back reference epsilon-transit, its destination must
1499              also have the constraint.  Then duplicate the epsilon closure
1500              of the destination of the back reference, and store it in
1501              edests of the back reference.  */
1502           org_dest = dfa->nexts[org_node];
1503           re_node_set_empty (dfa->edests + clone_node);
1504           clone_dest = duplicate_node (dfa, org_dest, constraint);
1505           if (__glibc_unlikely (clone_dest == -1))
1506             return REG_ESPACE;
1507           dfa->nexts[clone_node] = dfa->nexts[org_node];
1508           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509           if (__glibc_unlikely (! ok))
1510             return REG_ESPACE;
1511         }
1512       else if (dfa->edests[org_node].nelem == 0)
1513         {
1514           /* In case of the node can't epsilon-transit, don't duplicate the
1515              destination and store the original destination as the
1516              destination of the node.  */
1517           dfa->nexts[clone_node] = dfa->nexts[org_node];
1518           break;
1519         }
1520       else if (dfa->edests[org_node].nelem == 1)
1521         {
1522           /* In case of the node can epsilon-transit, and it has only one
1523              destination.  */
1524           org_dest = dfa->edests[org_node].elems[0];
1525           re_node_set_empty (dfa->edests + clone_node);
1526           /* If the node is root_node itself, it means the epsilon closure
1527              has a loop.  Then tie it to the destination of the root_node.  */
1528           if (org_node == root_node && clone_node != org_node)
1529             {
1530               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531               if (__glibc_unlikely (! ok))
1532                 return REG_ESPACE;
1533               break;
1534             }
1535           /* In case the node has another constraint, append it.  */
1536           constraint |= dfa->nodes[org_node].constraint;
1537           clone_dest = duplicate_node (dfa, org_dest, constraint);
1538           if (__glibc_unlikely (clone_dest == -1))
1539             return REG_ESPACE;
1540           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541           if (__glibc_unlikely (! ok))
1542             return REG_ESPACE;
1543         }
1544       else /* dfa->edests[org_node].nelem == 2 */
1545         {
1546           /* In case of the node can epsilon-transit, and it has two
1547              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1548           org_dest = dfa->edests[org_node].elems[0];
1549           re_node_set_empty (dfa->edests + clone_node);
1550           /* Search for a duplicated node which satisfies the constraint.  */
1551           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552           if (clone_dest == -1)
1553             {
1554               /* There is no such duplicated node, create a new one.  */
1555               reg_errcode_t err;
1556               clone_dest = duplicate_node (dfa, org_dest, constraint);
1557               if (__glibc_unlikely (clone_dest == -1))
1558                 return REG_ESPACE;
1559               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560               if (__glibc_unlikely (! ok))
1561                 return REG_ESPACE;
1562               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563                                             root_node, constraint);
1564               if (__glibc_unlikely (err != REG_NOERROR))
1565                 return err;
1566             }
1567           else
1568             {
1569               /* There is a duplicated node which satisfies the constraint,
1570                  use it to avoid infinite loop.  */
1571               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572               if (__glibc_unlikely (! ok))
1573                 return REG_ESPACE;
1574             }
1575
1576           org_dest = dfa->edests[org_node].elems[1];
1577           clone_dest = duplicate_node (dfa, org_dest, constraint);
1578           if (__glibc_unlikely (clone_dest == -1))
1579             return REG_ESPACE;
1580           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581           if (__glibc_unlikely (! ok))
1582             return REG_ESPACE;
1583         }
1584       org_node = org_dest;
1585       clone_node = clone_dest;
1586     }
1587   return REG_NOERROR;
1588 }
1589
1590 /* Search for a node which is duplicated from the node ORG_NODE, and
1591    satisfies the constraint CONSTRAINT.  */
1592
1593 static Idx
1594 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595                         unsigned int constraint)
1596 {
1597   Idx idx;
1598   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1599     {
1600       if (org_node == dfa->org_indices[idx]
1601           && constraint == dfa->nodes[idx].constraint)
1602         return idx; /* Found.  */
1603     }
1604   return -1; /* Not found.  */
1605 }
1606
1607 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1608    Return the index of the new node, or -1 if insufficient storage is
1609    available.  */
1610
1611 static Idx
1612 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1613 {
1614   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615   if (__glibc_likely (dup_idx != -1))
1616     {
1617       dfa->nodes[dup_idx].constraint = constraint;
1618       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1619       dfa->nodes[dup_idx].duplicated = 1;
1620
1621       /* Store the index of the original node.  */
1622       dfa->org_indices[dup_idx] = org_idx;
1623     }
1624   return dup_idx;
1625 }
1626
1627 static reg_errcode_t
1628 calc_inveclosure (re_dfa_t *dfa)
1629 {
1630   Idx src, idx;
1631   bool ok;
1632   for (idx = 0; idx < dfa->nodes_len; ++idx)
1633     re_node_set_init_empty (dfa->inveclosures + idx);
1634
1635   for (src = 0; src < dfa->nodes_len; ++src)
1636     {
1637       Idx *elems = dfa->eclosures[src].elems;
1638       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1639         {
1640           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641           if (__glibc_unlikely (! ok))
1642             return REG_ESPACE;
1643         }
1644     }
1645
1646   return REG_NOERROR;
1647 }
1648
1649 /* Calculate "eclosure" for all the node in DFA.  */
1650
1651 static reg_errcode_t
1652 calc_eclosure (re_dfa_t *dfa)
1653 {
1654   Idx node_idx;
1655   bool incomplete;
1656   DEBUG_ASSERT (dfa->nodes_len > 0);
1657   incomplete = false;
1658   /* For each nodes, calculate epsilon closure.  */
1659   for (node_idx = 0; ; ++node_idx)
1660     {
1661       reg_errcode_t err;
1662       re_node_set eclosure_elem;
1663       if (node_idx == dfa->nodes_len)
1664         {
1665           if (!incomplete)
1666             break;
1667           incomplete = false;
1668           node_idx = 0;
1669         }
1670
1671       DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1672
1673       /* If we have already calculated, skip it.  */
1674       if (dfa->eclosures[node_idx].nelem != 0)
1675         continue;
1676       /* Calculate epsilon closure of 'node_idx'.  */
1677       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678       if (__glibc_unlikely (err != REG_NOERROR))
1679         return err;
1680
1681       if (dfa->eclosures[node_idx].nelem == 0)
1682         {
1683           incomplete = true;
1684           re_node_set_free (&eclosure_elem);
1685         }
1686     }
1687   return REG_NOERROR;
1688 }
1689
1690 /* Calculate epsilon closure of NODE.  */
1691
1692 static reg_errcode_t
1693 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1694 {
1695   reg_errcode_t err;
1696   Idx i;
1697   re_node_set eclosure;
1698   bool incomplete = false;
1699   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1700   if (__glibc_unlikely (err != REG_NOERROR))
1701     return err;
1702
1703   /* An epsilon closure includes itself.  */
1704   eclosure.elems[eclosure.nelem++] = node;
1705
1706   /* This indicates that we are calculating this node now.
1707      We reference this value to avoid infinite loop.  */
1708   dfa->eclosures[node].nelem = -1;
1709
1710   /* If the current node has constraints, duplicate all nodes
1711      since they must inherit the constraints.  */
1712   if (dfa->nodes[node].constraint
1713       && dfa->edests[node].nelem
1714       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1715     {
1716       err = duplicate_node_closure (dfa, node, node, node,
1717                                     dfa->nodes[node].constraint);
1718       if (__glibc_unlikely (err != REG_NOERROR))
1719         return err;
1720     }
1721
1722   /* Expand each epsilon destination nodes.  */
1723   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1724     for (i = 0; i < dfa->edests[node].nelem; ++i)
1725       {
1726         re_node_set eclosure_elem;
1727         Idx edest = dfa->edests[node].elems[i];
1728         /* If calculating the epsilon closure of 'edest' is in progress,
1729            return intermediate result.  */
1730         if (dfa->eclosures[edest].nelem == -1)
1731           {
1732             incomplete = true;
1733             continue;
1734           }
1735         /* If we haven't calculated the epsilon closure of 'edest' yet,
1736            calculate now. Otherwise use calculated epsilon closure.  */
1737         if (dfa->eclosures[edest].nelem == 0)
1738           {
1739             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1740             if (__glibc_unlikely (err != REG_NOERROR))
1741               return err;
1742           }
1743         else
1744           eclosure_elem = dfa->eclosures[edest];
1745         /* Merge the epsilon closure of 'edest'.  */
1746         err = re_node_set_merge (&eclosure, &eclosure_elem);
1747         if (__glibc_unlikely (err != REG_NOERROR))
1748           return err;
1749         /* If the epsilon closure of 'edest' is incomplete,
1750            the epsilon closure of this node is also incomplete.  */
1751         if (dfa->eclosures[edest].nelem == 0)
1752           {
1753             incomplete = true;
1754             re_node_set_free (&eclosure_elem);
1755           }
1756       }
1757
1758   if (incomplete && !root)
1759     dfa->eclosures[node].nelem = 0;
1760   else
1761     dfa->eclosures[node] = eclosure;
1762   *new_set = eclosure;
1763   return REG_NOERROR;
1764 }
1765 \f
1766 /* Functions for token which are used in the parser.  */
1767
1768 /* Fetch a token from INPUT.
1769    We must not use this function inside bracket expressions.  */
1770
1771 static void
1772 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1773 {
1774   re_string_skip_bytes (input, peek_token (result, input, syntax));
1775 }
1776
1777 /* Peek a token from INPUT, and return the length of the token.
1778    We must not use this function inside bracket expressions.  */
1779
1780 static int
1781 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1782 {
1783   unsigned char c;
1784
1785   if (re_string_eoi (input))
1786     {
1787       token->type = END_OF_RE;
1788       return 0;
1789     }
1790
1791   c = re_string_peek_byte (input, 0);
1792   token->opr.c = c;
1793
1794   token->word_char = 0;
1795 #ifdef RE_ENABLE_I18N
1796   token->mb_partial = 0;
1797   if (input->mb_cur_max > 1
1798       && !re_string_first_byte (input, re_string_cur_idx (input)))
1799     {
1800       token->type = CHARACTER;
1801       token->mb_partial = 1;
1802       return 1;
1803     }
1804 #endif
1805   if (c == '\\')
1806     {
1807       unsigned char c2;
1808       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1809         {
1810           token->type = BACK_SLASH;
1811           return 1;
1812         }
1813
1814       c2 = re_string_peek_byte_case (input, 1);
1815       token->opr.c = c2;
1816       token->type = CHARACTER;
1817 #ifdef RE_ENABLE_I18N
1818       if (input->mb_cur_max > 1)
1819         {
1820           wint_t wc = re_string_wchar_at (input,
1821                                           re_string_cur_idx (input) + 1);
1822           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1823         }
1824       else
1825 #endif
1826         token->word_char = IS_WORD_CHAR (c2) != 0;
1827
1828       switch (c2)
1829         {
1830         case '|':
1831           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832             token->type = OP_ALT;
1833           break;
1834         case '1': case '2': case '3': case '4': case '5':
1835         case '6': case '7': case '8': case '9':
1836           if (!(syntax & RE_NO_BK_REFS))
1837             {
1838               token->type = OP_BACK_REF;
1839               token->opr.idx = c2 - '1';
1840             }
1841           break;
1842         case '<':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             {
1845               token->type = ANCHOR;
1846               token->opr.ctx_type = WORD_FIRST;
1847             }
1848           break;
1849         case '>':
1850           if (!(syntax & RE_NO_GNU_OPS))
1851             {
1852               token->type = ANCHOR;
1853               token->opr.ctx_type = WORD_LAST;
1854             }
1855           break;
1856         case 'b':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             {
1859               token->type = ANCHOR;
1860               token->opr.ctx_type = WORD_DELIM;
1861             }
1862           break;
1863         case 'B':
1864           if (!(syntax & RE_NO_GNU_OPS))
1865             {
1866               token->type = ANCHOR;
1867               token->opr.ctx_type = NOT_WORD_DELIM;
1868             }
1869           break;
1870         case 'w':
1871           if (!(syntax & RE_NO_GNU_OPS))
1872             token->type = OP_WORD;
1873           break;
1874         case 'W':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             token->type = OP_NOTWORD;
1877           break;
1878         case 's':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             token->type = OP_SPACE;
1881           break;
1882         case 'S':
1883           if (!(syntax & RE_NO_GNU_OPS))
1884             token->type = OP_NOTSPACE;
1885           break;
1886         case '`':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             {
1889               token->type = ANCHOR;
1890               token->opr.ctx_type = BUF_FIRST;
1891             }
1892           break;
1893         case '\'':
1894           if (!(syntax & RE_NO_GNU_OPS))
1895             {
1896               token->type = ANCHOR;
1897               token->opr.ctx_type = BUF_LAST;
1898             }
1899           break;
1900         case '(':
1901           if (!(syntax & RE_NO_BK_PARENS))
1902             token->type = OP_OPEN_SUBEXP;
1903           break;
1904         case ')':
1905           if (!(syntax & RE_NO_BK_PARENS))
1906             token->type = OP_CLOSE_SUBEXP;
1907           break;
1908         case '+':
1909           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910             token->type = OP_DUP_PLUS;
1911           break;
1912         case '?':
1913           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914             token->type = OP_DUP_QUESTION;
1915           break;
1916         case '{':
1917           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918             token->type = OP_OPEN_DUP_NUM;
1919           break;
1920         case '}':
1921           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922             token->type = OP_CLOSE_DUP_NUM;
1923           break;
1924         default:
1925           break;
1926         }
1927       return 2;
1928     }
1929
1930   token->type = CHARACTER;
1931 #ifdef RE_ENABLE_I18N
1932   if (input->mb_cur_max > 1)
1933     {
1934       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1936     }
1937   else
1938 #endif
1939     token->word_char = IS_WORD_CHAR (token->opr.c);
1940
1941   switch (c)
1942     {
1943     case '\n':
1944       if (syntax & RE_NEWLINE_ALT)
1945         token->type = OP_ALT;
1946       break;
1947     case '|':
1948       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949         token->type = OP_ALT;
1950       break;
1951     case '*':
1952       token->type = OP_DUP_ASTERISK;
1953       break;
1954     case '+':
1955       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956         token->type = OP_DUP_PLUS;
1957       break;
1958     case '?':
1959       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960         token->type = OP_DUP_QUESTION;
1961       break;
1962     case '{':
1963       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964         token->type = OP_OPEN_DUP_NUM;
1965       break;
1966     case '}':
1967       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968         token->type = OP_CLOSE_DUP_NUM;
1969       break;
1970     case '(':
1971       if (syntax & RE_NO_BK_PARENS)
1972         token->type = OP_OPEN_SUBEXP;
1973       break;
1974     case ')':
1975       if (syntax & RE_NO_BK_PARENS)
1976         token->type = OP_CLOSE_SUBEXP;
1977       break;
1978     case '[':
1979       token->type = OP_OPEN_BRACKET;
1980       break;
1981     case '.':
1982       token->type = OP_PERIOD;
1983       break;
1984     case '^':
1985       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1986           && re_string_cur_idx (input) != 0)
1987         {
1988           char prev = re_string_peek_byte (input, -1);
1989           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1990             break;
1991         }
1992       token->type = ANCHOR;
1993       token->opr.ctx_type = LINE_FIRST;
1994       break;
1995     case '$':
1996       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1997           && re_string_cur_idx (input) + 1 != re_string_length (input))
1998         {
1999           re_token_t next;
2000           re_string_skip_bytes (input, 1);
2001           peek_token (&next, input, syntax);
2002           re_string_skip_bytes (input, -1);
2003           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004             break;
2005         }
2006       token->type = ANCHOR;
2007       token->opr.ctx_type = LINE_LAST;
2008       break;
2009     default:
2010       break;
2011     }
2012   return 1;
2013 }
2014
2015 /* Peek a token from INPUT, and return the length of the token.
2016    We must not use this function out of bracket expressions.  */
2017
2018 static int
2019 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2020 {
2021   unsigned char c;
2022   if (re_string_eoi (input))
2023     {
2024       token->type = END_OF_RE;
2025       return 0;
2026     }
2027   c = re_string_peek_byte (input, 0);
2028   token->opr.c = c;
2029
2030 #ifdef RE_ENABLE_I18N
2031   if (input->mb_cur_max > 1
2032       && !re_string_first_byte (input, re_string_cur_idx (input)))
2033     {
2034       token->type = CHARACTER;
2035       return 1;
2036     }
2037 #endif /* RE_ENABLE_I18N */
2038
2039   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040       && re_string_cur_idx (input) + 1 < re_string_length (input))
2041     {
2042       /* In this case, '\' escape a character.  */
2043       unsigned char c2;
2044       re_string_skip_bytes (input, 1);
2045       c2 = re_string_peek_byte (input, 0);
2046       token->opr.c = c2;
2047       token->type = CHARACTER;
2048       return 1;
2049     }
2050   if (c == '[') /* '[' is a special char in a bracket exps.  */
2051     {
2052       unsigned char c2;
2053       int token_len;
2054       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2055         c2 = re_string_peek_byte (input, 1);
2056       else
2057         c2 = 0;
2058       token->opr.c = c2;
2059       token_len = 2;
2060       switch (c2)
2061         {
2062         case '.':
2063           token->type = OP_OPEN_COLL_ELEM;
2064           break;
2065
2066         case '=':
2067           token->type = OP_OPEN_EQUIV_CLASS;
2068           break;
2069
2070         case ':':
2071           if (syntax & RE_CHAR_CLASSES)
2072             {
2073               token->type = OP_OPEN_CHAR_CLASS;
2074               break;
2075             }
2076           FALLTHROUGH;
2077         default:
2078           token->type = CHARACTER;
2079           token->opr.c = c;
2080           token_len = 1;
2081           break;
2082         }
2083       return token_len;
2084     }
2085   switch (c)
2086     {
2087     case '-':
2088       token->type = OP_CHARSET_RANGE;
2089       break;
2090     case ']':
2091       token->type = OP_CLOSE_BRACKET;
2092       break;
2093     case '^':
2094       token->type = OP_NON_MATCH_LIST;
2095       break;
2096     default:
2097       token->type = CHARACTER;
2098     }
2099   return 1;
2100 }
2101 \f
2102 /* Functions for parser.  */
2103
2104 /* Entry point of the parser.
2105    Parse the regular expression REGEXP and return the structure tree.
2106    If an error occurs, ERR is set by error code, and return NULL.
2107    This function build the following tree, from regular expression <reg_exp>:
2108            CAT
2109            / \
2110           /   \
2111    <reg_exp>  EOR
2112
2113    CAT means concatenation.
2114    EOR means end of regular expression.  */
2115
2116 static bin_tree_t *
2117 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2118        reg_errcode_t *err)
2119 {
2120   re_dfa_t *dfa = preg->buffer;
2121   bin_tree_t *tree, *eor, *root;
2122   re_token_t current_token;
2123   dfa->syntax = syntax;
2124   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2126   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2127     return NULL;
2128   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129   if (tree != NULL)
2130     root = create_tree (dfa, tree, eor, CONCAT);
2131   else
2132     root = eor;
2133   if (__glibc_unlikely (eor == NULL || root == NULL))
2134     {
2135       *err = REG_ESPACE;
2136       return NULL;
2137     }
2138   return root;
2139 }
2140
2141 /* This function build the following tree, from regular expression
2142    <branch1>|<branch2>:
2143            ALT
2144            / \
2145           /   \
2146    <branch1> <branch2>
2147
2148    ALT means alternative, which represents the operator '|'.  */
2149
2150 static bin_tree_t *
2151 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2153 {
2154   re_dfa_t *dfa = preg->buffer;
2155   bin_tree_t *tree, *branch = NULL;
2156   bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2157   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2158   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2159     return NULL;
2160
2161   while (token->type == OP_ALT)
2162     {
2163       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2164       if (token->type != OP_ALT && token->type != END_OF_RE
2165           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166         {
2167           bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2168           dfa->completed_bkref_map = initial_bkref_map;
2169           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2170           if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2171             {
2172               if (tree != NULL)
2173                 postorder (tree, free_tree, NULL);
2174               return NULL;
2175             }
2176           dfa->completed_bkref_map |= accumulated_bkref_map;
2177         }
2178       else
2179         branch = NULL;
2180       tree = create_tree (dfa, tree, branch, OP_ALT);
2181       if (__glibc_unlikely (tree == NULL))
2182         {
2183           *err = REG_ESPACE;
2184           return NULL;
2185         }
2186     }
2187   return tree;
2188 }
2189
2190 /* This function build the following tree, from regular expression
2191    <exp1><exp2>:
2192         CAT
2193         / \
2194        /   \
2195    <exp1> <exp2>
2196
2197    CAT means concatenation.  */
2198
2199 static bin_tree_t *
2200 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2201               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2202 {
2203   bin_tree_t *tree, *expr;
2204   re_dfa_t *dfa = preg->buffer;
2205   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2206   if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2207     return NULL;
2208
2209   while (token->type != OP_ALT && token->type != END_OF_RE
2210          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2211     {
2212       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2213       if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2214         {
2215           if (tree != NULL)
2216             postorder (tree, free_tree, NULL);
2217           return NULL;
2218         }
2219       if (tree != NULL && expr != NULL)
2220         {
2221           bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2222           if (newtree == NULL)
2223             {
2224               postorder (expr, free_tree, NULL);
2225               postorder (tree, free_tree, NULL);
2226               *err = REG_ESPACE;
2227               return NULL;
2228             }
2229           tree = newtree;
2230         }
2231       else if (tree == NULL)
2232         tree = expr;
2233       /* Otherwise expr == NULL, we don't need to create new tree.  */
2234     }
2235   return tree;
2236 }
2237
2238 /* This function build the following tree, from regular expression a*:
2239          *
2240          |
2241          a
2242 */
2243
2244 static bin_tree_t *
2245 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2246                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2247 {
2248   re_dfa_t *dfa = preg->buffer;
2249   bin_tree_t *tree;
2250   switch (token->type)
2251     {
2252     case CHARACTER:
2253       tree = create_token_tree (dfa, NULL, NULL, token);
2254       if (__glibc_unlikely (tree == NULL))
2255         {
2256           *err = REG_ESPACE;
2257           return NULL;
2258         }
2259 #ifdef RE_ENABLE_I18N
2260       if (dfa->mb_cur_max > 1)
2261         {
2262           while (!re_string_eoi (regexp)
2263                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2264             {
2265               bin_tree_t *mbc_remain;
2266               fetch_token (token, regexp, syntax);
2267               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2268               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2269               if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2270                 {
2271                   *err = REG_ESPACE;
2272                   return NULL;
2273                 }
2274             }
2275         }
2276 #endif
2277       break;
2278
2279     case OP_OPEN_SUBEXP:
2280       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2281       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2282         return NULL;
2283       break;
2284
2285     case OP_OPEN_BRACKET:
2286       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2287       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2288         return NULL;
2289       break;
2290
2291     case OP_BACK_REF:
2292       if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2293         {
2294           *err = REG_ESUBREG;
2295           return NULL;
2296         }
2297       dfa->used_bkref_map |= 1 << token->opr.idx;
2298       tree = create_token_tree (dfa, NULL, NULL, token);
2299       if (__glibc_unlikely (tree == NULL))
2300         {
2301           *err = REG_ESPACE;
2302           return NULL;
2303         }
2304       ++dfa->nbackref;
2305       dfa->has_mb_node = 1;
2306       break;
2307
2308     case OP_OPEN_DUP_NUM:
2309       if (syntax & RE_CONTEXT_INVALID_DUP)
2310         {
2311           *err = REG_BADRPT;
2312           return NULL;
2313         }
2314       FALLTHROUGH;
2315     case OP_DUP_ASTERISK:
2316     case OP_DUP_PLUS:
2317     case OP_DUP_QUESTION:
2318       if (syntax & RE_CONTEXT_INVALID_OPS)
2319         {
2320           *err = REG_BADRPT;
2321           return NULL;
2322         }
2323       else if (syntax & RE_CONTEXT_INDEP_OPS)
2324         {
2325           fetch_token (token, regexp, syntax);
2326           return parse_expression (regexp, preg, token, syntax, nest, err);
2327         }
2328       FALLTHROUGH;
2329     case OP_CLOSE_SUBEXP:
2330       if ((token->type == OP_CLOSE_SUBEXP)
2331           && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2332         {
2333           *err = REG_ERPAREN;
2334           return NULL;
2335         }
2336       FALLTHROUGH;
2337     case OP_CLOSE_DUP_NUM:
2338       /* We treat it as a normal character.  */
2339
2340       /* Then we can these characters as normal characters.  */
2341       token->type = CHARACTER;
2342       /* mb_partial and word_char bits should be initialized already
2343          by peek_token.  */
2344       tree = create_token_tree (dfa, NULL, NULL, token);
2345       if (__glibc_unlikely (tree == NULL))
2346         {
2347           *err = REG_ESPACE;
2348           return NULL;
2349         }
2350       break;
2351
2352     case ANCHOR:
2353       if ((token->opr.ctx_type
2354            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2355           && dfa->word_ops_used == 0)
2356         init_word_char (dfa);
2357       if (token->opr.ctx_type == WORD_DELIM
2358           || token->opr.ctx_type == NOT_WORD_DELIM)
2359         {
2360           bin_tree_t *tree_first, *tree_last;
2361           if (token->opr.ctx_type == WORD_DELIM)
2362             {
2363               token->opr.ctx_type = WORD_FIRST;
2364               tree_first = create_token_tree (dfa, NULL, NULL, token);
2365               token->opr.ctx_type = WORD_LAST;
2366             }
2367           else
2368             {
2369               token->opr.ctx_type = INSIDE_WORD;
2370               tree_first = create_token_tree (dfa, NULL, NULL, token);
2371               token->opr.ctx_type = INSIDE_NOTWORD;
2372             }
2373           tree_last = create_token_tree (dfa, NULL, NULL, token);
2374           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2375           if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2376                                 || tree == NULL))
2377             {
2378               *err = REG_ESPACE;
2379               return NULL;
2380             }
2381         }
2382       else
2383         {
2384           tree = create_token_tree (dfa, NULL, NULL, token);
2385           if (__glibc_unlikely (tree == NULL))
2386             {
2387               *err = REG_ESPACE;
2388               return NULL;
2389             }
2390         }
2391       /* We must return here, since ANCHORs can't be followed
2392          by repetition operators.
2393          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2394              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2395       fetch_token (token, regexp, syntax);
2396       return tree;
2397
2398     case OP_PERIOD:
2399       tree = create_token_tree (dfa, NULL, NULL, token);
2400       if (__glibc_unlikely (tree == NULL))
2401         {
2402           *err = REG_ESPACE;
2403           return NULL;
2404         }
2405       if (dfa->mb_cur_max > 1)
2406         dfa->has_mb_node = 1;
2407       break;
2408
2409     case OP_WORD:
2410     case OP_NOTWORD:
2411       tree = build_charclass_op (dfa, regexp->trans,
2412                                  "alnum",
2413                                  "_",
2414                                  token->type == OP_NOTWORD, err);
2415       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2416         return NULL;
2417       break;
2418
2419     case OP_SPACE:
2420     case OP_NOTSPACE:
2421       tree = build_charclass_op (dfa, regexp->trans,
2422                                  "space",
2423                                  "",
2424                                  token->type == OP_NOTSPACE, err);
2425       if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2426         return NULL;
2427       break;
2428
2429     case OP_ALT:
2430     case END_OF_RE:
2431       return NULL;
2432
2433     case BACK_SLASH:
2434       *err = REG_EESCAPE;
2435       return NULL;
2436
2437     default:
2438       /* Must not happen?  */
2439       DEBUG_ASSERT (false);
2440       return NULL;
2441     }
2442   fetch_token (token, regexp, syntax);
2443
2444   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2446     {
2447       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2448                                            syntax, err);
2449       if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2450         {
2451           if (tree != NULL)
2452             postorder (tree, free_tree, NULL);
2453           return NULL;
2454         }
2455       tree = dup_tree;
2456       /* In BRE consecutive duplications are not allowed.  */
2457       if ((syntax & RE_CONTEXT_INVALID_DUP)
2458           && (token->type == OP_DUP_ASTERISK
2459               || token->type == OP_OPEN_DUP_NUM))
2460         {
2461           if (tree != NULL)
2462             postorder (tree, free_tree, NULL);
2463           *err = REG_BADRPT;
2464           return NULL;
2465         }
2466     }
2467
2468   return tree;
2469 }
2470
2471 /* This function build the following tree, from regular expression
2472    (<reg_exp>):
2473          SUBEXP
2474             |
2475         <reg_exp>
2476 */
2477
2478 static bin_tree_t *
2479 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2480                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2481 {
2482   re_dfa_t *dfa = preg->buffer;
2483   bin_tree_t *tree;
2484   size_t cur_nsub;
2485   cur_nsub = preg->re_nsub++;
2486
2487   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2488
2489   /* The subexpression may be a null string.  */
2490   if (token->type == OP_CLOSE_SUBEXP)
2491     tree = NULL;
2492   else
2493     {
2494       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2495       if (__glibc_unlikely (*err == REG_NOERROR
2496                             && token->type != OP_CLOSE_SUBEXP))
2497         {
2498           if (tree != NULL)
2499             postorder (tree, free_tree, NULL);
2500           *err = REG_EPAREN;
2501         }
2502       if (__glibc_unlikely (*err != REG_NOERROR))
2503         return NULL;
2504     }
2505
2506   if (cur_nsub <= '9' - '1')
2507     dfa->completed_bkref_map |= 1 << cur_nsub;
2508
2509   tree = create_tree (dfa, tree, NULL, SUBEXP);
2510   if (__glibc_unlikely (tree == NULL))
2511     {
2512       *err = REG_ESPACE;
2513       return NULL;
2514     }
2515   tree->token.opr.idx = cur_nsub;
2516   return tree;
2517 }
2518
2519 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2520
2521 static bin_tree_t *
2522 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2523               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2524 {
2525   bin_tree_t *tree = NULL, *old_tree = NULL;
2526   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2527   re_token_t start_token = *token;
2528
2529   if (token->type == OP_OPEN_DUP_NUM)
2530     {
2531       end = 0;
2532       start = fetch_number (regexp, token, syntax);
2533       if (start == -1)
2534         {
2535           if (token->type == CHARACTER && token->opr.c == ',')
2536             start = 0; /* We treat "{,m}" as "{0,m}".  */
2537           else
2538             {
2539               *err = REG_BADBR; /* <re>{} is invalid.  */
2540               return NULL;
2541             }
2542         }
2543       if (__glibc_likely (start != -2))
2544         {
2545           /* We treat "{n}" as "{n,n}".  */
2546           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2547                  : ((token->type == CHARACTER && token->opr.c == ',')
2548                     ? fetch_number (regexp, token, syntax) : -2));
2549         }
2550       if (__glibc_unlikely (start == -2 || end == -2))
2551         {
2552           /* Invalid sequence.  */
2553           if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2554             {
2555               if (token->type == END_OF_RE)
2556                 *err = REG_EBRACE;
2557               else
2558                 *err = REG_BADBR;
2559
2560               return NULL;
2561             }
2562
2563           /* If the syntax bit is set, rollback.  */
2564           re_string_set_index (regexp, start_idx);
2565           *token = start_token;
2566           token->type = CHARACTER;
2567           /* mb_partial and word_char bits should be already initialized by
2568              peek_token.  */
2569           return elem;
2570         }
2571
2572       if (__glibc_unlikely ((end != -1 && start > end)
2573                             || token->type != OP_CLOSE_DUP_NUM))
2574         {
2575           /* First number greater than second.  */
2576           *err = REG_BADBR;
2577           return NULL;
2578         }
2579
2580       if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2581         {
2582           *err = REG_ESIZE;
2583           return NULL;
2584         }
2585     }
2586   else
2587     {
2588       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2589       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2590     }
2591
2592   fetch_token (token, regexp, syntax);
2593
2594   if (__glibc_unlikely (elem == NULL))
2595     return NULL;
2596   if (__glibc_unlikely (start == 0 && end == 0))
2597     {
2598       postorder (elem, free_tree, NULL);
2599       return NULL;
2600     }
2601
2602   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2603   if (__glibc_unlikely (start > 0))
2604     {
2605       tree = elem;
2606       for (i = 2; i <= start; ++i)
2607         {
2608           elem = duplicate_tree (elem, dfa);
2609           tree = create_tree (dfa, tree, elem, CONCAT);
2610           if (__glibc_unlikely (elem == NULL || tree == NULL))
2611             goto parse_dup_op_espace;
2612         }
2613
2614       if (start == end)
2615         return tree;
2616
2617       /* Duplicate ELEM before it is marked optional.  */
2618       elem = duplicate_tree (elem, dfa);
2619       if (__glibc_unlikely (elem == NULL))
2620         goto parse_dup_op_espace;
2621       old_tree = tree;
2622     }
2623   else
2624     old_tree = NULL;
2625
2626   if (elem->token.type == SUBEXP)
2627     {
2628       uintptr_t subidx = elem->token.opr.idx;
2629       postorder (elem, mark_opt_subexp, (void *) subidx);
2630     }
2631
2632   tree = create_tree (dfa, elem, NULL,
2633                       (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2634   if (__glibc_unlikely (tree == NULL))
2635     goto parse_dup_op_espace;
2636
2637   /* This loop is actually executed only when end != -1,
2638      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2639      already created the start+1-th copy.  */
2640   if (TYPE_SIGNED (Idx) || end != -1)
2641     for (i = start + 2; i <= end; ++i)
2642       {
2643         elem = duplicate_tree (elem, dfa);
2644         tree = create_tree (dfa, tree, elem, CONCAT);
2645         if (__glibc_unlikely (elem == NULL || tree == NULL))
2646           goto parse_dup_op_espace;
2647
2648         tree = create_tree (dfa, tree, NULL, OP_ALT);
2649         if (__glibc_unlikely (tree == NULL))
2650           goto parse_dup_op_espace;
2651       }
2652
2653   if (old_tree)
2654     tree = create_tree (dfa, old_tree, tree, CONCAT);
2655
2656   return tree;
2657
2658  parse_dup_op_espace:
2659   *err = REG_ESPACE;
2660   return NULL;
2661 }
2662
2663 /* Size of the names for collating symbol/equivalence_class/character_class.
2664    I'm not sure, but maybe enough.  */
2665 #define BRACKET_NAME_BUF_SIZE 32
2666
2667 #ifndef _LIBC
2668
2669 # ifdef RE_ENABLE_I18N
2670 /* Convert the byte B to the corresponding wide character.  In a
2671    unibyte locale, treat B as itself.  In a multibyte locale, return
2672    WEOF if B is an encoding error.  */
2673 static wint_t
2674 parse_byte (unsigned char b, re_charset_t *mbcset)
2675 {
2676   return mbcset == NULL ? b : __btowc (b);
2677 }
2678 # endif
2679
2680   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2681      Build the range expression which starts from START_ELEM, and ends
2682      at END_ELEM.  The result are written to MBCSET and SBCSET.
2683      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2684      mbcset->range_ends, is a pointer argument since we may
2685      update it.  */
2686
2687 static reg_errcode_t
2688 # ifdef RE_ENABLE_I18N
2689 build_range_exp (const reg_syntax_t syntax,
2690                  bitset_t sbcset,
2691                  re_charset_t *mbcset,
2692                  Idx *range_alloc,
2693                  const bracket_elem_t *start_elem,
2694                  const bracket_elem_t *end_elem)
2695 # else /* not RE_ENABLE_I18N */
2696 build_range_exp (const reg_syntax_t syntax,
2697                  bitset_t sbcset,
2698                  const bracket_elem_t *start_elem,
2699                  const bracket_elem_t *end_elem)
2700 # endif /* not RE_ENABLE_I18N */
2701 {
2702   unsigned int start_ch, end_ch;
2703   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2704   if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2705                         || start_elem->type == CHAR_CLASS
2706                         || end_elem->type == EQUIV_CLASS
2707                         || end_elem->type == CHAR_CLASS))
2708     return REG_ERANGE;
2709
2710   /* We can handle no multi character collating elements without libc
2711      support.  */
2712   if (__glibc_unlikely ((start_elem->type == COLL_SYM
2713                          && strlen ((char *) start_elem->opr.name) > 1)
2714                         || (end_elem->type == COLL_SYM
2715                             && strlen ((char *) end_elem->opr.name) > 1)))
2716     return REG_ECOLLATE;
2717
2718 # ifdef RE_ENABLE_I18N
2719   {
2720     wchar_t wc;
2721     wint_t start_wc;
2722     wint_t end_wc;
2723
2724     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2725                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2726                    : 0));
2727     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2728               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2729                  : 0));
2730     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2731                 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2732     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2733               ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2734     if (start_wc == WEOF || end_wc == WEOF)
2735       return REG_ECOLLATE;
2736     else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2737                                && start_wc > end_wc))
2738       return REG_ERANGE;
2739
2740     /* Got valid collation sequence values, add them as a new entry.
2741        However, for !_LIBC we have no collation elements: if the
2742        character set is single byte, the single byte character set
2743        that we build below suffices.  parse_bracket_exp passes
2744        no MBCSET if dfa->mb_cur_max == 1.  */
2745     if (mbcset)
2746       {
2747         /* Check the space of the arrays.  */
2748         if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2749           {
2750             /* There is not enough space, need realloc.  */
2751             wchar_t *new_array_start, *new_array_end;
2752             Idx new_nranges;
2753
2754             /* +1 in case of mbcset->nranges is 0.  */
2755             new_nranges = 2 * mbcset->nranges + 1;
2756             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2757                are NULL if *range_alloc == 0.  */
2758             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2759                                           new_nranges);
2760             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2761                                         new_nranges);
2762
2763             if (__glibc_unlikely (new_array_start == NULL
2764                                   || new_array_end == NULL))
2765               {
2766                 re_free (new_array_start);
2767                 re_free (new_array_end);
2768                 return REG_ESPACE;
2769               }
2770
2771             mbcset->range_starts = new_array_start;
2772             mbcset->range_ends = new_array_end;
2773             *range_alloc = new_nranges;
2774           }
2775
2776         mbcset->range_starts[mbcset->nranges] = start_wc;
2777         mbcset->range_ends[mbcset->nranges++] = end_wc;
2778       }
2779
2780     /* Build the table for single byte characters.  */
2781     for (wc = 0; wc < SBC_MAX; ++wc)
2782       {
2783         if (start_wc <= wc && wc <= end_wc)
2784           bitset_set (sbcset, wc);
2785       }
2786   }
2787 # else /* not RE_ENABLE_I18N */
2788   {
2789     unsigned int ch;
2790     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2791                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2792                    : 0));
2793     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2794               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2795                  : 0));
2796     if (start_ch > end_ch)
2797       return REG_ERANGE;
2798     /* Build the table for single byte characters.  */
2799     for (ch = 0; ch < SBC_MAX; ++ch)
2800       if (start_ch <= ch  && ch <= end_ch)
2801         bitset_set (sbcset, ch);
2802   }
2803 # endif /* not RE_ENABLE_I18N */
2804   return REG_NOERROR;
2805 }
2806 #endif /* not _LIBC */
2807
2808 #ifndef _LIBC
2809 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2810    Build the collating element which is represented by NAME.
2811    The result are written to MBCSET and SBCSET.
2812    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2813    pointer argument since we may update it.  */
2814
2815 static reg_errcode_t
2816 # ifdef RE_ENABLE_I18N
2817 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2818                         Idx *coll_sym_alloc, const unsigned char *name)
2819 # else /* not RE_ENABLE_I18N */
2820 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2821 # endif /* not RE_ENABLE_I18N */
2822 {
2823   size_t name_len = strlen ((const char *) name);
2824   if (__glibc_unlikely (name_len != 1))
2825     return REG_ECOLLATE;
2826   else
2827     {
2828       bitset_set (sbcset, name[0]);
2829       return REG_NOERROR;
2830     }
2831 }
2832 #endif /* not _LIBC */
2833
2834 /* This function parse bracket expression like "[abc]", "[a-c]",
2835    "[[.a-a.]]" etc.  */
2836
2837 static bin_tree_t *
2838 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2839                    reg_syntax_t syntax, reg_errcode_t *err)
2840 {
2841 #ifdef _LIBC
2842   const unsigned char *collseqmb;
2843   const char *collseqwc;
2844   uint32_t nrules;
2845   int32_t table_size;
2846   const int32_t *symb_table;
2847   const unsigned char *extra;
2848
2849   /* Local function for parse_bracket_exp used in _LIBC environment.
2850      Seek the collating symbol entry corresponding to NAME.
2851      Return the index of the symbol in the SYMB_TABLE,
2852      or -1 if not found.  */
2853
2854   auto inline int32_t
2855   __attribute__ ((always_inline))
2856   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2857     {
2858       int32_t elem;
2859
2860       for (elem = 0; elem < table_size; elem++)
2861         if (symb_table[2 * elem] != 0)
2862           {
2863             int32_t idx = symb_table[2 * elem + 1];
2864             /* Skip the name of collating element name.  */
2865             idx += 1 + extra[idx];
2866             if (/* Compare the length of the name.  */
2867                 name_len == extra[idx]
2868                 /* Compare the name.  */
2869                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2870               /* Yep, this is the entry.  */
2871               return elem;
2872           }
2873       return -1;
2874     }
2875
2876   /* Local function for parse_bracket_exp used in _LIBC environment.
2877      Look up the collation sequence value of BR_ELEM.
2878      Return the value if succeeded, UINT_MAX otherwise.  */
2879
2880   auto inline unsigned int
2881   __attribute__ ((always_inline))
2882   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2883     {
2884       if (br_elem->type == SB_CHAR)
2885         {
2886           /*
2887           if (MB_CUR_MAX == 1)
2888           */
2889           if (nrules == 0)
2890             return collseqmb[br_elem->opr.ch];
2891           else
2892             {
2893               wint_t wc = __btowc (br_elem->opr.ch);
2894               return __collseq_table_lookup (collseqwc, wc);
2895             }
2896         }
2897       else if (br_elem->type == MB_CHAR)
2898         {
2899           if (nrules != 0)
2900             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2901         }
2902       else if (br_elem->type == COLL_SYM)
2903         {
2904           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2905           if (nrules != 0)
2906             {
2907               int32_t elem, idx;
2908               elem = seek_collating_symbol_entry (br_elem->opr.name,
2909                                                   sym_name_len);
2910               if (elem != -1)
2911                 {
2912                   /* We found the entry.  */
2913                   idx = symb_table[2 * elem + 1];
2914                   /* Skip the name of collating element name.  */
2915                   idx += 1 + extra[idx];
2916                   /* Skip the byte sequence of the collating element.  */
2917                   idx += 1 + extra[idx];
2918                   /* Adjust for the alignment.  */
2919                   idx = (idx + 3) & ~3;
2920                   /* Skip the multibyte collation sequence value.  */
2921                   idx += sizeof (unsigned int);
2922                   /* Skip the wide char sequence of the collating element.  */
2923                   idx += sizeof (unsigned int) *
2924                     (1 + *(unsigned int *) (extra + idx));
2925                   /* Return the collation sequence value.  */
2926                   return *(unsigned int *) (extra + idx);
2927                 }
2928               else if (sym_name_len == 1)
2929                 {
2930                   /* No valid character.  Match it as a single byte
2931                      character.  */
2932                   return collseqmb[br_elem->opr.name[0]];
2933                 }
2934             }
2935           else if (sym_name_len == 1)
2936             return collseqmb[br_elem->opr.name[0]];
2937         }
2938       return UINT_MAX;
2939     }
2940
2941   /* Local function for parse_bracket_exp used in _LIBC environment.
2942      Build the range expression which starts from START_ELEM, and ends
2943      at END_ELEM.  The result are written to MBCSET and SBCSET.
2944      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2945      mbcset->range_ends, is a pointer argument since we may
2946      update it.  */
2947
2948   auto inline reg_errcode_t
2949   __attribute__ ((always_inline))
2950   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2951                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2952     {
2953       unsigned int ch;
2954       uint32_t start_collseq;
2955       uint32_t end_collseq;
2956
2957       /* Equivalence Classes and Character Classes can't be a range
2958          start/end.  */
2959       if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2960                             || start_elem->type == CHAR_CLASS
2961                             || end_elem->type == EQUIV_CLASS
2962                             || end_elem->type == CHAR_CLASS))
2963         return REG_ERANGE;
2964
2965       /* FIXME: Implement rational ranges here, too.  */
2966       start_collseq = lookup_collation_sequence_value (start_elem);
2967       end_collseq = lookup_collation_sequence_value (end_elem);
2968       /* Check start/end collation sequence values.  */
2969       if (__glibc_unlikely (start_collseq == UINT_MAX
2970                             || end_collseq == UINT_MAX))
2971         return REG_ECOLLATE;
2972       if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2973                             && start_collseq > end_collseq))
2974         return REG_ERANGE;
2975
2976       /* Got valid collation sequence values, add them as a new entry.
2977          However, if we have no collation elements, and the character set
2978          is single byte, the single byte character set that we
2979          build below suffices. */
2980       if (nrules > 0 || dfa->mb_cur_max > 1)
2981         {
2982           /* Check the space of the arrays.  */
2983           if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2984             {
2985               /* There is not enough space, need realloc.  */
2986               uint32_t *new_array_start;
2987               uint32_t *new_array_end;
2988               Idx new_nranges;
2989
2990               /* +1 in case of mbcset->nranges is 0.  */
2991               new_nranges = 2 * mbcset->nranges + 1;
2992               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2993                                             new_nranges);
2994               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2995                                           new_nranges);
2996
2997               if (__glibc_unlikely (new_array_start == NULL
2998                                     || new_array_end == NULL))
2999                 return REG_ESPACE;
3000
3001               mbcset->range_starts = new_array_start;
3002               mbcset->range_ends = new_array_end;
3003               *range_alloc = new_nranges;
3004             }
3005
3006           mbcset->range_starts[mbcset->nranges] = start_collseq;
3007           mbcset->range_ends[mbcset->nranges++] = end_collseq;
3008         }
3009
3010       /* Build the table for single byte characters.  */
3011       for (ch = 0; ch < SBC_MAX; ch++)
3012         {
3013           uint32_t ch_collseq;
3014           /*
3015           if (MB_CUR_MAX == 1)
3016           */
3017           if (nrules == 0)
3018             ch_collseq = collseqmb[ch];
3019           else
3020             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3021           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3022             bitset_set (sbcset, ch);
3023         }
3024       return REG_NOERROR;
3025     }
3026
3027   /* Local function for parse_bracket_exp used in _LIBC environment.
3028      Build the collating element which is represented by NAME.
3029      The result are written to MBCSET and SBCSET.
3030      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3031      pointer argument since we may update it.  */
3032
3033   auto inline reg_errcode_t
3034   __attribute__ ((always_inline))
3035   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3036                           Idx *coll_sym_alloc, const unsigned char *name)
3037     {
3038       int32_t elem, idx;
3039       size_t name_len = strlen ((const char *) name);
3040       if (nrules != 0)
3041         {
3042           elem = seek_collating_symbol_entry (name, name_len);
3043           if (elem != -1)
3044             {
3045               /* We found the entry.  */
3046               idx = symb_table[2 * elem + 1];
3047               /* Skip the name of collating element name.  */
3048               idx += 1 + extra[idx];
3049             }
3050           else if (name_len == 1)
3051             {
3052               /* No valid character, treat it as a normal
3053                  character.  */
3054               bitset_set (sbcset, name[0]);
3055               return REG_NOERROR;
3056             }
3057           else
3058             return REG_ECOLLATE;
3059
3060           /* Got valid collation sequence, add it as a new entry.  */
3061           /* Check the space of the arrays.  */
3062           if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3063             {
3064               /* Not enough, realloc it.  */
3065               /* +1 in case of mbcset->ncoll_syms is 0.  */
3066               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3067               /* Use realloc since mbcset->coll_syms is NULL
3068                  if *alloc == 0.  */
3069               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3070                                                    new_coll_sym_alloc);
3071               if (__glibc_unlikely (new_coll_syms == NULL))
3072                 return REG_ESPACE;
3073               mbcset->coll_syms = new_coll_syms;
3074               *coll_sym_alloc = new_coll_sym_alloc;
3075             }
3076           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3077           return REG_NOERROR;
3078         }
3079       else
3080         {
3081           if (__glibc_unlikely (name_len != 1))
3082             return REG_ECOLLATE;
3083           else
3084             {
3085               bitset_set (sbcset, name[0]);
3086               return REG_NOERROR;
3087             }
3088         }
3089     }
3090 #endif
3091
3092   re_token_t br_token;
3093   re_bitset_ptr_t sbcset;
3094 #ifdef RE_ENABLE_I18N
3095   re_charset_t *mbcset;
3096   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3097   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3098 #endif /* not RE_ENABLE_I18N */
3099   bool non_match = false;
3100   bin_tree_t *work_tree;
3101   int token_len;
3102   bool first_round = true;
3103 #ifdef _LIBC
3104   collseqmb = (const unsigned char *)
3105     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3106   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3107   if (nrules)
3108     {
3109       /*
3110       if (MB_CUR_MAX > 1)
3111       */
3112       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3113       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3114       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3115                                                   _NL_COLLATE_SYMB_TABLEMB);
3116       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3117                                                    _NL_COLLATE_SYMB_EXTRAMB);
3118     }
3119 #endif
3120   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3121 #ifdef RE_ENABLE_I18N
3122   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3123 #endif /* RE_ENABLE_I18N */
3124 #ifdef RE_ENABLE_I18N
3125   if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3126 #else
3127   if (__glibc_unlikely (sbcset == NULL))
3128 #endif /* RE_ENABLE_I18N */
3129     {
3130       re_free (sbcset);
3131 #ifdef RE_ENABLE_I18N
3132       re_free (mbcset);
3133 #endif
3134       *err = REG_ESPACE;
3135       return NULL;
3136     }
3137
3138   token_len = peek_token_bracket (token, regexp, syntax);
3139   if (__glibc_unlikely (token->type == END_OF_RE))
3140     {
3141       *err = REG_BADPAT;
3142       goto parse_bracket_exp_free_return;
3143     }
3144   if (token->type == OP_NON_MATCH_LIST)
3145     {
3146 #ifdef RE_ENABLE_I18N
3147       mbcset->non_match = 1;
3148 #endif /* not RE_ENABLE_I18N */
3149       non_match = true;
3150       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3151         bitset_set (sbcset, '\n');
3152       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3153       token_len = peek_token_bracket (token, regexp, syntax);
3154       if (__glibc_unlikely (token->type == END_OF_RE))
3155         {
3156           *err = REG_BADPAT;
3157           goto parse_bracket_exp_free_return;
3158         }
3159     }
3160
3161   /* We treat the first ']' as a normal character.  */
3162   if (token->type == OP_CLOSE_BRACKET)
3163     token->type = CHARACTER;
3164
3165   while (1)
3166     {
3167       bracket_elem_t start_elem, end_elem;
3168       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3169       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3170       reg_errcode_t ret;
3171       int token_len2 = 0;
3172       bool is_range_exp = false;
3173       re_token_t token2;
3174
3175       start_elem.opr.name = start_name_buf;
3176       start_elem.type = COLL_SYM;
3177       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3178                                    syntax, first_round);
3179       if (__glibc_unlikely (ret != REG_NOERROR))
3180         {
3181           *err = ret;
3182           goto parse_bracket_exp_free_return;
3183         }
3184       first_round = false;
3185
3186       /* Get information about the next token.  We need it in any case.  */
3187       token_len = peek_token_bracket (token, regexp, syntax);
3188
3189       /* Do not check for ranges if we know they are not allowed.  */
3190       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3191         {
3192           if (__glibc_unlikely (token->type == END_OF_RE))
3193             {
3194               *err = REG_EBRACK;
3195               goto parse_bracket_exp_free_return;
3196             }
3197           if (token->type == OP_CHARSET_RANGE)
3198             {
3199               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3200               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3201               if (__glibc_unlikely (token2.type == END_OF_RE))
3202                 {
3203                   *err = REG_EBRACK;
3204                   goto parse_bracket_exp_free_return;
3205                 }
3206               if (token2.type == OP_CLOSE_BRACKET)
3207                 {
3208                   /* We treat the last '-' as a normal character.  */
3209                   re_string_skip_bytes (regexp, -token_len);
3210                   token->type = CHARACTER;
3211                 }
3212               else
3213                 is_range_exp = true;
3214             }
3215         }
3216
3217       if (is_range_exp == true)
3218         {
3219           end_elem.opr.name = end_name_buf;
3220           end_elem.type = COLL_SYM;
3221           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3222                                        dfa, syntax, true);
3223           if (__glibc_unlikely (ret != REG_NOERROR))
3224             {
3225               *err = ret;
3226               goto parse_bracket_exp_free_return;
3227             }
3228
3229           token_len = peek_token_bracket (token, regexp, syntax);
3230
3231 #ifdef _LIBC
3232           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3233                                   &start_elem, &end_elem);
3234 #else
3235 # ifdef RE_ENABLE_I18N
3236           *err = build_range_exp (syntax, sbcset,
3237                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3238                                   &range_alloc, &start_elem, &end_elem);
3239 # else
3240           *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3241 # endif
3242 #endif /* RE_ENABLE_I18N */
3243           if (__glibc_unlikely (*err != REG_NOERROR))
3244             goto parse_bracket_exp_free_return;
3245         }
3246       else
3247         {
3248           switch (start_elem.type)
3249             {
3250             case SB_CHAR:
3251               bitset_set (sbcset, start_elem.opr.ch);
3252               break;
3253 #ifdef RE_ENABLE_I18N
3254             case MB_CHAR:
3255               /* Check whether the array has enough space.  */
3256               if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3257                 {
3258                   wchar_t *new_mbchars;
3259                   /* Not enough, realloc it.  */
3260                   /* +1 in case of mbcset->nmbchars is 0.  */
3261                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3262                   /* Use realloc since array is NULL if *alloc == 0.  */
3263                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3264                                             mbchar_alloc);
3265                   if (__glibc_unlikely (new_mbchars == NULL))
3266                     goto parse_bracket_exp_espace;
3267                   mbcset->mbchars = new_mbchars;
3268                 }
3269               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3270               break;
3271 #endif /* RE_ENABLE_I18N */
3272             case EQUIV_CLASS:
3273               *err = build_equiv_class (sbcset,
3274 #ifdef RE_ENABLE_I18N
3275                                         mbcset, &equiv_class_alloc,
3276 #endif /* RE_ENABLE_I18N */
3277                                         start_elem.opr.name);
3278               if (__glibc_unlikely (*err != REG_NOERROR))
3279                 goto parse_bracket_exp_free_return;
3280               break;
3281             case COLL_SYM:
3282               *err = build_collating_symbol (sbcset,
3283 #ifdef RE_ENABLE_I18N
3284                                              mbcset, &coll_sym_alloc,
3285 #endif /* RE_ENABLE_I18N */
3286                                              start_elem.opr.name);
3287               if (__glibc_unlikely (*err != REG_NOERROR))
3288                 goto parse_bracket_exp_free_return;
3289               break;
3290             case CHAR_CLASS:
3291               *err = build_charclass (regexp->trans, sbcset,
3292 #ifdef RE_ENABLE_I18N
3293                                       mbcset, &char_class_alloc,
3294 #endif /* RE_ENABLE_I18N */
3295                                       (const char *) start_elem.opr.name,
3296                                       syntax);
3297               if (__glibc_unlikely (*err != REG_NOERROR))
3298                goto parse_bracket_exp_free_return;
3299               break;
3300             default:
3301               DEBUG_ASSERT (false);
3302               break;
3303             }
3304         }
3305       if (__glibc_unlikely (token->type == END_OF_RE))
3306         {
3307           *err = REG_EBRACK;
3308           goto parse_bracket_exp_free_return;
3309         }
3310       if (token->type == OP_CLOSE_BRACKET)
3311         break;
3312     }
3313
3314   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3315
3316   /* If it is non-matching list.  */
3317   if (non_match)
3318     bitset_not (sbcset);
3319
3320 #ifdef RE_ENABLE_I18N
3321   /* Ensure only single byte characters are set.  */
3322   if (dfa->mb_cur_max > 1)
3323     bitset_mask (sbcset, dfa->sb_char);
3324
3325   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3326       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3327                                                      || mbcset->non_match)))
3328     {
3329       bin_tree_t *mbc_tree;
3330       int sbc_idx;
3331       /* Build a tree for complex bracket.  */
3332       dfa->has_mb_node = 1;
3333       br_token.type = COMPLEX_BRACKET;
3334       br_token.opr.mbcset = mbcset;
3335       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3336       if (__glibc_unlikely (mbc_tree == NULL))
3337         goto parse_bracket_exp_espace;
3338       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3339         if (sbcset[sbc_idx])
3340           break;
3341       /* If there are no bits set in sbcset, there is no point
3342          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3343       if (sbc_idx < BITSET_WORDS)
3344         {
3345           /* Build a tree for simple bracket.  */
3346           br_token.type = SIMPLE_BRACKET;
3347           br_token.opr.sbcset = sbcset;
3348           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3349           if (__glibc_unlikely (work_tree == NULL))
3350             goto parse_bracket_exp_espace;
3351
3352           /* Then join them by ALT node.  */
3353           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3354           if (__glibc_unlikely (work_tree == NULL))
3355             goto parse_bracket_exp_espace;
3356         }
3357       else
3358         {
3359           re_free (sbcset);
3360           work_tree = mbc_tree;
3361         }
3362     }
3363   else
3364 #endif /* not RE_ENABLE_I18N */
3365     {
3366 #ifdef RE_ENABLE_I18N
3367       free_charset (mbcset);
3368 #endif
3369       /* Build a tree for simple bracket.  */
3370       br_token.type = SIMPLE_BRACKET;
3371       br_token.opr.sbcset = sbcset;
3372       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3373       if (__glibc_unlikely (work_tree == NULL))
3374         goto parse_bracket_exp_espace;
3375     }
3376   return work_tree;
3377
3378  parse_bracket_exp_espace:
3379   *err = REG_ESPACE;
3380  parse_bracket_exp_free_return:
3381   re_free (sbcset);
3382 #ifdef RE_ENABLE_I18N
3383   free_charset (mbcset);
3384 #endif /* RE_ENABLE_I18N */
3385   return NULL;
3386 }
3387
3388 /* Parse an element in the bracket expression.  */
3389
3390 static reg_errcode_t
3391 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3392                        re_token_t *token, int token_len, re_dfa_t *dfa,
3393                        reg_syntax_t syntax, bool accept_hyphen)
3394 {
3395 #ifdef RE_ENABLE_I18N
3396   int cur_char_size;
3397   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3398   if (cur_char_size > 1)
3399     {
3400       elem->type = MB_CHAR;
3401       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3402       re_string_skip_bytes (regexp, cur_char_size);
3403       return REG_NOERROR;
3404     }
3405 #endif /* RE_ENABLE_I18N */
3406   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3407   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3408       || token->type == OP_OPEN_EQUIV_CLASS)
3409     return parse_bracket_symbol (elem, regexp, token);
3410   if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3411     {
3412       /* A '-' must only appear as anything but a range indicator before
3413          the closing bracket.  Everything else is an error.  */
3414       re_token_t token2;
3415       (void) peek_token_bracket (&token2, regexp, syntax);
3416       if (token2.type != OP_CLOSE_BRACKET)
3417         /* The actual error value is not standardized since this whole
3418            case is undefined.  But ERANGE makes good sense.  */
3419         return REG_ERANGE;
3420     }
3421   elem->type = SB_CHAR;
3422   elem->opr.ch = token->opr.c;
3423   return REG_NOERROR;
3424 }
3425
3426 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3427    such as [:<character_class>:], [.<collating_element>.], and
3428    [=<equivalent_class>=].  */
3429
3430 static reg_errcode_t
3431 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3432                       re_token_t *token)
3433 {
3434   unsigned char ch, delim = token->opr.c;
3435   int i = 0;
3436   if (re_string_eoi(regexp))
3437     return REG_EBRACK;
3438   for (;; ++i)
3439     {
3440       if (i >= BRACKET_NAME_BUF_SIZE)
3441         return REG_EBRACK;
3442       if (token->type == OP_OPEN_CHAR_CLASS)
3443         ch = re_string_fetch_byte_case (regexp);
3444       else
3445         ch = re_string_fetch_byte (regexp);
3446       if (re_string_eoi(regexp))
3447         return REG_EBRACK;
3448       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3449         break;
3450       elem->opr.name[i] = ch;
3451     }
3452   re_string_skip_bytes (regexp, 1);
3453   elem->opr.name[i] = '\0';
3454   switch (token->type)
3455     {
3456     case OP_OPEN_COLL_ELEM:
3457       elem->type = COLL_SYM;
3458       break;
3459     case OP_OPEN_EQUIV_CLASS:
3460       elem->type = EQUIV_CLASS;
3461       break;
3462     case OP_OPEN_CHAR_CLASS:
3463       elem->type = CHAR_CLASS;
3464       break;
3465     default:
3466       break;
3467     }
3468   return REG_NOERROR;
3469 }
3470
3471   /* Helper function for parse_bracket_exp.
3472      Build the equivalence class which is represented by NAME.
3473      The result are written to MBCSET and SBCSET.
3474      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3475      is a pointer argument since we may update it.  */
3476
3477 static reg_errcode_t
3478 #ifdef RE_ENABLE_I18N
3479 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3480                    Idx *equiv_class_alloc, const unsigned char *name)
3481 #else /* not RE_ENABLE_I18N */
3482 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3483 #endif /* not RE_ENABLE_I18N */
3484 {
3485 #ifdef _LIBC
3486   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3487   if (nrules != 0)
3488     {
3489       const int32_t *table, *indirect;
3490       const unsigned char *weights, *extra, *cp;
3491       unsigned char char_buf[2];
3492       int32_t idx1, idx2;
3493       unsigned int ch;
3494       size_t len;
3495       /* Calculate the index for equivalence class.  */
3496       cp = name;
3497       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3498       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3499                                                _NL_COLLATE_WEIGHTMB);
3500       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3501                                                    _NL_COLLATE_EXTRAMB);
3502       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3503                                                 _NL_COLLATE_INDIRECTMB);
3504       idx1 = findidx (table, indirect, extra, &cp, -1);
3505       if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3506         /* This isn't a valid character.  */
3507         return REG_ECOLLATE;
3508
3509       /* Build single byte matching table for this equivalence class.  */
3510       len = weights[idx1 & 0xffffff];
3511       for (ch = 0; ch < SBC_MAX; ++ch)
3512         {
3513           char_buf[0] = ch;
3514           cp = char_buf;
3515           idx2 = findidx (table, indirect, extra, &cp, 1);
3516 /*
3517           idx2 = table[ch];
3518 */
3519           if (idx2 == 0)
3520             /* This isn't a valid character.  */
3521             continue;
3522           /* Compare only if the length matches and the collation rule
3523              index is the same.  */
3524           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3525               && memcmp (weights + (idx1 & 0xffffff) + 1,
3526                          weights + (idx2 & 0xffffff) + 1, len) == 0)
3527             bitset_set (sbcset, ch);
3528         }
3529       /* Check whether the array has enough space.  */
3530       if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3531         {
3532           /* Not enough, realloc it.  */
3533           /* +1 in case of mbcset->nequiv_classes is 0.  */
3534           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3535           /* Use realloc since the array is NULL if *alloc == 0.  */
3536           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3537                                                    int32_t,
3538                                                    new_equiv_class_alloc);
3539           if (__glibc_unlikely (new_equiv_classes == NULL))
3540             return REG_ESPACE;
3541           mbcset->equiv_classes = new_equiv_classes;
3542           *equiv_class_alloc = new_equiv_class_alloc;
3543         }
3544       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3545     }
3546   else
3547 #endif /* _LIBC */
3548     {
3549       if (__glibc_unlikely (strlen ((const char *) name) != 1))
3550         return REG_ECOLLATE;
3551       bitset_set (sbcset, *name);
3552     }
3553   return REG_NOERROR;
3554 }
3555
3556   /* Helper function for parse_bracket_exp.
3557      Build the character class which is represented by NAME.
3558      The result are written to MBCSET and SBCSET.
3559      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3560      is a pointer argument since we may update it.  */
3561
3562 static reg_errcode_t
3563 #ifdef RE_ENABLE_I18N
3564 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3565                  re_charset_t *mbcset, Idx *char_class_alloc,
3566                  const char *class_name, reg_syntax_t syntax)
3567 #else /* not RE_ENABLE_I18N */
3568 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3569                  const char *class_name, reg_syntax_t syntax)
3570 #endif /* not RE_ENABLE_I18N */
3571 {
3572   int i;
3573   const char *name = class_name;
3574
3575   /* In case of REG_ICASE "upper" and "lower" match the both of
3576      upper and lower cases.  */
3577   if ((syntax & RE_ICASE)
3578       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3579     name = "alpha";
3580
3581 #ifdef RE_ENABLE_I18N
3582   /* Check the space of the arrays.  */
3583   if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3584     {
3585       /* Not enough, realloc it.  */
3586       /* +1 in case of mbcset->nchar_classes is 0.  */
3587       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3588       /* Use realloc since array is NULL if *alloc == 0.  */
3589       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3590                                                new_char_class_alloc);
3591       if (__glibc_unlikely (new_char_classes == NULL))
3592         return REG_ESPACE;
3593       mbcset->char_classes = new_char_classes;
3594       *char_class_alloc = new_char_class_alloc;
3595     }
3596   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3597 #endif /* RE_ENABLE_I18N */
3598
3599 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3600   do {                                          \
3601     if (__glibc_unlikely (trans != NULL))                       \
3602       {                                         \
3603         for (i = 0; i < SBC_MAX; ++i)           \
3604           if (ctype_func (i))                   \
3605             bitset_set (sbcset, trans[i]);      \
3606       }                                         \
3607     else                                        \
3608       {                                         \
3609         for (i = 0; i < SBC_MAX; ++i)           \
3610           if (ctype_func (i))                   \
3611             bitset_set (sbcset, i);             \
3612       }                                         \
3613   } while (0)
3614
3615   if (strcmp (name, "alnum") == 0)
3616     BUILD_CHARCLASS_LOOP (isalnum);
3617   else if (strcmp (name, "cntrl") == 0)
3618     BUILD_CHARCLASS_LOOP (iscntrl);
3619   else if (strcmp (name, "lower") == 0)
3620     BUILD_CHARCLASS_LOOP (islower);
3621   else if (strcmp (name, "space") == 0)
3622     BUILD_CHARCLASS_LOOP (isspace);
3623   else if (strcmp (name, "alpha") == 0)
3624     BUILD_CHARCLASS_LOOP (isalpha);
3625   else if (strcmp (name, "digit") == 0)
3626     BUILD_CHARCLASS_LOOP (isdigit);
3627   else if (strcmp (name, "print") == 0)
3628     BUILD_CHARCLASS_LOOP (isprint);
3629   else if (strcmp (name, "upper") == 0)
3630     BUILD_CHARCLASS_LOOP (isupper);
3631   else if (strcmp (name, "blank") == 0)
3632     BUILD_CHARCLASS_LOOP (isblank);
3633   else if (strcmp (name, "graph") == 0)
3634     BUILD_CHARCLASS_LOOP (isgraph);
3635   else if (strcmp (name, "punct") == 0)
3636     BUILD_CHARCLASS_LOOP (ispunct);
3637   else if (strcmp (name, "xdigit") == 0)
3638     BUILD_CHARCLASS_LOOP (isxdigit);
3639   else
3640     return REG_ECTYPE;
3641
3642   return REG_NOERROR;
3643 }
3644
3645 static bin_tree_t *
3646 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3647                     const char *class_name,
3648                     const char *extra, bool non_match,
3649                     reg_errcode_t *err)
3650 {
3651   re_bitset_ptr_t sbcset;
3652 #ifdef RE_ENABLE_I18N
3653   re_charset_t *mbcset;
3654   Idx alloc = 0;
3655 #endif /* not RE_ENABLE_I18N */
3656   reg_errcode_t ret;
3657   bin_tree_t *tree;
3658
3659   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3660   if (__glibc_unlikely (sbcset == NULL))
3661     {
3662       *err = REG_ESPACE;
3663       return NULL;
3664     }
3665 #ifdef RE_ENABLE_I18N
3666   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3667   if (__glibc_unlikely (mbcset == NULL))
3668     {
3669       re_free (sbcset);
3670       *err = REG_ESPACE;
3671       return NULL;
3672     }
3673   mbcset->non_match = non_match;
3674 #endif /* RE_ENABLE_I18N */
3675
3676   /* We don't care the syntax in this case.  */
3677   ret = build_charclass (trans, sbcset,
3678 #ifdef RE_ENABLE_I18N
3679                          mbcset, &alloc,
3680 #endif /* RE_ENABLE_I18N */
3681                          class_name, 0);
3682
3683   if (__glibc_unlikely (ret != REG_NOERROR))
3684     {
3685       re_free (sbcset);
3686 #ifdef RE_ENABLE_I18N
3687       free_charset (mbcset);
3688 #endif /* RE_ENABLE_I18N */
3689       *err = ret;
3690       return NULL;
3691     }
3692   /* \w match '_' also.  */
3693   for (; *extra; extra++)
3694     bitset_set (sbcset, *extra);
3695
3696   /* If it is non-matching list.  */
3697   if (non_match)
3698     bitset_not (sbcset);
3699
3700 #ifdef RE_ENABLE_I18N
3701   /* Ensure only single byte characters are set.  */
3702   if (dfa->mb_cur_max > 1)
3703     bitset_mask (sbcset, dfa->sb_char);
3704 #endif
3705
3706   /* Build a tree for simple bracket.  */
3707   re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3708   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3709   if (__glibc_unlikely (tree == NULL))
3710     goto build_word_op_espace;
3711
3712 #ifdef RE_ENABLE_I18N
3713   if (dfa->mb_cur_max > 1)
3714     {
3715       bin_tree_t *mbc_tree;
3716       /* Build a tree for complex bracket.  */
3717       br_token.type = COMPLEX_BRACKET;
3718       br_token.opr.mbcset = mbcset;
3719       dfa->has_mb_node = 1;
3720       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3721       if (__glibc_unlikely (mbc_tree == NULL))
3722         goto build_word_op_espace;
3723       /* Then join them by ALT node.  */
3724       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3725       if (__glibc_likely (mbc_tree != NULL))
3726         return tree;
3727     }
3728   else
3729     {
3730       free_charset (mbcset);
3731       return tree;
3732     }
3733 #else /* not RE_ENABLE_I18N */
3734   return tree;
3735 #endif /* not RE_ENABLE_I18N */
3736
3737  build_word_op_espace:
3738   re_free (sbcset);
3739 #ifdef RE_ENABLE_I18N
3740   free_charset (mbcset);
3741 #endif /* RE_ENABLE_I18N */
3742   *err = REG_ESPACE;
3743   return NULL;
3744 }
3745
3746 /* This is intended for the expressions like "a{1,3}".
3747    Fetch a number from 'input', and return the number.
3748    Return -1 if the number field is empty like "{,1}".
3749    Return RE_DUP_MAX + 1 if the number field is too large.
3750    Return -2 if an error occurred.  */
3751
3752 static Idx
3753 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3754 {
3755   Idx num = -1;
3756   unsigned char c;
3757   while (1)
3758     {
3759       fetch_token (token, input, syntax);
3760       c = token->opr.c;
3761       if (__glibc_unlikely (token->type == END_OF_RE))
3762         return -2;
3763       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3764         break;
3765       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3766              ? -2
3767              : num == -1
3768              ? c - '0'
3769              : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3770     }
3771   return num;
3772 }
3773 \f
3774 #ifdef RE_ENABLE_I18N
3775 static void
3776 free_charset (re_charset_t *cset)
3777 {
3778   re_free (cset->mbchars);
3779 # ifdef _LIBC
3780   re_free (cset->coll_syms);
3781   re_free (cset->equiv_classes);
3782 # endif
3783   re_free (cset->range_starts);
3784   re_free (cset->range_ends);
3785   re_free (cset->char_classes);
3786   re_free (cset);
3787 }
3788 #endif /* RE_ENABLE_I18N */
3789 \f
3790 /* Functions for binary tree operation.  */
3791
3792 /* Create a tree node.  */
3793
3794 static bin_tree_t *
3795 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3796              re_token_type_t type)
3797 {
3798   re_token_t t = { .type = type };
3799   return create_token_tree (dfa, left, right, &t);
3800 }
3801
3802 static bin_tree_t *
3803 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3804                    const re_token_t *token)
3805 {
3806   bin_tree_t *tree;
3807   if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3808     {
3809       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3810
3811       if (storage == NULL)
3812         return NULL;
3813       storage->next = dfa->str_tree_storage;
3814       dfa->str_tree_storage = storage;
3815       dfa->str_tree_storage_idx = 0;
3816     }
3817   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3818
3819   tree->parent = NULL;
3820   tree->left = left;
3821   tree->right = right;
3822   tree->token = *token;
3823   tree->token.duplicated = 0;
3824   tree->token.opt_subexp = 0;
3825   tree->first = NULL;
3826   tree->next = NULL;
3827   tree->node_idx = -1;
3828
3829   if (left != NULL)
3830     left->parent = tree;
3831   if (right != NULL)
3832     right->parent = tree;
3833   return tree;
3834 }
3835
3836 /* Mark the tree SRC as an optional subexpression.
3837    To be called from preorder or postorder.  */
3838
3839 static reg_errcode_t
3840 mark_opt_subexp (void *extra, bin_tree_t *node)
3841 {
3842   Idx idx = (uintptr_t) extra;
3843   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3844     node->token.opt_subexp = 1;
3845
3846   return REG_NOERROR;
3847 }
3848
3849 /* Free the allocated memory inside NODE. */
3850
3851 static void
3852 free_token (re_token_t *node)
3853 {
3854 #ifdef RE_ENABLE_I18N
3855   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3856     free_charset (node->opr.mbcset);
3857   else
3858 #endif /* RE_ENABLE_I18N */
3859     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3860       re_free (node->opr.sbcset);
3861 }
3862
3863 /* Worker function for tree walking.  Free the allocated memory inside NODE
3864    and its children. */
3865
3866 static reg_errcode_t
3867 free_tree (void *extra, bin_tree_t *node)
3868 {
3869   free_token (&node->token);
3870   return REG_NOERROR;
3871 }
3872
3873
3874 /* Duplicate the node SRC, and return new node.  This is a preorder
3875    visit similar to the one implemented by the generic visitor, but
3876    we need more infrastructure to maintain two parallel trees --- so,
3877    it's easier to duplicate.  */
3878
3879 static bin_tree_t *
3880 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3881 {
3882   const bin_tree_t *node;
3883   bin_tree_t *dup_root;
3884   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3885
3886   for (node = root; ; )
3887     {
3888       /* Create a new tree and link it back to the current parent.  */
3889       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3890       if (*p_new == NULL)
3891         return NULL;
3892       (*p_new)->parent = dup_node;
3893       (*p_new)->token.duplicated = 1;
3894       dup_node = *p_new;
3895
3896       /* Go to the left node, or up and to the right.  */
3897       if (node->left)
3898         {
3899           node = node->left;
3900           p_new = &dup_node->left;
3901         }
3902       else
3903         {
3904           const bin_tree_t *prev = NULL;
3905           while (node->right == prev || node->right == NULL)
3906             {
3907               prev = node;
3908               node = node->parent;
3909               dup_node = dup_node->parent;
3910               if (!node)
3911                 return dup_root;
3912             }
3913           node = node->right;
3914           p_new = &dup_node->right;
3915         }
3916     }
3917 }