lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      Idx n);
  22 static void match_ctx_clean (re_match_context_t *mctx);
  23 static void match_ctx_free (re_match_context_t *cache);
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
  25                                           Idx str_idx, Idx from, Idx to);
  26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
  27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
  28                                            Idx str_idx);
  29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  30                                                     Idx node, Idx str_idx);
  31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  32                            re_dfastate_t **limited_sts, Idx last_node,
  33                            Idx last_str_idx);
  34 static reg_errcode_t re_search_internal (const regex_t *preg,
  35                                          const char *string, Idx length,
  36                                          Idx start, Idx last_start, Idx stop,
  37                                          size_t nmatch, regmatch_t pmatch[],
  38                                          int eflags);
  39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
  40                                   const char *string1, Idx length1,
  41                                   const char *string2, Idx length2,
  42                                   Idx start, regoff_t range,
  43                                   struct re_registers *regs,
  44                                   Idx stop, bool ret_len);
  45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
  46                                 const char *string, Idx length, Idx start,
  47                                 regoff_t range, Idx stop,
  48                                 struct re_registers *regs,
  49                                 bool ret_len);
  50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  51                               Idx nregs, int regs_allocated);
  52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
  53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
  54                            Idx *p_match_first);
  55 static Idx check_halt_state_context (const re_match_context_t *mctx,
  56                                      const re_dfastate_t *state, Idx idx);
  57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  58                          regmatch_t *prev_idx_match, Idx cur_node,
  59                          Idx cur_idx, Idx nmatch);
  60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  61                                       Idx str_idx, Idx dest_node, Idx nregs,
  62                                       regmatch_t *regs, regmatch_t *prevregs,
  63                                       re_node_set *eps_via_nodes);
  64 static reg_errcode_t set_regs (const regex_t *preg,
  65                                const re_match_context_t *mctx,
  66                                size_t nmatch, regmatch_t *pmatch,
  67                                bool fl_backtrack);
  68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
  69
  70 #ifdef RE_ENABLE_I18N
  71 static int sift_states_iter_mb (const re_match_context_t *mctx,
  72                                 re_sift_context_t *sctx,
  73                                 Idx node_idx, Idx str_idx, Idx max_str_idx);
  74 #endif /* RE_ENABLE_I18N */
  75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  76                                            re_sift_context_t *sctx);
  77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  78                                           re_sift_context_t *sctx, Idx str_idx,
  79                                           re_node_set *cur_dest);
  80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  81                                               re_sift_context_t *sctx,
  82                                               Idx str_idx,
  83                                               re_node_set *dest_nodes);
  84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  85                                             re_node_set *dest_nodes,
  86                                             const re_node_set *candidates);
  87 static bool check_dst_limits (const re_match_context_t *mctx,
  88                               const re_node_set *limits,
  89                               Idx dst_node, Idx dst_idx, Idx src_node,
  90                               Idx src_idx);
  91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
  92                                         int boundaries, Idx subexp_idx,
  93                                         Idx from_node, Idx bkref_idx);
  94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
  95                                       Idx limit, Idx subexp_idx,
  96                                       Idx node, Idx str_idx,
  97                                       Idx bkref_idx);
  98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
  99                                           re_node_set *dest_nodes,
 100                                           const re_node_set *candidates,
 101                                           re_node_set *limits,
 102                                           struct re_backref_cache_entry *bkref_ents,
 103                                           Idx str_idx);
 104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 105                                         re_sift_context_t *sctx,
 106                                         Idx str_idx, const re_node_set *candidates);
 107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 108                                         re_dfastate_t **dst,
 109                                         re_dfastate_t **src, Idx num);
 110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 111                                          re_match_context_t *mctx);
 112 static re_dfastate_t *transit_state (reg_errcode_t *err,
 113                                      re_match_context_t *mctx,
 114                                      re_dfastate_t *state);
 115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 116                                             re_match_context_t *mctx,
 117                                             re_dfastate_t *next_state);
 118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 119                                                 re_node_set *cur_nodes,
 120                                                 Idx str_idx);
 121 #if 0
 122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 123                                         re_match_context_t *mctx,
 124                                         re_dfastate_t *pstate);
 125 #endif
 126 #ifdef RE_ENABLE_I18N
 127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 128                                        re_dfastate_t *pstate);
 129 #endif /* RE_ENABLE_I18N */
 130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 131                                           const re_node_set *nodes);
 132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 133                                  Idx bkref_node, Idx bkref_str_idx);
 134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 135                                      const re_sub_match_top_t *sub_top,
 136                                      re_sub_match_last_t *sub_last,
 137                                      Idx bkref_node, Idx bkref_str);
 138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 139                              Idx subexp_idx, int type);
 140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 141                                     state_array_t *path, Idx top_node,
 142                                     Idx top_str, Idx last_node, Idx last_str,
 143                                     int type);
 144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 145                                                    Idx str_idx,
 146                                                    re_node_set *cur_nodes,
 147                                                    re_node_set *next_nodes);
 148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 149                                                re_node_set *cur_nodes,
 150                                                Idx ex_subexp, int type);
 151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 152                                                    re_node_set *dst_nodes,
 153                                                    Idx target, Idx ex_subexp,
 154                                                    int type);
 155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 156                                          re_node_set *cur_nodes, Idx cur_str,
 157                                          Idx subexp_num, int type);
 158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
 159 #ifdef RE_ENABLE_I18N
 160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
 161                                     const re_string_t *input, Idx idx);
 162 # ifdef _LIBC
 163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 164                                                    size_t name_len);
 165 # endif /* _LIBC */
 166 #endif /* RE_ENABLE_I18N */
 167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
 168                                        const re_dfastate_t *state,
 169                                        re_node_set *states_node,
 170                                        bitset_t *states_ch);
 171 static bool check_node_accept (const re_match_context_t *mctx,
 172                                const re_token_t *node, Idx idx);
 173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
 174 \f
 175 /* Entry point for POSIX code.  */
 176
 177 /* regexec searches for a given pattern, specified by PREG, in the
 178    string STRING.
 179
 180    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 181    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 182    least NMATCH elements, and we set them to the offsets of the
 183    corresponding matched substrings.
 184
 185    EFLAGS specifies "execution flags" which affect matching: if
 186    REG_NOTBOL is set, then ^ does not match at the beginning of the
 187    string; if REG_NOTEOL is set, then $ does not match at the end.
 188
 189    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
 190    EFLAGS is invalid.  */
 191
 192 int
 193 regexec (const regex_t *__restrict preg, const char *__restrict string,
 194          size_t nmatch, regmatch_t pmatch[], int eflags)
 195 {
 196   reg_errcode_t err;
 197   Idx start, length;
 198   re_dfa_t *dfa = preg->buffer;
 199
 200   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 201     return REG_BADPAT;
 202
 203   if (eflags & REG_STARTEND)
 204     {
 205       start = pmatch[0].rm_so;
 206       length = pmatch[0].rm_eo;
 207     }
 208   else
 209     {
 210       start = 0;
 211       length = strlen (string);
 212     }
 213
 214   lock_lock (dfa->lock);
 215   if (preg->no_sub)
 216     err = re_search_internal (preg, string, length, start, length,
 217                               length, 0, NULL, eflags);
 218   else
 219     err = re_search_internal (preg, string, length, start, length,
 220                               length, nmatch, pmatch, eflags);
 221   lock_unlock (dfa->lock);
 222   return err != REG_NOERROR;
 223 }
 224
 225 #ifdef _LIBC
 226 libc_hidden_def (__regexec)
 227
 228 # include <shlib-compat.h>
 229 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 230
 231 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 232 __typeof__ (__regexec) __compat_regexec;
 233
 234 int
 235 attribute_compat_text_section
 236 __compat_regexec (const regex_t *__restrict preg,
 237                   const char *__restrict string, size_t nmatch,
 238                   regmatch_t pmatch[], int eflags)
 239 {
 240   return regexec (preg, string, nmatch, pmatch,
 241                   eflags & (REG_NOTBOL | REG_NOTEOL));
 242 }
 243 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 244 # endif
 245 #endif
 246
 247 /* Entry points for GNU code.  */
 248
 249 /* re_match, re_search, re_match_2, re_search_2
 250
 251    The former two functions operate on STRING with length LENGTH,
 252    while the later two operate on concatenation of STRING1 and STRING2
 253    with lengths LENGTH1 and LENGTH2, respectively.
 254
 255    re_match() matches the compiled pattern in BUFP against the string,
 256    starting at index START.
 257
 258    re_search() first tries matching at index START, then it tries to match
 259    starting from index START + 1, and so on.  The last start position tried
 260    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 261    way as re_match().)
 262
 263    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 264    the first STOP characters of the concatenation of the strings should be
 265    concerned.
 266
 267    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 268    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
 269    computed relative to the concatenation, not relative to the individual
 270    strings.)
 271
 272    On success, re_match* functions return the length of the match, re_search*
 273    return the position of the start of the match.  They return -1 on
 274    match failure, -2 on error.  */
 275
 276 regoff_t
 277 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
 278           Idx start, struct re_registers *regs)
 279 {
 280   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
 281 }
 282 #ifdef _LIBC
 283 weak_alias (__re_match, re_match)
 284 #endif
 285
 286 regoff_t
 287 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
 288            Idx start, regoff_t range, struct re_registers *regs)
 289 {
 290   return re_search_stub (bufp, string, length, start, range, length, regs,
 291                          false);
 292 }
 293 #ifdef _LIBC
 294 weak_alias (__re_search, re_search)
 295 #endif
 296
 297 regoff_t
 298 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
 299             const char *string2, Idx length2, Idx start,
 300             struct re_registers *regs, Idx stop)
 301 {
 302   return re_search_2_stub (bufp, string1, length1, string2, length2,
 303                            start, 0, regs, stop, true);
 304 }
 305 #ifdef _LIBC
 306 weak_alias (__re_match_2, re_match_2)
 307 #endif
 308
 309 regoff_t
 310 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
 311              const char *string2, Idx length2, Idx start, regoff_t range,
 312              struct re_registers *regs, Idx stop)
 313 {
 314   return re_search_2_stub (bufp, string1, length1, string2, length2,
 315                            start, range, regs, stop, false);
 316 }
 317 #ifdef _LIBC
 318 weak_alias (__re_search_2, re_search_2)
 319 #endif
 320
 321 static regoff_t
 322 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
 323                   Idx length1, const char *string2, Idx length2, Idx start,
 324                   regoff_t range, struct re_registers *regs,
 325                   Idx stop, bool ret_len)
 326 {
 327   const char *str;
 328   regoff_t rval;
 329   Idx len;
 330   char *s = NULL;
 331
 332   if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
 333                          || INT_ADD_WRAPV (length1, length2, &len))))
 334     return -2;
 335
 336   /* Concatenate the strings.  */
 337   if (length2 > 0)
 338     if (length1 > 0)
 339       {
 340         s = re_malloc (char, len);
 341
 342         if (__glibc_unlikely (s == NULL))
 343           return -2;
 344 #ifdef _LIBC
 345         memcpy (__mempcpy (s, string1, length1), string2, length2);
 346 #else
 347         memcpy (s, string1, length1);
 348         memcpy (s + length1, string2, length2);
 349 #endif
 350         str = s;
 351       }
 352     else
 353       str = string2;
 354   else
 355     str = string1;
 356
 357   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 358                          ret_len);
 359   re_free (s);
 360   return rval;
 361 }
 362
 363 /* The parameters have the same meaning as those of re_search.
 364    Additional parameters:
 365    If RET_LEN is true the length of the match is returned (re_match style);
 366    otherwise the position of the match is returned.  */
 367
 368 static regoff_t
 369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
 370                 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
 371                 bool ret_len)
 372 {
 373   reg_errcode_t result;
 374   regmatch_t *pmatch;
 375   Idx nregs;
 376   regoff_t rval;
 377   int eflags = 0;
 378   re_dfa_t *dfa = bufp->buffer;
 379   Idx last_start = start + range;
 380
 381   /* Check for out-of-range.  */
 382   if (__glibc_unlikely (start < 0 || start > length))
 383     return -1;
 384   if (__glibc_unlikely (length < last_start
 385                         || (0 <= range && last_start < start)))
 386     last_start = length;
 387   else if (__glibc_unlikely (last_start < 0
 388                              || (range < 0 && start <= last_start)))
 389     last_start = 0;
 390
 391   lock_lock (dfa->lock);
 392
 393   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 394   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 395
 396   /* Compile fastmap if we haven't yet.  */
 397   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 398     re_compile_fastmap (bufp);
 399
 400   if (__glibc_unlikely (bufp->no_sub))
 401     regs = NULL;
 402
 403   /* We need at least 1 register.  */
 404   if (regs == NULL)
 405     nregs = 1;
 406   else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
 407                              && regs->num_regs <= bufp->re_nsub))
 408     {
 409       nregs = regs->num_regs;
 410       if (__glibc_unlikely (nregs < 1))
 411         {
 412           /* Nothing can be copied to regs.  */
 413           regs = NULL;
 414           nregs = 1;
 415         }
 416     }
 417   else
 418     nregs = bufp->re_nsub + 1;
 419   pmatch = re_malloc (regmatch_t, nregs);
 420   if (__glibc_unlikely (pmatch == NULL))
 421     {
 422       rval = -2;
 423       goto out;
 424     }
 425
 426   result = re_search_internal (bufp, string, length, start, last_start, stop,
 427                                nregs, pmatch, eflags);
 428
 429   rval = 0;
 430
 431   /* I hope we needn't fill their regs with -1's when no match was found.  */
 432   if (result != REG_NOERROR)
 433     rval = result == REG_NOMATCH ? -1 : -2;
 434   else if (regs != NULL)
 435     {
 436       /* If caller wants register contents data back, copy them.  */
 437       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 438                                            bufp->regs_allocated);
 439       if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
 440         rval = -2;
 441     }
 442
 443   if (__glibc_likely (rval == 0))
 444     {
 445       if (ret_len)
 446         {
 447           DEBUG_ASSERT (pmatch[0].rm_so == start);
 448           rval = pmatch[0].rm_eo - start;
 449         }
 450       else
 451         rval = pmatch[0].rm_so;
 452     }
 453   re_free (pmatch);
 454  out:
 455   lock_unlock (dfa->lock);
 456   return rval;
 457 }
 458
 459 static unsigned
 460 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
 461               int regs_allocated)
 462 {
 463   int rval = REGS_REALLOCATE;
 464   Idx i;
 465   Idx need_regs = nregs + 1;
 466   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
 467      uses.  */
 468
 469   /* Have the register data arrays been allocated?  */
 470   if (regs_allocated == REGS_UNALLOCATED)
 471     { /* No.  So allocate them with malloc.  */
 472       regs->start = re_malloc (regoff_t, need_regs);
 473       if (__glibc_unlikely (regs->start == NULL))
 474         return REGS_UNALLOCATED;
 475       regs->end = re_malloc (regoff_t, need_regs);
 476       if (__glibc_unlikely (regs->end == NULL))
 477         {
 478           re_free (regs->start);
 479           return REGS_UNALLOCATED;
 480         }
 481       regs->num_regs = need_regs;
 482     }
 483   else if (regs_allocated == REGS_REALLOCATE)
 484     { /* Yes.  If we need more elements than were already
 485          allocated, reallocate them.  If we need fewer, just
 486          leave it alone.  */
 487       if (__glibc_unlikely (need_regs > regs->num_regs))
 488         {
 489           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 490           regoff_t *new_end;
 491           if (__glibc_unlikely (new_start == NULL))
 492             return REGS_UNALLOCATED;
 493           new_end = re_realloc (regs->end, regoff_t, need_regs);
 494           if (__glibc_unlikely (new_end == NULL))
 495             {
 496               re_free (new_start);
 497               return REGS_UNALLOCATED;
 498             }
 499           regs->start = new_start;
 500           regs->end = new_end;
 501           regs->num_regs = need_regs;
 502         }
 503     }
 504   else
 505     {
 506       DEBUG_ASSERT (regs_allocated == REGS_FIXED);
 507       /* This function may not be called with REGS_FIXED and nregs too big.  */
 508       DEBUG_ASSERT (nregs <= regs->num_regs);
 509       rval = REGS_FIXED;
 510     }
 511
 512   /* Copy the regs.  */
 513   for (i = 0; i < nregs; ++i)
 514     {
 515       regs->start[i] = pmatch[i].rm_so;
 516       regs->end[i] = pmatch[i].rm_eo;
 517     }
 518   for ( ; i < regs->num_regs; ++i)
 519     regs->start[i] = regs->end[i] = -1;
 520
 521   return rval;
 522 }
 523
 524 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 525    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 526    this memory for recording register information.  STARTS and ENDS
 527    must be allocated using the malloc library routine, and must each
 528    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 529
 530    If NUM_REGS == 0, then subsequent matches should allocate their own
 531    register data.
 532
 533    Unless this function is called, the first search or match using
 534    PATTERN_BUFFER will allocate its own register data, without
 535    freeing the old data.  */
 536
 537 void
 538 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 539                   __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
 540 {
 541   if (num_regs)
 542     {
 543       bufp->regs_allocated = REGS_REALLOCATE;
 544       regs->num_regs = num_regs;
 545       regs->start = starts;
 546       regs->end = ends;
 547     }
 548   else
 549     {
 550       bufp->regs_allocated = REGS_UNALLOCATED;
 551       regs->num_regs = 0;
 552       regs->start = regs->end = NULL;
 553     }
 554 }
 555 #ifdef _LIBC
 556 weak_alias (__re_set_registers, re_set_registers)
 557 #endif
 558 \f
 559 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 560    them unless specifically requested.  */
 561
 562 #if defined _REGEX_RE_COMP || defined _LIBC
 563 int
 564 # ifdef _LIBC
 565 weak_function
 566 # endif
 567 re_exec (const char *s)
 568 {
 569   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 570 }
 571 #endif /* _REGEX_RE_COMP */
 572 \f
 573 /* Internal entry point.  */
 574
 575 /* Searches for a compiled pattern PREG in the string STRING, whose
 576    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 577    meaning as with regexec.  LAST_START is START + RANGE, where
 578    START and RANGE have the same meaning as with re_search.
 579    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 580    otherwise return the error code.
 581    Note: We assume front end functions already check ranges.
 582    (0 <= LAST_START && LAST_START <= LENGTH)  */
 583
 584 static reg_errcode_t
 585 __attribute_warn_unused_result__
 586 re_search_internal (const regex_t *preg, const char *string, Idx length,
 587                     Idx start, Idx last_start, Idx stop, size_t nmatch,
 588                     regmatch_t pmatch[], int eflags)
 589 {
 590   reg_errcode_t err;
 591   const re_dfa_t *dfa = preg->buffer;
 592   Idx left_lim, right_lim;
 593   int incr;
 594   bool fl_longest_match;
 595   int match_kind;
 596   Idx match_first;
 597   Idx match_last = -1;
 598   Idx extra_nmatch;
 599   bool sb;
 600   int ch;
 601   re_match_context_t mctx = { .dfa = dfa };
 602   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
 603                     && start != last_start && !preg->can_be_null)
 604                    ? preg->fastmap : NULL);
 605   RE_TRANSLATE_TYPE t = preg->translate;
 606
 607   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 608   nmatch -= extra_nmatch;
 609
 610   /* Check if the DFA haven't been compiled.  */
 611   if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
 612                         || dfa->init_state_word == NULL
 613                         || dfa->init_state_nl == NULL
 614                         || dfa->init_state_begbuf == NULL))
 615     return REG_NOMATCH;
 616
 617   /* We assume front-end functions already check them.  */
 618   DEBUG_ASSERT (0 <= last_start && last_start <= length);
 619
 620   /* If initial states with non-begbuf contexts have no elements,
 621      the regex must be anchored.  If preg->newline_anchor is set,
 622      we'll never use init_state_nl, so do not check it.  */
 623   if (dfa->init_state->nodes.nelem == 0
 624       && dfa->init_state_word->nodes.nelem == 0
 625       && (dfa->init_state_nl->nodes.nelem == 0
 626           || !preg->newline_anchor))
 627     {
 628       if (start != 0 && last_start != 0)
 629         return REG_NOMATCH;
 630       start = last_start = 0;
 631     }
 632
 633   /* We must check the longest matching, if nmatch > 0.  */
 634   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 635
 636   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 637                             preg->translate, (preg->syntax & RE_ICASE) != 0,
 638                             dfa);
 639   if (__glibc_unlikely (err != REG_NOERROR))
 640     goto free_return;
 641   mctx.input.stop = stop;
 642   mctx.input.raw_stop = stop;
 643   mctx.input.newline_anchor = preg->newline_anchor;
 644
 645   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 646   if (__glibc_unlikely (err != REG_NOERROR))
 647     goto free_return;
 648
 649   /* We will log all the DFA states through which the dfa pass,
 650      if nmatch > 1, or this dfa has "multibyte node", which is a
 651      back-reference or a node which can accept multibyte character or
 652      multi character collating element.  */
 653   if (nmatch > 1 || dfa->has_mb_node)
 654     {
 655       /* Avoid overflow.  */
 656       if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
 657                              <= mctx.input.bufs_len)))
 658         {
 659           err = REG_ESPACE;
 660           goto free_return;
 661         }
 662
 663       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 664       if (__glibc_unlikely (mctx.state_log == NULL))
 665         {
 666           err = REG_ESPACE;
 667           goto free_return;
 668         }
 669     }
 670
 671   match_first = start;
 672   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 673                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 674
 675   /* Check incrementally whether the input string matches.  */
 676   incr = (last_start < start) ? -1 : 1;
 677   left_lim = (last_start < start) ? last_start : start;
 678   right_lim = (last_start < start) ? start : last_start;
 679   sb = dfa->mb_cur_max == 1;
 680   match_kind =
 681     (fastmap
 682      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 683         | (start <= last_start ? 2 : 0)
 684         | (t != NULL ? 1 : 0))
 685      : 8);
 686
 687   for (;; match_first += incr)
 688     {
 689       err = REG_NOMATCH;
 690       if (match_first < left_lim || right_lim < match_first)
 691         goto free_return;
 692
 693       /* Advance as rapidly as possible through the string, until we
 694          find a plausible place to start matching.  This may be done
 695          with varying efficiency, so there are various possibilities:
 696          only the most common of them are specialized, in order to
 697          save on code size.  We use a switch statement for speed.  */
 698       switch (match_kind)
 699         {
 700         case 8:
 701           /* No fastmap.  */
 702           break;
 703
 704         case 7:
 705           /* Fastmap with single-byte translation, match forward.  */
 706           while (__glibc_likely (match_first < right_lim)
 707                  && !fastmap[t[(unsigned char) string[match_first]]])
 708             ++match_first;
 709           goto forward_match_found_start_or_reached_end;
 710
 711         case 6:
 712           /* Fastmap without translation, match forward.  */
 713           while (__glibc_likely (match_first < right_lim)
 714                  && !fastmap[(unsigned char) string[match_first]])
 715             ++match_first;
 716
 717         forward_match_found_start_or_reached_end:
 718           if (__glibc_unlikely (match_first == right_lim))
 719             {
 720               ch = match_first >= length
 721                        ? 0 : (unsigned char) string[match_first];
 722               if (!fastmap[t ? t[ch] : ch])
 723                 goto free_return;
 724             }
 725           break;
 726
 727         case 4:
 728         case 5:
 729           /* Fastmap without multi-byte translation, match backwards.  */
 730           while (match_first >= left_lim)
 731             {
 732               ch = match_first >= length
 733                        ? 0 : (unsigned char) string[match_first];
 734               if (fastmap[t ? t[ch] : ch])
 735                 break;
 736               --match_first;
 737             }
 738           if (match_first < left_lim)
 739             goto free_return;
 740           break;
 741
 742         default:
 743           /* In this case, we can't determine easily the current byte,
 744              since it might be a component byte of a multibyte
 745              character.  Then we use the constructed buffer instead.  */
 746           for (;;)
 747             {
 748               /* If MATCH_FIRST is out of the valid range, reconstruct the
 749                  buffers.  */
 750               __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
 751               if (__glibc_unlikely (offset
 752                                     >= (__re_size_t) mctx.input.valid_raw_len))
 753                 {
 754                   err = re_string_reconstruct (&mctx.input, match_first,
 755                                                eflags);
 756                   if (__glibc_unlikely (err != REG_NOERROR))
 757                     goto free_return;
 758
 759                   offset = match_first - mctx.input.raw_mbs_idx;
 760                 }
 761               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 762                  Note that MATCH_FIRST must not be smaller than 0.  */
 763               ch = (match_first >= length
 764                     ? 0 : re_string_byte_at (&mctx.input, offset));
 765               if (fastmap[ch])
 766                 break;
 767               match_first += incr;
 768               if (match_first < left_lim || match_first > right_lim)
 769                 {
 770                   err = REG_NOMATCH;
 771                   goto free_return;
 772                 }
 773             }
 774           break;
 775         }
 776
 777       /* Reconstruct the buffers so that the matcher can assume that
 778          the matching starts from the beginning of the buffer.  */
 779       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 780       if (__glibc_unlikely (err != REG_NOERROR))
 781         goto free_return;
 782
 783 #ifdef RE_ENABLE_I18N
 784      /* Don't consider this char as a possible match start if it part,
 785         yet isn't the head, of a multibyte character.  */
 786       if (!sb && !re_string_first_byte (&mctx.input, 0))
 787         continue;
 788 #endif
 789
 790       /* It seems to be appropriate one, then use the matcher.  */
 791       /* We assume that the matching starts from 0.  */
 792       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 793       match_last = check_matching (&mctx, fl_longest_match,
 794                                    start <= last_start ? &match_first : NULL);
 795       if (match_last != -1)
 796         {
 797           if (__glibc_unlikely (match_last == -2))
 798             {
 799               err = REG_ESPACE;
 800               goto free_return;
 801             }
 802           else
 803             {
 804               mctx.match_last = match_last;
 805               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 806                 {
 807                   re_dfastate_t *pstate = mctx.state_log[match_last];
 808                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 809                                                              match_last);
 810                 }
 811               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 812                   || dfa->nbackref)
 813                 {
 814                   err = prune_impossible_nodes (&mctx);
 815                   if (err == REG_NOERROR)
 816                     break;
 817                   if (__glibc_unlikely (err != REG_NOMATCH))
 818                     goto free_return;
 819                   match_last = -1;
 820                 }
 821               else
 822                 break; /* We found a match.  */
 823             }
 824         }
 825
 826       match_ctx_clean (&mctx);
 827     }
 828
 829   DEBUG_ASSERT (match_last != -1);
 830   DEBUG_ASSERT (err == REG_NOERROR);
 831
 832   /* Set pmatch[] if we need.  */
 833   if (nmatch > 0)
 834     {
 835       Idx reg_idx;
 836
 837       /* Initialize registers.  */
 838       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 839         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 840
 841       /* Set the points where matching start/end.  */
 842       pmatch[0].rm_so = 0;
 843       pmatch[0].rm_eo = mctx.match_last;
 844       /* FIXME: This function should fail if mctx.match_last exceeds
 845          the maximum possible regoff_t value.  We need a new error
 846          code REG_OVERFLOW.  */
 847
 848       if (!preg->no_sub && nmatch > 1)
 849         {
 850           err = set_regs (preg, &mctx, nmatch, pmatch,
 851                           dfa->has_plural_match && dfa->nbackref > 0);
 852           if (__glibc_unlikely (err != REG_NOERROR))
 853             goto free_return;
 854         }
 855
 856       /* At last, add the offset to each register, since we slid
 857          the buffers so that we could assume that the matching starts
 858          from 0.  */
 859       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 860         if (pmatch[reg_idx].rm_so != -1)
 861           {
 862 #ifdef RE_ENABLE_I18N
 863             if (__glibc_unlikely (mctx.input.offsets_needed != 0))
 864               {
 865                 pmatch[reg_idx].rm_so =
 866                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 867                    ? mctx.input.valid_raw_len
 868                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 869                 pmatch[reg_idx].rm_eo =
 870                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 871                    ? mctx.input.valid_raw_len
 872                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 873               }
 874 #else
 875             DEBUG_ASSERT (mctx.input.offsets_needed == 0);
 876 #endif
 877             pmatch[reg_idx].rm_so += match_first;
 878             pmatch[reg_idx].rm_eo += match_first;
 879           }
 880       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 881         {
 882           pmatch[nmatch + reg_idx].rm_so = -1;
 883           pmatch[nmatch + reg_idx].rm_eo = -1;
 884         }
 885
 886       if (dfa->subexp_map)
 887         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 888           if (dfa->subexp_map[reg_idx] != reg_idx)
 889             {
 890               pmatch[reg_idx + 1].rm_so
 891                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 892               pmatch[reg_idx + 1].rm_eo
 893                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 894             }
 895     }
 896
 897  free_return:
 898   re_free (mctx.state_log);
 899   if (dfa->nbackref)
 900     match_ctx_free (&mctx);
 901   re_string_destruct (&mctx.input);
 902   return err;
 903 }
 904
 905 static reg_errcode_t
 906 __attribute_warn_unused_result__
 907 prune_impossible_nodes (re_match_context_t *mctx)
 908 {
 909   const re_dfa_t *const dfa = mctx->dfa;
 910   Idx halt_node, match_last;
 911   reg_errcode_t ret;
 912   re_dfastate_t **sifted_states;
 913   re_dfastate_t **lim_states = NULL;
 914   re_sift_context_t sctx;
 915   DEBUG_ASSERT (mctx->state_log != NULL);
 916   match_last = mctx->match_last;
 917   halt_node = mctx->last_node;
 918
 919   /* Avoid overflow.  */
 920   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
 921                         <= match_last))
 922     return REG_ESPACE;
 923
 924   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 925   if (__glibc_unlikely (sifted_states == NULL))
 926     {
 927       ret = REG_ESPACE;
 928       goto free_return;
 929     }
 930   if (dfa->nbackref)
 931     {
 932       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 933       if (__glibc_unlikely (lim_states == NULL))
 934         {
 935           ret = REG_ESPACE;
 936           goto free_return;
 937         }
 938       while (1)
 939         {
 940           memset (lim_states, '\0',
 941                   sizeof (re_dfastate_t *) * (match_last + 1));
 942           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 943                          match_last);
 944           ret = sift_states_backward (mctx, &sctx);
 945           re_node_set_free (&sctx.limits);
 946           if (__glibc_unlikely (ret != REG_NOERROR))
 947               goto free_return;
 948           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 949             break;
 950           do
 951             {
 952               --match_last;
 953               if (match_last < 0)
 954                 {
 955                   ret = REG_NOMATCH;
 956                   goto free_return;
 957                 }
 958             } while (mctx->state_log[match_last] == NULL
 959                      || !mctx->state_log[match_last]->halt);
 960           halt_node = check_halt_state_context (mctx,
 961                                                 mctx->state_log[match_last],
 962                                                 match_last);
 963         }
 964       ret = merge_state_array (dfa, sifted_states, lim_states,
 965                                match_last + 1);
 966       re_free (lim_states);
 967       lim_states = NULL;
 968       if (__glibc_unlikely (ret != REG_NOERROR))
 969         goto free_return;
 970     }
 971   else
 972     {
 973       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 974       ret = sift_states_backward (mctx, &sctx);
 975       re_node_set_free (&sctx.limits);
 976       if (__glibc_unlikely (ret != REG_NOERROR))
 977         goto free_return;
 978       if (sifted_states[0] == NULL)
 979         {
 980           ret = REG_NOMATCH;
 981           goto free_return;
 982         }
 983     }
 984   re_free (mctx->state_log);
 985   mctx->state_log = sifted_states;
 986   sifted_states = NULL;
 987   mctx->last_node = halt_node;
 988   mctx->match_last = match_last;
 989   ret = REG_NOERROR;
 990  free_return:
 991   re_free (sifted_states);
 992   re_free (lim_states);
 993   return ret;
 994 }
 995
 996 /* Acquire an initial state and return it.
 997    We must select appropriate initial state depending on the context,
 998    since initial states may have constraints like "\<", "^", etc..  */
 999
1000 static inline re_dfastate_t *
1001 __attribute__ ((always_inline))
1002 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1003                             Idx idx)
1004 {
1005   const re_dfa_t *const dfa = mctx->dfa;
1006   if (dfa->init_state->has_constraint)
1007     {
1008       unsigned int context;
1009       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1010       if (IS_WORD_CONTEXT (context))
1011         return dfa->init_state_word;
1012       else if (IS_ORDINARY_CONTEXT (context))
1013         return dfa->init_state;
1014       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1015         return dfa->init_state_begbuf;
1016       else if (IS_NEWLINE_CONTEXT (context))
1017         return dfa->init_state_nl;
1018       else if (IS_BEGBUF_CONTEXT (context))
1019         {
1020           /* It is relatively rare case, then calculate on demand.  */
1021           return re_acquire_state_context (err, dfa,
1022                                            dfa->init_state->entrance_nodes,
1023                                            context);
1024         }
1025       else
1026         /* Must not happen?  */
1027         return dfa->init_state;
1028     }
1029   else
1030     return dfa->init_state;
1031 }
1032
1033 /* Check whether the regular expression match input string INPUT or not,
1034    and return the index where the matching end.  Return -1 if
1035    there is no match, and return -2 in case of an error.
1036    FL_LONGEST_MATCH means we want the POSIX longest matching.
1037    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1038    next place where we may want to try matching.
1039    Note that the matcher assumes that the matching starts from the current
1040    index of the buffer.  */
1041
1042 static Idx
1043 __attribute_warn_unused_result__
1044 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1045                 Idx *p_match_first)
1046 {
1047   const re_dfa_t *const dfa = mctx->dfa;
1048   reg_errcode_t err;
1049   Idx match = 0;
1050   Idx match_last = -1;
1051   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1052   re_dfastate_t *cur_state;
1053   bool at_init_state = p_match_first != NULL;
1054   Idx next_start_idx = cur_str_idx;
1055
1056   err = REG_NOERROR;
1057   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1058   /* An initial state must not be NULL (invalid).  */
1059   if (__glibc_unlikely (cur_state == NULL))
1060     {
1061       DEBUG_ASSERT (err == REG_ESPACE);
1062       return -2;
1063     }
1064
1065   if (mctx->state_log != NULL)
1066     {
1067       mctx->state_log[cur_str_idx] = cur_state;
1068
1069       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1070          later.  E.g. Processing back references.  */
1071       if (__glibc_unlikely (dfa->nbackref))
1072         {
1073           at_init_state = false;
1074           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1075           if (__glibc_unlikely (err != REG_NOERROR))
1076             return err;
1077
1078           if (cur_state->has_backref)
1079             {
1080               err = transit_state_bkref (mctx, &cur_state->nodes);
1081               if (__glibc_unlikely (err != REG_NOERROR))
1082                 return err;
1083             }
1084         }
1085     }
1086
1087   /* If the RE accepts NULL string.  */
1088   if (__glibc_unlikely (cur_state->halt))
1089     {
1090       if (!cur_state->has_constraint
1091           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1092         {
1093           if (!fl_longest_match)
1094             return cur_str_idx;
1095           else
1096             {
1097               match_last = cur_str_idx;
1098               match = 1;
1099             }
1100         }
1101     }
1102
1103   while (!re_string_eoi (&mctx->input))
1104     {
1105       re_dfastate_t *old_state = cur_state;
1106       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1107
1108       if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1109            && mctx->input.bufs_len < mctx->input.len)
1110           || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1111               && mctx->input.valid_len < mctx->input.len))
1112         {
1113           err = extend_buffers (mctx, next_char_idx + 1);
1114           if (__glibc_unlikely (err != REG_NOERROR))
1115             {
1116               DEBUG_ASSERT (err == REG_ESPACE);
1117               return -2;
1118             }
1119         }
1120
1121       cur_state = transit_state (&err, mctx, cur_state);
1122       if (mctx->state_log != NULL)
1123         cur_state = merge_state_with_log (&err, mctx, cur_state);
1124
1125       if (cur_state == NULL)
1126         {
1127           /* Reached the invalid state or an error.  Try to recover a valid
1128              state using the state log, if available and if we have not
1129              already found a valid (even if not the longest) match.  */
1130           if (__glibc_unlikely (err != REG_NOERROR))
1131             return -2;
1132
1133           if (mctx->state_log == NULL
1134               || (match && !fl_longest_match)
1135               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1136             break;
1137         }
1138
1139       if (__glibc_unlikely (at_init_state))
1140         {
1141           if (old_state == cur_state)
1142             next_start_idx = next_char_idx;
1143           else
1144             at_init_state = false;
1145         }
1146
1147       if (cur_state->halt)
1148         {
1149           /* Reached a halt state.
1150              Check the halt state can satisfy the current context.  */
1151           if (!cur_state->has_constraint
1152               || check_halt_state_context (mctx, cur_state,
1153                                            re_string_cur_idx (&mctx->input)))
1154             {
1155               /* We found an appropriate halt state.  */
1156               match_last = re_string_cur_idx (&mctx->input);
1157               match = 1;
1158
1159               /* We found a match, do not modify match_first below.  */
1160               p_match_first = NULL;
1161               if (!fl_longest_match)
1162                 break;
1163             }
1164         }
1165     }
1166
1167   if (p_match_first)
1168     *p_match_first += next_start_idx;
1169
1170   return match_last;
1171 }
1172
1173 /* Check NODE match the current context.  */
1174
1175 static bool
1176 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1177 {
1178   re_token_type_t type = dfa->nodes[node].type;
1179   unsigned int constraint = dfa->nodes[node].constraint;
1180   if (type != END_OF_RE)
1181     return false;
1182   if (!constraint)
1183     return true;
1184   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1185     return false;
1186   return true;
1187 }
1188
1189 /* Check the halt state STATE match the current context.
1190    Return 0 if not match, if the node, STATE has, is a halt node and
1191    match the context, return the node.  */
1192
1193 static Idx
1194 check_halt_state_context (const re_match_context_t *mctx,
1195                           const re_dfastate_t *state, Idx idx)
1196 {
1197   Idx i;
1198   unsigned int context;
1199   DEBUG_ASSERT (state->halt);
1200   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201   for (i = 0; i < state->nodes.nelem; ++i)
1202     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203       return state->nodes.elems[i];
1204   return 0;
1205 }
1206
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208    corresponding to the DFA).
1209    Return the destination node, and update EPS_VIA_NODES;
1210    return -1 on match failure, -2 on error.  */
1211
1212 static Idx
1213 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1214                    regmatch_t *prevregs,
1215                    Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1216                    struct re_fail_stack_t *fs)
1217 {
1218   const re_dfa_t *const dfa = mctx->dfa;
1219   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1220     {
1221       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1222       re_node_set *edests = &dfa->edests[node];
1223
1224       if (! re_node_set_contains (eps_via_nodes, node))
1225         {
1226           bool ok = re_node_set_insert (eps_via_nodes, node);
1227           if (__glibc_unlikely (! ok))
1228             return -2;
1229         }
1230
1231       /* Pick a valid destination, or return -1 if none is found.  */
1232       Idx dest_node = -1;
1233       for (Idx i = 0; i < edests->nelem; i++)
1234         {
1235           Idx candidate = edests->elems[i];
1236           if (!re_node_set_contains (cur_nodes, candidate))
1237             continue;
1238           if (dest_node == -1)
1239             dest_node = candidate;
1240
1241           else
1242             {
1243               /* In order to avoid infinite loop like "(a*)*", return the second
1244                  epsilon-transition if the first was already considered.  */
1245               if (re_node_set_contains (eps_via_nodes, dest_node))
1246                 return candidate;
1247
1248               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1249               else if (fs != NULL
1250                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1251                                            prevregs, eps_via_nodes))
1252                 return -2;
1253
1254               /* We know we are going to exit.  */
1255               break;
1256             }
1257         }
1258       return dest_node;
1259     }
1260   else
1261     {
1262       Idx naccepted = 0;
1263       re_token_type_t type = dfa->nodes[node].type;
1264
1265 #ifdef RE_ENABLE_I18N
1266       if (dfa->nodes[node].accept_mb)
1267         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1268       else
1269 #endif /* RE_ENABLE_I18N */
1270       if (type == OP_BACK_REF)
1271         {
1272           Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1273           if (subexp_idx < nregs)
1274             naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1275           if (fs != NULL)
1276             {
1277               if (subexp_idx >= nregs
1278                   || regs[subexp_idx].rm_so == -1
1279                   || regs[subexp_idx].rm_eo == -1)
1280                 return -1;
1281               else if (naccepted)
1282                 {
1283                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1284                   if (mctx->input.valid_len - *pidx < naccepted
1285                       || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1286                                   naccepted)
1287                           != 0))
1288                     return -1;
1289                 }
1290             }
1291
1292           if (naccepted == 0)
1293             {
1294               Idx dest_node;
1295               bool ok = re_node_set_insert (eps_via_nodes, node);
1296               if (__glibc_unlikely (! ok))
1297                 return -2;
1298               dest_node = dfa->edests[node].elems[0];
1299               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1300                                         dest_node))
1301                 return dest_node;
1302             }
1303         }
1304
1305       if (naccepted != 0
1306           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1307         {
1308           Idx dest_node = dfa->nexts[node];
1309           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1310           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1311                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1312                                                dest_node)))
1313             return -1;
1314           re_node_set_empty (eps_via_nodes);
1315           return dest_node;
1316         }
1317     }
1318   return -1;
1319 }
1320
1321 static reg_errcode_t
1322 __attribute_warn_unused_result__
1323 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1324                  Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1325                  re_node_set *eps_via_nodes)
1326 {
1327   reg_errcode_t err;
1328   Idx num = fs->num++;
1329   if (fs->num == fs->alloc)
1330     {
1331       struct re_fail_stack_ent_t *new_array;
1332       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1333                               fs->alloc * 2);
1334       if (new_array == NULL)
1335         return REG_ESPACE;
1336       fs->alloc *= 2;
1337       fs->stack = new_array;
1338     }
1339   fs->stack[num].idx = str_idx;
1340   fs->stack[num].node = dest_node;
1341   fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1342   if (fs->stack[num].regs == NULL)
1343     return REG_ESPACE;
1344   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1345   memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1346   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1347   return err;
1348 }
1349
1350 static Idx
1351 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1352                 regmatch_t *regs, regmatch_t *prevregs,
1353                 re_node_set *eps_via_nodes)
1354 {
1355   if (fs == NULL || fs->num == 0)
1356     return -1;
1357   Idx num = --fs->num;
1358   *pidx = fs->stack[num].idx;
1359   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1360   memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1361   re_node_set_free (eps_via_nodes);
1362   re_free (fs->stack[num].regs);
1363   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1364   DEBUG_ASSERT (0 <= fs->stack[num].node);
1365   return fs->stack[num].node;
1366 }
1367
1368
1369 #define DYNARRAY_STRUCT  regmatch_list
1370 #define DYNARRAY_ELEMENT regmatch_t
1371 #define DYNARRAY_PREFIX  regmatch_list_
1372 #include <malloc/dynarray-skeleton.c>
1373
1374 /* Set the positions where the subexpressions are starts/ends to registers
1375    PMATCH.
1376    Note: We assume that pmatch[0] is already set, and
1377    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1378
1379 static reg_errcode_t
1380 __attribute_warn_unused_result__
1381 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1382           regmatch_t *pmatch, bool fl_backtrack)
1383 {
1384   const re_dfa_t *dfa = preg->buffer;
1385   Idx idx, cur_node;
1386   re_node_set eps_via_nodes;
1387   struct re_fail_stack_t *fs;
1388   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1389   struct regmatch_list prev_match;
1390   regmatch_list_init (&prev_match);
1391
1392   DEBUG_ASSERT (nmatch > 1);
1393   DEBUG_ASSERT (mctx->state_log != NULL);
1394   if (fl_backtrack)
1395     {
1396       fs = &fs_body;
1397       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1398       if (fs->stack == NULL)
1399         return REG_ESPACE;
1400     }
1401   else
1402     fs = NULL;
1403
1404   cur_node = dfa->init_node;
1405   re_node_set_init_empty (&eps_via_nodes);
1406
1407   if (!regmatch_list_resize (&prev_match, nmatch))
1408     {
1409       regmatch_list_free (&prev_match);
1410       free_fail_stack_return (fs);
1411       return REG_ESPACE;
1412     }
1413   regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1414   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1415
1416   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1417     {
1418       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1419
1420       if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1421           || (fs && re_node_set_contains (&eps_via_nodes, cur_node)))
1422         {
1423           Idx reg_idx;
1424           cur_node = -1;
1425           if (fs)
1426             {
1427               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1428                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1429                   {
1430                     cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1431                                                prev_idx_match, &eps_via_nodes);
1432                     break;
1433                   }
1434             }
1435           if (cur_node < 0)
1436             {
1437               re_node_set_free (&eps_via_nodes);
1438               regmatch_list_free (&prev_match);
1439               return free_fail_stack_return (fs);
1440             }
1441         }
1442
1443       /* Proceed to next node.  */
1444       cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1445                                     &idx, cur_node,
1446                                     &eps_via_nodes, fs);
1447
1448       if (__glibc_unlikely (cur_node < 0))
1449         {
1450           if (__glibc_unlikely (cur_node == -2))
1451             {
1452               re_node_set_free (&eps_via_nodes);
1453               regmatch_list_free (&prev_match);
1454               free_fail_stack_return (fs);
1455               return REG_ESPACE;
1456             }
1457           cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1458                                      prev_idx_match, &eps_via_nodes);
1459           if (cur_node < 0)
1460             {
1461               re_node_set_free (&eps_via_nodes);
1462               regmatch_list_free (&prev_match);
1463               free_fail_stack_return (fs);
1464               return REG_NOMATCH;
1465             }
1466         }
1467     }
1468   re_node_set_free (&eps_via_nodes);
1469   regmatch_list_free (&prev_match);
1470   return free_fail_stack_return (fs);
1471 }
1472
1473 static reg_errcode_t
1474 free_fail_stack_return (struct re_fail_stack_t *fs)
1475 {
1476   if (fs)
1477     {
1478       Idx fs_idx;
1479       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1480         {
1481           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1482           re_free (fs->stack[fs_idx].regs);
1483         }
1484       re_free (fs->stack);
1485     }
1486   return REG_NOERROR;
1487 }
1488
1489 static void
1490 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1491              regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1492 {
1493   int type = dfa->nodes[cur_node].type;
1494   if (type == OP_OPEN_SUBEXP)
1495     {
1496       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1497
1498       /* We are at the first node of this sub expression.  */
1499       if (reg_num < nmatch)
1500         {
1501           pmatch[reg_num].rm_so = cur_idx;
1502           pmatch[reg_num].rm_eo = -1;
1503         }
1504     }
1505   else if (type == OP_CLOSE_SUBEXP)
1506     {
1507       /* We are at the last node of this sub expression.  */
1508       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1509       if (reg_num < nmatch)
1510         {
1511           if (pmatch[reg_num].rm_so < cur_idx)
1512             {
1513               pmatch[reg_num].rm_eo = cur_idx;
1514               /* This is a non-empty match or we are not inside an optional
1515                  subexpression.  Accept this right away.  */
1516               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1517             }
1518           else
1519             {
1520               if (dfa->nodes[cur_node].opt_subexp
1521                   && prev_idx_match[reg_num].rm_so != -1)
1522                 /* We transited through an empty match for an optional
1523                    subexpression, like (a?)*, and this is not the subexp's
1524                    first match.  Copy back the old content of the registers
1525                    so that matches of an inner subexpression are undone as
1526                    well, like in ((a?))*.  */
1527                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1528               else
1529                 /* We completed a subexpression, but it may be part of
1530                    an optional one, so do not update PREV_IDX_MATCH.  */
1531                 pmatch[reg_num].rm_eo = cur_idx;
1532             }
1533         }
1534     }
1535 }
1536
1537 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1538    and sift the nodes in each states according to the following rules.
1539    Updated state_log will be wrote to STATE_LOG.
1540
1541    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1542      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1543         If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1544         the LAST_NODE, we throw away the node 'a'.
1545      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1546         string 's' and transit to 'b':
1547         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1548            away the node 'a'.
1549         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1550             thrown away, we throw away the node 'a'.
1551      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1552         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1553            node 'a'.
1554         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1555             we throw away the node 'a'.  */
1556
1557 #define STATE_NODE_CONTAINS(state,node) \
1558   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1559
1560 static reg_errcode_t
1561 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1562 {
1563   reg_errcode_t err;
1564   int null_cnt = 0;
1565   Idx str_idx = sctx->last_str_idx;
1566   re_node_set cur_dest;
1567
1568   DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1569
1570   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1571      transit to the last_node and the last_node itself.  */
1572   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1573   if (__glibc_unlikely (err != REG_NOERROR))
1574     return err;
1575   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1576   if (__glibc_unlikely (err != REG_NOERROR))
1577     goto free_return;
1578
1579   /* Then check each states in the state_log.  */
1580   while (str_idx > 0)
1581     {
1582       /* Update counters.  */
1583       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1584       if (null_cnt > mctx->max_mb_elem_len)
1585         {
1586           memset (sctx->sifted_states, '\0',
1587                   sizeof (re_dfastate_t *) * str_idx);
1588           re_node_set_free (&cur_dest);
1589           return REG_NOERROR;
1590         }
1591       re_node_set_empty (&cur_dest);
1592       --str_idx;
1593
1594       if (mctx->state_log[str_idx])
1595         {
1596           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1597           if (__glibc_unlikely (err != REG_NOERROR))
1598             goto free_return;
1599         }
1600
1601       /* Add all the nodes which satisfy the following conditions:
1602          - It can epsilon transit to a node in CUR_DEST.
1603          - It is in CUR_SRC.
1604          And update state_log.  */
1605       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1606       if (__glibc_unlikely (err != REG_NOERROR))
1607         goto free_return;
1608     }
1609   err = REG_NOERROR;
1610  free_return:
1611   re_node_set_free (&cur_dest);
1612   return err;
1613 }
1614
1615 static reg_errcode_t
1616 __attribute_warn_unused_result__
1617 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1618                      Idx str_idx, re_node_set *cur_dest)
1619 {
1620   const re_dfa_t *const dfa = mctx->dfa;
1621   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1622   Idx i;
1623
1624   /* Then build the next sifted state.
1625      We build the next sifted state on 'cur_dest', and update
1626      'sifted_states[str_idx]' with 'cur_dest'.
1627      Note:
1628      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1629      'cur_src' points the node_set of the old 'state_log[str_idx]'
1630      (with the epsilon nodes pre-filtered out).  */
1631   for (i = 0; i < cur_src->nelem; i++)
1632     {
1633       Idx prev_node = cur_src->elems[i];
1634       int naccepted = 0;
1635       bool ok;
1636       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1637
1638 #ifdef RE_ENABLE_I18N
1639       /* If the node may accept "multi byte".  */
1640       if (dfa->nodes[prev_node].accept_mb)
1641         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1642                                          str_idx, sctx->last_str_idx);
1643 #endif /* RE_ENABLE_I18N */
1644
1645       /* We don't check backreferences here.
1646          See update_cur_sifted_state().  */
1647       if (!naccepted
1648           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1649           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1650                                   dfa->nexts[prev_node]))
1651         naccepted = 1;
1652
1653       if (naccepted == 0)
1654         continue;
1655
1656       if (sctx->limits.nelem)
1657         {
1658           Idx to_idx = str_idx + naccepted;
1659           if (check_dst_limits (mctx, &sctx->limits,
1660                                 dfa->nexts[prev_node], to_idx,
1661                                 prev_node, str_idx))
1662             continue;
1663         }
1664       ok = re_node_set_insert (cur_dest, prev_node);
1665       if (__glibc_unlikely (! ok))
1666         return REG_ESPACE;
1667     }
1668
1669   return REG_NOERROR;
1670 }
1671
1672 /* Helper functions.  */
1673
1674 static reg_errcode_t
1675 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1676 {
1677   Idx top = mctx->state_log_top;
1678
1679   if ((next_state_log_idx >= mctx->input.bufs_len
1680        && mctx->input.bufs_len < mctx->input.len)
1681       || (next_state_log_idx >= mctx->input.valid_len
1682           && mctx->input.valid_len < mctx->input.len))
1683     {
1684       reg_errcode_t err;
1685       err = extend_buffers (mctx, next_state_log_idx + 1);
1686       if (__glibc_unlikely (err != REG_NOERROR))
1687         return err;
1688     }
1689
1690   if (top < next_state_log_idx)
1691     {
1692       memset (mctx->state_log + top + 1, '\0',
1693               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1694       mctx->state_log_top = next_state_log_idx;
1695     }
1696   return REG_NOERROR;
1697 }
1698
1699 static reg_errcode_t
1700 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1701                    re_dfastate_t **src, Idx num)
1702 {
1703   Idx st_idx;
1704   reg_errcode_t err;
1705   for (st_idx = 0; st_idx < num; ++st_idx)
1706     {
1707       if (dst[st_idx] == NULL)
1708         dst[st_idx] = src[st_idx];
1709       else if (src[st_idx] != NULL)
1710         {
1711           re_node_set merged_set;
1712           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1713                                         &src[st_idx]->nodes);
1714           if (__glibc_unlikely (err != REG_NOERROR))
1715             return err;
1716           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1717           re_node_set_free (&merged_set);
1718           if (__glibc_unlikely (err != REG_NOERROR))
1719             return err;
1720         }
1721     }
1722   return REG_NOERROR;
1723 }
1724
1725 static reg_errcode_t
1726 update_cur_sifted_state (const re_match_context_t *mctx,
1727                          re_sift_context_t *sctx, Idx str_idx,
1728                          re_node_set *dest_nodes)
1729 {
1730   const re_dfa_t *const dfa = mctx->dfa;
1731   reg_errcode_t err = REG_NOERROR;
1732   const re_node_set *candidates;
1733   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1734                 : &mctx->state_log[str_idx]->nodes);
1735
1736   if (dest_nodes->nelem == 0)
1737     sctx->sifted_states[str_idx] = NULL;
1738   else
1739     {
1740       if (candidates)
1741         {
1742           /* At first, add the nodes which can epsilon transit to a node in
1743              DEST_NODE.  */
1744           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1745           if (__glibc_unlikely (err != REG_NOERROR))
1746             return err;
1747
1748           /* Then, check the limitations in the current sift_context.  */
1749           if (sctx->limits.nelem)
1750             {
1751               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1752                                          mctx->bkref_ents, str_idx);
1753               if (__glibc_unlikely (err != REG_NOERROR))
1754                 return err;
1755             }
1756         }
1757
1758       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1759       if (__glibc_unlikely (err != REG_NOERROR))
1760         return err;
1761     }
1762
1763   if (candidates && mctx->state_log[str_idx]->has_backref)
1764     {
1765       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1766       if (__glibc_unlikely (err != REG_NOERROR))
1767         return err;
1768     }
1769   return REG_NOERROR;
1770 }
1771
1772 static reg_errcode_t
1773 __attribute_warn_unused_result__
1774 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1775                        const re_node_set *candidates)
1776 {
1777   reg_errcode_t err = REG_NOERROR;
1778   Idx i;
1779
1780   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1781   if (__glibc_unlikely (err != REG_NOERROR))
1782     return err;
1783
1784   if (!state->inveclosure.alloc)
1785     {
1786       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1787       if (__glibc_unlikely (err != REG_NOERROR))
1788         return REG_ESPACE;
1789       for (i = 0; i < dest_nodes->nelem; i++)
1790         {
1791           err = re_node_set_merge (&state->inveclosure,
1792                                    dfa->inveclosures + dest_nodes->elems[i]);
1793           if (__glibc_unlikely (err != REG_NOERROR))
1794             return REG_ESPACE;
1795         }
1796     }
1797   return re_node_set_add_intersect (dest_nodes, candidates,
1798                                     &state->inveclosure);
1799 }
1800
1801 static reg_errcode_t
1802 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1803                        const re_node_set *candidates)
1804 {
1805     Idx ecl_idx;
1806     reg_errcode_t err;
1807     re_node_set *inv_eclosure = dfa->inveclosures + node;
1808     re_node_set except_nodes;
1809     re_node_set_init_empty (&except_nodes);
1810     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1811       {
1812         Idx cur_node = inv_eclosure->elems[ecl_idx];
1813         if (cur_node == node)
1814           continue;
1815         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1816           {
1817             Idx edst1 = dfa->edests[cur_node].elems[0];
1818             Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1819                          ? dfa->edests[cur_node].elems[1] : -1);
1820             if ((!re_node_set_contains (inv_eclosure, edst1)
1821                  && re_node_set_contains (dest_nodes, edst1))
1822                 || (edst2 > 0
1823                     && !re_node_set_contains (inv_eclosure, edst2)
1824                     && re_node_set_contains (dest_nodes, edst2)))
1825               {
1826                 err = re_node_set_add_intersect (&except_nodes, candidates,
1827                                                  dfa->inveclosures + cur_node);
1828                 if (__glibc_unlikely (err != REG_NOERROR))
1829                   {
1830                     re_node_set_free (&except_nodes);
1831                     return err;
1832                   }
1833               }
1834           }
1835       }
1836     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1837       {
1838         Idx cur_node = inv_eclosure->elems[ecl_idx];
1839         if (!re_node_set_contains (&except_nodes, cur_node))
1840           {
1841             Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1842             re_node_set_remove_at (dest_nodes, idx);
1843           }
1844       }
1845     re_node_set_free (&except_nodes);
1846     return REG_NOERROR;
1847 }
1848
1849 static bool
1850 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1851                   Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1852 {
1853   const re_dfa_t *const dfa = mctx->dfa;
1854   Idx lim_idx, src_pos, dst_pos;
1855
1856   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1857   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1858   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1859     {
1860       Idx subexp_idx;
1861       struct re_backref_cache_entry *ent;
1862       ent = mctx->bkref_ents + limits->elems[lim_idx];
1863       subexp_idx = dfa->nodes[ent->node].opr.idx;
1864
1865       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1866                                            subexp_idx, dst_node, dst_idx,
1867                                            dst_bkref_idx);
1868       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1869                                            subexp_idx, src_node, src_idx,
1870                                            src_bkref_idx);
1871
1872       /* In case of:
1873          <src> <dst> ( <subexp> )
1874          ( <subexp> ) <src> <dst>
1875          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1876       if (src_pos == dst_pos)
1877         continue; /* This is unrelated limitation.  */
1878       else
1879         return true;
1880     }
1881   return false;
1882 }
1883
1884 static int
1885 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1886                              Idx subexp_idx, Idx from_node, Idx bkref_idx)
1887 {
1888   const re_dfa_t *const dfa = mctx->dfa;
1889   const re_node_set *eclosures = dfa->eclosures + from_node;
1890   Idx node_idx;
1891
1892   /* Else, we are on the boundary: examine the nodes on the epsilon
1893      closure.  */
1894   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1895     {
1896       Idx node = eclosures->elems[node_idx];
1897       switch (dfa->nodes[node].type)
1898         {
1899         case OP_BACK_REF:
1900           if (bkref_idx != -1)
1901             {
1902               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1903               do
1904                 {
1905                   Idx dst;
1906                   int cpos;
1907
1908                   if (ent->node != node)
1909                     continue;
1910
1911                   if (subexp_idx < BITSET_WORD_BITS
1912                       && !(ent->eps_reachable_subexps_map
1913                            & ((bitset_word_t) 1 << subexp_idx)))
1914                     continue;
1915
1916                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1917                      OP_CLOSE_SUBEXP cases below.  But, if the
1918                      destination node is the same node as the source
1919                      node, don't recurse because it would cause an
1920                      infinite loop: a regex that exhibits this behavior
1921                      is ()\1*\1*  */
1922                   dst = dfa->edests[node].elems[0];
1923                   if (dst == from_node)
1924                     {
1925                       if (boundaries & 1)
1926                         return -1;
1927                       else /* if (boundaries & 2) */
1928                         return 0;
1929                     }
1930
1931                   cpos =
1932                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1933                                                  dst, bkref_idx);
1934                   if (cpos == -1 /* && (boundaries & 1) */)
1935                     return -1;
1936                   if (cpos == 0 && (boundaries & 2))
1937                     return 0;
1938
1939                   if (subexp_idx < BITSET_WORD_BITS)
1940                     ent->eps_reachable_subexps_map
1941                       &= ~((bitset_word_t) 1 << subexp_idx);
1942                 }
1943               while (ent++->more);
1944             }
1945           break;
1946
1947         case OP_OPEN_SUBEXP:
1948           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1949             return -1;
1950           break;
1951
1952         case OP_CLOSE_SUBEXP:
1953           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1954             return 0;
1955           break;
1956
1957         default:
1958             break;
1959         }
1960     }
1961
1962   return (boundaries & 2) ? 1 : 0;
1963 }
1964
1965 static int
1966 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1967                            Idx subexp_idx, Idx from_node, Idx str_idx,
1968                            Idx bkref_idx)
1969 {
1970   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1971   int boundaries;
1972
1973   /* If we are outside the range of the subexpression, return -1 or 1.  */
1974   if (str_idx < lim->subexp_from)
1975     return -1;
1976
1977   if (lim->subexp_to < str_idx)
1978     return 1;
1979
1980   /* If we are within the subexpression, return 0.  */
1981   boundaries = (str_idx == lim->subexp_from);
1982   boundaries |= (str_idx == lim->subexp_to) << 1;
1983   if (boundaries == 0)
1984     return 0;
1985
1986   /* Else, examine epsilon closure.  */
1987   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988                                       from_node, bkref_idx);
1989 }
1990
1991 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1992    which are against limitations from DEST_NODES. */
1993
1994 static reg_errcode_t
1995 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1996                      const re_node_set *candidates, re_node_set *limits,
1997                      struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1998 {
1999   reg_errcode_t err;
2000   Idx node_idx, lim_idx;
2001
2002   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2003     {
2004       Idx subexp_idx;
2005       struct re_backref_cache_entry *ent;
2006       ent = bkref_ents + limits->elems[lim_idx];
2007
2008       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2009         continue; /* This is unrelated limitation.  */
2010
2011       subexp_idx = dfa->nodes[ent->node].opr.idx;
2012       if (ent->subexp_to == str_idx)
2013         {
2014           Idx ops_node = -1;
2015           Idx cls_node = -1;
2016           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2017             {
2018               Idx node = dest_nodes->elems[node_idx];
2019               re_token_type_t type = dfa->nodes[node].type;
2020               if (type == OP_OPEN_SUBEXP
2021                   && subexp_idx == dfa->nodes[node].opr.idx)
2022                 ops_node = node;
2023               else if (type == OP_CLOSE_SUBEXP
2024                        && subexp_idx == dfa->nodes[node].opr.idx)
2025                 cls_node = node;
2026             }
2027
2028           /* Check the limitation of the open subexpression.  */
2029           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2030           if (ops_node >= 0)
2031             {
2032               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2033                                            candidates);
2034               if (__glibc_unlikely (err != REG_NOERROR))
2035                 return err;
2036             }
2037
2038           /* Check the limitation of the close subexpression.  */
2039           if (cls_node >= 0)
2040             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2041               {
2042                 Idx node = dest_nodes->elems[node_idx];
2043                 if (!re_node_set_contains (dfa->inveclosures + node,
2044                                            cls_node)
2045                     && !re_node_set_contains (dfa->eclosures + node,
2046                                               cls_node))
2047                   {
2048                     /* It is against this limitation.
2049                        Remove it form the current sifted state.  */
2050                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2051                                                  candidates);
2052                     if (__glibc_unlikely (err != REG_NOERROR))
2053                       return err;
2054                     --node_idx;
2055                   }
2056               }
2057         }
2058       else /* (ent->subexp_to != str_idx)  */
2059         {
2060           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2061             {
2062               Idx node = dest_nodes->elems[node_idx];
2063               re_token_type_t type = dfa->nodes[node].type;
2064               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2065                 {
2066                   if (subexp_idx != dfa->nodes[node].opr.idx)
2067                     continue;
2068                   /* It is against this limitation.
2069                      Remove it form the current sifted state.  */
2070                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2071                                                candidates);
2072                   if (__glibc_unlikely (err != REG_NOERROR))
2073                     return err;
2074                 }
2075             }
2076         }
2077     }
2078   return REG_NOERROR;
2079 }
2080
2081 static reg_errcode_t
2082 __attribute_warn_unused_result__
2083 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2084                    Idx str_idx, const re_node_set *candidates)
2085 {
2086   const re_dfa_t *const dfa = mctx->dfa;
2087   reg_errcode_t err;
2088   Idx node_idx, node;
2089   re_sift_context_t local_sctx;
2090   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2091
2092   if (first_idx == -1)
2093     return REG_NOERROR;
2094
2095   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2096
2097   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2098     {
2099       Idx enabled_idx;
2100       re_token_type_t type;
2101       struct re_backref_cache_entry *entry;
2102       node = candidates->elems[node_idx];
2103       type = dfa->nodes[node].type;
2104       /* Avoid infinite loop for the REs like "()\1+".  */
2105       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2106         continue;
2107       if (type != OP_BACK_REF)
2108         continue;
2109
2110       entry = mctx->bkref_ents + first_idx;
2111       enabled_idx = first_idx;
2112       do
2113         {
2114           Idx subexp_len;
2115           Idx to_idx;
2116           Idx dst_node;
2117           bool ok;
2118           re_dfastate_t *cur_state;
2119
2120           if (entry->node != node)
2121             continue;
2122           subexp_len = entry->subexp_to - entry->subexp_from;
2123           to_idx = str_idx + subexp_len;
2124           dst_node = (subexp_len ? dfa->nexts[node]
2125                       : dfa->edests[node].elems[0]);
2126
2127           if (to_idx > sctx->last_str_idx
2128               || sctx->sifted_states[to_idx] == NULL
2129               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2130               || check_dst_limits (mctx, &sctx->limits, node,
2131                                    str_idx, dst_node, to_idx))
2132             continue;
2133
2134           if (local_sctx.sifted_states == NULL)
2135             {
2136               local_sctx = *sctx;
2137               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2138               if (__glibc_unlikely (err != REG_NOERROR))
2139                 goto free_return;
2140             }
2141           local_sctx.last_node = node;
2142           local_sctx.last_str_idx = str_idx;
2143           ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2144           if (__glibc_unlikely (! ok))
2145             {
2146               err = REG_ESPACE;
2147               goto free_return;
2148             }
2149           cur_state = local_sctx.sifted_states[str_idx];
2150           err = sift_states_backward (mctx, &local_sctx);
2151           if (__glibc_unlikely (err != REG_NOERROR))
2152             goto free_return;
2153           if (sctx->limited_states != NULL)
2154             {
2155               err = merge_state_array (dfa, sctx->limited_states,
2156                                        local_sctx.sifted_states,
2157                                        str_idx + 1);
2158               if (__glibc_unlikely (err != REG_NOERROR))
2159                 goto free_return;
2160             }
2161           local_sctx.sifted_states[str_idx] = cur_state;
2162           re_node_set_remove (&local_sctx.limits, enabled_idx);
2163
2164           /* mctx->bkref_ents may have changed, reload the pointer.  */
2165           entry = mctx->bkref_ents + enabled_idx;
2166         }
2167       while (enabled_idx++, entry++->more);
2168     }
2169   err = REG_NOERROR;
2170  free_return:
2171   if (local_sctx.sifted_states != NULL)
2172     {
2173       re_node_set_free (&local_sctx.limits);
2174     }
2175
2176   return err;
2177 }
2178
2179
2180 #ifdef RE_ENABLE_I18N
2181 static int
2182 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2183                      Idx node_idx, Idx str_idx, Idx max_str_idx)
2184 {
2185   const re_dfa_t *const dfa = mctx->dfa;
2186   int naccepted;
2187   /* Check the node can accept "multi byte".  */
2188   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2189   if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2190       && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2191                                dfa->nexts[node_idx]))
2192     /* The node can't accept the "multi byte", or the
2193        destination was already thrown away, then the node
2194        couldn't accept the current input "multi byte".   */
2195     naccepted = 0;
2196   /* Otherwise, it is sure that the node could accept
2197      'naccepted' bytes input.  */
2198   return naccepted;
2199 }
2200 #endif /* RE_ENABLE_I18N */
2201
2202 \f
2203 /* Functions for state transition.  */
2204
2205 /* Return the next state to which the current state STATE will transit by
2206    accepting the current input byte, and update STATE_LOG if necessary.
2207    Return NULL on failure.
2208    If STATE can accept a multibyte char/collating element/back reference
2209    update the destination of STATE_LOG.  */
2210
2211 static re_dfastate_t *
2212 __attribute_warn_unused_result__
2213 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2214                re_dfastate_t *state)
2215 {
2216   re_dfastate_t **trtable;
2217   unsigned char ch;
2218
2219 #ifdef RE_ENABLE_I18N
2220   /* If the current state can accept multibyte.  */
2221   if (__glibc_unlikely (state->accept_mb))
2222     {
2223       *err = transit_state_mb (mctx, state);
2224       if (__glibc_unlikely (*err != REG_NOERROR))
2225         return NULL;
2226     }
2227 #endif /* RE_ENABLE_I18N */
2228
2229   /* Then decide the next state with the single byte.  */
2230 #if 0
2231   if (0)
2232     /* don't use transition table  */
2233     return transit_state_sb (err, mctx, state);
2234 #endif
2235
2236   /* Use transition table  */
2237   ch = re_string_fetch_byte (&mctx->input);
2238   for (;;)
2239     {
2240       trtable = state->trtable;
2241       if (__glibc_likely (trtable != NULL))
2242         return trtable[ch];
2243
2244       trtable = state->word_trtable;
2245       if (__glibc_likely (trtable != NULL))
2246         {
2247           unsigned int context;
2248           context
2249             = re_string_context_at (&mctx->input,
2250                                     re_string_cur_idx (&mctx->input) - 1,
2251                                     mctx->eflags);
2252           if (IS_WORD_CONTEXT (context))
2253             return trtable[ch + SBC_MAX];
2254           else
2255             return trtable[ch];
2256         }
2257
2258       if (!build_trtable (mctx->dfa, state))
2259         {
2260           *err = REG_ESPACE;
2261           return NULL;
2262         }
2263
2264       /* Retry, we now have a transition table.  */
2265     }
2266 }
2267
2268 /* Update the state_log if we need */
2269 static re_dfastate_t *
2270 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2271                       re_dfastate_t *next_state)
2272 {
2273   const re_dfa_t *const dfa = mctx->dfa;
2274   Idx cur_idx = re_string_cur_idx (&mctx->input);
2275
2276   if (cur_idx > mctx->state_log_top)
2277     {
2278       mctx->state_log[cur_idx] = next_state;
2279       mctx->state_log_top = cur_idx;
2280     }
2281   else if (mctx->state_log[cur_idx] == 0)
2282     {
2283       mctx->state_log[cur_idx] = next_state;
2284     }
2285   else
2286     {
2287       re_dfastate_t *pstate;
2288       unsigned int context;
2289       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2290       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2291          the destination of a multibyte char/collating element/
2292          back reference.  Then the next state is the union set of
2293          these destinations and the results of the transition table.  */
2294       pstate = mctx->state_log[cur_idx];
2295       log_nodes = pstate->entrance_nodes;
2296       if (next_state != NULL)
2297         {
2298           table_nodes = next_state->entrance_nodes;
2299           *err = re_node_set_init_union (&next_nodes, table_nodes,
2300                                              log_nodes);
2301           if (__glibc_unlikely (*err != REG_NOERROR))
2302             return NULL;
2303         }
2304       else
2305         next_nodes = *log_nodes;
2306       /* Note: We already add the nodes of the initial state,
2307          then we don't need to add them here.  */
2308
2309       context = re_string_context_at (&mctx->input,
2310                                       re_string_cur_idx (&mctx->input) - 1,
2311                                       mctx->eflags);
2312       next_state = mctx->state_log[cur_idx]
2313         = re_acquire_state_context (err, dfa, &next_nodes, context);
2314       /* We don't need to check errors here, since the return value of
2315          this function is next_state and ERR is already set.  */
2316
2317       if (table_nodes != NULL)
2318         re_node_set_free (&next_nodes);
2319     }
2320
2321   if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2322     {
2323       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2324          later.  We must check them here, since the back references in the
2325          next state might use them.  */
2326       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2327                                         cur_idx);
2328       if (__glibc_unlikely (*err != REG_NOERROR))
2329         return NULL;
2330
2331       /* If the next state has back references.  */
2332       if (next_state->has_backref)
2333         {
2334           *err = transit_state_bkref (mctx, &next_state->nodes);
2335           if (__glibc_unlikely (*err != REG_NOERROR))
2336             return NULL;
2337           next_state = mctx->state_log[cur_idx];
2338         }
2339     }
2340
2341   return next_state;
2342 }
2343
2344 /* Skip bytes in the input that correspond to part of a
2345    multi-byte match, then look in the log for a state
2346    from which to restart matching.  */
2347 static re_dfastate_t *
2348 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2349 {
2350   re_dfastate_t *cur_state;
2351   do
2352     {
2353       Idx max = mctx->state_log_top;
2354       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2355
2356       do
2357         {
2358           if (++cur_str_idx > max)
2359             return NULL;
2360           re_string_skip_bytes (&mctx->input, 1);
2361         }
2362       while (mctx->state_log[cur_str_idx] == NULL);
2363
2364       cur_state = merge_state_with_log (err, mctx, NULL);
2365     }
2366   while (*err == REG_NOERROR && cur_state == NULL);
2367   return cur_state;
2368 }
2369
2370 /* Helper functions for transit_state.  */
2371
2372 /* From the node set CUR_NODES, pick up the nodes whose types are
2373    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2374    expression. And register them to use them later for evaluating the
2375    corresponding back references.  */
2376
2377 static reg_errcode_t
2378 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2379                            Idx str_idx)
2380 {
2381   const re_dfa_t *const dfa = mctx->dfa;
2382   Idx node_idx;
2383   reg_errcode_t err;
2384
2385   /* TODO: This isn't efficient.
2386            Because there might be more than one nodes whose types are
2387            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2388            nodes.
2389            E.g. RE: (a){2}  */
2390   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2391     {
2392       Idx node = cur_nodes->elems[node_idx];
2393       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2394           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2395           && (dfa->used_bkref_map
2396               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2397         {
2398           err = match_ctx_add_subtop (mctx, node, str_idx);
2399           if (__glibc_unlikely (err != REG_NOERROR))
2400             return err;
2401         }
2402     }
2403   return REG_NOERROR;
2404 }
2405
2406 #if 0
2407 /* Return the next state to which the current state STATE will transit by
2408    accepting the current input byte.  Return NULL on failure.  */
2409
2410 static re_dfastate_t *
2411 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2412                   re_dfastate_t *state)
2413 {
2414   const re_dfa_t *const dfa = mctx->dfa;
2415   re_node_set next_nodes;
2416   re_dfastate_t *next_state;
2417   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2418   unsigned int context;
2419
2420   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2421   if (__glibc_unlikely (*err != REG_NOERROR))
2422     return NULL;
2423   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2424     {
2425       Idx cur_node = state->nodes.elems[node_cnt];
2426       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2427         {
2428           *err = re_node_set_merge (&next_nodes,
2429                                     dfa->eclosures + dfa->nexts[cur_node]);
2430           if (__glibc_unlikely (*err != REG_NOERROR))
2431             {
2432               re_node_set_free (&next_nodes);
2433               return NULL;
2434             }
2435         }
2436     }
2437   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2438   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2439   /* We don't need to check errors here, since the return value of
2440      this function is next_state and ERR is already set.  */
2441
2442   re_node_set_free (&next_nodes);
2443   re_string_skip_bytes (&mctx->input, 1);
2444   return next_state;
2445 }
2446 #endif
2447
2448 #ifdef RE_ENABLE_I18N
2449 static reg_errcode_t
2450 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2451 {
2452   const re_dfa_t *const dfa = mctx->dfa;
2453   reg_errcode_t err;
2454   Idx i;
2455
2456   for (i = 0; i < pstate->nodes.nelem; ++i)
2457     {
2458       re_node_set dest_nodes, *new_nodes;
2459       Idx cur_node_idx = pstate->nodes.elems[i];
2460       int naccepted;
2461       Idx dest_idx;
2462       unsigned int context;
2463       re_dfastate_t *dest_state;
2464
2465       if (!dfa->nodes[cur_node_idx].accept_mb)
2466         continue;
2467
2468       if (dfa->nodes[cur_node_idx].constraint)
2469         {
2470           context = re_string_context_at (&mctx->input,
2471                                           re_string_cur_idx (&mctx->input),
2472                                           mctx->eflags);
2473           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2474                                            context))
2475             continue;
2476         }
2477
2478       /* How many bytes the node can accept?  */
2479       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2480                                            re_string_cur_idx (&mctx->input));
2481       if (naccepted == 0)
2482         continue;
2483
2484       /* The node can accepts 'naccepted' bytes.  */
2485       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2486       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2487                                : mctx->max_mb_elem_len);
2488       err = clean_state_log_if_needed (mctx, dest_idx);
2489       if (__glibc_unlikely (err != REG_NOERROR))
2490         return err;
2491       DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2492       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2493
2494       dest_state = mctx->state_log[dest_idx];
2495       if (dest_state == NULL)
2496         dest_nodes = *new_nodes;
2497       else
2498         {
2499           err = re_node_set_init_union (&dest_nodes,
2500                                         dest_state->entrance_nodes, new_nodes);
2501           if (__glibc_unlikely (err != REG_NOERROR))
2502             return err;
2503         }
2504       context = re_string_context_at (&mctx->input, dest_idx - 1,
2505                                       mctx->eflags);
2506       mctx->state_log[dest_idx]
2507         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2508       if (dest_state != NULL)
2509         re_node_set_free (&dest_nodes);
2510       if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2511                             && err != REG_NOERROR))
2512         return err;
2513     }
2514   return REG_NOERROR;
2515 }
2516 #endif /* RE_ENABLE_I18N */
2517
2518 static reg_errcode_t
2519 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2520 {
2521   const re_dfa_t *const dfa = mctx->dfa;
2522   reg_errcode_t err;
2523   Idx i;
2524   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2525
2526   for (i = 0; i < nodes->nelem; ++i)
2527     {
2528       Idx dest_str_idx, prev_nelem, bkc_idx;
2529       Idx node_idx = nodes->elems[i];
2530       unsigned int context;
2531       const re_token_t *node = dfa->nodes + node_idx;
2532       re_node_set *new_dest_nodes;
2533
2534       /* Check whether 'node' is a backreference or not.  */
2535       if (node->type != OP_BACK_REF)
2536         continue;
2537
2538       if (node->constraint)
2539         {
2540           context = re_string_context_at (&mctx->input, cur_str_idx,
2541                                           mctx->eflags);
2542           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2543             continue;
2544         }
2545
2546       /* 'node' is a backreference.
2547          Check the substring which the substring matched.  */
2548       bkc_idx = mctx->nbkref_ents;
2549       err = get_subexp (mctx, node_idx, cur_str_idx);
2550       if (__glibc_unlikely (err != REG_NOERROR))
2551         goto free_return;
2552
2553       /* And add the epsilon closures (which is 'new_dest_nodes') of
2554          the backreference to appropriate state_log.  */
2555       DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2556       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2557         {
2558           Idx subexp_len;
2559           re_dfastate_t *dest_state;
2560           struct re_backref_cache_entry *bkref_ent;
2561           bkref_ent = mctx->bkref_ents + bkc_idx;
2562           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2563             continue;
2564           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2565           new_dest_nodes = (subexp_len == 0
2566                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2567                             : dfa->eclosures + dfa->nexts[node_idx]);
2568           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2569                           - bkref_ent->subexp_from);
2570           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2571                                           mctx->eflags);
2572           dest_state = mctx->state_log[dest_str_idx];
2573           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2574                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2575           /* Add 'new_dest_node' to state_log.  */
2576           if (dest_state == NULL)
2577             {
2578               mctx->state_log[dest_str_idx]
2579                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2580                                             context);
2581               if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2582                                     && err != REG_NOERROR))
2583                 goto free_return;
2584             }
2585           else
2586             {
2587               re_node_set dest_nodes;
2588               err = re_node_set_init_union (&dest_nodes,
2589                                             dest_state->entrance_nodes,
2590                                             new_dest_nodes);
2591               if (__glibc_unlikely (err != REG_NOERROR))
2592                 {
2593                   re_node_set_free (&dest_nodes);
2594                   goto free_return;
2595                 }
2596               mctx->state_log[dest_str_idx]
2597                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2598               re_node_set_free (&dest_nodes);
2599               if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2600                                     && err != REG_NOERROR))
2601                 goto free_return;
2602             }
2603           /* We need to check recursively if the backreference can epsilon
2604              transit.  */
2605           if (subexp_len == 0
2606               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2607             {
2608               err = check_subexp_matching_top (mctx, new_dest_nodes,
2609                                                cur_str_idx);
2610               if (__glibc_unlikely (err != REG_NOERROR))
2611                 goto free_return;
2612               err = transit_state_bkref (mctx, new_dest_nodes);
2613               if (__glibc_unlikely (err != REG_NOERROR))
2614                 goto free_return;
2615             }
2616         }
2617     }
2618   err = REG_NOERROR;
2619  free_return:
2620   return err;
2621 }
2622
2623 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2624    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2625    Note that we might collect inappropriate candidates here.
2626    However, the cost of checking them strictly here is too high, then we
2627    delay these checking for prune_impossible_nodes().  */
2628
2629 static reg_errcode_t
2630 __attribute_warn_unused_result__
2631 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2632 {
2633   const re_dfa_t *const dfa = mctx->dfa;
2634   Idx subexp_num, sub_top_idx;
2635   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2636   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2637   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2638   if (cache_idx != -1)
2639     {
2640       const struct re_backref_cache_entry *entry
2641         = mctx->bkref_ents + cache_idx;
2642       do
2643         if (entry->node == bkref_node)
2644           return REG_NOERROR; /* We already checked it.  */
2645       while (entry++->more);
2646     }
2647
2648   subexp_num = dfa->nodes[bkref_node].opr.idx;
2649
2650   /* For each sub expression  */
2651   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2652     {
2653       reg_errcode_t err;
2654       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2655       re_sub_match_last_t *sub_last;
2656       Idx sub_last_idx, sl_str, bkref_str_off;
2657
2658       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2659         continue; /* It isn't related.  */
2660
2661       sl_str = sub_top->str_idx;
2662       bkref_str_off = bkref_str_idx;
2663       /* At first, check the last node of sub expressions we already
2664          evaluated.  */
2665       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2666         {
2667           regoff_t sl_str_diff;
2668           sub_last = sub_top->lasts[sub_last_idx];
2669           sl_str_diff = sub_last->str_idx - sl_str;
2670           /* The matched string by the sub expression match with the substring
2671              at the back reference?  */
2672           if (sl_str_diff > 0)
2673             {
2674               if (__glibc_unlikely (bkref_str_off + sl_str_diff
2675                                     > mctx->input.valid_len))
2676                 {
2677                   /* Not enough chars for a successful match.  */
2678                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2679                     break;
2680
2681                   err = clean_state_log_if_needed (mctx,
2682                                                    bkref_str_off
2683                                                    + sl_str_diff);
2684                   if (__glibc_unlikely (err != REG_NOERROR))
2685                     return err;
2686                   buf = (const char *) re_string_get_buffer (&mctx->input);
2687                 }
2688               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2689                 /* We don't need to search this sub expression any more.  */
2690                 break;
2691             }
2692           bkref_str_off += sl_str_diff;
2693           sl_str += sl_str_diff;
2694           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2695                                 bkref_str_idx);
2696
2697           /* Reload buf, since the preceding call might have reallocated
2698              the buffer.  */
2699           buf = (const char *) re_string_get_buffer (&mctx->input);
2700
2701           if (err == REG_NOMATCH)
2702             continue;
2703           if (__glibc_unlikely (err != REG_NOERROR))
2704             return err;
2705         }
2706
2707       if (sub_last_idx < sub_top->nlasts)
2708         continue;
2709       if (sub_last_idx > 0)
2710         ++sl_str;
2711       /* Then, search for the other last nodes of the sub expression.  */
2712       for (; sl_str <= bkref_str_idx; ++sl_str)
2713         {
2714           Idx cls_node;
2715           regoff_t sl_str_off;
2716           const re_node_set *nodes;
2717           sl_str_off = sl_str - sub_top->str_idx;
2718           /* The matched string by the sub expression match with the substring
2719              at the back reference?  */
2720           if (sl_str_off > 0)
2721             {
2722               if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2723                 {
2724                   /* If we are at the end of the input, we cannot match.  */
2725                   if (bkref_str_off >= mctx->input.len)
2726                     break;
2727
2728                   err = extend_buffers (mctx, bkref_str_off + 1);
2729                   if (__glibc_unlikely (err != REG_NOERROR))
2730                     return err;
2731
2732                   buf = (const char *) re_string_get_buffer (&mctx->input);
2733                 }
2734               if (buf [bkref_str_off++] != buf[sl_str - 1])
2735                 break; /* We don't need to search this sub expression
2736                           any more.  */
2737             }
2738           if (mctx->state_log[sl_str] == NULL)
2739             continue;
2740           /* Does this state have a ')' of the sub expression?  */
2741           nodes = &mctx->state_log[sl_str]->nodes;
2742           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2743                                        OP_CLOSE_SUBEXP);
2744           if (cls_node == -1)
2745             continue; /* No.  */
2746           if (sub_top->path == NULL)
2747             {
2748               sub_top->path = calloc (sizeof (state_array_t),
2749                                       sl_str - sub_top->str_idx + 1);
2750               if (sub_top->path == NULL)
2751                 return REG_ESPACE;
2752             }
2753           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2754              in the current context?  */
2755           err = check_arrival (mctx, sub_top->path, sub_top->node,
2756                                sub_top->str_idx, cls_node, sl_str,
2757                                OP_CLOSE_SUBEXP);
2758           if (err == REG_NOMATCH)
2759               continue;
2760           if (__glibc_unlikely (err != REG_NOERROR))
2761               return err;
2762           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2763           if (__glibc_unlikely (sub_last == NULL))
2764             return REG_ESPACE;
2765           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2766                                 bkref_str_idx);
2767           buf = (const char *) re_string_get_buffer (&mctx->input);
2768           if (err == REG_NOMATCH)
2769             continue;
2770           if (__glibc_unlikely (err != REG_NOERROR))
2771             return err;
2772         }
2773     }
2774   return REG_NOERROR;
2775 }
2776
2777 /* Helper functions for get_subexp().  */
2778
2779 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2780    If it can arrive, register the sub expression expressed with SUB_TOP
2781    and SUB_LAST.  */
2782
2783 static reg_errcode_t
2784 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2785                 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2786 {
2787   reg_errcode_t err;
2788   Idx to_idx;
2789   /* Can the subexpression arrive the back reference?  */
2790   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2791                        sub_last->str_idx, bkref_node, bkref_str,
2792                        OP_OPEN_SUBEXP);
2793   if (err != REG_NOERROR)
2794     return err;
2795   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2796                              sub_last->str_idx);
2797   if (__glibc_unlikely (err != REG_NOERROR))
2798     return err;
2799   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2800   return clean_state_log_if_needed (mctx, to_idx);
2801 }
2802
2803 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2804    Search '(' if FL_OPEN, or search ')' otherwise.
2805    TODO: This function isn't efficient...
2806          Because there might be more than one nodes whose types are
2807          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2808          nodes.
2809          E.g. RE: (a){2}  */
2810
2811 static Idx
2812 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2813                   Idx subexp_idx, int type)
2814 {
2815   Idx cls_idx;
2816   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2817     {
2818       Idx cls_node = nodes->elems[cls_idx];
2819       const re_token_t *node = dfa->nodes + cls_node;
2820       if (node->type == type
2821           && node->opr.idx == subexp_idx)
2822         return cls_node;
2823     }
2824   return -1;
2825 }
2826
2827 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2828    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2829    heavily reused.
2830    Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2831    REG_ESPACE if memory is exhausted.  */
2832
2833 static reg_errcode_t
2834 __attribute_warn_unused_result__
2835 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2836                Idx top_str, Idx last_node, Idx last_str, int type)
2837 {
2838   const re_dfa_t *const dfa = mctx->dfa;
2839   reg_errcode_t err = REG_NOERROR;
2840   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2841   re_dfastate_t *cur_state = NULL;
2842   re_node_set *cur_nodes, next_nodes;
2843   re_dfastate_t **backup_state_log;
2844   unsigned int context;
2845
2846   subexp_num = dfa->nodes[top_node].opr.idx;
2847   /* Extend the buffer if we need.  */
2848   if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2849     {
2850       re_dfastate_t **new_array;
2851       Idx old_alloc = path->alloc;
2852       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2853       Idx new_alloc;
2854       if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2855         return REG_ESPACE;
2856       new_alloc = old_alloc + incr_alloc;
2857       if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2858         return REG_ESPACE;
2859       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2860       if (__glibc_unlikely (new_array == NULL))
2861         return REG_ESPACE;
2862       path->array = new_array;
2863       path->alloc = new_alloc;
2864       memset (new_array + old_alloc, '\0',
2865               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2866     }
2867
2868   str_idx = path->next_idx ? path->next_idx : top_str;
2869
2870   /* Temporary modify MCTX.  */
2871   backup_state_log = mctx->state_log;
2872   backup_cur_idx = mctx->input.cur_idx;
2873   mctx->state_log = path->array;
2874   mctx->input.cur_idx = str_idx;
2875
2876   /* Setup initial node set.  */
2877   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2878   if (str_idx == top_str)
2879     {
2880       err = re_node_set_init_1 (&next_nodes, top_node);
2881       if (__glibc_unlikely (err != REG_NOERROR))
2882         return err;
2883       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2884       if (__glibc_unlikely (err != REG_NOERROR))
2885         {
2886           re_node_set_free (&next_nodes);
2887           return err;
2888         }
2889     }
2890   else
2891     {
2892       cur_state = mctx->state_log[str_idx];
2893       if (cur_state && cur_state->has_backref)
2894         {
2895           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2896           if (__glibc_unlikely (err != REG_NOERROR))
2897             return err;
2898         }
2899       else
2900         re_node_set_init_empty (&next_nodes);
2901     }
2902   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2903     {
2904       if (next_nodes.nelem)
2905         {
2906           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2907                                     subexp_num, type);
2908           if (__glibc_unlikely (err != REG_NOERROR))
2909             {
2910               re_node_set_free (&next_nodes);
2911               return err;
2912             }
2913         }
2914       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2915       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2916         {
2917           re_node_set_free (&next_nodes);
2918           return err;
2919         }
2920       mctx->state_log[str_idx] = cur_state;
2921     }
2922
2923   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2924     {
2925       re_node_set_empty (&next_nodes);
2926       if (mctx->state_log[str_idx + 1])
2927         {
2928           err = re_node_set_merge (&next_nodes,
2929                                    &mctx->state_log[str_idx + 1]->nodes);
2930           if (__glibc_unlikely (err != REG_NOERROR))
2931             {
2932               re_node_set_free (&next_nodes);
2933               return err;
2934             }
2935         }
2936       if (cur_state)
2937         {
2938           err = check_arrival_add_next_nodes (mctx, str_idx,
2939                                               &cur_state->non_eps_nodes,
2940                                               &next_nodes);
2941           if (__glibc_unlikely (err != REG_NOERROR))
2942             {
2943               re_node_set_free (&next_nodes);
2944               return err;
2945             }
2946         }
2947       ++str_idx;
2948       if (next_nodes.nelem)
2949         {
2950           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2951           if (__glibc_unlikely (err != REG_NOERROR))
2952             {
2953               re_node_set_free (&next_nodes);
2954               return err;
2955             }
2956           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2957                                     subexp_num, type);
2958           if (__glibc_unlikely (err != REG_NOERROR))
2959             {
2960               re_node_set_free (&next_nodes);
2961               return err;
2962             }
2963         }
2964       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2965       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2966       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2967         {
2968           re_node_set_free (&next_nodes);
2969           return err;
2970         }
2971       mctx->state_log[str_idx] = cur_state;
2972       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2973     }
2974   re_node_set_free (&next_nodes);
2975   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2976                : &mctx->state_log[last_str]->nodes);
2977   path->next_idx = str_idx;
2978
2979   /* Fix MCTX.  */
2980   mctx->state_log = backup_state_log;
2981   mctx->input.cur_idx = backup_cur_idx;
2982
2983   /* Then check the current node set has the node LAST_NODE.  */
2984   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2985     return REG_NOERROR;
2986
2987   return REG_NOMATCH;
2988 }
2989
2990 /* Helper functions for check_arrival.  */
2991
2992 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2993    to NEXT_NODES.
2994    TODO: This function is similar to the functions transit_state*(),
2995          however this function has many additional works.
2996          Can't we unify them?  */
2997
2998 static reg_errcode_t
2999 __attribute_warn_unused_result__
3000 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3001                               re_node_set *cur_nodes, re_node_set *next_nodes)
3002 {
3003   const re_dfa_t *const dfa = mctx->dfa;
3004   bool ok;
3005   Idx cur_idx;
3006 #ifdef RE_ENABLE_I18N
3007   reg_errcode_t err = REG_NOERROR;
3008 #endif
3009   re_node_set union_set;
3010   re_node_set_init_empty (&union_set);
3011   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3012     {
3013       int naccepted = 0;
3014       Idx cur_node = cur_nodes->elems[cur_idx];
3015       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
3016
3017 #ifdef RE_ENABLE_I18N
3018       /* If the node may accept "multi byte".  */
3019       if (dfa->nodes[cur_node].accept_mb)
3020         {
3021           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3022                                                str_idx);
3023           if (naccepted > 1)
3024             {
3025               re_dfastate_t *dest_state;
3026               Idx next_node = dfa->nexts[cur_node];
3027               Idx next_idx = str_idx + naccepted;
3028               dest_state = mctx->state_log[next_idx];
3029               re_node_set_empty (&union_set);
3030               if (dest_state)
3031                 {
3032                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3033                   if (__glibc_unlikely (err != REG_NOERROR))
3034                     {
3035                       re_node_set_free (&union_set);
3036                       return err;
3037                     }
3038                 }
3039               ok = re_node_set_insert (&union_set, next_node);
3040               if (__glibc_unlikely (! ok))
3041                 {
3042                   re_node_set_free (&union_set);
3043                   return REG_ESPACE;
3044                 }
3045               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3046                                                             &union_set);
3047               if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3048                                     && err != REG_NOERROR))
3049                 {
3050                   re_node_set_free (&union_set);
3051                   return err;
3052                 }
3053             }
3054         }
3055 #endif /* RE_ENABLE_I18N */
3056       if (naccepted
3057           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3058         {
3059           ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3060           if (__glibc_unlikely (! ok))
3061             {
3062               re_node_set_free (&union_set);
3063               return REG_ESPACE;
3064             }
3065         }
3066     }
3067   re_node_set_free (&union_set);
3068   return REG_NOERROR;
3069 }
3070
3071 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3072    CUR_NODES, however exclude the nodes which are:
3073     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3074     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3075 */
3076
3077 static reg_errcode_t
3078 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3079                           Idx ex_subexp, int type)
3080 {
3081   reg_errcode_t err;
3082   Idx idx, outside_node;
3083   re_node_set new_nodes;
3084   DEBUG_ASSERT (cur_nodes->nelem);
3085   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3086   if (__glibc_unlikely (err != REG_NOERROR))
3087     return err;
3088   /* Create a new node set NEW_NODES with the nodes which are epsilon
3089      closures of the node in CUR_NODES.  */
3090
3091   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3092     {
3093       Idx cur_node = cur_nodes->elems[idx];
3094       const re_node_set *eclosure = dfa->eclosures + cur_node;
3095       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3096       if (outside_node == -1)
3097         {
3098           /* There are no problematic nodes, just merge them.  */
3099           err = re_node_set_merge (&new_nodes, eclosure);
3100           if (__glibc_unlikely (err != REG_NOERROR))
3101             {
3102               re_node_set_free (&new_nodes);
3103               return err;
3104             }
3105         }
3106       else
3107         {
3108           /* There are problematic nodes, re-calculate incrementally.  */
3109           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3110                                               ex_subexp, type);
3111           if (__glibc_unlikely (err != REG_NOERROR))
3112             {
3113               re_node_set_free (&new_nodes);
3114               return err;
3115             }
3116         }
3117     }
3118   re_node_set_free (cur_nodes);
3119   *cur_nodes = new_nodes;
3120   return REG_NOERROR;
3121 }
3122
3123 /* Helper function for check_arrival_expand_ecl.
3124    Check incrementally the epsilon closure of TARGET, and if it isn't
3125    problematic append it to DST_NODES.  */
3126
3127 static reg_errcode_t
3128 __attribute_warn_unused_result__
3129 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3130                               Idx target, Idx ex_subexp, int type)
3131 {
3132   Idx cur_node;
3133   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3134     {
3135       bool ok;
3136
3137       if (dfa->nodes[cur_node].type == type
3138           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3139         {
3140           if (type == OP_CLOSE_SUBEXP)
3141             {
3142               ok = re_node_set_insert (dst_nodes, cur_node);
3143               if (__glibc_unlikely (! ok))
3144                 return REG_ESPACE;
3145             }
3146           break;
3147         }
3148       ok = re_node_set_insert (dst_nodes, cur_node);
3149       if (__glibc_unlikely (! ok))
3150         return REG_ESPACE;
3151       if (dfa->edests[cur_node].nelem == 0)
3152         break;
3153       if (dfa->edests[cur_node].nelem == 2)
3154         {
3155           reg_errcode_t err;
3156           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3157                                               dfa->edests[cur_node].elems[1],
3158                                               ex_subexp, type);
3159           if (__glibc_unlikely (err != REG_NOERROR))
3160             return err;
3161         }
3162       cur_node = dfa->edests[cur_node].elems[0];
3163     }
3164   return REG_NOERROR;
3165 }
3166
3167
3168 /* For all the back references in the current state, calculate the
3169    destination of the back references by the appropriate entry
3170    in MCTX->BKREF_ENTS.  */
3171
3172 static reg_errcode_t
3173 __attribute_warn_unused_result__
3174 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3175                     Idx cur_str, Idx subexp_num, int type)
3176 {
3177   const re_dfa_t *const dfa = mctx->dfa;
3178   reg_errcode_t err;
3179   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3180   struct re_backref_cache_entry *ent;
3181
3182   if (cache_idx_start == -1)
3183     return REG_NOERROR;
3184
3185  restart:
3186   ent = mctx->bkref_ents + cache_idx_start;
3187   do
3188     {
3189       Idx to_idx, next_node;
3190
3191       /* Is this entry ENT is appropriate?  */
3192       if (!re_node_set_contains (cur_nodes, ent->node))
3193         continue; /* No.  */
3194
3195       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3196       /* Calculate the destination of the back reference, and append it
3197          to MCTX->STATE_LOG.  */
3198       if (to_idx == cur_str)
3199         {
3200           /* The backreference did epsilon transit, we must re-check all the
3201              node in the current state.  */
3202           re_node_set new_dests;
3203           reg_errcode_t err2, err3;
3204           next_node = dfa->edests[ent->node].elems[0];
3205           if (re_node_set_contains (cur_nodes, next_node))
3206             continue;
3207           err = re_node_set_init_1 (&new_dests, next_node);
3208           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3209           err3 = re_node_set_merge (cur_nodes, &new_dests);
3210           re_node_set_free (&new_dests);
3211           if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3212                                 || err3 != REG_NOERROR))
3213             {
3214               err = (err != REG_NOERROR ? err
3215                      : (err2 != REG_NOERROR ? err2 : err3));
3216               return err;
3217             }
3218           /* TODO: It is still inefficient...  */
3219           goto restart;
3220         }
3221       else
3222         {
3223           re_node_set union_set;
3224           next_node = dfa->nexts[ent->node];
3225           if (mctx->state_log[to_idx])
3226             {
3227               bool ok;
3228               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3229                                         next_node))
3230                 continue;
3231               err = re_node_set_init_copy (&union_set,
3232                                            &mctx->state_log[to_idx]->nodes);
3233               ok = re_node_set_insert (&union_set, next_node);
3234               if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3235                 {
3236                   re_node_set_free (&union_set);
3237                   err = err != REG_NOERROR ? err : REG_ESPACE;
3238                   return err;
3239                 }
3240             }
3241           else
3242             {
3243               err = re_node_set_init_1 (&union_set, next_node);
3244               if (__glibc_unlikely (err != REG_NOERROR))
3245                 return err;
3246             }
3247           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3248           re_node_set_free (&union_set);
3249           if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3250                                 && err != REG_NOERROR))
3251             return err;
3252         }
3253     }
3254   while (ent++->more);
3255   return REG_NOERROR;
3256 }
3257
3258 /* Build transition table for the state.
3259    Return true if successful.  */
3260
3261 static bool __attribute_noinline__
3262 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3263 {
3264   reg_errcode_t err;
3265   Idx i, j;
3266   int ch;
3267   bool need_word_trtable = false;
3268   bitset_word_t elem, mask;
3269   Idx ndests; /* Number of the destination states from 'state'.  */
3270   re_dfastate_t **trtable;
3271   re_dfastate_t *dest_states[SBC_MAX];
3272   re_dfastate_t *dest_states_word[SBC_MAX];
3273   re_dfastate_t *dest_states_nl[SBC_MAX];
3274   re_node_set follows;
3275   bitset_t acceptable;
3276
3277   /* We build DFA states which corresponds to the destination nodes
3278      from 'state'.  'dests_node[i]' represents the nodes which i-th
3279      destination state contains, and 'dests_ch[i]' represents the
3280      characters which i-th destination state accepts.  */
3281   re_node_set dests_node[SBC_MAX];
3282   bitset_t dests_ch[SBC_MAX];
3283
3284   /* Initialize transition table.  */
3285   state->word_trtable = state->trtable = NULL;
3286
3287   /* At first, group all nodes belonging to 'state' into several
3288      destinations.  */
3289   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3290   if (__glibc_unlikely (ndests <= 0))
3291     {
3292       /* Return false in case of an error, true otherwise.  */
3293       if (ndests == 0)
3294         {
3295           state->trtable = (re_dfastate_t **)
3296             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3297           if (__glibc_unlikely (state->trtable == NULL))
3298             return false;
3299           return true;
3300         }
3301       return false;
3302     }
3303
3304   err = re_node_set_alloc (&follows, ndests + 1);
3305   if (__glibc_unlikely (err != REG_NOERROR))
3306     {
3307     out_free:
3308       re_node_set_free (&follows);
3309       for (i = 0; i < ndests; ++i)
3310         re_node_set_free (dests_node + i);
3311       return false;
3312     }
3313
3314   bitset_empty (acceptable);
3315
3316   /* Then build the states for all destinations.  */
3317   for (i = 0; i < ndests; ++i)
3318     {
3319       Idx next_node;
3320       re_node_set_empty (&follows);
3321       /* Merge the follows of this destination states.  */
3322       for (j = 0; j < dests_node[i].nelem; ++j)
3323         {
3324           next_node = dfa->nexts[dests_node[i].elems[j]];
3325           if (next_node != -1)
3326             {
3327               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3328               if (__glibc_unlikely (err != REG_NOERROR))
3329                 goto out_free;
3330             }
3331         }
3332       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3333       if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3334         goto out_free;
3335       /* If the new state has context constraint,
3336          build appropriate states for these contexts.  */
3337       if (dest_states[i]->has_constraint)
3338         {
3339           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3340                                                           CONTEXT_WORD);
3341           if (__glibc_unlikely (dest_states_word[i] == NULL
3342                                 && err != REG_NOERROR))
3343             goto out_free;
3344
3345           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3346             need_word_trtable = true;
3347
3348           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3349                                                         CONTEXT_NEWLINE);
3350           if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3351             goto out_free;
3352         }
3353       else
3354         {
3355           dest_states_word[i] = dest_states[i];
3356           dest_states_nl[i] = dest_states[i];
3357         }
3358       bitset_merge (acceptable, dests_ch[i]);
3359     }
3360
3361   if (!__glibc_unlikely (need_word_trtable))
3362     {
3363       /* We don't care about whether the following character is a word
3364          character, or we are in a single-byte character set so we can
3365          discern by looking at the character code: allocate a
3366          256-entry transition table.  */
3367       trtable = state->trtable =
3368         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3369       if (__glibc_unlikely (trtable == NULL))
3370         goto out_free;
3371
3372       /* For all characters ch...:  */
3373       for (i = 0; i < BITSET_WORDS; ++i)
3374         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3375              elem;
3376              mask <<= 1, elem >>= 1, ++ch)
3377           if (__glibc_unlikely (elem & 1))
3378             {
3379               /* There must be exactly one destination which accepts
3380                  character ch.  See group_nodes_into_DFAstates.  */
3381               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3382                 ;
3383
3384               /* j-th destination accepts the word character ch.  */
3385               if (dfa->word_char[i] & mask)
3386                 trtable[ch] = dest_states_word[j];
3387               else
3388                 trtable[ch] = dest_states[j];
3389             }
3390     }
3391   else
3392     {
3393       /* We care about whether the following character is a word
3394          character, and we are in a multi-byte character set: discern
3395          by looking at the character code: build two 256-entry
3396          transition tables, one starting at trtable[0] and one
3397          starting at trtable[SBC_MAX].  */
3398       trtable = state->word_trtable =
3399         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3400       if (__glibc_unlikely (trtable == NULL))
3401         goto out_free;
3402
3403       /* For all characters ch...:  */
3404       for (i = 0; i < BITSET_WORDS; ++i)
3405         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3406              elem;
3407              mask <<= 1, elem >>= 1, ++ch)
3408           if (__glibc_unlikely (elem & 1))
3409             {
3410               /* There must be exactly one destination which accepts
3411                  character ch.  See group_nodes_into_DFAstates.  */
3412               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3413                 ;
3414
3415               /* j-th destination accepts the word character ch.  */
3416               trtable[ch] = dest_states[j];
3417               trtable[ch + SBC_MAX] = dest_states_word[j];
3418             }
3419     }
3420
3421   /* new line */
3422   if (bitset_contain (acceptable, NEWLINE_CHAR))
3423     {
3424       /* The current state accepts newline character.  */
3425       for (j = 0; j < ndests; ++j)
3426         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3427           {
3428             /* k-th destination accepts newline character.  */
3429             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3430             if (need_word_trtable)
3431               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3432             /* There must be only one destination which accepts
3433                newline.  See group_nodes_into_DFAstates.  */
3434             break;
3435           }
3436     }
3437
3438   re_node_set_free (&follows);
3439   for (i = 0; i < ndests; ++i)
3440     re_node_set_free (dests_node + i);
3441   return true;
3442 }
3443
3444 /* Group all nodes belonging to STATE into several destinations.
3445    Then for all destinations, set the nodes belonging to the destination
3446    to DESTS_NODE[i] and set the characters accepted by the destination
3447    to DEST_CH[i].  Return the number of destinations if successful,
3448    -1 on internal error.  */
3449
3450 static Idx
3451 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3452                             re_node_set *dests_node, bitset_t *dests_ch)
3453 {
3454   reg_errcode_t err;
3455   bool ok;
3456   Idx i, j, k;
3457   Idx ndests; /* Number of the destinations from 'state'.  */
3458   bitset_t accepts; /* Characters a node can accept.  */
3459   const re_node_set *cur_nodes = &state->nodes;
3460   bitset_empty (accepts);
3461   ndests = 0;
3462
3463   /* For all the nodes belonging to 'state',  */
3464   for (i = 0; i < cur_nodes->nelem; ++i)
3465     {
3466       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3467       re_token_type_t type = node->type;
3468       unsigned int constraint = node->constraint;
3469
3470       /* Enumerate all single byte character this node can accept.  */
3471       if (type == CHARACTER)
3472         bitset_set (accepts, node->opr.c);
3473       else if (type == SIMPLE_BRACKET)
3474         {
3475           bitset_merge (accepts, node->opr.sbcset);
3476         }
3477       else if (type == OP_PERIOD)
3478         {
3479 #ifdef RE_ENABLE_I18N
3480           if (dfa->mb_cur_max > 1)
3481             bitset_merge (accepts, dfa->sb_char);
3482           else
3483 #endif
3484             bitset_set_all (accepts);
3485           if (!(dfa->syntax & RE_DOT_NEWLINE))
3486             bitset_clear (accepts, '\n');
3487           if (dfa->syntax & RE_DOT_NOT_NULL)
3488             bitset_clear (accepts, '\0');
3489         }
3490 #ifdef RE_ENABLE_I18N
3491       else if (type == OP_UTF8_PERIOD)
3492         {
3493           if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3494             memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3495           else
3496             bitset_merge (accepts, utf8_sb_map);
3497           if (!(dfa->syntax & RE_DOT_NEWLINE))
3498             bitset_clear (accepts, '\n');
3499           if (dfa->syntax & RE_DOT_NOT_NULL)
3500             bitset_clear (accepts, '\0');
3501         }
3502 #endif
3503       else
3504         continue;
3505
3506       /* Check the 'accepts' and sift the characters which are not
3507          match it the context.  */
3508       if (constraint)
3509         {
3510           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3511             {
3512               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3513               bitset_empty (accepts);
3514               if (accepts_newline)
3515                 bitset_set (accepts, NEWLINE_CHAR);
3516               else
3517                 continue;
3518             }
3519           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3520             {
3521               bitset_empty (accepts);
3522               continue;
3523             }
3524
3525           if (constraint & NEXT_WORD_CONSTRAINT)
3526             {
3527               bitset_word_t any_set = 0;
3528               if (type == CHARACTER && !node->word_char)
3529                 {
3530                   bitset_empty (accepts);
3531                   continue;
3532                 }
3533 #ifdef RE_ENABLE_I18N
3534               if (dfa->mb_cur_max > 1)
3535                 for (j = 0; j < BITSET_WORDS; ++j)
3536                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3537               else
3538 #endif
3539                 for (j = 0; j < BITSET_WORDS; ++j)
3540                   any_set |= (accepts[j] &= dfa->word_char[j]);
3541               if (!any_set)
3542                 continue;
3543             }
3544           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3545             {
3546               bitset_word_t any_set = 0;
3547               if (type == CHARACTER && node->word_char)
3548                 {
3549                   bitset_empty (accepts);
3550                   continue;
3551                 }
3552 #ifdef RE_ENABLE_I18N
3553               if (dfa->mb_cur_max > 1)
3554                 for (j = 0; j < BITSET_WORDS; ++j)
3555                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3556               else
3557 #endif
3558                 for (j = 0; j < BITSET_WORDS; ++j)
3559                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3560               if (!any_set)
3561                 continue;
3562             }
3563         }
3564
3565       /* Then divide 'accepts' into DFA states, or create a new
3566          state.  Above, we make sure that accepts is not empty.  */
3567       for (j = 0; j < ndests; ++j)
3568         {
3569           bitset_t intersec; /* Intersection sets, see below.  */
3570           bitset_t remains;
3571           /* Flags, see below.  */
3572           bitset_word_t has_intersec, not_subset, not_consumed;
3573
3574           /* Optimization, skip if this state doesn't accept the character.  */
3575           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3576             continue;
3577
3578           /* Enumerate the intersection set of this state and 'accepts'.  */
3579           has_intersec = 0;
3580           for (k = 0; k < BITSET_WORDS; ++k)
3581             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3582           /* And skip if the intersection set is empty.  */
3583           if (!has_intersec)
3584             continue;
3585
3586           /* Then check if this state is a subset of 'accepts'.  */
3587           not_subset = not_consumed = 0;
3588           for (k = 0; k < BITSET_WORDS; ++k)
3589             {
3590               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3591               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3592             }
3593
3594           /* If this state isn't a subset of 'accepts', create a
3595              new group state, which has the 'remains'. */
3596           if (not_subset)
3597             {
3598               bitset_copy (dests_ch[ndests], remains);
3599               bitset_copy (dests_ch[j], intersec);
3600               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3601               if (__glibc_unlikely (err != REG_NOERROR))
3602                 goto error_return;
3603               ++ndests;
3604             }
3605
3606           /* Put the position in the current group. */
3607           ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3608           if (__glibc_unlikely (! ok))
3609             goto error_return;
3610
3611           /* If all characters are consumed, go to next node. */
3612           if (!not_consumed)
3613             break;
3614         }
3615       /* Some characters remain, create a new group. */
3616       if (j == ndests)
3617         {
3618           bitset_copy (dests_ch[ndests], accepts);
3619           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3620           if (__glibc_unlikely (err != REG_NOERROR))
3621             goto error_return;
3622           ++ndests;
3623           bitset_empty (accepts);
3624         }
3625     }
3626   assume (ndests <= SBC_MAX);
3627   return ndests;
3628  error_return:
3629   for (j = 0; j < ndests; ++j)
3630     re_node_set_free (dests_node + j);
3631   return -1;
3632 }
3633
3634 #ifdef RE_ENABLE_I18N
3635 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3636    Return the number of the bytes the node accepts.
3637    STR_IDX is the current index of the input string.
3638
3639    This function handles the nodes which can accept one character, or
3640    one collating element like '.', '[a-z]', opposite to the other nodes
3641    can only accept one byte.  */
3642
3643 # ifdef _LIBC
3644 #  include <locale/weight.h>
3645 # endif
3646
3647 static int
3648 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3649                          const re_string_t *input, Idx str_idx)
3650 {
3651   const re_token_t *node = dfa->nodes + node_idx;
3652   int char_len, elem_len;
3653   Idx i;
3654
3655   if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3656     {
3657       unsigned char c = re_string_byte_at (input, str_idx), d;
3658       if (__glibc_likely (c < 0xc2))
3659         return 0;
3660
3661       if (str_idx + 2 > input->len)
3662         return 0;
3663
3664       d = re_string_byte_at (input, str_idx + 1);
3665       if (c < 0xe0)
3666         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3667       else if (c < 0xf0)
3668         {
3669           char_len = 3;
3670           if (c == 0xe0 && d < 0xa0)
3671             return 0;
3672         }
3673       else if (c < 0xf8)
3674         {
3675           char_len = 4;
3676           if (c == 0xf0 && d < 0x90)
3677             return 0;
3678         }
3679       else if (c < 0xfc)
3680         {
3681           char_len = 5;
3682           if (c == 0xf8 && d < 0x88)
3683             return 0;
3684         }
3685       else if (c < 0xfe)
3686         {
3687           char_len = 6;
3688           if (c == 0xfc && d < 0x84)
3689             return 0;
3690         }
3691       else
3692         return 0;
3693
3694       if (str_idx + char_len > input->len)
3695         return 0;
3696
3697       for (i = 1; i < char_len; ++i)
3698         {
3699           d = re_string_byte_at (input, str_idx + i);
3700           if (d < 0x80 || d > 0xbf)
3701             return 0;
3702         }
3703       return char_len;
3704     }
3705
3706   char_len = re_string_char_size_at (input, str_idx);
3707   if (node->type == OP_PERIOD)
3708     {
3709       if (char_len <= 1)
3710         return 0;
3711       /* FIXME: I don't think this if is needed, as both '\n'
3712          and '\0' are char_len == 1.  */
3713       /* '.' accepts any one character except the following two cases.  */
3714       if ((!(dfa->syntax & RE_DOT_NEWLINE)
3715            && re_string_byte_at (input, str_idx) == '\n')
3716           || ((dfa->syntax & RE_DOT_NOT_NULL)
3717               && re_string_byte_at (input, str_idx) == '\0'))
3718         return 0;
3719       return char_len;
3720     }
3721
3722   elem_len = re_string_elem_size_at (input, str_idx);
3723   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3724     return 0;
3725
3726   if (node->type == COMPLEX_BRACKET)
3727     {
3728       const re_charset_t *cset = node->opr.mbcset;
3729 # ifdef _LIBC
3730       const unsigned char *pin
3731         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3732       Idx j;
3733       uint32_t nrules;
3734 # endif /* _LIBC */
3735       int match_len = 0;
3736       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3737                     ? re_string_wchar_at (input, str_idx) : 0);
3738
3739       /* match with multibyte character?  */
3740       for (i = 0; i < cset->nmbchars; ++i)
3741         if (wc == cset->mbchars[i])
3742           {
3743             match_len = char_len;
3744             goto check_node_accept_bytes_match;
3745           }
3746       /* match with character_class?  */
3747       for (i = 0; i < cset->nchar_classes; ++i)
3748         {
3749           wctype_t wt = cset->char_classes[i];
3750           if (__iswctype (wc, wt))
3751             {
3752               match_len = char_len;
3753               goto check_node_accept_bytes_match;
3754             }
3755         }
3756
3757 # ifdef _LIBC
3758       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3759       if (nrules != 0)
3760         {
3761           unsigned int in_collseq = 0;
3762           const int32_t *table, *indirect;
3763           const unsigned char *weights, *extra;
3764           const char *collseqwc;
3765
3766           /* match with collating_symbol?  */
3767           if (cset->ncoll_syms)
3768             extra = (const unsigned char *)
3769               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3770           for (i = 0; i < cset->ncoll_syms; ++i)
3771             {
3772               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3773               /* Compare the length of input collating element and
3774                  the length of current collating element.  */
3775               if (*coll_sym != elem_len)
3776                 continue;
3777               /* Compare each bytes.  */
3778               for (j = 0; j < *coll_sym; j++)
3779                 if (pin[j] != coll_sym[1 + j])
3780                   break;
3781               if (j == *coll_sym)
3782                 {
3783                   /* Match if every bytes is equal.  */
3784                   match_len = j;
3785                   goto check_node_accept_bytes_match;
3786                 }
3787             }
3788
3789           if (cset->nranges)
3790             {
3791               if (elem_len <= char_len)
3792                 {
3793                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3794                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3795                 }
3796               else
3797                 in_collseq = find_collation_sequence_value (pin, elem_len);
3798             }
3799           /* match with range expression?  */
3800           /* FIXME: Implement rational ranges here, too.  */
3801           for (i = 0; i < cset->nranges; ++i)
3802             if (cset->range_starts[i] <= in_collseq
3803                 && in_collseq <= cset->range_ends[i])
3804               {
3805                 match_len = elem_len;
3806                 goto check_node_accept_bytes_match;
3807               }
3808
3809           /* match with equivalence_class?  */
3810           if (cset->nequiv_classes)
3811             {
3812               const unsigned char *cp = pin;
3813               table = (const int32_t *)
3814                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3815               weights = (const unsigned char *)
3816                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3817               extra = (const unsigned char *)
3818                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3819               indirect = (const int32_t *)
3820                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3821               int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3822               int32_t rule = idx >> 24;
3823               idx &= 0xffffff;
3824               if (idx > 0)
3825                 {
3826                   size_t weight_len = weights[idx];
3827                   for (i = 0; i < cset->nequiv_classes; ++i)
3828                     {
3829                       int32_t equiv_class_idx = cset->equiv_classes[i];
3830                       int32_t equiv_class_rule = equiv_class_idx >> 24;
3831                       equiv_class_idx &= 0xffffff;
3832                       if (weights[equiv_class_idx] == weight_len
3833                           && equiv_class_rule == rule
3834                           && memcmp (weights + idx + 1,
3835                                      weights + equiv_class_idx + 1,
3836                                      weight_len) == 0)
3837                         {
3838                           match_len = elem_len;
3839                           goto check_node_accept_bytes_match;
3840                         }
3841                     }
3842                 }
3843             }
3844         }
3845       else
3846 # endif /* _LIBC */
3847         {
3848           /* match with range expression?  */
3849           for (i = 0; i < cset->nranges; ++i)
3850             {
3851               if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3852                 {
3853                   match_len = char_len;
3854                   goto check_node_accept_bytes_match;
3855                 }
3856             }
3857         }
3858     check_node_accept_bytes_match:
3859       if (!cset->non_match)
3860         return match_len;
3861       else
3862         {
3863           if (match_len > 0)
3864             return 0;
3865           else
3866             return (elem_len > char_len) ? elem_len : char_len;
3867         }
3868     }
3869   return 0;
3870 }
3871
3872 # ifdef _LIBC
3873 static unsigned int
3874 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3875 {
3876   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3877   if (nrules == 0)
3878     {
3879       if (mbs_len == 1)
3880         {
3881           /* No valid character.  Match it as a single byte character.  */
3882           const unsigned char *collseq = (const unsigned char *)
3883             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3884           return collseq[mbs[0]];
3885         }
3886       return UINT_MAX;
3887     }
3888   else
3889     {
3890       int32_t idx;
3891       const unsigned char *extra = (const unsigned char *)
3892         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3893       int32_t extrasize = (const unsigned char *)
3894         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3895
3896       for (idx = 0; idx < extrasize;)
3897         {
3898           int mbs_cnt;
3899           bool found = false;
3900           int32_t elem_mbs_len;
3901           /* Skip the name of collating element name.  */
3902           idx = idx + extra[idx] + 1;
3903           elem_mbs_len = extra[idx++];
3904           if (mbs_len == elem_mbs_len)
3905             {
3906               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3907                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3908                   break;
3909               if (mbs_cnt == elem_mbs_len)
3910                 /* Found the entry.  */
3911                 found = true;
3912             }
3913           /* Skip the byte sequence of the collating element.  */
3914           idx += elem_mbs_len;
3915           /* Adjust for the alignment.  */
3916           idx = (idx + 3) & ~3;
3917           /* Skip the collation sequence value.  */
3918           idx += sizeof (uint32_t);
3919           /* Skip the wide char sequence of the collating element.  */
3920           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3921           /* If we found the entry, return the sequence value.  */
3922           if (found)
3923             return *(uint32_t *) (extra + idx);
3924           /* Skip the collation sequence value.  */
3925           idx += sizeof (uint32_t);
3926         }
3927       return UINT_MAX;
3928     }
3929 }
3930 # endif /* _LIBC */
3931 #endif /* RE_ENABLE_I18N */
3932
3933 /* Check whether the node accepts the byte which is IDX-th
3934    byte of the INPUT.  */
3935
3936 static bool
3937 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3938                    Idx idx)
3939 {
3940   unsigned char ch;
3941   ch = re_string_byte_at (&mctx->input, idx);
3942   switch (node->type)
3943     {
3944     case CHARACTER:
3945       if (node->opr.c != ch)
3946         return false;
3947       break;
3948
3949     case SIMPLE_BRACKET:
3950       if (!bitset_contain (node->opr.sbcset, ch))
3951         return false;
3952       break;
3953
3954 #ifdef RE_ENABLE_I18N
3955     case OP_UTF8_PERIOD:
3956       if (ch >= ASCII_CHARS)
3957         return false;
3958       FALLTHROUGH;
3959 #endif
3960     case OP_PERIOD:
3961       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3962           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3963         return false;
3964       break;
3965
3966     default:
3967       return false;
3968     }
3969
3970   if (node->constraint)
3971     {
3972       /* The node has constraints.  Check whether the current context
3973          satisfies the constraints.  */
3974       unsigned int context = re_string_context_at (&mctx->input, idx,
3975                                                    mctx->eflags);
3976       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3977         return false;
3978     }
3979
3980   return true;
3981 }
3982
3983 /* Extend the buffers, if the buffers have run out.  */
3984
3985 static reg_errcode_t
3986 __attribute_warn_unused_result__
3987 extend_buffers (re_match_context_t *mctx, int min_len)
3988 {
3989   reg_errcode_t ret;
3990   re_string_t *pstr = &mctx->input;
3991
3992   /* Avoid overflow.  */
3993   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3994                         <= pstr->bufs_len))
3995     return REG_ESPACE;
3996
3997   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
3998   ret = re_string_realloc_buffers (pstr,
3999                                    MAX (min_len,
4000                                         MIN (pstr->len, pstr->bufs_len * 2)));
4001   if (__glibc_unlikely (ret != REG_NOERROR))
4002     return ret;
4003
4004   if (mctx->state_log != NULL)
4005     {
4006       /* And double the length of state_log.  */
4007       /* XXX We have no indication of the size of this buffer.  If this
4008          allocation fail we have no indication that the state_log array
4009          does not have the right size.  */
4010       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4011                                               pstr->bufs_len + 1);
4012       if (__glibc_unlikely (new_array == NULL))
4013         return REG_ESPACE;
4014       mctx->state_log = new_array;
4015     }
4016
4017   /* Then reconstruct the buffers.  */
4018   if (pstr->icase)
4019     {
4020 #ifdef RE_ENABLE_I18N
4021       if (pstr->mb_cur_max > 1)
4022         {
4023           ret = build_wcs_upper_buffer (pstr);
4024           if (__glibc_unlikely (ret != REG_NOERROR))
4025             return ret;
4026         }
4027       else
4028 #endif /* RE_ENABLE_I18N  */
4029         build_upper_buffer (pstr);
4030     }
4031   else
4032     {
4033 #ifdef RE_ENABLE_I18N
4034       if (pstr->mb_cur_max > 1)
4035         build_wcs_buffer (pstr);
4036       else
4037 #endif /* RE_ENABLE_I18N  */
4038         {
4039           if (pstr->trans != NULL)
4040             re_string_translate_buffer (pstr);
4041         }
4042     }
4043   return REG_NOERROR;
4044 }
4045
4046 \f
4047 /* Functions for matching context.  */
4048
4049 /* Initialize MCTX.  */
4050
4051 static reg_errcode_t
4052 __attribute_warn_unused_result__
4053 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4054 {
4055   mctx->eflags = eflags;
4056   mctx->match_last = -1;
4057   if (n > 0)
4058     {
4059       /* Avoid overflow.  */
4060       size_t max_object_size =
4061         MAX (sizeof (struct re_backref_cache_entry),
4062              sizeof (re_sub_match_top_t *));
4063       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4064         return REG_ESPACE;
4065
4066       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4067       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4068       if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4069         return REG_ESPACE;
4070     }
4071   /* Already zero-ed by the caller.
4072      else
4073        mctx->bkref_ents = NULL;
4074      mctx->nbkref_ents = 0;
4075      mctx->nsub_tops = 0;  */
4076   mctx->abkref_ents = n;
4077   mctx->max_mb_elem_len = 1;
4078   mctx->asub_tops = n;
4079   return REG_NOERROR;
4080 }
4081
4082 /* Clean the entries which depend on the current input in MCTX.
4083    This function must be invoked when the matcher changes the start index
4084    of the input, or changes the input string.  */
4085
4086 static void
4087 match_ctx_clean (re_match_context_t *mctx)
4088 {
4089   Idx st_idx;
4090   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4091     {
4092       Idx sl_idx;
4093       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4094       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4095         {
4096           re_sub_match_last_t *last = top->lasts[sl_idx];
4097           re_free (last->path.array);
4098           re_free (last);
4099         }
4100       re_free (top->lasts);
4101       if (top->path)
4102         {
4103           re_free (top->path->array);
4104           re_free (top->path);
4105         }
4106       re_free (top);
4107     }
4108
4109   mctx->nsub_tops = 0;
4110   mctx->nbkref_ents = 0;
4111 }
4112
4113 /* Free all the memory associated with MCTX.  */
4114
4115 static void
4116 match_ctx_free (re_match_context_t *mctx)
4117 {
4118   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4119   match_ctx_clean (mctx);
4120   re_free (mctx->sub_tops);
4121   re_free (mctx->bkref_ents);
4122 }
4123
4124 /* Add a new backreference entry to MCTX.
4125    Note that we assume that caller never call this function with duplicate
4126    entry, and call with STR_IDX which isn't smaller than any existing entry.
4127 */
4128
4129 static reg_errcode_t
4130 __attribute_warn_unused_result__
4131 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4132                      Idx to)
4133 {
4134   if (mctx->nbkref_ents >= mctx->abkref_ents)
4135     {
4136       struct re_backref_cache_entry* new_entry;
4137       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4138                               mctx->abkref_ents * 2);
4139       if (__glibc_unlikely (new_entry == NULL))
4140         {
4141           re_free (mctx->bkref_ents);
4142           return REG_ESPACE;
4143         }
4144       mctx->bkref_ents = new_entry;
4145       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4146               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4147       mctx->abkref_ents *= 2;
4148     }
4149   if (mctx->nbkref_ents > 0
4150       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4151     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4152
4153   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4154   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4155   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4156   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4157
4158   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4159      If bit N is clear, means that this entry won't epsilon-transition to
4160      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4161      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4162      such node.
4163
4164      A backreference does not epsilon-transition unless it is empty, so set
4165      to all zeros if FROM != TO.  */
4166   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4167     = (from == to ? -1 : 0);
4168
4169   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4170   if (mctx->max_mb_elem_len < to - from)
4171     mctx->max_mb_elem_len = to - from;
4172   return REG_NOERROR;
4173 }
4174
4175 /* Return the first entry with the same str_idx, or -1 if none is
4176    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4177
4178 static Idx
4179 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4180 {
4181   Idx left, right, mid, last;
4182   last = right = mctx->nbkref_ents;
4183   for (left = 0; left < right;)
4184     {
4185       mid = (left + right) / 2;
4186       if (mctx->bkref_ents[mid].str_idx < str_idx)
4187         left = mid + 1;
4188       else
4189         right = mid;
4190     }
4191   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4192     return left;
4193   else
4194     return -1;
4195 }
4196
4197 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4198    at STR_IDX.  */
4199
4200 static reg_errcode_t
4201 __attribute_warn_unused_result__
4202 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4203 {
4204   DEBUG_ASSERT (mctx->sub_tops != NULL);
4205   DEBUG_ASSERT (mctx->asub_tops > 0);
4206   if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4207     {
4208       Idx new_asub_tops = mctx->asub_tops * 2;
4209       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4210                                                    re_sub_match_top_t *,
4211                                                    new_asub_tops);
4212       if (__glibc_unlikely (new_array == NULL))
4213         return REG_ESPACE;
4214       mctx->sub_tops = new_array;
4215       mctx->asub_tops = new_asub_tops;
4216     }
4217   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4218   if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4219     return REG_ESPACE;
4220   mctx->sub_tops[mctx->nsub_tops]->node = node;
4221   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4222   return REG_NOERROR;
4223 }
4224
4225 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4226    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4227    Return the new entry if successful, NULL if memory is exhausted.  */
4228
4229 static re_sub_match_last_t *
4230 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4231 {
4232   re_sub_match_last_t *new_entry;
4233   if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4234     {
4235       Idx new_alasts = 2 * subtop->alasts + 1;
4236       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4237                                                     re_sub_match_last_t *,
4238                                                     new_alasts);
4239       if (__glibc_unlikely (new_array == NULL))
4240         return NULL;
4241       subtop->lasts = new_array;
4242       subtop->alasts = new_alasts;
4243     }
4244   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4245   if (__glibc_likely (new_entry != NULL))
4246     {
4247       subtop->lasts[subtop->nlasts] = new_entry;
4248       new_entry->node = node;
4249       new_entry->str_idx = str_idx;
4250       ++subtop->nlasts;
4251     }
4252   return new_entry;
4253 }
4254
4255 static void
4256 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4257                re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4258 {
4259   sctx->sifted_states = sifted_sts;
4260   sctx->limited_states = limited_sts;
4261   sctx->last_node = last_node;
4262   sctx->last_str_idx = last_str_idx;
4263   re_node_set_init_empty (&sctx->limits);
4264 }