lib/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU General Public
   8    License as published by the Free Software Foundation; either
   9    version 3 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      Idx n);
  22 static void match_ctx_clean (re_match_context_t *mctx);
  23 static void match_ctx_free (re_match_context_t *cache);
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
  25                                           Idx str_idx, Idx from, Idx to);
  26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
  27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
  28                                            Idx str_idx);
  29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  30                                                     Idx node, Idx str_idx);
  31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  32                            re_dfastate_t **limited_sts, Idx last_node,
  33                            Idx last_str_idx);
  34 static reg_errcode_t re_search_internal (const regex_t *preg,
  35                                          const char *string, Idx length,
  36                                          Idx start, Idx last_start, Idx stop,
  37                                          size_t nmatch, regmatch_t pmatch[],
  38                                          int eflags);
  39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
  40                                   const char *string1, Idx length1,
  41                                   const char *string2, Idx length2,
  42                                   Idx start, regoff_t range,
  43                                   struct re_registers *regs,
  44                                   Idx stop, bool ret_len);
  45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
  46                                 const char *string, Idx length, Idx start,
  47                                 regoff_t range, Idx stop,
  48                                 struct re_registers *regs,
  49                                 bool ret_len);
  50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  51                               Idx nregs, int regs_allocated);
  52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
  53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
  54                            Idx *p_match_first);
  55 static Idx check_halt_state_context (const re_match_context_t *mctx,
  56                                      const re_dfastate_t *state, Idx idx);
  57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  58                          regmatch_t *prev_idx_match, Idx cur_node,
  59                          Idx cur_idx, Idx nmatch);
  60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  61                                       Idx str_idx, Idx dest_node, Idx nregs,
  62                                       regmatch_t *regs, regmatch_t *prevregs,
  63                                       re_node_set *eps_via_nodes);
  64 static reg_errcode_t set_regs (const regex_t *preg,
  65                                const re_match_context_t *mctx,
  66                                size_t nmatch, regmatch_t *pmatch,
  67                                bool fl_backtrack);
  68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
  69
  70 #ifdef RE_ENABLE_I18N
  71 static int sift_states_iter_mb (const re_match_context_t *mctx,
  72                                 re_sift_context_t *sctx,
  73                                 Idx node_idx, Idx str_idx, Idx max_str_idx);
  74 #endif /* RE_ENABLE_I18N */
  75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  76                                            re_sift_context_t *sctx);
  77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  78                                           re_sift_context_t *sctx, Idx str_idx,
  79                                           re_node_set *cur_dest);
  80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  81                                               re_sift_context_t *sctx,
  82                                               Idx str_idx,
  83                                               re_node_set *dest_nodes);
  84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  85                                             re_node_set *dest_nodes,
  86                                             const re_node_set *candidates);
  87 static bool check_dst_limits (const re_match_context_t *mctx,
  88                               const re_node_set *limits,
  89                               Idx dst_node, Idx dst_idx, Idx src_node,
  90                               Idx src_idx);
  91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
  92                                         int boundaries, Idx subexp_idx,
  93                                         Idx from_node, Idx bkref_idx);
  94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
  95                                       Idx limit, Idx subexp_idx,
  96                                       Idx node, Idx str_idx,
  97                                       Idx bkref_idx);
  98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
  99                                           re_node_set *dest_nodes,
 100                                           const re_node_set *candidates,
 101                                           re_node_set *limits,
 102                                           struct re_backref_cache_entry *bkref_ents,
 103                                           Idx str_idx);
 104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 105                                         re_sift_context_t *sctx,
 106                                         Idx str_idx, const re_node_set *candidates);
 107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 108                                         re_dfastate_t **dst,
 109                                         re_dfastate_t **src, Idx num);
 110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 111                                          re_match_context_t *mctx);
 112 static re_dfastate_t *transit_state (reg_errcode_t *err,
 113                                      re_match_context_t *mctx,
 114                                      re_dfastate_t *state);
 115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 116                                             re_match_context_t *mctx,
 117                                             re_dfastate_t *next_state);
 118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 119                                                 re_node_set *cur_nodes,
 120                                                 Idx str_idx);
 121 #if 0
 122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 123                                         re_match_context_t *mctx,
 124                                         re_dfastate_t *pstate);
 125 #endif
 126 #ifdef RE_ENABLE_I18N
 127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 128                                        re_dfastate_t *pstate);
 129 #endif /* RE_ENABLE_I18N */
 130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 131                                           const re_node_set *nodes);
 132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 133                                  Idx bkref_node, Idx bkref_str_idx);
 134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 135                                      const re_sub_match_top_t *sub_top,
 136                                      re_sub_match_last_t *sub_last,
 137                                      Idx bkref_node, Idx bkref_str);
 138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 139                              Idx subexp_idx, int type);
 140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 141                                     state_array_t *path, Idx top_node,
 142                                     Idx top_str, Idx last_node, Idx last_str,
 143                                     int type);
 144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 145                                                    Idx str_idx,
 146                                                    re_node_set *cur_nodes,
 147                                                    re_node_set *next_nodes);
 148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 149                                                re_node_set *cur_nodes,
 150                                                Idx ex_subexp, int type);
 151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 152                                                    re_node_set *dst_nodes,
 153                                                    Idx target, Idx ex_subexp,
 154                                                    int type);
 155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 156                                          re_node_set *cur_nodes, Idx cur_str,
 157                                          Idx subexp_num, int type);
 158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
 159 #ifdef RE_ENABLE_I18N
 160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
 161                                     const re_string_t *input, Idx idx);
 162 # ifdef _LIBC
 163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 164                                                    size_t name_len);
 165 # endif /* _LIBC */
 166 #endif /* RE_ENABLE_I18N */
 167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
 168                                        const re_dfastate_t *state,
 169                                        re_node_set *states_node,
 170                                        bitset_t *states_ch);
 171 static bool check_node_accept (const re_match_context_t *mctx,
 172                                const re_token_t *node, Idx idx);
 173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
 174 \f
 175 /* Entry point for POSIX code.  */
 176
 177 /* regexec searches for a given pattern, specified by PREG, in the
 178    string STRING.
 179
 180    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 181    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 182    least NMATCH elements, and we set them to the offsets of the
 183    corresponding matched substrings.
 184
 185    EFLAGS specifies "execution flags" which affect matching: if
 186    REG_NOTBOL is set, then ^ does not match at the beginning of the
 187    string; if REG_NOTEOL is set, then $ does not match at the end.
 188
 189    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
 190    EFLAGS is invalid.  */
 191
 192 int
 193 regexec (const regex_t *__restrict preg, const char *__restrict string,
 194          size_t nmatch, regmatch_t pmatch[], int eflags)
 195 {
 196   reg_errcode_t err;
 197   Idx start, length;
 198   re_dfa_t *dfa = preg->buffer;
 199
 200   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 201     return REG_BADPAT;
 202
 203   if (eflags & REG_STARTEND)
 204     {
 205       start = pmatch[0].rm_so;
 206       length = pmatch[0].rm_eo;
 207     }
 208   else
 209     {
 210       start = 0;
 211       length = strlen (string);
 212     }
 213
 214   lock_lock (dfa->lock);
 215   if (preg->no_sub)
 216     err = re_search_internal (preg, string, length, start, length,
 217                               length, 0, NULL, eflags);
 218   else
 219     err = re_search_internal (preg, string, length, start, length,
 220                               length, nmatch, pmatch, eflags);
 221   lock_unlock (dfa->lock);
 222   return err != REG_NOERROR;
 223 }
 224
 225 #ifdef _LIBC
 226 libc_hidden_def (__regexec)
 227
 228 # include <shlib-compat.h>
 229 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 230
 231 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 232 __typeof__ (__regexec) __compat_regexec;
 233
 234 int
 235 attribute_compat_text_section
 236 __compat_regexec (const regex_t *__restrict preg,
 237                   const char *__restrict string, size_t nmatch,
 238                   regmatch_t pmatch[], int eflags)
 239 {
 240   return regexec (preg, string, nmatch, pmatch,
 241                   eflags & (REG_NOTBOL | REG_NOTEOL));
 242 }
 243 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 244 # endif
 245 #endif
 246
 247 /* Entry points for GNU code.  */
 248
 249 /* re_match, re_search, re_match_2, re_search_2
 250
 251    The former two functions operate on STRING with length LENGTH,
 252    while the later two operate on concatenation of STRING1 and STRING2
 253    with lengths LENGTH1 and LENGTH2, respectively.
 254
 255    re_match() matches the compiled pattern in BUFP against the string,
 256    starting at index START.
 257
 258    re_search() first tries matching at index START, then it tries to match
 259    starting from index START + 1, and so on.  The last start position tried
 260    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 261    way as re_match().)
 262
 263    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 264    the first STOP characters of the concatenation of the strings should be
 265    concerned.
 266
 267    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 268    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
 269    computed relative to the concatenation, not relative to the individual
 270    strings.)
 271
 272    On success, re_match* functions return the length of the match, re_search*
 273    return the position of the start of the match.  They return -1 on
 274    match failure, -2 on error.  */
 275
 276 regoff_t
 277 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
 278           Idx start, struct re_registers *regs)
 279 {
 280   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
 281 }
 282 #ifdef _LIBC
 283 weak_alias (__re_match, re_match)
 284 #endif
 285
 286 regoff_t
 287 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
 288            Idx start, regoff_t range, struct re_registers *regs)
 289 {
 290   return re_search_stub (bufp, string, length, start, range, length, regs,
 291                          false);
 292 }
 293 #ifdef _LIBC
 294 weak_alias (__re_search, re_search)
 295 #endif
 296
 297 regoff_t
 298 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
 299             const char *string2, Idx length2, Idx start,
 300             struct re_registers *regs, Idx stop)
 301 {
 302   return re_search_2_stub (bufp, string1, length1, string2, length2,
 303                            start, 0, regs, stop, true);
 304 }
 305 #ifdef _LIBC
 306 weak_alias (__re_match_2, re_match_2)
 307 #endif
 308
 309 regoff_t
 310 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
 311              const char *string2, Idx length2, Idx start, regoff_t range,
 312              struct re_registers *regs, Idx stop)
 313 {
 314   return re_search_2_stub (bufp, string1, length1, string2, length2,
 315                            start, range, regs, stop, false);
 316 }
 317 #ifdef _LIBC
 318 weak_alias (__re_search_2, re_search_2)
 319 #endif
 320
 321 static regoff_t
 322 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
 323                   Idx length1, const char *string2, Idx length2, Idx start,
 324                   regoff_t range, struct re_registers *regs,
 325                   Idx stop, bool ret_len)
 326 {
 327   const char *str;
 328   regoff_t rval;
 329   Idx len;
 330   char *s = NULL;
 331
 332   if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
 333                          || INT_ADD_WRAPV (length1, length2, &len))))
 334     return -2;
 335
 336   /* Concatenate the strings.  */
 337   if (length2 > 0)
 338     if (length1 > 0)
 339       {
 340         s = re_malloc (char, len);
 341
 342         if (__glibc_unlikely (s == NULL))
 343           return -2;
 344 #ifdef _LIBC
 345         memcpy (__mempcpy (s, string1, length1), string2, length2);
 346 #else
 347         memcpy (s, string1, length1);
 348         memcpy (s + length1, string2, length2);
 349 #endif
 350         str = s;
 351       }
 352     else
 353       str = string2;
 354   else
 355     str = string1;
 356
 357   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 358                          ret_len);
 359   re_free (s);
 360   return rval;
 361 }
 362
 363 /* The parameters have the same meaning as those of re_search.
 364    Additional parameters:
 365    If RET_LEN is true the length of the match is returned (re_match style);
 366    otherwise the position of the match is returned.  */
 367
 368 static regoff_t
 369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
 370                 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
 371                 bool ret_len)
 372 {
 373   reg_errcode_t result;
 374   regmatch_t *pmatch;
 375   Idx nregs;
 376   regoff_t rval;
 377   int eflags = 0;
 378   re_dfa_t *dfa = bufp->buffer;
 379   Idx last_start = start + range;
 380
 381   /* Check for out-of-range.  */
 382   if (__glibc_unlikely (start < 0 || start > length))
 383     return -1;
 384   if (__glibc_unlikely (length < last_start
 385                         || (0 <= range && last_start < start)))
 386     last_start = length;
 387   else if (__glibc_unlikely (last_start < 0
 388                              || (range < 0 && start <= last_start)))
 389     last_start = 0;
 390
 391   lock_lock (dfa->lock);
 392
 393   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 394   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 395
 396   /* Compile fastmap if we haven't yet.  */
 397   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 398     re_compile_fastmap (bufp);
 399
 400   if (__glibc_unlikely (bufp->no_sub))
 401     regs = NULL;
 402
 403   /* We need at least 1 register.  */
 404   if (regs == NULL)
 405     nregs = 1;
 406   else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
 407                              && regs->num_regs <= bufp->re_nsub))
 408     {
 409       nregs = regs->num_regs;
 410       if (__glibc_unlikely (nregs < 1))
 411         {
 412           /* Nothing can be copied to regs.  */
 413           regs = NULL;
 414           nregs = 1;
 415         }
 416     }
 417   else
 418     nregs = bufp->re_nsub + 1;
 419   pmatch = re_malloc (regmatch_t, nregs);
 420   if (__glibc_unlikely (pmatch == NULL))
 421     {
 422       rval = -2;
 423       goto out;
 424     }
 425
 426   result = re_search_internal (bufp, string, length, start, last_start, stop,
 427                                nregs, pmatch, eflags);
 428
 429   rval = 0;
 430
 431   /* I hope we needn't fill their regs with -1's when no match was found.  */
 432   if (result != REG_NOERROR)
 433     rval = result == REG_NOMATCH ? -1 : -2;
 434   else if (regs != NULL)
 435     {
 436       /* If caller wants register contents data back, copy them.  */
 437       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 438                                            bufp->regs_allocated);
 439       if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
 440         rval = -2;
 441     }
 442
 443   if (__glibc_likely (rval == 0))
 444     {
 445       if (ret_len)
 446         {
 447           DEBUG_ASSERT (pmatch[0].rm_so == start);
 448           rval = pmatch[0].rm_eo - start;
 449         }
 450       else
 451         rval = pmatch[0].rm_so;
 452     }
 453   re_free (pmatch);
 454  out:
 455   lock_unlock (dfa->lock);
 456   return rval;
 457 }
 458
 459 static unsigned
 460 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
 461               int regs_allocated)
 462 {
 463   int rval = REGS_REALLOCATE;
 464   Idx i;
 465   Idx need_regs = nregs + 1;
 466   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
 467      uses.  */
 468
 469   /* Have the register data arrays been allocated?  */
 470   if (regs_allocated == REGS_UNALLOCATED)
 471     { /* No.  So allocate them with malloc.  */
 472       regs->start = re_malloc (regoff_t, need_regs);
 473       if (__glibc_unlikely (regs->start == NULL))
 474         return REGS_UNALLOCATED;
 475       regs->end = re_malloc (regoff_t, need_regs);
 476       if (__glibc_unlikely (regs->end == NULL))
 477         {
 478           re_free (regs->start);
 479           return REGS_UNALLOCATED;
 480         }
 481       regs->num_regs = need_regs;
 482     }
 483   else if (regs_allocated == REGS_REALLOCATE)
 484     { /* Yes.  If we need more elements than were already
 485          allocated, reallocate them.  If we need fewer, just
 486          leave it alone.  */
 487       if (__glibc_unlikely (need_regs > regs->num_regs))
 488         {
 489           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 490           regoff_t *new_end;
 491           if (__glibc_unlikely (new_start == NULL))
 492             return REGS_UNALLOCATED;
 493           new_end = re_realloc (regs->end, regoff_t, need_regs);
 494           if (__glibc_unlikely (new_end == NULL))
 495             {
 496               re_free (new_start);
 497               return REGS_UNALLOCATED;
 498             }
 499           regs->start = new_start;
 500           regs->end = new_end;
 501           regs->num_regs = need_regs;
 502         }
 503     }
 504   else
 505     {
 506       DEBUG_ASSERT (regs_allocated == REGS_FIXED);
 507       /* This function may not be called with REGS_FIXED and nregs too big.  */
 508       DEBUG_ASSERT (nregs <= regs->num_regs);
 509       rval = REGS_FIXED;
 510     }
 511
 512   /* Copy the regs.  */
 513   for (i = 0; i < nregs; ++i)
 514     {
 515       regs->start[i] = pmatch[i].rm_so;
 516       regs->end[i] = pmatch[i].rm_eo;
 517     }
 518   for ( ; i < regs->num_regs; ++i)
 519     regs->start[i] = regs->end[i] = -1;
 520
 521   return rval;
 522 }
 523
 524 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 525    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 526    this memory for recording register information.  STARTS and ENDS
 527    must be allocated using the malloc library routine, and must each
 528    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 529
 530    If NUM_REGS == 0, then subsequent matches should allocate their own
 531    register data.
 532
 533    Unless this function is called, the first search or match using
 534    PATTERN_BUFFER will allocate its own register data, without
 535    freeing the old data.  */
 536
 537 void
 538 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 539                   __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
 540 {
 541   if (num_regs)
 542     {
 543       bufp->regs_allocated = REGS_REALLOCATE;
 544       regs->num_regs = num_regs;
 545       regs->start = starts;
 546       regs->end = ends;
 547     }
 548   else
 549     {
 550       bufp->regs_allocated = REGS_UNALLOCATED;
 551       regs->num_regs = 0;
 552       regs->start = regs->end = NULL;
 553     }
 554 }
 555 #ifdef _LIBC
 556 weak_alias (__re_set_registers, re_set_registers)
 557 #endif
 558 \f
 559 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 560    them unless specifically requested.  */
 561
 562 #if defined _REGEX_RE_COMP || defined _LIBC
 563 int
 564 # ifdef _LIBC
 565 weak_function
 566 # endif
 567 re_exec (const char *s)
 568 {
 569   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 570 }
 571 #endif /* _REGEX_RE_COMP */
 572 \f
 573 /* Internal entry point.  */
 574
 575 /* Searches for a compiled pattern PREG in the string STRING, whose
 576    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 577    meaning as with regexec.  LAST_START is START + RANGE, where
 578    START and RANGE have the same meaning as with re_search.
 579    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 580    otherwise return the error code.
 581    Note: We assume front end functions already check ranges.
 582    (0 <= LAST_START && LAST_START <= LENGTH)  */
 583
 584 static reg_errcode_t
 585 __attribute_warn_unused_result__
 586 re_search_internal (const regex_t *preg, const char *string, Idx length,
 587                     Idx start, Idx last_start, Idx stop, size_t nmatch,
 588                     regmatch_t pmatch[], int eflags)
 589 {
 590   reg_errcode_t err;
 591   const re_dfa_t *dfa = preg->buffer;
 592   Idx left_lim, right_lim;
 593   int incr;
 594   bool fl_longest_match;
 595   int match_kind;
 596   Idx match_first;
 597   Idx match_last = -1;
 598   Idx extra_nmatch;
 599   bool sb;
 600   int ch;
 601   re_match_context_t mctx = { .dfa = dfa };
 602   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
 603                     && start != last_start && !preg->can_be_null)
 604                    ? preg->fastmap : NULL);
 605   RE_TRANSLATE_TYPE t = preg->translate;
 606
 607   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 608   nmatch -= extra_nmatch;
 609
 610   /* Check if the DFA haven't been compiled.  */
 611   if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
 612                         || dfa->init_state_word == NULL
 613                         || dfa->init_state_nl == NULL
 614                         || dfa->init_state_begbuf == NULL))
 615     return REG_NOMATCH;
 616
 617   /* We assume front-end functions already check them.  */
 618   DEBUG_ASSERT (0 <= last_start && last_start <= length);
 619
 620   /* If initial states with non-begbuf contexts have no elements,
 621      the regex must be anchored.  If preg->newline_anchor is set,
 622      we'll never use init_state_nl, so do not check it.  */
 623   if (dfa->init_state->nodes.nelem == 0
 624       && dfa->init_state_word->nodes.nelem == 0
 625       && (dfa->init_state_nl->nodes.nelem == 0
 626           || !preg->newline_anchor))
 627     {
 628       if (start != 0 && last_start != 0)
 629         return REG_NOMATCH;
 630       start = last_start = 0;
 631     }
 632
 633   /* We must check the longest matching, if nmatch > 0.  */
 634   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 635
 636   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 637                             preg->translate, (preg->syntax & RE_ICASE) != 0,
 638                             dfa);
 639   if (__glibc_unlikely (err != REG_NOERROR))
 640     goto free_return;
 641   mctx.input.stop = stop;
 642   mctx.input.raw_stop = stop;
 643   mctx.input.newline_anchor = preg->newline_anchor;
 644
 645   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 646   if (__glibc_unlikely (err != REG_NOERROR))
 647     goto free_return;
 648
 649   /* We will log all the DFA states through which the dfa pass,
 650      if nmatch > 1, or this dfa has "multibyte node", which is a
 651      back-reference or a node which can accept multibyte character or
 652      multi character collating element.  */
 653   if (nmatch > 1 || dfa->has_mb_node)
 654     {
 655       /* Avoid overflow.  */
 656       if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
 657                              <= mctx.input.bufs_len)))
 658         {
 659           err = REG_ESPACE;
 660           goto free_return;
 661         }
 662
 663       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 664       if (__glibc_unlikely (mctx.state_log == NULL))
 665         {
 666           err = REG_ESPACE;
 667           goto free_return;
 668         }
 669     }
 670
 671   match_first = start;
 672   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 673                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 674
 675   /* Check incrementally whether the input string matches.  */
 676   incr = (last_start < start) ? -1 : 1;
 677   left_lim = (last_start < start) ? last_start : start;
 678   right_lim = (last_start < start) ? start : last_start;
 679   sb = dfa->mb_cur_max == 1;
 680   match_kind =
 681     (fastmap
 682      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 683         | (start <= last_start ? 2 : 0)
 684         | (t != NULL ? 1 : 0))
 685      : 8);
 686
 687   for (;; match_first += incr)
 688     {
 689       err = REG_NOMATCH;
 690       if (match_first < left_lim || right_lim < match_first)
 691         goto free_return;
 692
 693       /* Advance as rapidly as possible through the string, until we
 694          find a plausible place to start matching.  This may be done
 695          with varying efficiency, so there are various possibilities:
 696          only the most common of them are specialized, in order to
 697          save on code size.  We use a switch statement for speed.  */
 698       switch (match_kind)
 699         {
 700         case 8:
 701           /* No fastmap.  */
 702           break;
 703
 704         case 7:
 705           /* Fastmap with single-byte translation, match forward.  */
 706           while (__glibc_likely (match_first < right_lim)
 707                  && !fastmap[t[(unsigned char) string[match_first]]])
 708             ++match_first;
 709           goto forward_match_found_start_or_reached_end;
 710
 711         case 6:
 712           /* Fastmap without translation, match forward.  */
 713           while (__glibc_likely (match_first < right_lim)
 714                  && !fastmap[(unsigned char) string[match_first]])
 715             ++match_first;
 716
 717         forward_match_found_start_or_reached_end:
 718           if (__glibc_unlikely (match_first == right_lim))
 719             {
 720               ch = match_first >= length
 721                        ? 0 : (unsigned char) string[match_first];
 722               if (!fastmap[t ? t[ch] : ch])
 723                 goto free_return;
 724             }
 725           break;
 726
 727         case 4:
 728         case 5:
 729           /* Fastmap without multi-byte translation, match backwards.  */
 730           while (match_first >= left_lim)
 731             {
 732               ch = match_first >= length
 733                        ? 0 : (unsigned char) string[match_first];
 734               if (fastmap[t ? t[ch] : ch])
 735                 break;
 736               --match_first;
 737             }
 738           if (match_first < left_lim)
 739             goto free_return;
 740           break;
 741
 742         default:
 743           /* In this case, we can't determine easily the current byte,
 744              since it might be a component byte of a multibyte
 745              character.  Then we use the constructed buffer instead.  */
 746           for (;;)
 747             {
 748               /* If MATCH_FIRST is out of the valid range, reconstruct the
 749                  buffers.  */
 750               __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
 751               if (__glibc_unlikely (offset
 752                                     >= (__re_size_t) mctx.input.valid_raw_len))
 753                 {
 754                   err = re_string_reconstruct (&mctx.input, match_first,
 755                                                eflags);
 756                   if (__glibc_unlikely (err != REG_NOERROR))
 757                     goto free_return;
 758
 759                   offset = match_first - mctx.input.raw_mbs_idx;
 760                 }
 761               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 762                  Note that MATCH_FIRST must not be smaller than 0.  */
 763               ch = (match_first >= length
 764                     ? 0 : re_string_byte_at (&mctx.input, offset));
 765               if (fastmap[ch])
 766                 break;
 767               match_first += incr;
 768               if (match_first < left_lim || match_first > right_lim)
 769                 {
 770                   err = REG_NOMATCH;
 771                   goto free_return;
 772                 }
 773             }
 774           break;
 775         }
 776
 777       /* Reconstruct the buffers so that the matcher can assume that
 778          the matching starts from the beginning of the buffer.  */
 779       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 780       if (__glibc_unlikely (err != REG_NOERROR))
 781         goto free_return;
 782
 783 #ifdef RE_ENABLE_I18N
 784      /* Don't consider this char as a possible match start if it part,
 785         yet isn't the head, of a multibyte character.  */
 786       if (!sb && !re_string_first_byte (&mctx.input, 0))
 787         continue;
 788 #endif
 789
 790       /* It seems to be appropriate one, then use the matcher.  */
 791       /* We assume that the matching starts from 0.  */
 792       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 793       match_last = check_matching (&mctx, fl_longest_match,
 794                                    start <= last_start ? &match_first : NULL);
 795       if (match_last != -1)
 796         {
 797           if (__glibc_unlikely (match_last == -2))
 798             {
 799               err = REG_ESPACE;
 800               goto free_return;
 801             }
 802           else
 803             {
 804               mctx.match_last = match_last;
 805               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 806                 {
 807                   re_dfastate_t *pstate = mctx.state_log[match_last];
 808                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 809                                                              match_last);
 810                 }
 811               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 812                   || dfa->nbackref)
 813                 {
 814                   err = prune_impossible_nodes (&mctx);
 815                   if (err == REG_NOERROR)
 816                     break;
 817                   if (__glibc_unlikely (err != REG_NOMATCH))
 818                     goto free_return;
 819                   match_last = -1;
 820                 }
 821               else
 822                 break; /* We found a match.  */
 823             }
 824         }
 825
 826       match_ctx_clean (&mctx);
 827     }
 828
 829   DEBUG_ASSERT (match_last != -1);
 830   DEBUG_ASSERT (err == REG_NOERROR);
 831
 832   /* Set pmatch[] if we need.  */
 833   if (nmatch > 0)
 834     {
 835       Idx reg_idx;
 836
 837       /* Initialize registers.  */
 838       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 839         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 840
 841       /* Set the points where matching start/end.  */
 842       pmatch[0].rm_so = 0;
 843       pmatch[0].rm_eo = mctx.match_last;
 844       /* FIXME: This function should fail if mctx.match_last exceeds
 845          the maximum possible regoff_t value.  We need a new error
 846          code REG_OVERFLOW.  */
 847
 848       if (!preg->no_sub && nmatch > 1)
 849         {
 850           err = set_regs (preg, &mctx, nmatch, pmatch,
 851                           dfa->has_plural_match && dfa->nbackref > 0);
 852           if (__glibc_unlikely (err != REG_NOERROR))
 853             goto free_return;
 854         }
 855
 856       /* At last, add the offset to each register, since we slid
 857          the buffers so that we could assume that the matching starts
 858          from 0.  */
 859       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 860         if (pmatch[reg_idx].rm_so != -1)
 861           {
 862 #ifdef RE_ENABLE_I18N
 863             if (__glibc_unlikely (mctx.input.offsets_needed != 0))
 864               {
 865                 pmatch[reg_idx].rm_so =
 866                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 867                    ? mctx.input.valid_raw_len
 868                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 869                 pmatch[reg_idx].rm_eo =
 870                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 871                    ? mctx.input.valid_raw_len
 872                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 873               }
 874 #else
 875             DEBUG_ASSERT (mctx.input.offsets_needed == 0);
 876 #endif
 877             pmatch[reg_idx].rm_so += match_first;
 878             pmatch[reg_idx].rm_eo += match_first;
 879           }
 880       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 881         {
 882           pmatch[nmatch + reg_idx].rm_so = -1;
 883           pmatch[nmatch + reg_idx].rm_eo = -1;
 884         }
 885
 886       if (dfa->subexp_map)
 887         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 888           if (dfa->subexp_map[reg_idx] != reg_idx)
 889             {
 890               pmatch[reg_idx + 1].rm_so
 891                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 892               pmatch[reg_idx + 1].rm_eo
 893                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 894             }
 895     }
 896
 897  free_return:
 898   re_free (mctx.state_log);
 899   if (dfa->nbackref)
 900     match_ctx_free (&mctx);
 901   re_string_destruct (&mctx.input);
 902   return err;
 903 }
 904
 905 static reg_errcode_t
 906 __attribute_warn_unused_result__
 907 prune_impossible_nodes (re_match_context_t *mctx)
 908 {
 909   const re_dfa_t *const dfa = mctx->dfa;
 910   Idx halt_node, match_last;
 911   reg_errcode_t ret;
 912   re_dfastate_t **sifted_states;
 913   re_dfastate_t **lim_states = NULL;
 914   re_sift_context_t sctx;
 915   DEBUG_ASSERT (mctx->state_log != NULL);
 916   match_last = mctx->match_last;
 917   halt_node = mctx->last_node;
 918
 919   /* Avoid overflow.  */
 920   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
 921                         <= match_last))
 922     return REG_ESPACE;
 923
 924   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 925   if (__glibc_unlikely (sifted_states == NULL))
 926     {
 927       ret = REG_ESPACE;
 928       goto free_return;
 929     }
 930   if (dfa->nbackref)
 931     {
 932       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 933       if (__glibc_unlikely (lim_states == NULL))
 934         {
 935           ret = REG_ESPACE;
 936           goto free_return;
 937         }
 938       while (1)
 939         {
 940           memset (lim_states, '\0',
 941                   sizeof (re_dfastate_t *) * (match_last + 1));
 942           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 943                          match_last);
 944           ret = sift_states_backward (mctx, &sctx);
 945           re_node_set_free (&sctx.limits);
 946           if (__glibc_unlikely (ret != REG_NOERROR))
 947               goto free_return;
 948           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 949             break;
 950           do
 951             {
 952               --match_last;
 953               if (match_last < 0)
 954                 {
 955                   ret = REG_NOMATCH;
 956                   goto free_return;
 957                 }
 958             } while (mctx->state_log[match_last] == NULL
 959                      || !mctx->state_log[match_last]->halt);
 960           halt_node = check_halt_state_context (mctx,
 961                                                 mctx->state_log[match_last],
 962                                                 match_last);
 963         }
 964       ret = merge_state_array (dfa, sifted_states, lim_states,
 965                                match_last + 1);
 966       re_free (lim_states);
 967       lim_states = NULL;
 968       if (__glibc_unlikely (ret != REG_NOERROR))
 969         goto free_return;
 970     }
 971   else
 972     {
 973       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 974       ret = sift_states_backward (mctx, &sctx);
 975       re_node_set_free (&sctx.limits);
 976       if (__glibc_unlikely (ret != REG_NOERROR))
 977         goto free_return;
 978       if (sifted_states[0] == NULL)
 979         {
 980           ret = REG_NOMATCH;
 981           goto free_return;
 982         }
 983     }
 984   re_free (mctx->state_log);
 985   mctx->state_log = sifted_states;
 986   sifted_states = NULL;
 987   mctx->last_node = halt_node;
 988   mctx->match_last = match_last;
 989   ret = REG_NOERROR;
 990  free_return:
 991   re_free (sifted_states);
 992   re_free (lim_states);
 993   return ret;
 994 }
 995
 996 /* Acquire an initial state and return it.
 997    We must select appropriate initial state depending on the context,
 998    since initial states may have constraints like "\<", "^", etc..  */
 999
1000 static inline re_dfastate_t *
1001 __attribute__ ((always_inline))
1002 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1003                             Idx idx)
1004 {
1005   const re_dfa_t *const dfa = mctx->dfa;
1006   if (dfa->init_state->has_constraint)
1007     {
1008       unsigned int context;
1009       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1010       if (IS_WORD_CONTEXT (context))
1011         return dfa->init_state_word;
1012       else if (IS_ORDINARY_CONTEXT (context))
1013         return dfa->init_state;
1014       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1015         return dfa->init_state_begbuf;
1016       else if (IS_NEWLINE_CONTEXT (context))
1017         return dfa->init_state_nl;
1018       else if (IS_BEGBUF_CONTEXT (context))
1019         {
1020           /* It is relatively rare case, then calculate on demand.  */
1021           return re_acquire_state_context (err, dfa,
1022                                            dfa->init_state->entrance_nodes,
1023                                            context);
1024         }
1025       else
1026         /* Must not happen?  */
1027         return dfa->init_state;
1028     }
1029   else
1030     return dfa->init_state;
1031 }
1032
1033 /* Check whether the regular expression match input string INPUT or not,
1034    and return the index where the matching end.  Return -1 if
1035    there is no match, and return -2 in case of an error.
1036    FL_LONGEST_MATCH means we want the POSIX longest matching.
1037    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1038    next place where we may want to try matching.
1039    Note that the matcher assumes that the matching starts from the current
1040    index of the buffer.  */
1041
1042 static Idx
1043 __attribute_warn_unused_result__
1044 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1045                 Idx *p_match_first)
1046 {
1047   const re_dfa_t *const dfa = mctx->dfa;
1048   reg_errcode_t err;
1049   Idx match = 0;
1050   Idx match_last = -1;
1051   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1052   re_dfastate_t *cur_state;
1053   bool at_init_state = p_match_first != NULL;
1054   Idx next_start_idx = cur_str_idx;
1055
1056   err = REG_NOERROR;
1057   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1058   /* An initial state must not be NULL (invalid).  */
1059   if (__glibc_unlikely (cur_state == NULL))
1060     {
1061       DEBUG_ASSERT (err == REG_ESPACE);
1062       return -2;
1063     }
1064
1065   if (mctx->state_log != NULL)
1066     {
1067       mctx->state_log[cur_str_idx] = cur_state;
1068
1069       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1070          later.  E.g. Processing back references.  */
1071       if (__glibc_unlikely (dfa->nbackref))
1072         {
1073           at_init_state = false;
1074           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1075           if (__glibc_unlikely (err != REG_NOERROR))
1076             return err;
1077
1078           if (cur_state->has_backref)
1079             {
1080               err = transit_state_bkref (mctx, &cur_state->nodes);
1081               if (__glibc_unlikely (err != REG_NOERROR))
1082                 return err;
1083             }
1084         }
1085     }
1086
1087   /* If the RE accepts NULL string.  */
1088   if (__glibc_unlikely (cur_state->halt))
1089     {
1090       if (!cur_state->has_constraint
1091           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1092         {
1093           if (!fl_longest_match)
1094             return cur_str_idx;
1095           else
1096             {
1097               match_last = cur_str_idx;
1098               match = 1;
1099             }
1100         }
1101     }
1102
1103   while (!re_string_eoi (&mctx->input))
1104     {
1105       re_dfastate_t *old_state = cur_state;
1106       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1107
1108       if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1109            && mctx->input.bufs_len < mctx->input.len)
1110           || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1111               && mctx->input.valid_len < mctx->input.len))
1112         {
1113           err = extend_buffers (mctx, next_char_idx + 1);
1114           if (__glibc_unlikely (err != REG_NOERROR))
1115             {
1116               DEBUG_ASSERT (err == REG_ESPACE);
1117               return -2;
1118             }
1119         }
1120
1121       cur_state = transit_state (&err, mctx, cur_state);
1122       if (mctx->state_log != NULL)
1123         cur_state = merge_state_with_log (&err, mctx, cur_state);
1124
1125       if (cur_state == NULL)
1126         {
1127           /* Reached the invalid state or an error.  Try to recover a valid
1128              state using the state log, if available and if we have not
1129              already found a valid (even if not the longest) match.  */
1130           if (__glibc_unlikely (err != REG_NOERROR))
1131             return -2;
1132
1133           if (mctx->state_log == NULL
1134               || (match && !fl_longest_match)
1135               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1136             break;
1137         }
1138
1139       if (__glibc_unlikely (at_init_state))
1140         {
1141           if (old_state == cur_state)
1142             next_start_idx = next_char_idx;
1143           else
1144             at_init_state = false;
1145         }
1146
1147       if (cur_state->halt)
1148         {
1149           /* Reached a halt state.
1150              Check the halt state can satisfy the current context.  */
1151           if (!cur_state->has_constraint
1152               || check_halt_state_context (mctx, cur_state,
1153                                            re_string_cur_idx (&mctx->input)))
1154             {
1155               /* We found an appropriate halt state.  */
1156               match_last = re_string_cur_idx (&mctx->input);
1157               match = 1;
1158
1159               /* We found a match, do not modify match_first below.  */
1160               p_match_first = NULL;
1161               if (!fl_longest_match)
1162                 break;
1163             }
1164         }
1165     }
1166
1167   if (p_match_first)
1168     *p_match_first += next_start_idx;
1169
1170   return match_last;
1171 }
1172
1173 /* Check NODE match the current context.  */
1174
1175 static bool
1176 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1177 {
1178   re_token_type_t type = dfa->nodes[node].type;
1179   unsigned int constraint = dfa->nodes[node].constraint;
1180   if (type != END_OF_RE)
1181     return false;
1182   if (!constraint)
1183     return true;
1184   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1185     return false;
1186   return true;
1187 }
1188
1189 /* Check the halt state STATE match the current context.
1190    Return 0 if not match, if the node, STATE has, is a halt node and
1191    match the context, return the node.  */
1192
1193 static Idx
1194 check_halt_state_context (const re_match_context_t *mctx,
1195                           const re_dfastate_t *state, Idx idx)
1196 {
1197   Idx i;
1198   unsigned int context;
1199   DEBUG_ASSERT (state->halt);
1200   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201   for (i = 0; i < state->nodes.nelem; ++i)
1202     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203       return state->nodes.elems[i];
1204   return 0;
1205 }
1206
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208    corresponding to the DFA).
1209    Return the destination node, and update EPS_VIA_NODES;
1210    return -1 on match failure, -2 on error.  */
1211
1212 static Idx
1213 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1214                    regmatch_t *prevregs,
1215                    Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1216                    struct re_fail_stack_t *fs)
1217 {
1218   const re_dfa_t *const dfa = mctx->dfa;
1219   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1220     {
1221       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1222       re_node_set *edests = &dfa->edests[node];
1223       bool ok = re_node_set_insert (eps_via_nodes, node);
1224       if (__glibc_unlikely (! ok))
1225         return -2;
1226
1227       /* Pick a valid destination, or return -1 if none is found.  */
1228       Idx dest_node = -1;
1229       for (Idx i = 0; i < edests->nelem; i++)
1230         {
1231           Idx candidate = edests->elems[i];
1232           if (!re_node_set_contains (cur_nodes, candidate))
1233             continue;
1234           if (dest_node == -1)
1235             dest_node = candidate;
1236
1237           else
1238             {
1239               /* In order to avoid infinite loop like "(a*)*", return the second
1240                  epsilon-transition if the first was already considered.  */
1241               if (re_node_set_contains (eps_via_nodes, dest_node))
1242                 return candidate;
1243
1244               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1245               else if (fs != NULL
1246                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1247                                            prevregs, eps_via_nodes))
1248                 return -2;
1249
1250               /* We know we are going to exit.  */
1251               break;
1252             }
1253         }
1254       return dest_node;
1255     }
1256   else
1257     {
1258       Idx naccepted = 0;
1259       re_token_type_t type = dfa->nodes[node].type;
1260
1261 #ifdef RE_ENABLE_I18N
1262       if (dfa->nodes[node].accept_mb)
1263         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1264       else
1265 #endif /* RE_ENABLE_I18N */
1266       if (type == OP_BACK_REF)
1267         {
1268           Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1269           if (subexp_idx < nregs)
1270             naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1271           if (fs != NULL)
1272             {
1273               if (subexp_idx >= nregs
1274                   || regs[subexp_idx].rm_so == -1
1275                   || regs[subexp_idx].rm_eo == -1)
1276                 return -1;
1277               else if (naccepted)
1278                 {
1279                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1280                   if (mctx->input.valid_len - *pidx < naccepted
1281                       || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1282                                   naccepted)
1283                           != 0))
1284                     return -1;
1285                 }
1286             }
1287
1288           if (naccepted == 0)
1289             {
1290               Idx dest_node;
1291               bool ok = re_node_set_insert (eps_via_nodes, node);
1292               if (__glibc_unlikely (! ok))
1293                 return -2;
1294               dest_node = dfa->edests[node].elems[0];
1295               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1296                                         dest_node))
1297                 return dest_node;
1298             }
1299         }
1300
1301       if (naccepted != 0
1302           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1303         {
1304           Idx dest_node = dfa->nexts[node];
1305           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1306           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1307                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1308                                                dest_node)))
1309             return -1;
1310           re_node_set_empty (eps_via_nodes);
1311           return dest_node;
1312         }
1313     }
1314   return -1;
1315 }
1316
1317 static reg_errcode_t
1318 __attribute_warn_unused_result__
1319 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1320                  Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1321                  re_node_set *eps_via_nodes)
1322 {
1323   reg_errcode_t err;
1324   Idx num = fs->num++;
1325   if (fs->num == fs->alloc)
1326     {
1327       struct re_fail_stack_ent_t *new_array;
1328       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1329                               fs->alloc * 2);
1330       if (new_array == NULL)
1331         return REG_ESPACE;
1332       fs->alloc *= 2;
1333       fs->stack = new_array;
1334     }
1335   fs->stack[num].idx = str_idx;
1336   fs->stack[num].node = dest_node;
1337   fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1338   if (fs->stack[num].regs == NULL)
1339     return REG_ESPACE;
1340   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1341   memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1342   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1343   return err;
1344 }
1345
1346 static Idx
1347 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1348                 regmatch_t *regs, regmatch_t *prevregs,
1349                 re_node_set *eps_via_nodes)
1350 {
1351   if (fs == NULL || fs->num == 0)
1352     return -1;
1353   Idx num = --fs->num;
1354   *pidx = fs->stack[num].idx;
1355   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1356   memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1357   re_node_set_free (eps_via_nodes);
1358   re_free (fs->stack[num].regs);
1359   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1360   DEBUG_ASSERT (0 <= fs->stack[num].node);
1361   return fs->stack[num].node;
1362 }
1363
1364
1365 #define DYNARRAY_STRUCT  regmatch_list
1366 #define DYNARRAY_ELEMENT regmatch_t
1367 #define DYNARRAY_PREFIX  regmatch_list_
1368 #include <malloc/dynarray-skeleton.c>
1369
1370 /* Set the positions where the subexpressions are starts/ends to registers
1371    PMATCH.
1372    Note: We assume that pmatch[0] is already set, and
1373    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1374
1375 static reg_errcode_t
1376 __attribute_warn_unused_result__
1377 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1378           regmatch_t *pmatch, bool fl_backtrack)
1379 {
1380   const re_dfa_t *dfa = preg->buffer;
1381   Idx idx, cur_node;
1382   re_node_set eps_via_nodes;
1383   struct re_fail_stack_t *fs;
1384   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1385   struct regmatch_list prev_match;
1386   regmatch_list_init (&prev_match);
1387
1388   DEBUG_ASSERT (nmatch > 1);
1389   DEBUG_ASSERT (mctx->state_log != NULL);
1390   if (fl_backtrack)
1391     {
1392       fs = &fs_body;
1393       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1394       if (fs->stack == NULL)
1395         return REG_ESPACE;
1396     }
1397   else
1398     fs = NULL;
1399
1400   cur_node = dfa->init_node;
1401   re_node_set_init_empty (&eps_via_nodes);
1402
1403   if (!regmatch_list_resize (&prev_match, nmatch))
1404     {
1405       regmatch_list_free (&prev_match);
1406       free_fail_stack_return (fs);
1407       return REG_ESPACE;
1408     }
1409   regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1410   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1411
1412   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1413     {
1414       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1415
1416       if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1417           || re_node_set_contains (&eps_via_nodes, cur_node))
1418         {
1419           Idx reg_idx;
1420           cur_node = -1;
1421           if (fs)
1422             {
1423               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1424                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1425                   {
1426                     cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1427                                                prev_idx_match, &eps_via_nodes);
1428                     break;
1429                   }
1430             }
1431           if (cur_node < 0)
1432             {
1433               re_node_set_free (&eps_via_nodes);
1434               regmatch_list_free (&prev_match);
1435               return free_fail_stack_return (fs);
1436             }
1437         }
1438
1439       /* Proceed to next node.  */
1440       cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1441                                     &idx, cur_node,
1442                                     &eps_via_nodes, fs);
1443
1444       if (__glibc_unlikely (cur_node < 0))
1445         {
1446           if (__glibc_unlikely (cur_node == -2))
1447             {
1448               re_node_set_free (&eps_via_nodes);
1449               regmatch_list_free (&prev_match);
1450               free_fail_stack_return (fs);
1451               return REG_ESPACE;
1452             }
1453           cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454                                      prev_idx_match, &eps_via_nodes);
1455           if (cur_node < 0)
1456             {
1457               re_node_set_free (&eps_via_nodes);
1458               regmatch_list_free (&prev_match);
1459               free_fail_stack_return (fs);
1460               return REG_NOMATCH;
1461             }
1462         }
1463     }
1464   re_node_set_free (&eps_via_nodes);
1465   regmatch_list_free (&prev_match);
1466   return free_fail_stack_return (fs);
1467 }
1468
1469 static reg_errcode_t
1470 free_fail_stack_return (struct re_fail_stack_t *fs)
1471 {
1472   if (fs)
1473     {
1474       Idx fs_idx;
1475       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1476         {
1477           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1478           re_free (fs->stack[fs_idx].regs);
1479         }
1480       re_free (fs->stack);
1481     }
1482   return REG_NOERROR;
1483 }
1484
1485 static void
1486 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1487              regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1488 {
1489   int type = dfa->nodes[cur_node].type;
1490   if (type == OP_OPEN_SUBEXP)
1491     {
1492       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1493
1494       /* We are at the first node of this sub expression.  */
1495       if (reg_num < nmatch)
1496         {
1497           pmatch[reg_num].rm_so = cur_idx;
1498           pmatch[reg_num].rm_eo = -1;
1499         }
1500     }
1501   else if (type == OP_CLOSE_SUBEXP)
1502     {
1503       /* We are at the last node of this sub expression.  */
1504       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1505       if (reg_num < nmatch)
1506         {
1507           if (pmatch[reg_num].rm_so < cur_idx)
1508             {
1509               pmatch[reg_num].rm_eo = cur_idx;
1510               /* This is a non-empty match or we are not inside an optional
1511                  subexpression.  Accept this right away.  */
1512               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1513             }
1514           else
1515             {
1516               if (dfa->nodes[cur_node].opt_subexp
1517                   && prev_idx_match[reg_num].rm_so != -1)
1518                 /* We transited through an empty match for an optional
1519                    subexpression, like (a?)*, and this is not the subexp's
1520                    first match.  Copy back the old content of the registers
1521                    so that matches of an inner subexpression are undone as
1522                    well, like in ((a?))*.  */
1523                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1524               else
1525                 /* We completed a subexpression, but it may be part of
1526                    an optional one, so do not update PREV_IDX_MATCH.  */
1527                 pmatch[reg_num].rm_eo = cur_idx;
1528             }
1529         }
1530     }
1531 }
1532
1533 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1534    and sift the nodes in each states according to the following rules.
1535    Updated state_log will be wrote to STATE_LOG.
1536
1537    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1538      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1539         If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1540         the LAST_NODE, we throw away the node 'a'.
1541      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1542         string 's' and transit to 'b':
1543         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1544            away the node 'a'.
1545         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1546             thrown away, we throw away the node 'a'.
1547      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1548         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1549            node 'a'.
1550         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1551             we throw away the node 'a'.  */
1552
1553 #define STATE_NODE_CONTAINS(state,node) \
1554   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1555
1556 static reg_errcode_t
1557 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1558 {
1559   reg_errcode_t err;
1560   int null_cnt = 0;
1561   Idx str_idx = sctx->last_str_idx;
1562   re_node_set cur_dest;
1563
1564   DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1565
1566   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1567      transit to the last_node and the last_node itself.  */
1568   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1569   if (__glibc_unlikely (err != REG_NOERROR))
1570     return err;
1571   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1572   if (__glibc_unlikely (err != REG_NOERROR))
1573     goto free_return;
1574
1575   /* Then check each states in the state_log.  */
1576   while (str_idx > 0)
1577     {
1578       /* Update counters.  */
1579       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1580       if (null_cnt > mctx->max_mb_elem_len)
1581         {
1582           memset (sctx->sifted_states, '\0',
1583                   sizeof (re_dfastate_t *) * str_idx);
1584           re_node_set_free (&cur_dest);
1585           return REG_NOERROR;
1586         }
1587       re_node_set_empty (&cur_dest);
1588       --str_idx;
1589
1590       if (mctx->state_log[str_idx])
1591         {
1592           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1593           if (__glibc_unlikely (err != REG_NOERROR))
1594             goto free_return;
1595         }
1596
1597       /* Add all the nodes which satisfy the following conditions:
1598          - It can epsilon transit to a node in CUR_DEST.
1599          - It is in CUR_SRC.
1600          And update state_log.  */
1601       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1602       if (__glibc_unlikely (err != REG_NOERROR))
1603         goto free_return;
1604     }
1605   err = REG_NOERROR;
1606  free_return:
1607   re_node_set_free (&cur_dest);
1608   return err;
1609 }
1610
1611 static reg_errcode_t
1612 __attribute_warn_unused_result__
1613 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1614                      Idx str_idx, re_node_set *cur_dest)
1615 {
1616   const re_dfa_t *const dfa = mctx->dfa;
1617   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1618   Idx i;
1619
1620   /* Then build the next sifted state.
1621      We build the next sifted state on 'cur_dest', and update
1622      'sifted_states[str_idx]' with 'cur_dest'.
1623      Note:
1624      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1625      'cur_src' points the node_set of the old 'state_log[str_idx]'
1626      (with the epsilon nodes pre-filtered out).  */
1627   for (i = 0; i < cur_src->nelem; i++)
1628     {
1629       Idx prev_node = cur_src->elems[i];
1630       int naccepted = 0;
1631       bool ok;
1632       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1633
1634 #ifdef RE_ENABLE_I18N
1635       /* If the node may accept "multi byte".  */
1636       if (dfa->nodes[prev_node].accept_mb)
1637         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1638                                          str_idx, sctx->last_str_idx);
1639 #endif /* RE_ENABLE_I18N */
1640
1641       /* We don't check backreferences here.
1642          See update_cur_sifted_state().  */
1643       if (!naccepted
1644           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1645           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1646                                   dfa->nexts[prev_node]))
1647         naccepted = 1;
1648
1649       if (naccepted == 0)
1650         continue;
1651
1652       if (sctx->limits.nelem)
1653         {
1654           Idx to_idx = str_idx + naccepted;
1655           if (check_dst_limits (mctx, &sctx->limits,
1656                                 dfa->nexts[prev_node], to_idx,
1657                                 prev_node, str_idx))
1658             continue;
1659         }
1660       ok = re_node_set_insert (cur_dest, prev_node);
1661       if (__glibc_unlikely (! ok))
1662         return REG_ESPACE;
1663     }
1664
1665   return REG_NOERROR;
1666 }
1667
1668 /* Helper functions.  */
1669
1670 static reg_errcode_t
1671 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1672 {
1673   Idx top = mctx->state_log_top;
1674
1675   if ((next_state_log_idx >= mctx->input.bufs_len
1676        && mctx->input.bufs_len < mctx->input.len)
1677       || (next_state_log_idx >= mctx->input.valid_len
1678           && mctx->input.valid_len < mctx->input.len))
1679     {
1680       reg_errcode_t err;
1681       err = extend_buffers (mctx, next_state_log_idx + 1);
1682       if (__glibc_unlikely (err != REG_NOERROR))
1683         return err;
1684     }
1685
1686   if (top < next_state_log_idx)
1687     {
1688       memset (mctx->state_log + top + 1, '\0',
1689               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1690       mctx->state_log_top = next_state_log_idx;
1691     }
1692   return REG_NOERROR;
1693 }
1694
1695 static reg_errcode_t
1696 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1697                    re_dfastate_t **src, Idx num)
1698 {
1699   Idx st_idx;
1700   reg_errcode_t err;
1701   for (st_idx = 0; st_idx < num; ++st_idx)
1702     {
1703       if (dst[st_idx] == NULL)
1704         dst[st_idx] = src[st_idx];
1705       else if (src[st_idx] != NULL)
1706         {
1707           re_node_set merged_set;
1708           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1709                                         &src[st_idx]->nodes);
1710           if (__glibc_unlikely (err != REG_NOERROR))
1711             return err;
1712           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1713           re_node_set_free (&merged_set);
1714           if (__glibc_unlikely (err != REG_NOERROR))
1715             return err;
1716         }
1717     }
1718   return REG_NOERROR;
1719 }
1720
1721 static reg_errcode_t
1722 update_cur_sifted_state (const re_match_context_t *mctx,
1723                          re_sift_context_t *sctx, Idx str_idx,
1724                          re_node_set *dest_nodes)
1725 {
1726   const re_dfa_t *const dfa = mctx->dfa;
1727   reg_errcode_t err = REG_NOERROR;
1728   const re_node_set *candidates;
1729   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1730                 : &mctx->state_log[str_idx]->nodes);
1731
1732   if (dest_nodes->nelem == 0)
1733     sctx->sifted_states[str_idx] = NULL;
1734   else
1735     {
1736       if (candidates)
1737         {
1738           /* At first, add the nodes which can epsilon transit to a node in
1739              DEST_NODE.  */
1740           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1741           if (__glibc_unlikely (err != REG_NOERROR))
1742             return err;
1743
1744           /* Then, check the limitations in the current sift_context.  */
1745           if (sctx->limits.nelem)
1746             {
1747               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1748                                          mctx->bkref_ents, str_idx);
1749               if (__glibc_unlikely (err != REG_NOERROR))
1750                 return err;
1751             }
1752         }
1753
1754       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1755       if (__glibc_unlikely (err != REG_NOERROR))
1756         return err;
1757     }
1758
1759   if (candidates && mctx->state_log[str_idx]->has_backref)
1760     {
1761       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1762       if (__glibc_unlikely (err != REG_NOERROR))
1763         return err;
1764     }
1765   return REG_NOERROR;
1766 }
1767
1768 static reg_errcode_t
1769 __attribute_warn_unused_result__
1770 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1771                        const re_node_set *candidates)
1772 {
1773   reg_errcode_t err = REG_NOERROR;
1774   Idx i;
1775
1776   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1777   if (__glibc_unlikely (err != REG_NOERROR))
1778     return err;
1779
1780   if (!state->inveclosure.alloc)
1781     {
1782       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1783       if (__glibc_unlikely (err != REG_NOERROR))
1784         return REG_ESPACE;
1785       for (i = 0; i < dest_nodes->nelem; i++)
1786         {
1787           err = re_node_set_merge (&state->inveclosure,
1788                                    dfa->inveclosures + dest_nodes->elems[i]);
1789           if (__glibc_unlikely (err != REG_NOERROR))
1790             return REG_ESPACE;
1791         }
1792     }
1793   return re_node_set_add_intersect (dest_nodes, candidates,
1794                                     &state->inveclosure);
1795 }
1796
1797 static reg_errcode_t
1798 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1799                        const re_node_set *candidates)
1800 {
1801     Idx ecl_idx;
1802     reg_errcode_t err;
1803     re_node_set *inv_eclosure = dfa->inveclosures + node;
1804     re_node_set except_nodes;
1805     re_node_set_init_empty (&except_nodes);
1806     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1807       {
1808         Idx cur_node = inv_eclosure->elems[ecl_idx];
1809         if (cur_node == node)
1810           continue;
1811         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1812           {
1813             Idx edst1 = dfa->edests[cur_node].elems[0];
1814             Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1815                          ? dfa->edests[cur_node].elems[1] : -1);
1816             if ((!re_node_set_contains (inv_eclosure, edst1)
1817                  && re_node_set_contains (dest_nodes, edst1))
1818                 || (edst2 > 0
1819                     && !re_node_set_contains (inv_eclosure, edst2)
1820                     && re_node_set_contains (dest_nodes, edst2)))
1821               {
1822                 err = re_node_set_add_intersect (&except_nodes, candidates,
1823                                                  dfa->inveclosures + cur_node);
1824                 if (__glibc_unlikely (err != REG_NOERROR))
1825                   {
1826                     re_node_set_free (&except_nodes);
1827                     return err;
1828                   }
1829               }
1830           }
1831       }
1832     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1833       {
1834         Idx cur_node = inv_eclosure->elems[ecl_idx];
1835         if (!re_node_set_contains (&except_nodes, cur_node))
1836           {
1837             Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1838             re_node_set_remove_at (dest_nodes, idx);
1839           }
1840       }
1841     re_node_set_free (&except_nodes);
1842     return REG_NOERROR;
1843 }
1844
1845 static bool
1846 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1847                   Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1848 {
1849   const re_dfa_t *const dfa = mctx->dfa;
1850   Idx lim_idx, src_pos, dst_pos;
1851
1852   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1853   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1854   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1855     {
1856       Idx subexp_idx;
1857       struct re_backref_cache_entry *ent;
1858       ent = mctx->bkref_ents + limits->elems[lim_idx];
1859       subexp_idx = dfa->nodes[ent->node].opr.idx;
1860
1861       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1862                                            subexp_idx, dst_node, dst_idx,
1863                                            dst_bkref_idx);
1864       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1865                                            subexp_idx, src_node, src_idx,
1866                                            src_bkref_idx);
1867
1868       /* In case of:
1869          <src> <dst> ( <subexp> )
1870          ( <subexp> ) <src> <dst>
1871          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1872       if (src_pos == dst_pos)
1873         continue; /* This is unrelated limitation.  */
1874       else
1875         return true;
1876     }
1877   return false;
1878 }
1879
1880 static int
1881 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1882                              Idx subexp_idx, Idx from_node, Idx bkref_idx)
1883 {
1884   const re_dfa_t *const dfa = mctx->dfa;
1885   const re_node_set *eclosures = dfa->eclosures + from_node;
1886   Idx node_idx;
1887
1888   /* Else, we are on the boundary: examine the nodes on the epsilon
1889      closure.  */
1890   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1891     {
1892       Idx node = eclosures->elems[node_idx];
1893       switch (dfa->nodes[node].type)
1894         {
1895         case OP_BACK_REF:
1896           if (bkref_idx != -1)
1897             {
1898               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1899               do
1900                 {
1901                   Idx dst;
1902                   int cpos;
1903
1904                   if (ent->node != node)
1905                     continue;
1906
1907                   if (subexp_idx < BITSET_WORD_BITS
1908                       && !(ent->eps_reachable_subexps_map
1909                            & ((bitset_word_t) 1 << subexp_idx)))
1910                     continue;
1911
1912                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1913                      OP_CLOSE_SUBEXP cases below.  But, if the
1914                      destination node is the same node as the source
1915                      node, don't recurse because it would cause an
1916                      infinite loop: a regex that exhibits this behavior
1917                      is ()\1*\1*  */
1918                   dst = dfa->edests[node].elems[0];
1919                   if (dst == from_node)
1920                     {
1921                       if (boundaries & 1)
1922                         return -1;
1923                       else /* if (boundaries & 2) */
1924                         return 0;
1925                     }
1926
1927                   cpos =
1928                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1929                                                  dst, bkref_idx);
1930                   if (cpos == -1 /* && (boundaries & 1) */)
1931                     return -1;
1932                   if (cpos == 0 && (boundaries & 2))
1933                     return 0;
1934
1935                   if (subexp_idx < BITSET_WORD_BITS)
1936                     ent->eps_reachable_subexps_map
1937                       &= ~((bitset_word_t) 1 << subexp_idx);
1938                 }
1939               while (ent++->more);
1940             }
1941           break;
1942
1943         case OP_OPEN_SUBEXP:
1944           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1945             return -1;
1946           break;
1947
1948         case OP_CLOSE_SUBEXP:
1949           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1950             return 0;
1951           break;
1952
1953         default:
1954             break;
1955         }
1956     }
1957
1958   return (boundaries & 2) ? 1 : 0;
1959 }
1960
1961 static int
1962 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1963                            Idx subexp_idx, Idx from_node, Idx str_idx,
1964                            Idx bkref_idx)
1965 {
1966   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1967   int boundaries;
1968
1969   /* If we are outside the range of the subexpression, return -1 or 1.  */
1970   if (str_idx < lim->subexp_from)
1971     return -1;
1972
1973   if (lim->subexp_to < str_idx)
1974     return 1;
1975
1976   /* If we are within the subexpression, return 0.  */
1977   boundaries = (str_idx == lim->subexp_from);
1978   boundaries |= (str_idx == lim->subexp_to) << 1;
1979   if (boundaries == 0)
1980     return 0;
1981
1982   /* Else, examine epsilon closure.  */
1983   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1984                                       from_node, bkref_idx);
1985 }
1986
1987 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1988    which are against limitations from DEST_NODES. */
1989
1990 static reg_errcode_t
1991 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1992                      const re_node_set *candidates, re_node_set *limits,
1993                      struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1994 {
1995   reg_errcode_t err;
1996   Idx node_idx, lim_idx;
1997
1998   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1999     {
2000       Idx subexp_idx;
2001       struct re_backref_cache_entry *ent;
2002       ent = bkref_ents + limits->elems[lim_idx];
2003
2004       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2005         continue; /* This is unrelated limitation.  */
2006
2007       subexp_idx = dfa->nodes[ent->node].opr.idx;
2008       if (ent->subexp_to == str_idx)
2009         {
2010           Idx ops_node = -1;
2011           Idx cls_node = -1;
2012           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2013             {
2014               Idx node = dest_nodes->elems[node_idx];
2015               re_token_type_t type = dfa->nodes[node].type;
2016               if (type == OP_OPEN_SUBEXP
2017                   && subexp_idx == dfa->nodes[node].opr.idx)
2018                 ops_node = node;
2019               else if (type == OP_CLOSE_SUBEXP
2020                        && subexp_idx == dfa->nodes[node].opr.idx)
2021                 cls_node = node;
2022             }
2023
2024           /* Check the limitation of the open subexpression.  */
2025           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2026           if (ops_node >= 0)
2027             {
2028               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2029                                            candidates);
2030               if (__glibc_unlikely (err != REG_NOERROR))
2031                 return err;
2032             }
2033
2034           /* Check the limitation of the close subexpression.  */
2035           if (cls_node >= 0)
2036             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2037               {
2038                 Idx node = dest_nodes->elems[node_idx];
2039                 if (!re_node_set_contains (dfa->inveclosures + node,
2040                                            cls_node)
2041                     && !re_node_set_contains (dfa->eclosures + node,
2042                                               cls_node))
2043                   {
2044                     /* It is against this limitation.
2045                        Remove it form the current sifted state.  */
2046                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2047                                                  candidates);
2048                     if (__glibc_unlikely (err != REG_NOERROR))
2049                       return err;
2050                     --node_idx;
2051                   }
2052               }
2053         }
2054       else /* (ent->subexp_to != str_idx)  */
2055         {
2056           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2057             {
2058               Idx node = dest_nodes->elems[node_idx];
2059               re_token_type_t type = dfa->nodes[node].type;
2060               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2061                 {
2062                   if (subexp_idx != dfa->nodes[node].opr.idx)
2063                     continue;
2064                   /* It is against this limitation.
2065                      Remove it form the current sifted state.  */
2066                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2067                                                candidates);
2068                   if (__glibc_unlikely (err != REG_NOERROR))
2069                     return err;
2070                 }
2071             }
2072         }
2073     }
2074   return REG_NOERROR;
2075 }
2076
2077 static reg_errcode_t
2078 __attribute_warn_unused_result__
2079 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2080                    Idx str_idx, const re_node_set *candidates)
2081 {
2082   const re_dfa_t *const dfa = mctx->dfa;
2083   reg_errcode_t err;
2084   Idx node_idx, node;
2085   re_sift_context_t local_sctx;
2086   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2087
2088   if (first_idx == -1)
2089     return REG_NOERROR;
2090
2091   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2092
2093   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2094     {
2095       Idx enabled_idx;
2096       re_token_type_t type;
2097       struct re_backref_cache_entry *entry;
2098       node = candidates->elems[node_idx];
2099       type = dfa->nodes[node].type;
2100       /* Avoid infinite loop for the REs like "()\1+".  */
2101       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2102         continue;
2103       if (type != OP_BACK_REF)
2104         continue;
2105
2106       entry = mctx->bkref_ents + first_idx;
2107       enabled_idx = first_idx;
2108       do
2109         {
2110           Idx subexp_len;
2111           Idx to_idx;
2112           Idx dst_node;
2113           bool ok;
2114           re_dfastate_t *cur_state;
2115
2116           if (entry->node != node)
2117             continue;
2118           subexp_len = entry->subexp_to - entry->subexp_from;
2119           to_idx = str_idx + subexp_len;
2120           dst_node = (subexp_len ? dfa->nexts[node]
2121                       : dfa->edests[node].elems[0]);
2122
2123           if (to_idx > sctx->last_str_idx
2124               || sctx->sifted_states[to_idx] == NULL
2125               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2126               || check_dst_limits (mctx, &sctx->limits, node,
2127                                    str_idx, dst_node, to_idx))
2128             continue;
2129
2130           if (local_sctx.sifted_states == NULL)
2131             {
2132               local_sctx = *sctx;
2133               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2134               if (__glibc_unlikely (err != REG_NOERROR))
2135                 goto free_return;
2136             }
2137           local_sctx.last_node = node;
2138           local_sctx.last_str_idx = str_idx;
2139           ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2140           if (__glibc_unlikely (! ok))
2141             {
2142               err = REG_ESPACE;
2143               goto free_return;
2144             }
2145           cur_state = local_sctx.sifted_states[str_idx];
2146           err = sift_states_backward (mctx, &local_sctx);
2147           if (__glibc_unlikely (err != REG_NOERROR))
2148             goto free_return;
2149           if (sctx->limited_states != NULL)
2150             {
2151               err = merge_state_array (dfa, sctx->limited_states,
2152                                        local_sctx.sifted_states,
2153                                        str_idx + 1);
2154               if (__glibc_unlikely (err != REG_NOERROR))
2155                 goto free_return;
2156             }
2157           local_sctx.sifted_states[str_idx] = cur_state;
2158           re_node_set_remove (&local_sctx.limits, enabled_idx);
2159
2160           /* mctx->bkref_ents may have changed, reload the pointer.  */
2161           entry = mctx->bkref_ents + enabled_idx;
2162         }
2163       while (enabled_idx++, entry++->more);
2164     }
2165   err = REG_NOERROR;
2166  free_return:
2167   if (local_sctx.sifted_states != NULL)
2168     {
2169       re_node_set_free (&local_sctx.limits);
2170     }
2171
2172   return err;
2173 }
2174
2175
2176 #ifdef RE_ENABLE_I18N
2177 static int
2178 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2179                      Idx node_idx, Idx str_idx, Idx max_str_idx)
2180 {
2181   const re_dfa_t *const dfa = mctx->dfa;
2182   int naccepted;
2183   /* Check the node can accept "multi byte".  */
2184   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2185   if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2186       && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2187                                dfa->nexts[node_idx]))
2188     /* The node can't accept the "multi byte", or the
2189        destination was already thrown away, then the node
2190        couldn't accept the current input "multi byte".   */
2191     naccepted = 0;
2192   /* Otherwise, it is sure that the node could accept
2193      'naccepted' bytes input.  */
2194   return naccepted;
2195 }
2196 #endif /* RE_ENABLE_I18N */
2197
2198 \f
2199 /* Functions for state transition.  */
2200
2201 /* Return the next state to which the current state STATE will transit by
2202    accepting the current input byte, and update STATE_LOG if necessary.
2203    Return NULL on failure.
2204    If STATE can accept a multibyte char/collating element/back reference
2205    update the destination of STATE_LOG.  */
2206
2207 static re_dfastate_t *
2208 __attribute_warn_unused_result__
2209 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2210                re_dfastate_t *state)
2211 {
2212   re_dfastate_t **trtable;
2213   unsigned char ch;
2214
2215 #ifdef RE_ENABLE_I18N
2216   /* If the current state can accept multibyte.  */
2217   if (__glibc_unlikely (state->accept_mb))
2218     {
2219       *err = transit_state_mb (mctx, state);
2220       if (__glibc_unlikely (*err != REG_NOERROR))
2221         return NULL;
2222     }
2223 #endif /* RE_ENABLE_I18N */
2224
2225   /* Then decide the next state with the single byte.  */
2226 #if 0
2227   if (0)
2228     /* don't use transition table  */
2229     return transit_state_sb (err, mctx, state);
2230 #endif
2231
2232   /* Use transition table  */
2233   ch = re_string_fetch_byte (&mctx->input);
2234   for (;;)
2235     {
2236       trtable = state->trtable;
2237       if (__glibc_likely (trtable != NULL))
2238         return trtable[ch];
2239
2240       trtable = state->word_trtable;
2241       if (__glibc_likely (trtable != NULL))
2242         {
2243           unsigned int context;
2244           context
2245             = re_string_context_at (&mctx->input,
2246                                     re_string_cur_idx (&mctx->input) - 1,
2247                                     mctx->eflags);
2248           if (IS_WORD_CONTEXT (context))
2249             return trtable[ch + SBC_MAX];
2250           else
2251             return trtable[ch];
2252         }
2253
2254       if (!build_trtable (mctx->dfa, state))
2255         {
2256           *err = REG_ESPACE;
2257           return NULL;
2258         }
2259
2260       /* Retry, we now have a transition table.  */
2261     }
2262 }
2263
2264 /* Update the state_log if we need */
2265 static re_dfastate_t *
2266 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2267                       re_dfastate_t *next_state)
2268 {
2269   const re_dfa_t *const dfa = mctx->dfa;
2270   Idx cur_idx = re_string_cur_idx (&mctx->input);
2271
2272   if (cur_idx > mctx->state_log_top)
2273     {
2274       mctx->state_log[cur_idx] = next_state;
2275       mctx->state_log_top = cur_idx;
2276     }
2277   else if (mctx->state_log[cur_idx] == 0)
2278     {
2279       mctx->state_log[cur_idx] = next_state;
2280     }
2281   else
2282     {
2283       re_dfastate_t *pstate;
2284       unsigned int context;
2285       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2286       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2287          the destination of a multibyte char/collating element/
2288          back reference.  Then the next state is the union set of
2289          these destinations and the results of the transition table.  */
2290       pstate = mctx->state_log[cur_idx];
2291       log_nodes = pstate->entrance_nodes;
2292       if (next_state != NULL)
2293         {
2294           table_nodes = next_state->entrance_nodes;
2295           *err = re_node_set_init_union (&next_nodes, table_nodes,
2296                                              log_nodes);
2297           if (__glibc_unlikely (*err != REG_NOERROR))
2298             return NULL;
2299         }
2300       else
2301         next_nodes = *log_nodes;
2302       /* Note: We already add the nodes of the initial state,
2303          then we don't need to add them here.  */
2304
2305       context = re_string_context_at (&mctx->input,
2306                                       re_string_cur_idx (&mctx->input) - 1,
2307                                       mctx->eflags);
2308       next_state = mctx->state_log[cur_idx]
2309         = re_acquire_state_context (err, dfa, &next_nodes, context);
2310       /* We don't need to check errors here, since the return value of
2311          this function is next_state and ERR is already set.  */
2312
2313       if (table_nodes != NULL)
2314         re_node_set_free (&next_nodes);
2315     }
2316
2317   if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2318     {
2319       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2320          later.  We must check them here, since the back references in the
2321          next state might use them.  */
2322       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2323                                         cur_idx);
2324       if (__glibc_unlikely (*err != REG_NOERROR))
2325         return NULL;
2326
2327       /* If the next state has back references.  */
2328       if (next_state->has_backref)
2329         {
2330           *err = transit_state_bkref (mctx, &next_state->nodes);
2331           if (__glibc_unlikely (*err != REG_NOERROR))
2332             return NULL;
2333           next_state = mctx->state_log[cur_idx];
2334         }
2335     }
2336
2337   return next_state;
2338 }
2339
2340 /* Skip bytes in the input that correspond to part of a
2341    multi-byte match, then look in the log for a state
2342    from which to restart matching.  */
2343 static re_dfastate_t *
2344 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2345 {
2346   re_dfastate_t *cur_state;
2347   do
2348     {
2349       Idx max = mctx->state_log_top;
2350       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2351
2352       do
2353         {
2354           if (++cur_str_idx > max)
2355             return NULL;
2356           re_string_skip_bytes (&mctx->input, 1);
2357         }
2358       while (mctx->state_log[cur_str_idx] == NULL);
2359
2360       cur_state = merge_state_with_log (err, mctx, NULL);
2361     }
2362   while (*err == REG_NOERROR && cur_state == NULL);
2363   return cur_state;
2364 }
2365
2366 /* Helper functions for transit_state.  */
2367
2368 /* From the node set CUR_NODES, pick up the nodes whose types are
2369    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2370    expression. And register them to use them later for evaluating the
2371    corresponding back references.  */
2372
2373 static reg_errcode_t
2374 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2375                            Idx str_idx)
2376 {
2377   const re_dfa_t *const dfa = mctx->dfa;
2378   Idx node_idx;
2379   reg_errcode_t err;
2380
2381   /* TODO: This isn't efficient.
2382            Because there might be more than one nodes whose types are
2383            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2384            nodes.
2385            E.g. RE: (a){2}  */
2386   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2387     {
2388       Idx node = cur_nodes->elems[node_idx];
2389       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2390           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2391           && (dfa->used_bkref_map
2392               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2393         {
2394           err = match_ctx_add_subtop (mctx, node, str_idx);
2395           if (__glibc_unlikely (err != REG_NOERROR))
2396             return err;
2397         }
2398     }
2399   return REG_NOERROR;
2400 }
2401
2402 #if 0
2403 /* Return the next state to which the current state STATE will transit by
2404    accepting the current input byte.  Return NULL on failure.  */
2405
2406 static re_dfastate_t *
2407 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2408                   re_dfastate_t *state)
2409 {
2410   const re_dfa_t *const dfa = mctx->dfa;
2411   re_node_set next_nodes;
2412   re_dfastate_t *next_state;
2413   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2414   unsigned int context;
2415
2416   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2417   if (__glibc_unlikely (*err != REG_NOERROR))
2418     return NULL;
2419   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2420     {
2421       Idx cur_node = state->nodes.elems[node_cnt];
2422       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2423         {
2424           *err = re_node_set_merge (&next_nodes,
2425                                     dfa->eclosures + dfa->nexts[cur_node]);
2426           if (__glibc_unlikely (*err != REG_NOERROR))
2427             {
2428               re_node_set_free (&next_nodes);
2429               return NULL;
2430             }
2431         }
2432     }
2433   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2434   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2435   /* We don't need to check errors here, since the return value of
2436      this function is next_state and ERR is already set.  */
2437
2438   re_node_set_free (&next_nodes);
2439   re_string_skip_bytes (&mctx->input, 1);
2440   return next_state;
2441 }
2442 #endif
2443
2444 #ifdef RE_ENABLE_I18N
2445 static reg_errcode_t
2446 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2447 {
2448   const re_dfa_t *const dfa = mctx->dfa;
2449   reg_errcode_t err;
2450   Idx i;
2451
2452   for (i = 0; i < pstate->nodes.nelem; ++i)
2453     {
2454       re_node_set dest_nodes, *new_nodes;
2455       Idx cur_node_idx = pstate->nodes.elems[i];
2456       int naccepted;
2457       Idx dest_idx;
2458       unsigned int context;
2459       re_dfastate_t *dest_state;
2460
2461       if (!dfa->nodes[cur_node_idx].accept_mb)
2462         continue;
2463
2464       if (dfa->nodes[cur_node_idx].constraint)
2465         {
2466           context = re_string_context_at (&mctx->input,
2467                                           re_string_cur_idx (&mctx->input),
2468                                           mctx->eflags);
2469           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2470                                            context))
2471             continue;
2472         }
2473
2474       /* How many bytes the node can accept?  */
2475       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2476                                            re_string_cur_idx (&mctx->input));
2477       if (naccepted == 0)
2478         continue;
2479
2480       /* The node can accepts 'naccepted' bytes.  */
2481       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2482       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2483                                : mctx->max_mb_elem_len);
2484       err = clean_state_log_if_needed (mctx, dest_idx);
2485       if (__glibc_unlikely (err != REG_NOERROR))
2486         return err;
2487       DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2488       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2489
2490       dest_state = mctx->state_log[dest_idx];
2491       if (dest_state == NULL)
2492         dest_nodes = *new_nodes;
2493       else
2494         {
2495           err = re_node_set_init_union (&dest_nodes,
2496                                         dest_state->entrance_nodes, new_nodes);
2497           if (__glibc_unlikely (err != REG_NOERROR))
2498             return err;
2499         }
2500       context = re_string_context_at (&mctx->input, dest_idx - 1,
2501                                       mctx->eflags);
2502       mctx->state_log[dest_idx]
2503         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2504       if (dest_state != NULL)
2505         re_node_set_free (&dest_nodes);
2506       if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2507                             && err != REG_NOERROR))
2508         return err;
2509     }
2510   return REG_NOERROR;
2511 }
2512 #endif /* RE_ENABLE_I18N */
2513
2514 static reg_errcode_t
2515 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2516 {
2517   const re_dfa_t *const dfa = mctx->dfa;
2518   reg_errcode_t err;
2519   Idx i;
2520   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2521
2522   for (i = 0; i < nodes->nelem; ++i)
2523     {
2524       Idx dest_str_idx, prev_nelem, bkc_idx;
2525       Idx node_idx = nodes->elems[i];
2526       unsigned int context;
2527       const re_token_t *node = dfa->nodes + node_idx;
2528       re_node_set *new_dest_nodes;
2529
2530       /* Check whether 'node' is a backreference or not.  */
2531       if (node->type != OP_BACK_REF)
2532         continue;
2533
2534       if (node->constraint)
2535         {
2536           context = re_string_context_at (&mctx->input, cur_str_idx,
2537                                           mctx->eflags);
2538           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2539             continue;
2540         }
2541
2542       /* 'node' is a backreference.
2543          Check the substring which the substring matched.  */
2544       bkc_idx = mctx->nbkref_ents;
2545       err = get_subexp (mctx, node_idx, cur_str_idx);
2546       if (__glibc_unlikely (err != REG_NOERROR))
2547         goto free_return;
2548
2549       /* And add the epsilon closures (which is 'new_dest_nodes') of
2550          the backreference to appropriate state_log.  */
2551       DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2552       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2553         {
2554           Idx subexp_len;
2555           re_dfastate_t *dest_state;
2556           struct re_backref_cache_entry *bkref_ent;
2557           bkref_ent = mctx->bkref_ents + bkc_idx;
2558           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2559             continue;
2560           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2561           new_dest_nodes = (subexp_len == 0
2562                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2563                             : dfa->eclosures + dfa->nexts[node_idx]);
2564           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2565                           - bkref_ent->subexp_from);
2566           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2567                                           mctx->eflags);
2568           dest_state = mctx->state_log[dest_str_idx];
2569           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2570                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2571           /* Add 'new_dest_node' to state_log.  */
2572           if (dest_state == NULL)
2573             {
2574               mctx->state_log[dest_str_idx]
2575                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2576                                             context);
2577               if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2578                                     && err != REG_NOERROR))
2579                 goto free_return;
2580             }
2581           else
2582             {
2583               re_node_set dest_nodes;
2584               err = re_node_set_init_union (&dest_nodes,
2585                                             dest_state->entrance_nodes,
2586                                             new_dest_nodes);
2587               if (__glibc_unlikely (err != REG_NOERROR))
2588                 {
2589                   re_node_set_free (&dest_nodes);
2590                   goto free_return;
2591                 }
2592               mctx->state_log[dest_str_idx]
2593                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2594               re_node_set_free (&dest_nodes);
2595               if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2596                                     && err != REG_NOERROR))
2597                 goto free_return;
2598             }
2599           /* We need to check recursively if the backreference can epsilon
2600              transit.  */
2601           if (subexp_len == 0
2602               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2603             {
2604               err = check_subexp_matching_top (mctx, new_dest_nodes,
2605                                                cur_str_idx);
2606               if (__glibc_unlikely (err != REG_NOERROR))
2607                 goto free_return;
2608               err = transit_state_bkref (mctx, new_dest_nodes);
2609               if (__glibc_unlikely (err != REG_NOERROR))
2610                 goto free_return;
2611             }
2612         }
2613     }
2614   err = REG_NOERROR;
2615  free_return:
2616   return err;
2617 }
2618
2619 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2620    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2621    Note that we might collect inappropriate candidates here.
2622    However, the cost of checking them strictly here is too high, then we
2623    delay these checking for prune_impossible_nodes().  */
2624
2625 static reg_errcode_t
2626 __attribute_warn_unused_result__
2627 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2628 {
2629   const re_dfa_t *const dfa = mctx->dfa;
2630   Idx subexp_num, sub_top_idx;
2631   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2632   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2633   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2634   if (cache_idx != -1)
2635     {
2636       const struct re_backref_cache_entry *entry
2637         = mctx->bkref_ents + cache_idx;
2638       do
2639         if (entry->node == bkref_node)
2640           return REG_NOERROR; /* We already checked it.  */
2641       while (entry++->more);
2642     }
2643
2644   subexp_num = dfa->nodes[bkref_node].opr.idx;
2645
2646   /* For each sub expression  */
2647   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2648     {
2649       reg_errcode_t err;
2650       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2651       re_sub_match_last_t *sub_last;
2652       Idx sub_last_idx, sl_str, bkref_str_off;
2653
2654       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2655         continue; /* It isn't related.  */
2656
2657       sl_str = sub_top->str_idx;
2658       bkref_str_off = bkref_str_idx;
2659       /* At first, check the last node of sub expressions we already
2660          evaluated.  */
2661       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2662         {
2663           regoff_t sl_str_diff;
2664           sub_last = sub_top->lasts[sub_last_idx];
2665           sl_str_diff = sub_last->str_idx - sl_str;
2666           /* The matched string by the sub expression match with the substring
2667              at the back reference?  */
2668           if (sl_str_diff > 0)
2669             {
2670               if (__glibc_unlikely (bkref_str_off + sl_str_diff
2671                                     > mctx->input.valid_len))
2672                 {
2673                   /* Not enough chars for a successful match.  */
2674                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2675                     break;
2676
2677                   err = clean_state_log_if_needed (mctx,
2678                                                    bkref_str_off
2679                                                    + sl_str_diff);
2680                   if (__glibc_unlikely (err != REG_NOERROR))
2681                     return err;
2682                   buf = (const char *) re_string_get_buffer (&mctx->input);
2683                 }
2684               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2685                 /* We don't need to search this sub expression any more.  */
2686                 break;
2687             }
2688           bkref_str_off += sl_str_diff;
2689           sl_str += sl_str_diff;
2690           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2691                                 bkref_str_idx);
2692
2693           /* Reload buf, since the preceding call might have reallocated
2694              the buffer.  */
2695           buf = (const char *) re_string_get_buffer (&mctx->input);
2696
2697           if (err == REG_NOMATCH)
2698             continue;
2699           if (__glibc_unlikely (err != REG_NOERROR))
2700             return err;
2701         }
2702
2703       if (sub_last_idx < sub_top->nlasts)
2704         continue;
2705       if (sub_last_idx > 0)
2706         ++sl_str;
2707       /* Then, search for the other last nodes of the sub expression.  */
2708       for (; sl_str <= bkref_str_idx; ++sl_str)
2709         {
2710           Idx cls_node;
2711           regoff_t sl_str_off;
2712           const re_node_set *nodes;
2713           sl_str_off = sl_str - sub_top->str_idx;
2714           /* The matched string by the sub expression match with the substring
2715              at the back reference?  */
2716           if (sl_str_off > 0)
2717             {
2718               if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2719                 {
2720                   /* If we are at the end of the input, we cannot match.  */
2721                   if (bkref_str_off >= mctx->input.len)
2722                     break;
2723
2724                   err = extend_buffers (mctx, bkref_str_off + 1);
2725                   if (__glibc_unlikely (err != REG_NOERROR))
2726                     return err;
2727
2728                   buf = (const char *) re_string_get_buffer (&mctx->input);
2729                 }
2730               if (buf [bkref_str_off++] != buf[sl_str - 1])
2731                 break; /* We don't need to search this sub expression
2732                           any more.  */
2733             }
2734           if (mctx->state_log[sl_str] == NULL)
2735             continue;
2736           /* Does this state have a ')' of the sub expression?  */
2737           nodes = &mctx->state_log[sl_str]->nodes;
2738           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2739                                        OP_CLOSE_SUBEXP);
2740           if (cls_node == -1)
2741             continue; /* No.  */
2742           if (sub_top->path == NULL)
2743             {
2744               sub_top->path = calloc (sizeof (state_array_t),
2745                                       sl_str - sub_top->str_idx + 1);
2746               if (sub_top->path == NULL)
2747                 return REG_ESPACE;
2748             }
2749           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2750              in the current context?  */
2751           err = check_arrival (mctx, sub_top->path, sub_top->node,
2752                                sub_top->str_idx, cls_node, sl_str,
2753                                OP_CLOSE_SUBEXP);
2754           if (err == REG_NOMATCH)
2755               continue;
2756           if (__glibc_unlikely (err != REG_NOERROR))
2757               return err;
2758           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2759           if (__glibc_unlikely (sub_last == NULL))
2760             return REG_ESPACE;
2761           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2762                                 bkref_str_idx);
2763           buf = (const char *) re_string_get_buffer (&mctx->input);
2764           if (err == REG_NOMATCH)
2765             continue;
2766           if (__glibc_unlikely (err != REG_NOERROR))
2767             return err;
2768         }
2769     }
2770   return REG_NOERROR;
2771 }
2772
2773 /* Helper functions for get_subexp().  */
2774
2775 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2776    If it can arrive, register the sub expression expressed with SUB_TOP
2777    and SUB_LAST.  */
2778
2779 static reg_errcode_t
2780 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2781                 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2782 {
2783   reg_errcode_t err;
2784   Idx to_idx;
2785   /* Can the subexpression arrive the back reference?  */
2786   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2787                        sub_last->str_idx, bkref_node, bkref_str,
2788                        OP_OPEN_SUBEXP);
2789   if (err != REG_NOERROR)
2790     return err;
2791   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2792                              sub_last->str_idx);
2793   if (__glibc_unlikely (err != REG_NOERROR))
2794     return err;
2795   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2796   return clean_state_log_if_needed (mctx, to_idx);
2797 }
2798
2799 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2800    Search '(' if FL_OPEN, or search ')' otherwise.
2801    TODO: This function isn't efficient...
2802          Because there might be more than one nodes whose types are
2803          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2804          nodes.
2805          E.g. RE: (a){2}  */
2806
2807 static Idx
2808 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2809                   Idx subexp_idx, int type)
2810 {
2811   Idx cls_idx;
2812   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2813     {
2814       Idx cls_node = nodes->elems[cls_idx];
2815       const re_token_t *node = dfa->nodes + cls_node;
2816       if (node->type == type
2817           && node->opr.idx == subexp_idx)
2818         return cls_node;
2819     }
2820   return -1;
2821 }
2822
2823 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2824    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2825    heavily reused.
2826    Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2827    REG_ESPACE if memory is exhausted.  */
2828
2829 static reg_errcode_t
2830 __attribute_warn_unused_result__
2831 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2832                Idx top_str, Idx last_node, Idx last_str, int type)
2833 {
2834   const re_dfa_t *const dfa = mctx->dfa;
2835   reg_errcode_t err = REG_NOERROR;
2836   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2837   re_dfastate_t *cur_state = NULL;
2838   re_node_set *cur_nodes, next_nodes;
2839   re_dfastate_t **backup_state_log;
2840   unsigned int context;
2841
2842   subexp_num = dfa->nodes[top_node].opr.idx;
2843   /* Extend the buffer if we need.  */
2844   if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2845     {
2846       re_dfastate_t **new_array;
2847       Idx old_alloc = path->alloc;
2848       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2849       Idx new_alloc;
2850       if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2851         return REG_ESPACE;
2852       new_alloc = old_alloc + incr_alloc;
2853       if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2854         return REG_ESPACE;
2855       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2856       if (__glibc_unlikely (new_array == NULL))
2857         return REG_ESPACE;
2858       path->array = new_array;
2859       path->alloc = new_alloc;
2860       memset (new_array + old_alloc, '\0',
2861               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2862     }
2863
2864   str_idx = path->next_idx ? path->next_idx : top_str;
2865
2866   /* Temporary modify MCTX.  */
2867   backup_state_log = mctx->state_log;
2868   backup_cur_idx = mctx->input.cur_idx;
2869   mctx->state_log = path->array;
2870   mctx->input.cur_idx = str_idx;
2871
2872   /* Setup initial node set.  */
2873   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2874   if (str_idx == top_str)
2875     {
2876       err = re_node_set_init_1 (&next_nodes, top_node);
2877       if (__glibc_unlikely (err != REG_NOERROR))
2878         return err;
2879       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2880       if (__glibc_unlikely (err != REG_NOERROR))
2881         {
2882           re_node_set_free (&next_nodes);
2883           return err;
2884         }
2885     }
2886   else
2887     {
2888       cur_state = mctx->state_log[str_idx];
2889       if (cur_state && cur_state->has_backref)
2890         {
2891           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2892           if (__glibc_unlikely (err != REG_NOERROR))
2893             return err;
2894         }
2895       else
2896         re_node_set_init_empty (&next_nodes);
2897     }
2898   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2899     {
2900       if (next_nodes.nelem)
2901         {
2902           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2903                                     subexp_num, type);
2904           if (__glibc_unlikely (err != REG_NOERROR))
2905             {
2906               re_node_set_free (&next_nodes);
2907               return err;
2908             }
2909         }
2910       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2911       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2912         {
2913           re_node_set_free (&next_nodes);
2914           return err;
2915         }
2916       mctx->state_log[str_idx] = cur_state;
2917     }
2918
2919   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2920     {
2921       re_node_set_empty (&next_nodes);
2922       if (mctx->state_log[str_idx + 1])
2923         {
2924           err = re_node_set_merge (&next_nodes,
2925                                    &mctx->state_log[str_idx + 1]->nodes);
2926           if (__glibc_unlikely (err != REG_NOERROR))
2927             {
2928               re_node_set_free (&next_nodes);
2929               return err;
2930             }
2931         }
2932       if (cur_state)
2933         {
2934           err = check_arrival_add_next_nodes (mctx, str_idx,
2935                                               &cur_state->non_eps_nodes,
2936                                               &next_nodes);
2937           if (__glibc_unlikely (err != REG_NOERROR))
2938             {
2939               re_node_set_free (&next_nodes);
2940               return err;
2941             }
2942         }
2943       ++str_idx;
2944       if (next_nodes.nelem)
2945         {
2946           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2947           if (__glibc_unlikely (err != REG_NOERROR))
2948             {
2949               re_node_set_free (&next_nodes);
2950               return err;
2951             }
2952           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2953                                     subexp_num, type);
2954           if (__glibc_unlikely (err != REG_NOERROR))
2955             {
2956               re_node_set_free (&next_nodes);
2957               return err;
2958             }
2959         }
2960       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2961       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2962       if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2963         {
2964           re_node_set_free (&next_nodes);
2965           return err;
2966         }
2967       mctx->state_log[str_idx] = cur_state;
2968       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2969     }
2970   re_node_set_free (&next_nodes);
2971   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2972                : &mctx->state_log[last_str]->nodes);
2973   path->next_idx = str_idx;
2974
2975   /* Fix MCTX.  */
2976   mctx->state_log = backup_state_log;
2977   mctx->input.cur_idx = backup_cur_idx;
2978
2979   /* Then check the current node set has the node LAST_NODE.  */
2980   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2981     return REG_NOERROR;
2982
2983   return REG_NOMATCH;
2984 }
2985
2986 /* Helper functions for check_arrival.  */
2987
2988 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2989    to NEXT_NODES.
2990    TODO: This function is similar to the functions transit_state*(),
2991          however this function has many additional works.
2992          Can't we unify them?  */
2993
2994 static reg_errcode_t
2995 __attribute_warn_unused_result__
2996 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2997                               re_node_set *cur_nodes, re_node_set *next_nodes)
2998 {
2999   const re_dfa_t *const dfa = mctx->dfa;
3000   bool ok;
3001   Idx cur_idx;
3002 #ifdef RE_ENABLE_I18N
3003   reg_errcode_t err = REG_NOERROR;
3004 #endif
3005   re_node_set union_set;
3006   re_node_set_init_empty (&union_set);
3007   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3008     {
3009       int naccepted = 0;
3010       Idx cur_node = cur_nodes->elems[cur_idx];
3011       DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
3012
3013 #ifdef RE_ENABLE_I18N
3014       /* If the node may accept "multi byte".  */
3015       if (dfa->nodes[cur_node].accept_mb)
3016         {
3017           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3018                                                str_idx);
3019           if (naccepted > 1)
3020             {
3021               re_dfastate_t *dest_state;
3022               Idx next_node = dfa->nexts[cur_node];
3023               Idx next_idx = str_idx + naccepted;
3024               dest_state = mctx->state_log[next_idx];
3025               re_node_set_empty (&union_set);
3026               if (dest_state)
3027                 {
3028                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3029                   if (__glibc_unlikely (err != REG_NOERROR))
3030                     {
3031                       re_node_set_free (&union_set);
3032                       return err;
3033                     }
3034                 }
3035               ok = re_node_set_insert (&union_set, next_node);
3036               if (__glibc_unlikely (! ok))
3037                 {
3038                   re_node_set_free (&union_set);
3039                   return REG_ESPACE;
3040                 }
3041               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3042                                                             &union_set);
3043               if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3044                                     && err != REG_NOERROR))
3045                 {
3046                   re_node_set_free (&union_set);
3047                   return err;
3048                 }
3049             }
3050         }
3051 #endif /* RE_ENABLE_I18N */
3052       if (naccepted
3053           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3054         {
3055           ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3056           if (__glibc_unlikely (! ok))
3057             {
3058               re_node_set_free (&union_set);
3059               return REG_ESPACE;
3060             }
3061         }
3062     }
3063   re_node_set_free (&union_set);
3064   return REG_NOERROR;
3065 }
3066
3067 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3068    CUR_NODES, however exclude the nodes which are:
3069     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3070     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3071 */
3072
3073 static reg_errcode_t
3074 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3075                           Idx ex_subexp, int type)
3076 {
3077   reg_errcode_t err;
3078   Idx idx, outside_node;
3079   re_node_set new_nodes;
3080   DEBUG_ASSERT (cur_nodes->nelem);
3081   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3082   if (__glibc_unlikely (err != REG_NOERROR))
3083     return err;
3084   /* Create a new node set NEW_NODES with the nodes which are epsilon
3085      closures of the node in CUR_NODES.  */
3086
3087   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3088     {
3089       Idx cur_node = cur_nodes->elems[idx];
3090       const re_node_set *eclosure = dfa->eclosures + cur_node;
3091       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3092       if (outside_node == -1)
3093         {
3094           /* There are no problematic nodes, just merge them.  */
3095           err = re_node_set_merge (&new_nodes, eclosure);
3096           if (__glibc_unlikely (err != REG_NOERROR))
3097             {
3098               re_node_set_free (&new_nodes);
3099               return err;
3100             }
3101         }
3102       else
3103         {
3104           /* There are problematic nodes, re-calculate incrementally.  */
3105           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3106                                               ex_subexp, type);
3107           if (__glibc_unlikely (err != REG_NOERROR))
3108             {
3109               re_node_set_free (&new_nodes);
3110               return err;
3111             }
3112         }
3113     }
3114   re_node_set_free (cur_nodes);
3115   *cur_nodes = new_nodes;
3116   return REG_NOERROR;
3117 }
3118
3119 /* Helper function for check_arrival_expand_ecl.
3120    Check incrementally the epsilon closure of TARGET, and if it isn't
3121    problematic append it to DST_NODES.  */
3122
3123 static reg_errcode_t
3124 __attribute_warn_unused_result__
3125 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3126                               Idx target, Idx ex_subexp, int type)
3127 {
3128   Idx cur_node;
3129   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3130     {
3131       bool ok;
3132
3133       if (dfa->nodes[cur_node].type == type
3134           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3135         {
3136           if (type == OP_CLOSE_SUBEXP)
3137             {
3138               ok = re_node_set_insert (dst_nodes, cur_node);
3139               if (__glibc_unlikely (! ok))
3140                 return REG_ESPACE;
3141             }
3142           break;
3143         }
3144       ok = re_node_set_insert (dst_nodes, cur_node);
3145       if (__glibc_unlikely (! ok))
3146         return REG_ESPACE;
3147       if (dfa->edests[cur_node].nelem == 0)
3148         break;
3149       if (dfa->edests[cur_node].nelem == 2)
3150         {
3151           reg_errcode_t err;
3152           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3153                                               dfa->edests[cur_node].elems[1],
3154                                               ex_subexp, type);
3155           if (__glibc_unlikely (err != REG_NOERROR))
3156             return err;
3157         }
3158       cur_node = dfa->edests[cur_node].elems[0];
3159     }
3160   return REG_NOERROR;
3161 }
3162
3163
3164 /* For all the back references in the current state, calculate the
3165    destination of the back references by the appropriate entry
3166    in MCTX->BKREF_ENTS.  */
3167
3168 static reg_errcode_t
3169 __attribute_warn_unused_result__
3170 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3171                     Idx cur_str, Idx subexp_num, int type)
3172 {
3173   const re_dfa_t *const dfa = mctx->dfa;
3174   reg_errcode_t err;
3175   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3176   struct re_backref_cache_entry *ent;
3177
3178   if (cache_idx_start == -1)
3179     return REG_NOERROR;
3180
3181  restart:
3182   ent = mctx->bkref_ents + cache_idx_start;
3183   do
3184     {
3185       Idx to_idx, next_node;
3186
3187       /* Is this entry ENT is appropriate?  */
3188       if (!re_node_set_contains (cur_nodes, ent->node))
3189         continue; /* No.  */
3190
3191       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3192       /* Calculate the destination of the back reference, and append it
3193          to MCTX->STATE_LOG.  */
3194       if (to_idx == cur_str)
3195         {
3196           /* The backreference did epsilon transit, we must re-check all the
3197              node in the current state.  */
3198           re_node_set new_dests;
3199           reg_errcode_t err2, err3;
3200           next_node = dfa->edests[ent->node].elems[0];
3201           if (re_node_set_contains (cur_nodes, next_node))
3202             continue;
3203           err = re_node_set_init_1 (&new_dests, next_node);
3204           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3205           err3 = re_node_set_merge (cur_nodes, &new_dests);
3206           re_node_set_free (&new_dests);
3207           if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3208                                 || err3 != REG_NOERROR))
3209             {
3210               err = (err != REG_NOERROR ? err
3211                      : (err2 != REG_NOERROR ? err2 : err3));
3212               return err;
3213             }
3214           /* TODO: It is still inefficient...  */
3215           goto restart;
3216         }
3217       else
3218         {
3219           re_node_set union_set;
3220           next_node = dfa->nexts[ent->node];
3221           if (mctx->state_log[to_idx])
3222             {
3223               bool ok;
3224               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3225                                         next_node))
3226                 continue;
3227               err = re_node_set_init_copy (&union_set,
3228                                            &mctx->state_log[to_idx]->nodes);
3229               ok = re_node_set_insert (&union_set, next_node);
3230               if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3231                 {
3232                   re_node_set_free (&union_set);
3233                   err = err != REG_NOERROR ? err : REG_ESPACE;
3234                   return err;
3235                 }
3236             }
3237           else
3238             {
3239               err = re_node_set_init_1 (&union_set, next_node);
3240               if (__glibc_unlikely (err != REG_NOERROR))
3241                 return err;
3242             }
3243           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3244           re_node_set_free (&union_set);
3245           if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3246                                 && err != REG_NOERROR))
3247             return err;
3248         }
3249     }
3250   while (ent++->more);
3251   return REG_NOERROR;
3252 }
3253
3254 /* Build transition table for the state.
3255    Return true if successful.  */
3256
3257 static bool __attribute_noinline__
3258 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3259 {
3260   reg_errcode_t err;
3261   Idx i, j;
3262   int ch;
3263   bool need_word_trtable = false;
3264   bitset_word_t elem, mask;
3265   Idx ndests; /* Number of the destination states from 'state'.  */
3266   re_dfastate_t **trtable;
3267   re_dfastate_t *dest_states[SBC_MAX];
3268   re_dfastate_t *dest_states_word[SBC_MAX];
3269   re_dfastate_t *dest_states_nl[SBC_MAX];
3270   re_node_set follows;
3271   bitset_t acceptable;
3272
3273   /* We build DFA states which corresponds to the destination nodes
3274      from 'state'.  'dests_node[i]' represents the nodes which i-th
3275      destination state contains, and 'dests_ch[i]' represents the
3276      characters which i-th destination state accepts.  */
3277   re_node_set dests_node[SBC_MAX];
3278   bitset_t dests_ch[SBC_MAX];
3279
3280   /* Initialize transition table.  */
3281   state->word_trtable = state->trtable = NULL;
3282
3283   /* At first, group all nodes belonging to 'state' into several
3284      destinations.  */
3285   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3286   if (__glibc_unlikely (ndests <= 0))
3287     {
3288       /* Return false in case of an error, true otherwise.  */
3289       if (ndests == 0)
3290         {
3291           state->trtable = (re_dfastate_t **)
3292             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3293           if (__glibc_unlikely (state->trtable == NULL))
3294             return false;
3295           return true;
3296         }
3297       return false;
3298     }
3299
3300   err = re_node_set_alloc (&follows, ndests + 1);
3301   if (__glibc_unlikely (err != REG_NOERROR))
3302     {
3303     out_free:
3304       re_node_set_free (&follows);
3305       for (i = 0; i < ndests; ++i)
3306         re_node_set_free (dests_node + i);
3307       return false;
3308     }
3309
3310   bitset_empty (acceptable);
3311
3312   /* Then build the states for all destinations.  */
3313   for (i = 0; i < ndests; ++i)
3314     {
3315       Idx next_node;
3316       re_node_set_empty (&follows);
3317       /* Merge the follows of this destination states.  */
3318       for (j = 0; j < dests_node[i].nelem; ++j)
3319         {
3320           next_node = dfa->nexts[dests_node[i].elems[j]];
3321           if (next_node != -1)
3322             {
3323               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3324               if (__glibc_unlikely (err != REG_NOERROR))
3325                 goto out_free;
3326             }
3327         }
3328       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3329       if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3330         goto out_free;
3331       /* If the new state has context constraint,
3332          build appropriate states for these contexts.  */
3333       if (dest_states[i]->has_constraint)
3334         {
3335           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3336                                                           CONTEXT_WORD);
3337           if (__glibc_unlikely (dest_states_word[i] == NULL
3338                                 && err != REG_NOERROR))
3339             goto out_free;
3340
3341           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3342             need_word_trtable = true;
3343
3344           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3345                                                         CONTEXT_NEWLINE);
3346           if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3347             goto out_free;
3348         }
3349       else
3350         {
3351           dest_states_word[i] = dest_states[i];
3352           dest_states_nl[i] = dest_states[i];
3353         }
3354       bitset_merge (acceptable, dests_ch[i]);
3355     }
3356
3357   if (!__glibc_unlikely (need_word_trtable))
3358     {
3359       /* We don't care about whether the following character is a word
3360          character, or we are in a single-byte character set so we can
3361          discern by looking at the character code: allocate a
3362          256-entry transition table.  */
3363       trtable = state->trtable =
3364         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3365       if (__glibc_unlikely (trtable == NULL))
3366         goto out_free;
3367
3368       /* For all characters ch...:  */
3369       for (i = 0; i < BITSET_WORDS; ++i)
3370         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3371              elem;
3372              mask <<= 1, elem >>= 1, ++ch)
3373           if (__glibc_unlikely (elem & 1))
3374             {
3375               /* There must be exactly one destination which accepts
3376                  character ch.  See group_nodes_into_DFAstates.  */
3377               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3378                 ;
3379
3380               /* j-th destination accepts the word character ch.  */
3381               if (dfa->word_char[i] & mask)
3382                 trtable[ch] = dest_states_word[j];
3383               else
3384                 trtable[ch] = dest_states[j];
3385             }
3386     }
3387   else
3388     {
3389       /* We care about whether the following character is a word
3390          character, and we are in a multi-byte character set: discern
3391          by looking at the character code: build two 256-entry
3392          transition tables, one starting at trtable[0] and one
3393          starting at trtable[SBC_MAX].  */
3394       trtable = state->word_trtable =
3395         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3396       if (__glibc_unlikely (trtable == NULL))
3397         goto out_free;
3398
3399       /* For all characters ch...:  */
3400       for (i = 0; i < BITSET_WORDS; ++i)
3401         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3402              elem;
3403              mask <<= 1, elem >>= 1, ++ch)
3404           if (__glibc_unlikely (elem & 1))
3405             {
3406               /* There must be exactly one destination which accepts
3407                  character ch.  See group_nodes_into_DFAstates.  */
3408               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3409                 ;
3410
3411               /* j-th destination accepts the word character ch.  */
3412               trtable[ch] = dest_states[j];
3413               trtable[ch + SBC_MAX] = dest_states_word[j];
3414             }
3415     }
3416
3417   /* new line */
3418   if (bitset_contain (acceptable, NEWLINE_CHAR))
3419     {
3420       /* The current state accepts newline character.  */
3421       for (j = 0; j < ndests; ++j)
3422         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3423           {
3424             /* k-th destination accepts newline character.  */
3425             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3426             if (need_word_trtable)
3427               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3428             /* There must be only one destination which accepts
3429                newline.  See group_nodes_into_DFAstates.  */
3430             break;
3431           }
3432     }
3433
3434   re_node_set_free (&follows);
3435   for (i = 0; i < ndests; ++i)
3436     re_node_set_free (dests_node + i);
3437   return true;
3438 }
3439
3440 /* Group all nodes belonging to STATE into several destinations.
3441    Then for all destinations, set the nodes belonging to the destination
3442    to DESTS_NODE[i] and set the characters accepted by the destination
3443    to DEST_CH[i].  Return the number of destinations if successful,
3444    -1 on internal error.  */
3445
3446 static Idx
3447 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3448                             re_node_set *dests_node, bitset_t *dests_ch)
3449 {
3450   reg_errcode_t err;
3451   bool ok;
3452   Idx i, j, k;
3453   Idx ndests; /* Number of the destinations from 'state'.  */
3454   bitset_t accepts; /* Characters a node can accept.  */
3455   const re_node_set *cur_nodes = &state->nodes;
3456   bitset_empty (accepts);
3457   ndests = 0;
3458
3459   /* For all the nodes belonging to 'state',  */
3460   for (i = 0; i < cur_nodes->nelem; ++i)
3461     {
3462       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3463       re_token_type_t type = node->type;
3464       unsigned int constraint = node->constraint;
3465
3466       /* Enumerate all single byte character this node can accept.  */
3467       if (type == CHARACTER)
3468         bitset_set (accepts, node->opr.c);
3469       else if (type == SIMPLE_BRACKET)
3470         {
3471           bitset_merge (accepts, node->opr.sbcset);
3472         }
3473       else if (type == OP_PERIOD)
3474         {
3475 #ifdef RE_ENABLE_I18N
3476           if (dfa->mb_cur_max > 1)
3477             bitset_merge (accepts, dfa->sb_char);
3478           else
3479 #endif
3480             bitset_set_all (accepts);
3481           if (!(dfa->syntax & RE_DOT_NEWLINE))
3482             bitset_clear (accepts, '\n');
3483           if (dfa->syntax & RE_DOT_NOT_NULL)
3484             bitset_clear (accepts, '\0');
3485         }
3486 #ifdef RE_ENABLE_I18N
3487       else if (type == OP_UTF8_PERIOD)
3488         {
3489           if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3490             memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3491           else
3492             bitset_merge (accepts, utf8_sb_map);
3493           if (!(dfa->syntax & RE_DOT_NEWLINE))
3494             bitset_clear (accepts, '\n');
3495           if (dfa->syntax & RE_DOT_NOT_NULL)
3496             bitset_clear (accepts, '\0');
3497         }
3498 #endif
3499       else
3500         continue;
3501
3502       /* Check the 'accepts' and sift the characters which are not
3503          match it the context.  */
3504       if (constraint)
3505         {
3506           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3507             {
3508               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3509               bitset_empty (accepts);
3510               if (accepts_newline)
3511                 bitset_set (accepts, NEWLINE_CHAR);
3512               else
3513                 continue;
3514             }
3515           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3516             {
3517               bitset_empty (accepts);
3518               continue;
3519             }
3520
3521           if (constraint & NEXT_WORD_CONSTRAINT)
3522             {
3523               bitset_word_t any_set = 0;
3524               if (type == CHARACTER && !node->word_char)
3525                 {
3526                   bitset_empty (accepts);
3527                   continue;
3528                 }
3529 #ifdef RE_ENABLE_I18N
3530               if (dfa->mb_cur_max > 1)
3531                 for (j = 0; j < BITSET_WORDS; ++j)
3532                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3533               else
3534 #endif
3535                 for (j = 0; j < BITSET_WORDS; ++j)
3536                   any_set |= (accepts[j] &= dfa->word_char[j]);
3537               if (!any_set)
3538                 continue;
3539             }
3540           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3541             {
3542               bitset_word_t any_set = 0;
3543               if (type == CHARACTER && node->word_char)
3544                 {
3545                   bitset_empty (accepts);
3546                   continue;
3547                 }
3548 #ifdef RE_ENABLE_I18N
3549               if (dfa->mb_cur_max > 1)
3550                 for (j = 0; j < BITSET_WORDS; ++j)
3551                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3552               else
3553 #endif
3554                 for (j = 0; j < BITSET_WORDS; ++j)
3555                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3556               if (!any_set)
3557                 continue;
3558             }
3559         }
3560
3561       /* Then divide 'accepts' into DFA states, or create a new
3562          state.  Above, we make sure that accepts is not empty.  */
3563       for (j = 0; j < ndests; ++j)
3564         {
3565           bitset_t intersec; /* Intersection sets, see below.  */
3566           bitset_t remains;
3567           /* Flags, see below.  */
3568           bitset_word_t has_intersec, not_subset, not_consumed;
3569
3570           /* Optimization, skip if this state doesn't accept the character.  */
3571           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3572             continue;
3573
3574           /* Enumerate the intersection set of this state and 'accepts'.  */
3575           has_intersec = 0;
3576           for (k = 0; k < BITSET_WORDS; ++k)
3577             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3578           /* And skip if the intersection set is empty.  */
3579           if (!has_intersec)
3580             continue;
3581
3582           /* Then check if this state is a subset of 'accepts'.  */
3583           not_subset = not_consumed = 0;
3584           for (k = 0; k < BITSET_WORDS; ++k)
3585             {
3586               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3587               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3588             }
3589
3590           /* If this state isn't a subset of 'accepts', create a
3591              new group state, which has the 'remains'. */
3592           if (not_subset)
3593             {
3594               bitset_copy (dests_ch[ndests], remains);
3595               bitset_copy (dests_ch[j], intersec);
3596               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3597               if (__glibc_unlikely (err != REG_NOERROR))
3598                 goto error_return;
3599               ++ndests;
3600             }
3601
3602           /* Put the position in the current group. */
3603           ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3604           if (__glibc_unlikely (! ok))
3605             goto error_return;
3606
3607           /* If all characters are consumed, go to next node. */
3608           if (!not_consumed)
3609             break;
3610         }
3611       /* Some characters remain, create a new group. */
3612       if (j == ndests)
3613         {
3614           bitset_copy (dests_ch[ndests], accepts);
3615           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3616           if (__glibc_unlikely (err != REG_NOERROR))
3617             goto error_return;
3618           ++ndests;
3619           bitset_empty (accepts);
3620         }
3621     }
3622   assume (ndests <= SBC_MAX);
3623   return ndests;
3624  error_return:
3625   for (j = 0; j < ndests; ++j)
3626     re_node_set_free (dests_node + j);
3627   return -1;
3628 }
3629
3630 #ifdef RE_ENABLE_I18N
3631 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3632    Return the number of the bytes the node accepts.
3633    STR_IDX is the current index of the input string.
3634
3635    This function handles the nodes which can accept one character, or
3636    one collating element like '.', '[a-z]', opposite to the other nodes
3637    can only accept one byte.  */
3638
3639 # ifdef _LIBC
3640 #  include <locale/weight.h>
3641 # endif
3642
3643 static int
3644 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3645                          const re_string_t *input, Idx str_idx)
3646 {
3647   const re_token_t *node = dfa->nodes + node_idx;
3648   int char_len, elem_len;
3649   Idx i;
3650
3651   if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3652     {
3653       unsigned char c = re_string_byte_at (input, str_idx), d;
3654       if (__glibc_likely (c < 0xc2))
3655         return 0;
3656
3657       if (str_idx + 2 > input->len)
3658         return 0;
3659
3660       d = re_string_byte_at (input, str_idx + 1);
3661       if (c < 0xe0)
3662         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3663       else if (c < 0xf0)
3664         {
3665           char_len = 3;
3666           if (c == 0xe0 && d < 0xa0)
3667             return 0;
3668         }
3669       else if (c < 0xf8)
3670         {
3671           char_len = 4;
3672           if (c == 0xf0 && d < 0x90)
3673             return 0;
3674         }
3675       else if (c < 0xfc)
3676         {
3677           char_len = 5;
3678           if (c == 0xf8 && d < 0x88)
3679             return 0;
3680         }
3681       else if (c < 0xfe)
3682         {
3683           char_len = 6;
3684           if (c == 0xfc && d < 0x84)
3685             return 0;
3686         }
3687       else
3688         return 0;
3689
3690       if (str_idx + char_len > input->len)
3691         return 0;
3692
3693       for (i = 1; i < char_len; ++i)
3694         {
3695           d = re_string_byte_at (input, str_idx + i);
3696           if (d < 0x80 || d > 0xbf)
3697             return 0;
3698         }
3699       return char_len;
3700     }
3701
3702   char_len = re_string_char_size_at (input, str_idx);
3703   if (node->type == OP_PERIOD)
3704     {
3705       if (char_len <= 1)
3706         return 0;
3707       /* FIXME: I don't think this if is needed, as both '\n'
3708          and '\0' are char_len == 1.  */
3709       /* '.' accepts any one character except the following two cases.  */
3710       if ((!(dfa->syntax & RE_DOT_NEWLINE)
3711            && re_string_byte_at (input, str_idx) == '\n')
3712           || ((dfa->syntax & RE_DOT_NOT_NULL)
3713               && re_string_byte_at (input, str_idx) == '\0'))
3714         return 0;
3715       return char_len;
3716     }
3717
3718   elem_len = re_string_elem_size_at (input, str_idx);
3719   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3720     return 0;
3721
3722   if (node->type == COMPLEX_BRACKET)
3723     {
3724       const re_charset_t *cset = node->opr.mbcset;
3725 # ifdef _LIBC
3726       const unsigned char *pin
3727         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3728       Idx j;
3729       uint32_t nrules;
3730 # endif /* _LIBC */
3731       int match_len = 0;
3732       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3733                     ? re_string_wchar_at (input, str_idx) : 0);
3734
3735       /* match with multibyte character?  */
3736       for (i = 0; i < cset->nmbchars; ++i)
3737         if (wc == cset->mbchars[i])
3738           {
3739             match_len = char_len;
3740             goto check_node_accept_bytes_match;
3741           }
3742       /* match with character_class?  */
3743       for (i = 0; i < cset->nchar_classes; ++i)
3744         {
3745           wctype_t wt = cset->char_classes[i];
3746           if (__iswctype (wc, wt))
3747             {
3748               match_len = char_len;
3749               goto check_node_accept_bytes_match;
3750             }
3751         }
3752
3753 # ifdef _LIBC
3754       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3755       if (nrules != 0)
3756         {
3757           unsigned int in_collseq = 0;
3758           const int32_t *table, *indirect;
3759           const unsigned char *weights, *extra;
3760           const char *collseqwc;
3761
3762           /* match with collating_symbol?  */
3763           if (cset->ncoll_syms)
3764             extra = (const unsigned char *)
3765               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3766           for (i = 0; i < cset->ncoll_syms; ++i)
3767             {
3768               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3769               /* Compare the length of input collating element and
3770                  the length of current collating element.  */
3771               if (*coll_sym != elem_len)
3772                 continue;
3773               /* Compare each bytes.  */
3774               for (j = 0; j < *coll_sym; j++)
3775                 if (pin[j] != coll_sym[1 + j])
3776                   break;
3777               if (j == *coll_sym)
3778                 {
3779                   /* Match if every bytes is equal.  */
3780                   match_len = j;
3781                   goto check_node_accept_bytes_match;
3782                 }
3783             }
3784
3785           if (cset->nranges)
3786             {
3787               if (elem_len <= char_len)
3788                 {
3789                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3790                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3791                 }
3792               else
3793                 in_collseq = find_collation_sequence_value (pin, elem_len);
3794             }
3795           /* match with range expression?  */
3796           /* FIXME: Implement rational ranges here, too.  */
3797           for (i = 0; i < cset->nranges; ++i)
3798             if (cset->range_starts[i] <= in_collseq
3799                 && in_collseq <= cset->range_ends[i])
3800               {
3801                 match_len = elem_len;
3802                 goto check_node_accept_bytes_match;
3803               }
3804
3805           /* match with equivalence_class?  */
3806           if (cset->nequiv_classes)
3807             {
3808               const unsigned char *cp = pin;
3809               table = (const int32_t *)
3810                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3811               weights = (const unsigned char *)
3812                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3813               extra = (const unsigned char *)
3814                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3815               indirect = (const int32_t *)
3816                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3817               int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3818               int32_t rule = idx >> 24;
3819               idx &= 0xffffff;
3820               if (idx > 0)
3821                 {
3822                   size_t weight_len = weights[idx];
3823                   for (i = 0; i < cset->nequiv_classes; ++i)
3824                     {
3825                       int32_t equiv_class_idx = cset->equiv_classes[i];
3826                       int32_t equiv_class_rule = equiv_class_idx >> 24;
3827                       equiv_class_idx &= 0xffffff;
3828                       if (weights[equiv_class_idx] == weight_len
3829                           && equiv_class_rule == rule
3830                           && memcmp (weights + idx + 1,
3831                                      weights + equiv_class_idx + 1,
3832                                      weight_len) == 0)
3833                         {
3834                           match_len = elem_len;
3835                           goto check_node_accept_bytes_match;
3836                         }
3837                     }
3838                 }
3839             }
3840         }
3841       else
3842 # endif /* _LIBC */
3843         {
3844           /* match with range expression?  */
3845           for (i = 0; i < cset->nranges; ++i)
3846             {
3847               if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3848                 {
3849                   match_len = char_len;
3850                   goto check_node_accept_bytes_match;
3851                 }
3852             }
3853         }
3854     check_node_accept_bytes_match:
3855       if (!cset->non_match)
3856         return match_len;
3857       else
3858         {
3859           if (match_len > 0)
3860             return 0;
3861           else
3862             return (elem_len > char_len) ? elem_len : char_len;
3863         }
3864     }
3865   return 0;
3866 }
3867
3868 # ifdef _LIBC
3869 static unsigned int
3870 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3871 {
3872   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3873   if (nrules == 0)
3874     {
3875       if (mbs_len == 1)
3876         {
3877           /* No valid character.  Match it as a single byte character.  */
3878           const unsigned char *collseq = (const unsigned char *)
3879             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3880           return collseq[mbs[0]];
3881         }
3882       return UINT_MAX;
3883     }
3884   else
3885     {
3886       int32_t idx;
3887       const unsigned char *extra = (const unsigned char *)
3888         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3889       int32_t extrasize = (const unsigned char *)
3890         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3891
3892       for (idx = 0; idx < extrasize;)
3893         {
3894           int mbs_cnt;
3895           bool found = false;
3896           int32_t elem_mbs_len;
3897           /* Skip the name of collating element name.  */
3898           idx = idx + extra[idx] + 1;
3899           elem_mbs_len = extra[idx++];
3900           if (mbs_len == elem_mbs_len)
3901             {
3902               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3903                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3904                   break;
3905               if (mbs_cnt == elem_mbs_len)
3906                 /* Found the entry.  */
3907                 found = true;
3908             }
3909           /* Skip the byte sequence of the collating element.  */
3910           idx += elem_mbs_len;
3911           /* Adjust for the alignment.  */
3912           idx = (idx + 3) & ~3;
3913           /* Skip the collation sequence value.  */
3914           idx += sizeof (uint32_t);
3915           /* Skip the wide char sequence of the collating element.  */
3916           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3917           /* If we found the entry, return the sequence value.  */
3918           if (found)
3919             return *(uint32_t *) (extra + idx);
3920           /* Skip the collation sequence value.  */
3921           idx += sizeof (uint32_t);
3922         }
3923       return UINT_MAX;
3924     }
3925 }
3926 # endif /* _LIBC */
3927 #endif /* RE_ENABLE_I18N */
3928
3929 /* Check whether the node accepts the byte which is IDX-th
3930    byte of the INPUT.  */
3931
3932 static bool
3933 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3934                    Idx idx)
3935 {
3936   unsigned char ch;
3937   ch = re_string_byte_at (&mctx->input, idx);
3938   switch (node->type)
3939     {
3940     case CHARACTER:
3941       if (node->opr.c != ch)
3942         return false;
3943       break;
3944
3945     case SIMPLE_BRACKET:
3946       if (!bitset_contain (node->opr.sbcset, ch))
3947         return false;
3948       break;
3949
3950 #ifdef RE_ENABLE_I18N
3951     case OP_UTF8_PERIOD:
3952       if (ch >= ASCII_CHARS)
3953         return false;
3954       FALLTHROUGH;
3955 #endif
3956     case OP_PERIOD:
3957       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3958           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3959         return false;
3960       break;
3961
3962     default:
3963       return false;
3964     }
3965
3966   if (node->constraint)
3967     {
3968       /* The node has constraints.  Check whether the current context
3969          satisfies the constraints.  */
3970       unsigned int context = re_string_context_at (&mctx->input, idx,
3971                                                    mctx->eflags);
3972       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3973         return false;
3974     }
3975
3976   return true;
3977 }
3978
3979 /* Extend the buffers, if the buffers have run out.  */
3980
3981 static reg_errcode_t
3982 __attribute_warn_unused_result__
3983 extend_buffers (re_match_context_t *mctx, int min_len)
3984 {
3985   reg_errcode_t ret;
3986   re_string_t *pstr = &mctx->input;
3987
3988   /* Avoid overflow.  */
3989   if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3990                         <= pstr->bufs_len))
3991     return REG_ESPACE;
3992
3993   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
3994   ret = re_string_realloc_buffers (pstr,
3995                                    MAX (min_len,
3996                                         MIN (pstr->len, pstr->bufs_len * 2)));
3997   if (__glibc_unlikely (ret != REG_NOERROR))
3998     return ret;
3999
4000   if (mctx->state_log != NULL)
4001     {
4002       /* And double the length of state_log.  */
4003       /* XXX We have no indication of the size of this buffer.  If this
4004          allocation fail we have no indication that the state_log array
4005          does not have the right size.  */
4006       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4007                                               pstr->bufs_len + 1);
4008       if (__glibc_unlikely (new_array == NULL))
4009         return REG_ESPACE;
4010       mctx->state_log = new_array;
4011     }
4012
4013   /* Then reconstruct the buffers.  */
4014   if (pstr->icase)
4015     {
4016 #ifdef RE_ENABLE_I18N
4017       if (pstr->mb_cur_max > 1)
4018         {
4019           ret = build_wcs_upper_buffer (pstr);
4020           if (__glibc_unlikely (ret != REG_NOERROR))
4021             return ret;
4022         }
4023       else
4024 #endif /* RE_ENABLE_I18N  */
4025         build_upper_buffer (pstr);
4026     }
4027   else
4028     {
4029 #ifdef RE_ENABLE_I18N
4030       if (pstr->mb_cur_max > 1)
4031         build_wcs_buffer (pstr);
4032       else
4033 #endif /* RE_ENABLE_I18N  */
4034         {
4035           if (pstr->trans != NULL)
4036             re_string_translate_buffer (pstr);
4037         }
4038     }
4039   return REG_NOERROR;
4040 }
4041
4042 \f
4043 /* Functions for matching context.  */
4044
4045 /* Initialize MCTX.  */
4046
4047 static reg_errcode_t
4048 __attribute_warn_unused_result__
4049 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4050 {
4051   mctx->eflags = eflags;
4052   mctx->match_last = -1;
4053   if (n > 0)
4054     {
4055       /* Avoid overflow.  */
4056       size_t max_object_size =
4057         MAX (sizeof (struct re_backref_cache_entry),
4058              sizeof (re_sub_match_top_t *));
4059       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4060         return REG_ESPACE;
4061
4062       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4063       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4064       if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4065         return REG_ESPACE;
4066     }
4067   /* Already zero-ed by the caller.
4068      else
4069        mctx->bkref_ents = NULL;
4070      mctx->nbkref_ents = 0;
4071      mctx->nsub_tops = 0;  */
4072   mctx->abkref_ents = n;
4073   mctx->max_mb_elem_len = 1;
4074   mctx->asub_tops = n;
4075   return REG_NOERROR;
4076 }
4077
4078 /* Clean the entries which depend on the current input in MCTX.
4079    This function must be invoked when the matcher changes the start index
4080    of the input, or changes the input string.  */
4081
4082 static void
4083 match_ctx_clean (re_match_context_t *mctx)
4084 {
4085   Idx st_idx;
4086   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4087     {
4088       Idx sl_idx;
4089       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4090       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4091         {
4092           re_sub_match_last_t *last = top->lasts[sl_idx];
4093           re_free (last->path.array);
4094           re_free (last);
4095         }
4096       re_free (top->lasts);
4097       if (top->path)
4098         {
4099           re_free (top->path->array);
4100           re_free (top->path);
4101         }
4102       re_free (top);
4103     }
4104
4105   mctx->nsub_tops = 0;
4106   mctx->nbkref_ents = 0;
4107 }
4108
4109 /* Free all the memory associated with MCTX.  */
4110
4111 static void
4112 match_ctx_free (re_match_context_t *mctx)
4113 {
4114   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4115   match_ctx_clean (mctx);
4116   re_free (mctx->sub_tops);
4117   re_free (mctx->bkref_ents);
4118 }
4119
4120 /* Add a new backreference entry to MCTX.
4121    Note that we assume that caller never call this function with duplicate
4122    entry, and call with STR_IDX which isn't smaller than any existing entry.
4123 */
4124
4125 static reg_errcode_t
4126 __attribute_warn_unused_result__
4127 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4128                      Idx to)
4129 {
4130   if (mctx->nbkref_ents >= mctx->abkref_ents)
4131     {
4132       struct re_backref_cache_entry* new_entry;
4133       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4134                               mctx->abkref_ents * 2);
4135       if (__glibc_unlikely (new_entry == NULL))
4136         {
4137           re_free (mctx->bkref_ents);
4138           return REG_ESPACE;
4139         }
4140       mctx->bkref_ents = new_entry;
4141       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4142               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4143       mctx->abkref_ents *= 2;
4144     }
4145   if (mctx->nbkref_ents > 0
4146       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4147     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4148
4149   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4150   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4151   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4152   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4153
4154   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4155      If bit N is clear, means that this entry won't epsilon-transition to
4156      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4157      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4158      such node.
4159
4160      A backreference does not epsilon-transition unless it is empty, so set
4161      to all zeros if FROM != TO.  */
4162   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4163     = (from == to ? -1 : 0);
4164
4165   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4166   if (mctx->max_mb_elem_len < to - from)
4167     mctx->max_mb_elem_len = to - from;
4168   return REG_NOERROR;
4169 }
4170
4171 /* Return the first entry with the same str_idx, or -1 if none is
4172    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4173
4174 static Idx
4175 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4176 {
4177   Idx left, right, mid, last;
4178   last = right = mctx->nbkref_ents;
4179   for (left = 0; left < right;)
4180     {
4181       mid = (left + right) / 2;
4182       if (mctx->bkref_ents[mid].str_idx < str_idx)
4183         left = mid + 1;
4184       else
4185         right = mid;
4186     }
4187   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4188     return left;
4189   else
4190     return -1;
4191 }
4192
4193 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4194    at STR_IDX.  */
4195
4196 static reg_errcode_t
4197 __attribute_warn_unused_result__
4198 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4199 {
4200   DEBUG_ASSERT (mctx->sub_tops != NULL);
4201   DEBUG_ASSERT (mctx->asub_tops > 0);
4202   if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4203     {
4204       Idx new_asub_tops = mctx->asub_tops * 2;
4205       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4206                                                    re_sub_match_top_t *,
4207                                                    new_asub_tops);
4208       if (__glibc_unlikely (new_array == NULL))
4209         return REG_ESPACE;
4210       mctx->sub_tops = new_array;
4211       mctx->asub_tops = new_asub_tops;
4212     }
4213   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4214   if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4215     return REG_ESPACE;
4216   mctx->sub_tops[mctx->nsub_tops]->node = node;
4217   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4218   return REG_NOERROR;
4219 }
4220
4221 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4222    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4223    Return the new entry if successful, NULL if memory is exhausted.  */
4224
4225 static re_sub_match_last_t *
4226 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4227 {
4228   re_sub_match_last_t *new_entry;
4229   if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4230     {
4231       Idx new_alasts = 2 * subtop->alasts + 1;
4232       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4233                                                     re_sub_match_last_t *,
4234                                                     new_alasts);
4235       if (__glibc_unlikely (new_array == NULL))
4236         return NULL;
4237       subtop->lasts = new_array;
4238       subtop->alasts = new_alasts;
4239     }
4240   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4241   if (__glibc_likely (new_entry != NULL))
4242     {
4243       subtop->lasts[subtop->nlasts] = new_entry;
4244       new_entry->node = node;
4245       new_entry->str_idx = str_idx;
4246       ++subtop->nlasts;
4247     }
4248   return new_entry;
4249 }
4250
4251 static void
4252 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4253                re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4254 {
4255   sctx->sifted_states = sifted_sts;
4256   sctx->limited_states = limited_sts;
4257   sctx->last_node = last_node;
4258   sctx->last_str_idx = last_str_idx;
4259   re_node_set_init_empty (&sctx->limits);
4260 }