regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2005,2007,2009,2010,2011 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first) internal_function;
  58 static int check_halt_state_context (const re_match_context_t *mctx,
  59                                      const re_dfastate_t *state, int idx)
  60      internal_function;
  61 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  62                          regmatch_t *prev_idx_match, int cur_node,
  63                          int cur_idx, int nmatch) internal_function;
  64 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  65                                       int str_idx, int dest_node, int nregs,
  66                                       regmatch_t *regs,
  67                                       re_node_set *eps_via_nodes)
  68      internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  74      internal_function;
  75
  76 #ifdef RE_ENABLE_I18N
  77 static int sift_states_iter_mb (const re_match_context_t *mctx,
  78                                 re_sift_context_t *sctx,
  79                                 int node_idx, int str_idx, int max_str_idx)
  80      internal_function;
  81 #endif /* RE_ENABLE_I18N */
  82 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  83                                            re_sift_context_t *sctx)
  84      internal_function;
  85 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  86                                           re_sift_context_t *sctx, int str_idx,
  87                                           re_node_set *cur_dest)
  88      internal_function;
  89 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  90                                               re_sift_context_t *sctx,
  91                                               int str_idx,
  92                                               re_node_set *dest_nodes)
  93      internal_function;
  94 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  95                                             re_node_set *dest_nodes,
  96                                             const re_node_set *candidates)
  97      internal_function;
  98 static int check_dst_limits (const re_match_context_t *mctx,
  99                              re_node_set *limits,
 100                              int dst_node, int dst_idx, int src_node,
 101                              int src_idx) internal_function;
 102 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 103                                         int boundaries, int subexp_idx,
 104                                         int from_node, int bkref_idx)
 105      internal_function;
 106 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 107                                       int limit, int subexp_idx,
 108                                       int node, int str_idx,
 109                                       int bkref_idx) internal_function;
 110 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 111                                           re_node_set *dest_nodes,
 112                                           const re_node_set *candidates,
 113                                           re_node_set *limits,
 114                                           struct re_backref_cache_entry *bkref_ents,
 115                                           int str_idx) internal_function;
 116 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 117                                         re_sift_context_t *sctx,
 118                                         int str_idx, const re_node_set *candidates)
 119      internal_function;
 120 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 121                                         re_dfastate_t **dst,
 122                                         re_dfastate_t **src, int num)
 123      internal_function;
 124 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 125                                          re_match_context_t *mctx) internal_function;
 126 static re_dfastate_t *transit_state (reg_errcode_t *err,
 127                                      re_match_context_t *mctx,
 128                                      re_dfastate_t *state) internal_function;
 129 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 130                                             re_match_context_t *mctx,
 131                                             re_dfastate_t *next_state)
 132      internal_function;
 133 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 134                                                 re_node_set *cur_nodes,
 135                                                 int str_idx) internal_function;
 136 #if 0
 137 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 138                                         re_match_context_t *mctx,
 139                                         re_dfastate_t *pstate)
 140      internal_function;
 141 #endif
 142 #ifdef RE_ENABLE_I18N
 143 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 144                                        re_dfastate_t *pstate)
 145      internal_function;
 146 #endif /* RE_ENABLE_I18N */
 147 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 148                                           const re_node_set *nodes)
 149      internal_function;
 150 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 151                                  int bkref_node, int bkref_str_idx)
 152      internal_function;
 153 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 154                                      const re_sub_match_top_t *sub_top,
 155                                      re_sub_match_last_t *sub_last,
 156                                      int bkref_node, int bkref_str)
 157      internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes)
 168      internal_function;
 169 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 170                                                re_node_set *cur_nodes,
 171                                                int ex_subexp, int type)
 172      internal_function;
 173 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 174                                                    re_node_set *dst_nodes,
 175                                                    int target, int ex_subexp,
 176                                                    int type) internal_function;
 177 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 178                                          re_node_set *cur_nodes, int cur_str,
 179                                          int subexp_num, int type)
 180      internal_function;
 181 static int build_trtable (const re_dfa_t *dfa,
 182                           re_dfastate_t *state) internal_function;
 183 #ifdef RE_ENABLE_I18N
 184 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 185                                     const re_string_t *input, int idx)
 186      internal_function;
 187 # ifdef _LIBC
 188 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 189                                                    size_t name_len)
 190      internal_function;
 191 # endif /* _LIBC */
 192 #endif /* RE_ENABLE_I18N */
 193 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 194                                        const re_dfastate_t *state,
 195                                        re_node_set *states_node,
 196                                        bitset_t *states_ch) internal_function;
 197 static int check_node_accept (const re_match_context_t *mctx,
 198                               const re_token_t *node, int idx)
 199      internal_function;
 200 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 201      internal_function;
 202
 203 #ifdef GAWK
 204 #undef MIN      /* safety */
 205 static int
 206 MIN(size_t a, size_t b)
 207 {
 208         return (a < b ? a : b);
 209 }
 210 #endif
 211 \f
 212 /* Entry point for POSIX code.  */
 213
 214 /* regexec searches for a given pattern, specified by PREG, in the
 215    string STRING.
 216
 217    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 218    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 219    least NMATCH elements, and we set them to the offsets of the
 220    corresponding matched substrings.
 221
 222    EFLAGS specifies `execution flags' which affect matching: if
 223    REG_NOTBOL is set, then ^ does not match at the beginning of the
 224    string; if REG_NOTEOL is set, then $ does not match at the end.
 225
 226    We return 0 if we find a match and REG_NOMATCH if not.  */
 227
 228 int
 229 regexec (preg, string, nmatch, pmatch, eflags)
 230     const regex_t *__restrict preg;
 231     const char *__restrict string;
 232     size_t nmatch;
 233     regmatch_t pmatch[];
 234     int eflags;
 235 {
 236   reg_errcode_t err;
 237   int start, length;
 238
 239   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 240     return REG_BADPAT;
 241
 242   if (eflags & REG_STARTEND)
 243     {
 244       start = pmatch[0].rm_so;
 245       length = pmatch[0].rm_eo;
 246     }
 247   else
 248     {
 249       start = 0;
 250       length = strlen (string);
 251     }
 252
 253   __libc_lock_lock (dfa->lock);
 254   if (preg->no_sub)
 255     err = re_search_internal (preg, string, length, start, length - start,
 256                               length, 0, NULL, eflags);
 257   else
 258     err = re_search_internal (preg, string, length, start, length - start,
 259                               length, nmatch, pmatch, eflags);
 260   __libc_lock_unlock (dfa->lock);
 261   return err != REG_NOERROR;
 262 }
 263
 264 #ifdef _LIBC
 265 # include <shlib-compat.h>
 266 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 267
 268 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 269 __typeof__ (__regexec) __compat_regexec;
 270
 271 int
 272 attribute_compat_text_section
 273 __compat_regexec (const regex_t *__restrict preg,
 274                   const char *__restrict string, size_t nmatch,
 275                   regmatch_t pmatch[], int eflags)
 276 {
 277   return regexec (preg, string, nmatch, pmatch,
 278                   eflags & (REG_NOTBOL | REG_NOTEOL));
 279 }
 280 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 281 # endif
 282 #endif
 283
 284 /* Entry points for GNU code.  */
 285
 286 /* re_match, re_search, re_match_2, re_search_2
 287
 288    The former two functions operate on STRING with length LENGTH,
 289    while the later two operate on concatenation of STRING1 and STRING2
 290    with lengths LENGTH1 and LENGTH2, respectively.
 291
 292    re_match() matches the compiled pattern in BUFP against the string,
 293    starting at index START.
 294
 295    re_search() first tries matching at index START, then it tries to match
 296    starting from index START + 1, and so on.  The last start position tried
 297    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 298    way as re_match().)
 299
 300    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 301    the first STOP characters of the concatenation of the strings should be
 302    concerned.
 303
 304    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 305    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 306    computed relative to the concatenation, not relative to the individual
 307    strings.)
 308
 309    On success, re_match* functions return the length of the match, re_search*
 310    return the position of the start of the match.  Return value -1 means no
 311    match was found and -2 indicates an internal error.  */
 312
 313 int
 314 re_match (bufp, string, length, start, regs)
 315     struct re_pattern_buffer *bufp;
 316     const char *string;
 317     int length, start;
 318     struct re_registers *regs;
 319 {
 320   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 321 }
 322 #ifdef _LIBC
 323 weak_alias (__re_match, re_match)
 324 #endif
 325
 326 int
 327 re_search (bufp, string, length, start, range, regs)
 328     struct re_pattern_buffer *bufp;
 329     const char *string;
 330     int length, start, range;
 331     struct re_registers *regs;
 332 {
 333   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 334 }
 335 #ifdef _LIBC
 336 weak_alias (__re_search, re_search)
 337 #endif
 338
 339 int
 340 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 341     struct re_pattern_buffer *bufp;
 342     const char *string1, *string2;
 343     int length1, length2, start, stop;
 344     struct re_registers *regs;
 345 {
 346   return re_search_2_stub (bufp, string1, length1, string2, length2,
 347                            start, 0, regs, stop, 1);
 348 }
 349 #ifdef _LIBC
 350 weak_alias (__re_match_2, re_match_2)
 351 #endif
 352
 353 int
 354 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 355     struct re_pattern_buffer *bufp;
 356     const char *string1, *string2;
 357     int length1, length2, start, range, stop;
 358     struct re_registers *regs;
 359 {
 360   return re_search_2_stub (bufp, string1, length1, string2, length2,
 361                            start, range, regs, stop, 0);
 362 }
 363 #ifdef _LIBC
 364 weak_alias (__re_search_2, re_search_2)
 365 #endif
 366
 367 static int
 368 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 369                   stop, ret_len)
 370     struct re_pattern_buffer *bufp;
 371     const char *string1, *string2;
 372     int length1, length2, start, range, stop, ret_len;
 373     struct re_registers *regs;
 374 {
 375   const char *str;
 376   int rval;
 377   int len = length1 + length2;
 378   int free_str = 0;
 379
 380   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
 381     return -2;
 382
 383   /* Concatenate the strings.  */
 384   if (length2 > 0)
 385     if (length1 > 0)
 386       {
 387         char *s = re_malloc (char, len);
 388
 389         if (BE (s == NULL, 0))
 390           return -2;
 391 #ifdef _LIBC
 392         memcpy (__mempcpy (s, string1, length1), string2, length2);
 393 #else
 394         memcpy (s, string1, length1);
 395         memcpy (s + length1, string2, length2);
 396 #endif
 397         str = s;
 398         free_str = 1;
 399       }
 400     else
 401       str = string2;
 402   else
 403     str = string1;
 404
 405   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 406   if (free_str)
 407     re_free ((char *) str);
 408   return rval;
 409 }
 410
 411 /* The parameters have the same meaning as those of re_search.
 412    Additional parameters:
 413    If RET_LEN is nonzero the length of the match is returned (re_match style);
 414    otherwise the position of the match is returned.  */
 415
 416 static int
 417 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 418     struct re_pattern_buffer *bufp;
 419     const char *string;
 420     int length, start, range, stop, ret_len;
 421     struct re_registers *regs;
 422 {
 423   reg_errcode_t result;
 424   regmatch_t *pmatch;
 425   int nregs, rval;
 426   int eflags = 0;
 427
 428   /* Check for out-of-range.  */
 429   if (BE (start < 0 || start > length, 0))
 430     return -1;
 431   if (BE (start + range > length, 0))
 432     range = length - start;
 433   else if (BE (start + range < 0, 0))
 434     range = -start;
 435
 436   __libc_lock_lock (dfa->lock);
 437
 438   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 439   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 440
 441   /* Compile fastmap if we haven't yet.  */
 442   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 443     re_compile_fastmap (bufp);
 444
 445   if (BE (bufp->no_sub, 0))
 446     regs = NULL;
 447
 448   /* We need at least 1 register.  */
 449   if (regs == NULL)
 450     nregs = 1;
 451   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 452                regs->num_regs < bufp->re_nsub + 1, 0))
 453     {
 454       nregs = regs->num_regs;
 455       if (BE (nregs < 1, 0))
 456         {
 457           /* Nothing can be copied to regs.  */
 458           regs = NULL;
 459           nregs = 1;
 460         }
 461     }
 462   else
 463     nregs = bufp->re_nsub + 1;
 464   pmatch = re_malloc (regmatch_t, nregs);
 465   if (BE (pmatch == NULL, 0))
 466     {
 467       rval = -2;
 468       goto out;
 469     }
 470
 471   result = re_search_internal (bufp, string, length, start, range, stop,
 472                                nregs, pmatch, eflags);
 473
 474   rval = 0;
 475
 476   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 477   if (result != REG_NOERROR)
 478     rval = -1;
 479   else if (regs != NULL)
 480     {
 481       /* If caller wants register contents data back, copy them.  */
 482       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 483                                            bufp->regs_allocated);
 484       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 485         rval = -2;
 486     }
 487
 488   if (BE (rval == 0, 1))
 489     {
 490       if (ret_len)
 491         {
 492           assert (pmatch[0].rm_so == start);
 493           rval = pmatch[0].rm_eo - start;
 494         }
 495       else
 496         rval = pmatch[0].rm_so;
 497     }
 498   re_free (pmatch);
 499  out:
 500   __libc_lock_unlock (dfa->lock);
 501   return rval;
 502 }
 503
 504 static unsigned
 505 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 506     struct re_registers *regs;
 507     regmatch_t *pmatch;
 508     int nregs, regs_allocated;
 509 {
 510   int rval = REGS_REALLOCATE;
 511   int i;
 512   int need_regs = nregs + 1;
 513   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 514      uses.  */
 515
 516   /* Have the register data arrays been allocated?  */
 517   if (regs_allocated == REGS_UNALLOCATED)
 518     { /* No.  So allocate them with malloc.  */
 519       regs->start = re_malloc (regoff_t, need_regs);
 520       if (BE (regs->start == NULL, 0))
 521         return REGS_UNALLOCATED;
 522       regs->end = re_malloc (regoff_t, need_regs);
 523       if (BE (regs->end == NULL, 0))
 524         {
 525           re_free (regs->start);
 526           return REGS_UNALLOCATED;
 527         }
 528       regs->num_regs = need_regs;
 529     }
 530   else if (regs_allocated == REGS_REALLOCATE)
 531     { /* Yes.  If we need more elements than were already
 532          allocated, reallocate them.  If we need fewer, just
 533          leave it alone.  */
 534       if (BE (need_regs > regs->num_regs, 0))
 535         {
 536           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 537           regoff_t *new_end;
 538           if (BE (new_start == NULL, 0))
 539             return REGS_UNALLOCATED;
 540           new_end = re_realloc (regs->end, regoff_t, need_regs);
 541           if (BE (new_end == NULL, 0))
 542             {
 543               re_free (new_start);
 544               return REGS_UNALLOCATED;
 545             }
 546           regs->start = new_start;
 547           regs->end = new_end;
 548           regs->num_regs = need_regs;
 549         }
 550     }
 551   else
 552     {
 553       assert (regs_allocated == REGS_FIXED);
 554       /* This function may not be called with REGS_FIXED and nregs too big.  */
 555       assert (regs->num_regs >= nregs);
 556       rval = REGS_FIXED;
 557     }
 558
 559   /* Copy the regs.  */
 560   for (i = 0; i < nregs; ++i)
 561     {
 562       regs->start[i] = pmatch[i].rm_so;
 563       regs->end[i] = pmatch[i].rm_eo;
 564     }
 565   for ( ; i < regs->num_regs; ++i)
 566     regs->start[i] = regs->end[i] = -1;
 567
 568   return rval;
 569 }
 570
 571 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 572    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 573    this memory for recording register information.  STARTS and ENDS
 574    must be allocated using the malloc library routine, and must each
 575    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 576
 577    If NUM_REGS == 0, then subsequent matches should allocate their own
 578    register data.
 579
 580    Unless this function is called, the first search or match using
 581    PATTERN_BUFFER will allocate its own register data, without
 582    freeing the old data.  */
 583
 584 void
 585 re_set_registers (bufp, regs, num_regs, starts, ends)
 586     struct re_pattern_buffer *bufp;
 587     struct re_registers *regs;
 588     unsigned num_regs;
 589     regoff_t *starts, *ends;
 590 {
 591   if (num_regs)
 592     {
 593       bufp->regs_allocated = REGS_REALLOCATE;
 594       regs->num_regs = num_regs;
 595       regs->start = starts;
 596       regs->end = ends;
 597     }
 598   else
 599     {
 600       bufp->regs_allocated = REGS_UNALLOCATED;
 601       regs->num_regs = 0;
 602       regs->start = regs->end = (regoff_t *) 0;
 603     }
 604 }
 605 #ifdef _LIBC
 606 weak_alias (__re_set_registers, re_set_registers)
 607 #endif
 608 \f
 609 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 610    them unless specifically requested.  */
 611
 612 #if defined _REGEX_RE_COMP || defined _LIBC
 613 int
 614 # ifdef _LIBC
 615 weak_function
 616 # endif
 617 re_exec (s)
 618      const char *s;
 619 {
 620   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 621 }
 622 #endif /* _REGEX_RE_COMP */
 623 \f
 624 /* Internal entry point.  */
 625
 626 /* Searches for a compiled pattern PREG in the string STRING, whose
 627    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 628    mingings with regexec.  START, and RANGE have the same meanings
 629    with re_search.
 630    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 631    otherwise return the error code.
 632    Note: We assume front end functions already check ranges.
 633    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 634
 635 static reg_errcode_t
 636 __attribute_warn_unused_result__
 637 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 638                     eflags)
 639     const regex_t *preg;
 640     const char *string;
 641     int length, start, range, stop, eflags;
 642     size_t nmatch;
 643     regmatch_t pmatch[];
 644 {
 645   reg_errcode_t err;
 646   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 647   int left_lim, right_lim, incr;
 648   int fl_longest_match, match_first, match_kind, match_last = -1;
 649   int extra_nmatch;
 650   int sb, ch;
 651 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 652   re_match_context_t mctx = { .dfa = dfa };
 653 #else
 654   re_match_context_t mctx;
 655 #endif
 656   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 657                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 658   RE_TRANSLATE_TYPE t = preg->translate;
 659
 660 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 661   memset (&mctx, '\0', sizeof (re_match_context_t));
 662   mctx.dfa = dfa;
 663 #endif
 664
 665   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 666   nmatch -= extra_nmatch;
 667
 668   /* Check if the DFA haven't been compiled.  */
 669   if (BE (preg->used == 0 || dfa->init_state == NULL
 670           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 671           || dfa->init_state_begbuf == NULL, 0))
 672     return REG_NOMATCH;
 673
 674 #ifdef DEBUG
 675   /* We assume front-end functions already check them.  */
 676   assert (start + range >= 0 && start + range <= length);
 677 #endif
 678
 679   /* If initial states with non-begbuf contexts have no elements,
 680      the regex must be anchored.  If preg->newline_anchor is set,
 681      we'll never use init_state_nl, so do not check it.  */
 682   if (dfa->init_state->nodes.nelem == 0
 683       && dfa->init_state_word->nodes.nelem == 0
 684       && (dfa->init_state_nl->nodes.nelem == 0
 685           || !preg->newline_anchor))
 686     {
 687       if (start != 0 && start + range != 0)
 688         return REG_NOMATCH;
 689       start = range = 0;
 690     }
 691
 692   /* We must check the longest matching, if nmatch > 0.  */
 693   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 694
 695   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 696                             preg->translate, preg->syntax & RE_ICASE, dfa);
 697   if (BE (err != REG_NOERROR, 0))
 698     goto free_return;
 699   mctx.input.stop = stop;
 700   mctx.input.raw_stop = stop;
 701   mctx.input.newline_anchor = preg->newline_anchor;
 702
 703   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 704   if (BE (err != REG_NOERROR, 0))
 705     goto free_return;
 706
 707   /* We will log all the DFA states through which the dfa pass,
 708      if nmatch > 1, or this dfa has "multibyte node", which is a
 709      back-reference or a node which can accept multibyte character or
 710      multi character collating element.  */
 711   if (nmatch > 1 || dfa->has_mb_node)
 712     {
 713       /* Avoid overflow.  */
 714       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 715         {
 716           err = REG_ESPACE;
 717           goto free_return;
 718         }
 719
 720       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 721       if (BE (mctx.state_log == NULL, 0))
 722         {
 723           err = REG_ESPACE;
 724           goto free_return;
 725         }
 726     }
 727   else
 728     mctx.state_log = NULL;
 729
 730   match_first = start;
 731   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 732                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 733
 734   /* Check incrementally whether of not the input string match.  */
 735   incr = (range < 0) ? -1 : 1;
 736   left_lim = (range < 0) ? start + range : start;
 737   right_lim = (range < 0) ? start : start + range;
 738   sb = dfa->mb_cur_max == 1;
 739   match_kind =
 740     (fastmap
 741      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 742         | (range >= 0 ? 2 : 0)
 743         | (t != NULL ? 1 : 0))
 744      : 8);
 745
 746   for (;; match_first += incr)
 747     {
 748       err = REG_NOMATCH;
 749       if (match_first < left_lim || right_lim < match_first)
 750         goto free_return;
 751
 752       /* Advance as rapidly as possible through the string, until we
 753          find a plausible place to start matching.  This may be done
 754          with varying efficiency, so there are various possibilities:
 755          only the most common of them are specialized, in order to
 756          save on code size.  We use a switch statement for speed.  */
 757       switch (match_kind)
 758         {
 759         case 8:
 760           /* No fastmap.  */
 761           break;
 762
 763         case 7:
 764           /* Fastmap with single-byte translation, match forward.  */
 765           while (BE (match_first < right_lim, 1)
 766                  && !fastmap[t[(unsigned char) string[match_first]]])
 767             ++match_first;
 768           goto forward_match_found_start_or_reached_end;
 769
 770         case 6:
 771           /* Fastmap without translation, match forward.  */
 772           while (BE (match_first < right_lim, 1)
 773                  && !fastmap[(unsigned char) string[match_first]])
 774             ++match_first;
 775
 776         forward_match_found_start_or_reached_end:
 777           if (BE (match_first == right_lim, 0))
 778             {
 779               ch = match_first >= length
 780                        ? 0 : (unsigned char) string[match_first];
 781               if (!fastmap[t ? t[ch] : ch])
 782                 goto free_return;
 783             }
 784           break;
 785
 786         case 4:
 787         case 5:
 788           /* Fastmap without multi-byte translation, match backwards.  */
 789           while (match_first >= left_lim)
 790             {
 791               ch = match_first >= length
 792                        ? 0 : (unsigned char) string[match_first];
 793               if (fastmap[t ? t[ch] : ch])
 794                 break;
 795               --match_first;
 796             }
 797           if (match_first < left_lim)
 798             goto free_return;
 799           break;
 800
 801         default:
 802           /* In this case, we can't determine easily the current byte,
 803              since it might be a component byte of a multibyte
 804              character.  Then we use the constructed buffer instead.  */
 805           for (;;)
 806             {
 807               /* If MATCH_FIRST is out of the valid range, reconstruct the
 808                  buffers.  */
 809               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 810               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 811                 {
 812                   err = re_string_reconstruct (&mctx.input, match_first,
 813                                                eflags);
 814                   if (BE (err != REG_NOERROR, 0))
 815                     goto free_return;
 816
 817                   offset = match_first - mctx.input.raw_mbs_idx;
 818                 }
 819               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 820                  Note that MATCH_FIRST must not be smaller than 0.  */
 821               ch = (match_first >= length
 822                     ? 0 : re_string_byte_at (&mctx.input, offset));
 823               if (fastmap[ch])
 824                 break;
 825               match_first += incr;
 826               if (match_first < left_lim || match_first > right_lim)
 827                 {
 828                   err = REG_NOMATCH;
 829                   goto free_return;
 830                 }
 831             }
 832           break;
 833         }
 834
 835       /* Reconstruct the buffers so that the matcher can assume that
 836          the matching starts from the beginning of the buffer.  */
 837       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 838       if (BE (err != REG_NOERROR, 0))
 839         goto free_return;
 840
 841 #ifdef RE_ENABLE_I18N
 842      /* Don't consider this char as a possible match start if it part,
 843         yet isn't the head, of a multibyte character.  */
 844       if (!sb && !re_string_first_byte (&mctx.input, 0))
 845         continue;
 846 #endif
 847
 848       /* It seems to be appropriate one, then use the matcher.  */
 849       /* We assume that the matching starts from 0.  */
 850       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 851       match_last = check_matching (&mctx, fl_longest_match,
 852                                    range >= 0 ? &match_first : NULL);
 853       if (match_last != -1)
 854         {
 855           if (BE (match_last == -2, 0))
 856             {
 857               err = REG_ESPACE;
 858               goto free_return;
 859             }
 860           else
 861             {
 862               mctx.match_last = match_last;
 863               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 864                 {
 865                   re_dfastate_t *pstate = mctx.state_log[match_last];
 866                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 867                                                              match_last);
 868                 }
 869               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 870                   || dfa->nbackref)
 871                 {
 872                   err = prune_impossible_nodes (&mctx);
 873                   if (err == REG_NOERROR)
 874                     break;
 875                   if (BE (err != REG_NOMATCH, 0))
 876                     goto free_return;
 877                   match_last = -1;
 878                 }
 879               else
 880                 break; /* We found a match.  */
 881             }
 882         }
 883
 884       match_ctx_clean (&mctx);
 885     }
 886
 887 #ifdef DEBUG
 888   assert (match_last != -1);
 889   assert (err == REG_NOERROR);
 890 #endif
 891
 892   /* Set pmatch[] if we need.  */
 893   if (nmatch > 0)
 894     {
 895       int reg_idx;
 896
 897       /* Initialize registers.  */
 898       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 899         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 900
 901       /* Set the points where matching start/end.  */
 902       pmatch[0].rm_so = 0;
 903       pmatch[0].rm_eo = mctx.match_last;
 904
 905       if (!preg->no_sub && nmatch > 1)
 906         {
 907           err = set_regs (preg, &mctx, nmatch, pmatch,
 908                           dfa->has_plural_match && dfa->nbackref > 0);
 909           if (BE (err != REG_NOERROR, 0))
 910             goto free_return;
 911         }
 912
 913       /* At last, add the offset to the each registers, since we slided
 914          the buffers so that we could assume that the matching starts
 915          from 0.  */
 916       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 917         if (pmatch[reg_idx].rm_so != -1)
 918           {
 919 #ifdef RE_ENABLE_I18N
 920             if (BE (mctx.input.offsets_needed != 0, 0))
 921               {
 922                 pmatch[reg_idx].rm_so =
 923                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 924                    ? mctx.input.valid_raw_len
 925                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 926                 pmatch[reg_idx].rm_eo =
 927                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 928                    ? mctx.input.valid_raw_len
 929                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 930               }
 931 #else
 932             assert (mctx.input.offsets_needed == 0);
 933 #endif
 934             pmatch[reg_idx].rm_so += match_first;
 935             pmatch[reg_idx].rm_eo += match_first;
 936           }
 937       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 938         {
 939           pmatch[nmatch + reg_idx].rm_so = -1;
 940           pmatch[nmatch + reg_idx].rm_eo = -1;
 941         }
 942
 943       if (dfa->subexp_map)
 944         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 945           if (dfa->subexp_map[reg_idx] != reg_idx)
 946             {
 947               pmatch[reg_idx + 1].rm_so
 948                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 949               pmatch[reg_idx + 1].rm_eo
 950                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 951             }
 952     }
 953
 954  free_return:
 955   re_free (mctx.state_log);
 956   if (dfa->nbackref)
 957     match_ctx_free (&mctx);
 958   re_string_destruct (&mctx.input);
 959   return err;
 960 }
 961
 962 static reg_errcode_t
 963 __attribute_warn_unused_result__
 964 prune_impossible_nodes (mctx)
 965      re_match_context_t *mctx;
 966 {
 967   const re_dfa_t *const dfa = mctx->dfa;
 968   int halt_node, match_last;
 969   reg_errcode_t ret;
 970   re_dfastate_t **sifted_states;
 971   re_dfastate_t **lim_states = NULL;
 972   re_sift_context_t sctx;
 973 #ifdef DEBUG
 974   assert (mctx->state_log != NULL);
 975 #endif
 976   match_last = mctx->match_last;
 977   halt_node = mctx->last_node;
 978
 979   /* Avoid overflow.  */
 980   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 981     return REG_ESPACE;
 982
 983   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 984   if (BE (sifted_states == NULL, 0))
 985     {
 986       ret = REG_ESPACE;
 987       goto free_return;
 988     }
 989   if (dfa->nbackref)
 990     {
 991       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 992       if (BE (lim_states == NULL, 0))
 993         {
 994           ret = REG_ESPACE;
 995           goto free_return;
 996         }
 997       while (1)
 998         {
 999           memset (lim_states, '\0',
1000                   sizeof (re_dfastate_t *) * (match_last + 1));
1001           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1002                          match_last);
1003           ret = sift_states_backward (mctx, &sctx);
1004           re_node_set_free (&sctx.limits);
1005           if (BE (ret != REG_NOERROR, 0))
1006               goto free_return;
1007           if (sifted_states[0] != NULL || lim_states[0] != NULL)
1008             break;
1009           do
1010             {
1011               --match_last;
1012               if (match_last < 0)
1013                 {
1014                   ret = REG_NOMATCH;
1015                   goto free_return;
1016                 }
1017             } while (mctx->state_log[match_last] == NULL
1018                      || !mctx->state_log[match_last]->halt);
1019           halt_node = check_halt_state_context (mctx,
1020                                                 mctx->state_log[match_last],
1021                                                 match_last);
1022         }
1023       ret = merge_state_array (dfa, sifted_states, lim_states,
1024                                match_last + 1);
1025       re_free (lim_states);
1026       lim_states = NULL;
1027       if (BE (ret != REG_NOERROR, 0))
1028         goto free_return;
1029     }
1030   else
1031     {
1032       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1033       ret = sift_states_backward (mctx, &sctx);
1034       re_node_set_free (&sctx.limits);
1035       if (BE (ret != REG_NOERROR, 0))
1036         goto free_return;
1037       if (sifted_states[0] == NULL)
1038         {
1039           ret = REG_NOMATCH;
1040           goto free_return;
1041         }
1042     }
1043   re_free (mctx->state_log);
1044   mctx->state_log = sifted_states;
1045   sifted_states = NULL;
1046   mctx->last_node = halt_node;
1047   mctx->match_last = match_last;
1048   ret = REG_NOERROR;
1049  free_return:
1050   re_free (sifted_states);
1051   re_free (lim_states);
1052   return ret;
1053 }
1054
1055 /* Acquire an initial state and return it.
1056    We must select appropriate initial state depending on the context,
1057    since initial states may have constraints like "\<", "^", etc..  */
1058
1059 static inline re_dfastate_t *
1060 __attribute ((always_inline)) internal_function
1061 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1062                             int idx)
1063 {
1064   const re_dfa_t *const dfa = mctx->dfa;
1065   if (dfa->init_state->has_constraint)
1066     {
1067       unsigned int context;
1068       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1069       if (IS_WORD_CONTEXT (context))
1070         return dfa->init_state_word;
1071       else if (IS_ORDINARY_CONTEXT (context))
1072         return dfa->init_state;
1073       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1074         return dfa->init_state_begbuf;
1075       else if (IS_NEWLINE_CONTEXT (context))
1076         return dfa->init_state_nl;
1077       else if (IS_BEGBUF_CONTEXT (context))
1078         {
1079           /* It is relatively rare case, then calculate on demand.  */
1080           return re_acquire_state_context (err, dfa,
1081                                            dfa->init_state->entrance_nodes,
1082                                            context);
1083         }
1084       else
1085         /* Must not happen?  */
1086         return dfa->init_state;
1087     }
1088   else
1089     return dfa->init_state;
1090 }
1091
1092 /* Check whether the regular expression match input string INPUT or not,
1093    and return the index where the matching end, return -1 if not match,
1094    or return -2 in case of an error.
1095    FL_LONGEST_MATCH means we want the POSIX longest matching.
1096    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1097    next place where we may want to try matching.
1098    Note that the matcher assume that the maching starts from the current
1099    index of the buffer.  */
1100
1101 static int
1102 internal_function __attribute_warn_unused_result__
1103 check_matching (re_match_context_t *mctx, int fl_longest_match,
1104                 int *p_match_first)
1105 {
1106   const re_dfa_t *const dfa = mctx->dfa;
1107   reg_errcode_t err;
1108   int match = 0;
1109   int match_last = -1;
1110   int cur_str_idx = re_string_cur_idx (&mctx->input);
1111   re_dfastate_t *cur_state;
1112   int at_init_state = p_match_first != NULL;
1113   int next_start_idx = cur_str_idx;
1114
1115   err = REG_NOERROR;
1116   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1117   /* An initial state must not be NULL (invalid).  */
1118   if (BE (cur_state == NULL, 0))
1119     {
1120       assert (err == REG_ESPACE);
1121       return -2;
1122     }
1123
1124   if (mctx->state_log != NULL)
1125     {
1126       mctx->state_log[cur_str_idx] = cur_state;
1127
1128       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1129          later.  E.g. Processing back references.  */
1130       if (BE (dfa->nbackref, 0))
1131         {
1132           at_init_state = 0;
1133           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1134           if (BE (err != REG_NOERROR, 0))
1135             return err;
1136
1137           if (cur_state->has_backref)
1138             {
1139               err = transit_state_bkref (mctx, &cur_state->nodes);
1140               if (BE (err != REG_NOERROR, 0))
1141                 return err;
1142             }
1143         }
1144     }
1145
1146   /* If the RE accepts NULL string.  */
1147   if (BE (cur_state->halt, 0))
1148     {
1149       if (!cur_state->has_constraint
1150           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1151         {
1152           if (!fl_longest_match)
1153             return cur_str_idx;
1154           else
1155             {
1156               match_last = cur_str_idx;
1157               match = 1;
1158             }
1159         }
1160     }
1161
1162   while (!re_string_eoi (&mctx->input))
1163     {
1164       re_dfastate_t *old_state = cur_state;
1165       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1166
1167       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1168            && mctx->input.bufs_len < mctx->input.len)
1169           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1170               && mctx->input.valid_len < mctx->input.len))
1171         {
1172           err = extend_buffers (mctx);
1173           if (BE (err != REG_NOERROR, 0))
1174             {
1175               assert (err == REG_ESPACE);
1176               return -2;
1177             }
1178         }
1179
1180       cur_state = transit_state (&err, mctx, cur_state);
1181       if (mctx->state_log != NULL)
1182         cur_state = merge_state_with_log (&err, mctx, cur_state);
1183
1184       if (cur_state == NULL)
1185         {
1186           /* Reached the invalid state or an error.  Try to recover a valid
1187              state using the state log, if available and if we have not
1188              already found a valid (even if not the longest) match.  */
1189           if (BE (err != REG_NOERROR, 0))
1190             return -2;
1191
1192           if (mctx->state_log == NULL
1193               || (match && !fl_longest_match)
1194               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1195             break;
1196         }
1197
1198       if (BE (at_init_state, 0))
1199         {
1200           if (old_state == cur_state)
1201             next_start_idx = next_char_idx;
1202           else
1203             at_init_state = 0;
1204         }
1205
1206       if (cur_state->halt)
1207         {
1208           /* Reached a halt state.
1209              Check the halt state can satisfy the current context.  */
1210           if (!cur_state->has_constraint
1211               || check_halt_state_context (mctx, cur_state,
1212                                            re_string_cur_idx (&mctx->input)))
1213             {
1214               /* We found an appropriate halt state.  */
1215               match_last = re_string_cur_idx (&mctx->input);
1216               match = 1;
1217
1218               /* We found a match, do not modify match_first below.  */
1219               p_match_first = NULL;
1220               if (!fl_longest_match)
1221                 break;
1222             }
1223         }
1224     }
1225
1226   if (p_match_first)
1227     *p_match_first += next_start_idx;
1228
1229   return match_last;
1230 }
1231
1232 /* Check NODE match the current context.  */
1233
1234 static int
1235 internal_function
1236 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1237 {
1238   re_token_type_t type = dfa->nodes[node].type;
1239   unsigned int constraint = dfa->nodes[node].constraint;
1240   if (type != END_OF_RE)
1241     return 0;
1242   if (!constraint)
1243     return 1;
1244   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1245     return 0;
1246   return 1;
1247 }
1248
1249 /* Check the halt state STATE match the current context.
1250    Return 0 if not match, if the node, STATE has, is a halt node and
1251    match the context, return the node.  */
1252
1253 static int
1254 internal_function
1255 check_halt_state_context (const re_match_context_t *mctx,
1256                           const re_dfastate_t *state, int idx)
1257 {
1258   int i;
1259   unsigned int context;
1260 #ifdef DEBUG
1261   assert (state->halt);
1262 #endif
1263   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1264   for (i = 0; i < state->nodes.nelem; ++i)
1265     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1266       return state->nodes.elems[i];
1267   return 0;
1268 }
1269
1270 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1271    corresponding to the DFA).
1272    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1273    of errors.  */
1274
1275 static int
1276 internal_function
1277 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1278                    int *pidx, int node, re_node_set *eps_via_nodes,
1279                    struct re_fail_stack_t *fs)
1280 {
1281   const re_dfa_t *const dfa = mctx->dfa;
1282   int i, err;
1283   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1284     {
1285       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1286       re_node_set *edests = &dfa->edests[node];
1287       int dest_node;
1288       err = re_node_set_insert (eps_via_nodes, node);
1289       if (BE (err < 0, 0))
1290         return -2;
1291       /* Pick up a valid destination, or return -1 if none is found.  */
1292       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1293         {
1294           int candidate = edests->elems[i];
1295           if (!re_node_set_contains (cur_nodes, candidate))
1296             continue;
1297           if (dest_node == -1)
1298             dest_node = candidate;
1299
1300           else
1301             {
1302               /* In order to avoid infinite loop like "(a*)*", return the second
1303                  epsilon-transition if the first was already considered.  */
1304               if (re_node_set_contains (eps_via_nodes, dest_node))
1305                 return candidate;
1306
1307               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1308               else if (fs != NULL
1309                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1310                                            eps_via_nodes))
1311                 return -2;
1312
1313               /* We know we are going to exit.  */
1314               break;
1315             }
1316         }
1317       return dest_node;
1318     }
1319   else
1320     {
1321       int naccepted = 0;
1322       re_token_type_t type = dfa->nodes[node].type;
1323
1324 #ifdef RE_ENABLE_I18N
1325       if (dfa->nodes[node].accept_mb)
1326         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1327       else
1328 #endif /* RE_ENABLE_I18N */
1329       if (type == OP_BACK_REF)
1330         {
1331           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1332           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1333           if (fs != NULL)
1334             {
1335               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1336                 return -1;
1337               else if (naccepted)
1338                 {
1339                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1340                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1341                               naccepted) != 0)
1342                     return -1;
1343                 }
1344             }
1345
1346           if (naccepted == 0)
1347             {
1348               int dest_node;
1349               err = re_node_set_insert (eps_via_nodes, node);
1350               if (BE (err < 0, 0))
1351                 return -2;
1352               dest_node = dfa->edests[node].elems[0];
1353               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1354                                         dest_node))
1355                 return dest_node;
1356             }
1357         }
1358
1359       if (naccepted != 0
1360           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1361         {
1362           int dest_node = dfa->nexts[node];
1363           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1364           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1365                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1366                                                dest_node)))
1367             return -1;
1368           re_node_set_empty (eps_via_nodes);
1369           return dest_node;
1370         }
1371     }
1372   return -1;
1373 }
1374
1375 static reg_errcode_t
1376 internal_function __attribute_warn_unused_result__
1377 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1378                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1379 {
1380   reg_errcode_t err;
1381   int num = fs->num++;
1382   if (fs->num == fs->alloc)
1383     {
1384       struct re_fail_stack_ent_t *new_array;
1385       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1386                                        * fs->alloc * 2));
1387       if (new_array == NULL)
1388         return REG_ESPACE;
1389       fs->alloc *= 2;
1390       fs->stack = new_array;
1391     }
1392   fs->stack[num].idx = str_idx;
1393   fs->stack[num].node = dest_node;
1394   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1395   if (fs->stack[num].regs == NULL)
1396     return REG_ESPACE;
1397   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1398   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1399   return err;
1400 }
1401
1402 static int
1403 internal_function
1404 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1405                 regmatch_t *regs, re_node_set *eps_via_nodes)
1406 {
1407   int num = --fs->num;
1408   assert (num >= 0);
1409   *pidx = fs->stack[num].idx;
1410   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1411   re_node_set_free (eps_via_nodes);
1412   re_free (fs->stack[num].regs);
1413   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1414   return fs->stack[num].node;
1415 }
1416
1417 /* Set the positions where the subexpressions are starts/ends to registers
1418    PMATCH.
1419    Note: We assume that pmatch[0] is already set, and
1420    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1421
1422 static reg_errcode_t
1423 internal_function __attribute_warn_unused_result__
1424 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1425           regmatch_t *pmatch, int fl_backtrack)
1426 {
1427   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1428   int idx, cur_node;
1429   re_node_set eps_via_nodes;
1430   struct re_fail_stack_t *fs;
1431   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1432   regmatch_t *prev_idx_match;
1433   int prev_idx_match_malloced = 0;
1434
1435 #ifdef DEBUG
1436   assert (nmatch > 1);
1437   assert (mctx->state_log != NULL);
1438 #endif
1439   if (fl_backtrack)
1440     {
1441       fs = &fs_body;
1442       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1443       if (fs->stack == NULL)
1444         return REG_ESPACE;
1445     }
1446   else
1447     fs = NULL;
1448
1449   cur_node = dfa->init_node;
1450   re_node_set_init_empty (&eps_via_nodes);
1451
1452 #ifdef HAVE_ALLOCA
1453   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1454     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1455   else
1456 #endif
1457     {
1458       prev_idx_match = re_malloc (regmatch_t, nmatch);
1459       if (prev_idx_match == NULL)
1460         {
1461           free_fail_stack_return (fs);
1462           return REG_ESPACE;
1463         }
1464       prev_idx_match_malloced = 1;
1465     }
1466   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1467
1468   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1469     {
1470       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1471
1472       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1473         {
1474           int reg_idx;
1475           if (fs)
1476             {
1477               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1478                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1479                   break;
1480               if (reg_idx == nmatch)
1481                 {
1482                   re_node_set_free (&eps_via_nodes);
1483                   if (prev_idx_match_malloced)
1484                     re_free (prev_idx_match);
1485                   return free_fail_stack_return (fs);
1486                 }
1487               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1488                                          &eps_via_nodes);
1489             }
1490           else
1491             {
1492               re_node_set_free (&eps_via_nodes);
1493               if (prev_idx_match_malloced)
1494                 re_free (prev_idx_match);
1495               return REG_NOERROR;
1496             }
1497         }
1498
1499       /* Proceed to next node.  */
1500       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1501                                     &eps_via_nodes, fs);
1502
1503       if (BE (cur_node < 0, 0))
1504         {
1505           if (BE (cur_node == -2, 0))
1506             {
1507               re_node_set_free (&eps_via_nodes);
1508               if (prev_idx_match_malloced)
1509                 re_free (prev_idx_match);
1510               free_fail_stack_return (fs);
1511               return REG_ESPACE;
1512             }
1513           if (fs)
1514             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1515                                        &eps_via_nodes);
1516           else
1517             {
1518               re_node_set_free (&eps_via_nodes);
1519               if (prev_idx_match_malloced)
1520                 re_free (prev_idx_match);
1521               return REG_NOMATCH;
1522             }
1523         }
1524     }
1525   re_node_set_free (&eps_via_nodes);
1526   if (prev_idx_match_malloced)
1527     re_free (prev_idx_match);
1528   return free_fail_stack_return (fs);
1529 }
1530
1531 static reg_errcode_t
1532 internal_function
1533 free_fail_stack_return (struct re_fail_stack_t *fs)
1534 {
1535   if (fs)
1536     {
1537       int fs_idx;
1538       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1539         {
1540           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1541           re_free (fs->stack[fs_idx].regs);
1542         }
1543       re_free (fs->stack);
1544     }
1545   return REG_NOERROR;
1546 }
1547
1548 static void
1549 internal_function
1550 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1551              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1552 {
1553   int type = dfa->nodes[cur_node].type;
1554   if (type == OP_OPEN_SUBEXP)
1555     {
1556       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1557
1558       /* We are at the first node of this sub expression.  */
1559       if (reg_num < nmatch)
1560         {
1561           pmatch[reg_num].rm_so = cur_idx;
1562           pmatch[reg_num].rm_eo = -1;
1563         }
1564     }
1565   else if (type == OP_CLOSE_SUBEXP)
1566     {
1567       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1568       if (reg_num < nmatch)
1569         {
1570           /* We are at the last node of this sub expression.  */
1571           if (pmatch[reg_num].rm_so < cur_idx)
1572             {
1573               pmatch[reg_num].rm_eo = cur_idx;
1574               /* This is a non-empty match or we are not inside an optional
1575                  subexpression.  Accept this right away.  */
1576               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1577             }
1578           else
1579             {
1580               if (dfa->nodes[cur_node].opt_subexp
1581                   && prev_idx_match[reg_num].rm_so != -1)
1582                 /* We transited through an empty match for an optional
1583                    subexpression, like (a?)*, and this is not the subexp's
1584                    first match.  Copy back the old content of the registers
1585                    so that matches of an inner subexpression are undone as
1586                    well, like in ((a?))*.  */
1587                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1588               else
1589                 /* We completed a subexpression, but it may be part of
1590                    an optional one, so do not update PREV_IDX_MATCH.  */
1591                 pmatch[reg_num].rm_eo = cur_idx;
1592             }
1593         }
1594     }
1595 }
1596
1597 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1598    and sift the nodes in each states according to the following rules.
1599    Updated state_log will be wrote to STATE_LOG.
1600
1601    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1602      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1603         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1604         the LAST_NODE, we throw away the node `a'.
1605      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1606         string `s' and transit to `b':
1607         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1608            away the node `a'.
1609         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1610             thrown away, we throw away the node `a'.
1611      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1612         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1613            node `a'.
1614         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1615             we throw away the node `a'.  */
1616
1617 #define STATE_NODE_CONTAINS(state,node) \
1618   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1619
1620 static reg_errcode_t
1621 internal_function
1622 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1623 {
1624   reg_errcode_t err;
1625   int null_cnt = 0;
1626   int str_idx = sctx->last_str_idx;
1627   re_node_set cur_dest;
1628
1629 #ifdef DEBUG
1630   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1631 #endif
1632
1633   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1634      transit to the last_node and the last_node itself.  */
1635   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1636   if (BE (err != REG_NOERROR, 0))
1637     return err;
1638   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1639   if (BE (err != REG_NOERROR, 0))
1640     goto free_return;
1641
1642   /* Then check each states in the state_log.  */
1643   while (str_idx > 0)
1644     {
1645       /* Update counters.  */
1646       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1647       if (null_cnt > mctx->max_mb_elem_len)
1648         {
1649           memset (sctx->sifted_states, '\0',
1650                   sizeof (re_dfastate_t *) * str_idx);
1651           re_node_set_free (&cur_dest);
1652           return REG_NOERROR;
1653         }
1654       re_node_set_empty (&cur_dest);
1655       --str_idx;
1656
1657       if (mctx->state_log[str_idx])
1658         {
1659           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1660           if (BE (err != REG_NOERROR, 0))
1661             goto free_return;
1662         }
1663
1664       /* Add all the nodes which satisfy the following conditions:
1665          - It can epsilon transit to a node in CUR_DEST.
1666          - It is in CUR_SRC.
1667          And update state_log.  */
1668       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1669       if (BE (err != REG_NOERROR, 0))
1670         goto free_return;
1671     }
1672   err = REG_NOERROR;
1673  free_return:
1674   re_node_set_free (&cur_dest);
1675   return err;
1676 }
1677
1678 static reg_errcode_t
1679 internal_function __attribute_warn_unused_result__
1680 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1681                      int str_idx, re_node_set *cur_dest)
1682 {
1683   const re_dfa_t *const dfa = mctx->dfa;
1684   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1685   int i;
1686
1687   /* Then build the next sifted state.
1688      We build the next sifted state on `cur_dest', and update
1689      `sifted_states[str_idx]' with `cur_dest'.
1690      Note:
1691      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1692      `cur_src' points the node_set of the old `state_log[str_idx]'
1693      (with the epsilon nodes pre-filtered out).  */
1694   for (i = 0; i < cur_src->nelem; i++)
1695     {
1696       int prev_node = cur_src->elems[i];
1697       int naccepted = 0;
1698       int ret;
1699
1700 #ifdef DEBUG
1701       re_token_type_t type = dfa->nodes[prev_node].type;
1702       assert (!IS_EPSILON_NODE (type));
1703 #endif
1704 #ifdef RE_ENABLE_I18N
1705       /* If the node may accept `multi byte'.  */
1706       if (dfa->nodes[prev_node].accept_mb)
1707         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1708                                          str_idx, sctx->last_str_idx);
1709 #endif /* RE_ENABLE_I18N */
1710
1711       /* We don't check backreferences here.
1712          See update_cur_sifted_state().  */
1713       if (!naccepted
1714           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1715           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1716                                   dfa->nexts[prev_node]))
1717         naccepted = 1;
1718
1719       if (naccepted == 0)
1720         continue;
1721
1722       if (sctx->limits.nelem)
1723         {
1724           int to_idx = str_idx + naccepted;
1725           if (check_dst_limits (mctx, &sctx->limits,
1726                                 dfa->nexts[prev_node], to_idx,
1727                                 prev_node, str_idx))
1728             continue;
1729         }
1730       ret = re_node_set_insert (cur_dest, prev_node);
1731       if (BE (ret == -1, 0))
1732         return REG_ESPACE;
1733     }
1734
1735   return REG_NOERROR;
1736 }
1737
1738 /* Helper functions.  */
1739
1740 static reg_errcode_t
1741 internal_function
1742 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1743 {
1744   int top = mctx->state_log_top;
1745
1746   if ((next_state_log_idx >= mctx->input.bufs_len
1747        && mctx->input.bufs_len < mctx->input.len)
1748       || (next_state_log_idx >= mctx->input.valid_len
1749           && mctx->input.valid_len < mctx->input.len))
1750     {
1751       reg_errcode_t err;
1752       err = extend_buffers (mctx);
1753       if (BE (err != REG_NOERROR, 0))
1754         return err;
1755     }
1756
1757   if (top < next_state_log_idx)
1758     {
1759       memset (mctx->state_log + top + 1, '\0',
1760               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1761       mctx->state_log_top = next_state_log_idx;
1762     }
1763   return REG_NOERROR;
1764 }
1765
1766 static reg_errcode_t
1767 internal_function
1768 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1769                    re_dfastate_t **src, int num)
1770 {
1771   int st_idx;
1772   reg_errcode_t err;
1773   for (st_idx = 0; st_idx < num; ++st_idx)
1774     {
1775       if (dst[st_idx] == NULL)
1776         dst[st_idx] = src[st_idx];
1777       else if (src[st_idx] != NULL)
1778         {
1779           re_node_set merged_set;
1780           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1781                                         &src[st_idx]->nodes);
1782           if (BE (err != REG_NOERROR, 0))
1783             return err;
1784           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1785           re_node_set_free (&merged_set);
1786           if (BE (err != REG_NOERROR, 0))
1787             return err;
1788         }
1789     }
1790   return REG_NOERROR;
1791 }
1792
1793 static reg_errcode_t
1794 internal_function
1795 update_cur_sifted_state (const re_match_context_t *mctx,
1796                          re_sift_context_t *sctx, int str_idx,
1797                          re_node_set *dest_nodes)
1798 {
1799   const re_dfa_t *const dfa = mctx->dfa;
1800   reg_errcode_t err = REG_NOERROR;
1801   const re_node_set *candidates;
1802   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1803                 : &mctx->state_log[str_idx]->nodes);
1804
1805   if (dest_nodes->nelem == 0)
1806     sctx->sifted_states[str_idx] = NULL;
1807   else
1808     {
1809       if (candidates)
1810         {
1811           /* At first, add the nodes which can epsilon transit to a node in
1812              DEST_NODE.  */
1813           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1814           if (BE (err != REG_NOERROR, 0))
1815             return err;
1816
1817           /* Then, check the limitations in the current sift_context.  */
1818           if (sctx->limits.nelem)
1819             {
1820               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1821                                          mctx->bkref_ents, str_idx);
1822               if (BE (err != REG_NOERROR, 0))
1823                 return err;
1824             }
1825         }
1826
1827       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1828       if (BE (err != REG_NOERROR, 0))
1829         return err;
1830     }
1831
1832   if (candidates && mctx->state_log[str_idx]->has_backref)
1833     {
1834       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1835       if (BE (err != REG_NOERROR, 0))
1836         return err;
1837     }
1838   return REG_NOERROR;
1839 }
1840
1841 static reg_errcode_t
1842 internal_function __attribute_warn_unused_result__
1843 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1844                        const re_node_set *candidates)
1845 {
1846   reg_errcode_t err = REG_NOERROR;
1847   int i;
1848
1849   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1850   if (BE (err != REG_NOERROR, 0))
1851     return err;
1852
1853   if (!state->inveclosure.alloc)
1854     {
1855       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1856       if (BE (err != REG_NOERROR, 0))
1857         return REG_ESPACE;
1858       for (i = 0; i < dest_nodes->nelem; i++)
1859         {
1860           err = re_node_set_merge (&state->inveclosure,
1861                                    dfa->inveclosures + dest_nodes->elems[i]);
1862           if (BE (err != REG_NOERROR, 0))
1863             return REG_ESPACE;
1864         }
1865     }
1866   return re_node_set_add_intersect (dest_nodes, candidates,
1867                                     &state->inveclosure);
1868 }
1869
1870 static reg_errcode_t
1871 internal_function
1872 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1873                        const re_node_set *candidates)
1874 {
1875     int ecl_idx;
1876     reg_errcode_t err;
1877     re_node_set *inv_eclosure = dfa->inveclosures + node;
1878     re_node_set except_nodes;
1879     re_node_set_init_empty (&except_nodes);
1880     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1881       {
1882         int cur_node = inv_eclosure->elems[ecl_idx];
1883         if (cur_node == node)
1884           continue;
1885         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1886           {
1887             int edst1 = dfa->edests[cur_node].elems[0];
1888             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1889                          ? dfa->edests[cur_node].elems[1] : -1);
1890             if ((!re_node_set_contains (inv_eclosure, edst1)
1891                  && re_node_set_contains (dest_nodes, edst1))
1892                 || (edst2 > 0
1893                     && !re_node_set_contains (inv_eclosure, edst2)
1894                     && re_node_set_contains (dest_nodes, edst2)))
1895               {
1896                 err = re_node_set_add_intersect (&except_nodes, candidates,
1897                                                  dfa->inveclosures + cur_node);
1898                 if (BE (err != REG_NOERROR, 0))
1899                   {
1900                     re_node_set_free (&except_nodes);
1901                     return err;
1902                   }
1903               }
1904           }
1905       }
1906     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1907       {
1908         int cur_node = inv_eclosure->elems[ecl_idx];
1909         if (!re_node_set_contains (&except_nodes, cur_node))
1910           {
1911             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1912             re_node_set_remove_at (dest_nodes, idx);
1913           }
1914       }
1915     re_node_set_free (&except_nodes);
1916     return REG_NOERROR;
1917 }
1918
1919 static int
1920 internal_function
1921 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1922                   int dst_node, int dst_idx, int src_node, int src_idx)
1923 {
1924   const re_dfa_t *const dfa = mctx->dfa;
1925   int lim_idx, src_pos, dst_pos;
1926
1927   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1928   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1929   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1930     {
1931       int subexp_idx;
1932       struct re_backref_cache_entry *ent;
1933       ent = mctx->bkref_ents + limits->elems[lim_idx];
1934       subexp_idx = dfa->nodes[ent->node].opr.idx;
1935
1936       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1937                                            subexp_idx, dst_node, dst_idx,
1938                                            dst_bkref_idx);
1939       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1940                                            subexp_idx, src_node, src_idx,
1941                                            src_bkref_idx);
1942
1943       /* In case of:
1944          <src> <dst> ( <subexp> )
1945          ( <subexp> ) <src> <dst>
1946          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1947       if (src_pos == dst_pos)
1948         continue; /* This is unrelated limitation.  */
1949       else
1950         return 1;
1951     }
1952   return 0;
1953 }
1954
1955 static int
1956 internal_function
1957 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1958                              int subexp_idx, int from_node, int bkref_idx)
1959 {
1960   const re_dfa_t *const dfa = mctx->dfa;
1961   const re_node_set *eclosures = dfa->eclosures + from_node;
1962   int node_idx;
1963
1964   /* Else, we are on the boundary: examine the nodes on the epsilon
1965      closure.  */
1966   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1967     {
1968       int node = eclosures->elems[node_idx];
1969       switch (dfa->nodes[node].type)
1970         {
1971         case OP_BACK_REF:
1972           if (bkref_idx != -1)
1973             {
1974               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1975               do
1976                 {
1977                   int dst, cpos;
1978
1979                   if (ent->node != node)
1980                     continue;
1981
1982                   if (subexp_idx < BITSET_WORD_BITS
1983                       && !(ent->eps_reachable_subexps_map
1984                            & ((bitset_word_t) 1 << subexp_idx)))
1985                     continue;
1986
1987                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1988                      OP_CLOSE_SUBEXP cases below.  But, if the
1989                      destination node is the same node as the source
1990                      node, don't recurse because it would cause an
1991                      infinite loop: a regex that exhibits this behavior
1992                      is ()\1*\1*  */
1993                   dst = dfa->edests[node].elems[0];
1994                   if (dst == from_node)
1995                     {
1996                       if (boundaries & 1)
1997                         return -1;
1998                       else /* if (boundaries & 2) */
1999                         return 0;
2000                     }
2001
2002                   cpos =
2003                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2004                                                  dst, bkref_idx);
2005                   if (cpos == -1 /* && (boundaries & 1) */)
2006                     return -1;
2007                   if (cpos == 0 && (boundaries & 2))
2008                     return 0;
2009
2010                   if (subexp_idx < BITSET_WORD_BITS)
2011                     ent->eps_reachable_subexps_map
2012                       &= ~((bitset_word_t) 1 << subexp_idx);
2013                 }
2014               while (ent++->more);
2015             }
2016           break;
2017
2018         case OP_OPEN_SUBEXP:
2019           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2020             return -1;
2021           break;
2022
2023         case OP_CLOSE_SUBEXP:
2024           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2025             return 0;
2026           break;
2027
2028         default:
2029             break;
2030         }
2031     }
2032
2033   return (boundaries & 2) ? 1 : 0;
2034 }
2035
2036 static int
2037 internal_function
2038 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2039                            int subexp_idx, int from_node, int str_idx,
2040                            int bkref_idx)
2041 {
2042   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2043   int boundaries;
2044
2045   /* If we are outside the range of the subexpression, return -1 or 1.  */
2046   if (str_idx < lim->subexp_from)
2047     return -1;
2048
2049   if (lim->subexp_to < str_idx)
2050     return 1;
2051
2052   /* If we are within the subexpression, return 0.  */
2053   boundaries = (str_idx == lim->subexp_from);
2054   boundaries |= (str_idx == lim->subexp_to) << 1;
2055   if (boundaries == 0)
2056     return 0;
2057
2058   /* Else, examine epsilon closure.  */
2059   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2060                                       from_node, bkref_idx);
2061 }
2062
2063 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2064    which are against limitations from DEST_NODES. */
2065
2066 static reg_errcode_t
2067 internal_function
2068 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2069                      const re_node_set *candidates, re_node_set *limits,
2070                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2071 {
2072   reg_errcode_t err;
2073   int node_idx, lim_idx;
2074
2075   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2076     {
2077       int subexp_idx;
2078       struct re_backref_cache_entry *ent;
2079       ent = bkref_ents + limits->elems[lim_idx];
2080
2081       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2082         continue; /* This is unrelated limitation.  */
2083
2084       subexp_idx = dfa->nodes[ent->node].opr.idx;
2085       if (ent->subexp_to == str_idx)
2086         {
2087           int ops_node = -1;
2088           int cls_node = -1;
2089           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2090             {
2091               int node = dest_nodes->elems[node_idx];
2092               re_token_type_t type = dfa->nodes[node].type;
2093               if (type == OP_OPEN_SUBEXP
2094                   && subexp_idx == dfa->nodes[node].opr.idx)
2095                 ops_node = node;
2096               else if (type == OP_CLOSE_SUBEXP
2097                        && subexp_idx == dfa->nodes[node].opr.idx)
2098                 cls_node = node;
2099             }
2100
2101           /* Check the limitation of the open subexpression.  */
2102           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2103           if (ops_node >= 0)
2104             {
2105               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2106                                            candidates);
2107               if (BE (err != REG_NOERROR, 0))
2108                 return err;
2109             }
2110
2111           /* Check the limitation of the close subexpression.  */
2112           if (cls_node >= 0)
2113             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2114               {
2115                 int node = dest_nodes->elems[node_idx];
2116                 if (!re_node_set_contains (dfa->inveclosures + node,
2117                                            cls_node)
2118                     && !re_node_set_contains (dfa->eclosures + node,
2119                                               cls_node))
2120                   {
2121                     /* It is against this limitation.
2122                        Remove it form the current sifted state.  */
2123                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2124                                                  candidates);
2125                     if (BE (err != REG_NOERROR, 0))
2126                       return err;
2127                     --node_idx;
2128                   }
2129               }
2130         }
2131       else /* (ent->subexp_to != str_idx)  */
2132         {
2133           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2134             {
2135               int node = dest_nodes->elems[node_idx];
2136               re_token_type_t type = dfa->nodes[node].type;
2137               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2138                 {
2139                   if (subexp_idx != dfa->nodes[node].opr.idx)
2140                     continue;
2141                   /* It is against this limitation.
2142                      Remove it form the current sifted state.  */
2143                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2144                                                candidates);
2145                   if (BE (err != REG_NOERROR, 0))
2146                     return err;
2147                 }
2148             }
2149         }
2150     }
2151   return REG_NOERROR;
2152 }
2153
2154 static reg_errcode_t
2155 internal_function __attribute_warn_unused_result__
2156 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2157                    int str_idx, const re_node_set *candidates)
2158 {
2159   const re_dfa_t *const dfa = mctx->dfa;
2160   reg_errcode_t err;
2161   int node_idx, node;
2162   re_sift_context_t local_sctx;
2163   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2164
2165   if (first_idx == -1)
2166     return REG_NOERROR;
2167
2168   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2169
2170   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2171     {
2172       int enabled_idx;
2173       re_token_type_t type;
2174       struct re_backref_cache_entry *entry;
2175       node = candidates->elems[node_idx];
2176       type = dfa->nodes[node].type;
2177       /* Avoid infinite loop for the REs like "()\1+".  */
2178       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2179         continue;
2180       if (type != OP_BACK_REF)
2181         continue;
2182
2183       entry = mctx->bkref_ents + first_idx;
2184       enabled_idx = first_idx;
2185       do
2186         {
2187           int subexp_len;
2188           int to_idx;
2189           int dst_node;
2190           int ret;
2191           re_dfastate_t *cur_state;
2192
2193           if (entry->node != node)
2194             continue;
2195           subexp_len = entry->subexp_to - entry->subexp_from;
2196           to_idx = str_idx + subexp_len;
2197           dst_node = (subexp_len ? dfa->nexts[node]
2198                       : dfa->edests[node].elems[0]);
2199
2200           if (to_idx > sctx->last_str_idx
2201               || sctx->sifted_states[to_idx] == NULL
2202               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2203               || check_dst_limits (mctx, &sctx->limits, node,
2204                                    str_idx, dst_node, to_idx))
2205             continue;
2206
2207           if (local_sctx.sifted_states == NULL)
2208             {
2209               local_sctx = *sctx;
2210               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2211               if (BE (err != REG_NOERROR, 0))
2212                 goto free_return;
2213             }
2214           local_sctx.last_node = node;
2215           local_sctx.last_str_idx = str_idx;
2216           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2217           if (BE (ret < 0, 0))
2218             {
2219               err = REG_ESPACE;
2220               goto free_return;
2221             }
2222           cur_state = local_sctx.sifted_states[str_idx];
2223           err = sift_states_backward (mctx, &local_sctx);
2224           if (BE (err != REG_NOERROR, 0))
2225             goto free_return;
2226           if (sctx->limited_states != NULL)
2227             {
2228               err = merge_state_array (dfa, sctx->limited_states,
2229                                        local_sctx.sifted_states,
2230                                        str_idx + 1);
2231               if (BE (err != REG_NOERROR, 0))
2232                 goto free_return;
2233             }
2234           local_sctx.sifted_states[str_idx] = cur_state;
2235           re_node_set_remove (&local_sctx.limits, enabled_idx);
2236
2237           /* mctx->bkref_ents may have changed, reload the pointer.  */
2238           entry = mctx->bkref_ents + enabled_idx;
2239         }
2240       while (enabled_idx++, entry++->more);
2241     }
2242   err = REG_NOERROR;
2243  free_return:
2244   if (local_sctx.sifted_states != NULL)
2245     {
2246       re_node_set_free (&local_sctx.limits);
2247     }
2248
2249   return err;
2250 }
2251
2252
2253 #ifdef RE_ENABLE_I18N
2254 static int
2255 internal_function
2256 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2257                      int node_idx, int str_idx, int max_str_idx)
2258 {
2259   const re_dfa_t *const dfa = mctx->dfa;
2260   int naccepted;
2261   /* Check the node can accept `multi byte'.  */
2262   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2263   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2264       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2265                             dfa->nexts[node_idx]))
2266     /* The node can't accept the `multi byte', or the
2267        destination was already thrown away, then the node
2268        could't accept the current input `multi byte'.   */
2269     naccepted = 0;
2270   /* Otherwise, it is sure that the node could accept
2271      `naccepted' bytes input.  */
2272   return naccepted;
2273 }
2274 #endif /* RE_ENABLE_I18N */
2275
2276 \f
2277 /* Functions for state transition.  */
2278
2279 /* Return the next state to which the current state STATE will transit by
2280    accepting the current input byte, and update STATE_LOG if necessary.
2281    If STATE can accept a multibyte char/collating element/back reference
2282    update the destination of STATE_LOG.  */
2283
2284 static re_dfastate_t *
2285 internal_function __attribute_warn_unused_result__
2286 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2287                re_dfastate_t *state)
2288 {
2289   re_dfastate_t **trtable;
2290   unsigned char ch;
2291
2292 #ifdef RE_ENABLE_I18N
2293   /* If the current state can accept multibyte.  */
2294   if (BE (state->accept_mb, 0))
2295     {
2296       *err = transit_state_mb (mctx, state);
2297       if (BE (*err != REG_NOERROR, 0))
2298         return NULL;
2299     }
2300 #endif /* RE_ENABLE_I18N */
2301
2302   /* Then decide the next state with the single byte.  */
2303 #if 0
2304   if (0)
2305     /* don't use transition table  */
2306     return transit_state_sb (err, mctx, state);
2307 #endif
2308
2309   /* Use transition table  */
2310   ch = re_string_fetch_byte (&mctx->input);
2311   for (;;)
2312     {
2313       trtable = state->trtable;
2314       if (BE (trtable != NULL, 1))
2315         return trtable[ch];
2316
2317       trtable = state->word_trtable;
2318       if (BE (trtable != NULL, 1))
2319         {
2320           unsigned int context;
2321           context
2322             = re_string_context_at (&mctx->input,
2323                                     re_string_cur_idx (&mctx->input) - 1,
2324                                     mctx->eflags);
2325           if (IS_WORD_CONTEXT (context))
2326             return trtable[ch + SBC_MAX];
2327           else
2328             return trtable[ch];
2329         }
2330
2331       if (!build_trtable (mctx->dfa, state))
2332         {
2333           *err = REG_ESPACE;
2334           return NULL;
2335         }
2336
2337       /* Retry, we now have a transition table.  */
2338     }
2339 }
2340
2341 /* Update the state_log if we need */
2342 re_dfastate_t *
2343 internal_function
2344 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2345                       re_dfastate_t *next_state)
2346 {
2347   const re_dfa_t *const dfa = mctx->dfa;
2348   int cur_idx = re_string_cur_idx (&mctx->input);
2349
2350   if (cur_idx > mctx->state_log_top)
2351     {
2352       mctx->state_log[cur_idx] = next_state;
2353       mctx->state_log_top = cur_idx;
2354     }
2355   else if (mctx->state_log[cur_idx] == 0)
2356     {
2357       mctx->state_log[cur_idx] = next_state;
2358     }
2359   else
2360     {
2361       re_dfastate_t *pstate;
2362       unsigned int context;
2363       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2364       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2365          the destination of a multibyte char/collating element/
2366          back reference.  Then the next state is the union set of
2367          these destinations and the results of the transition table.  */
2368       pstate = mctx->state_log[cur_idx];
2369       log_nodes = pstate->entrance_nodes;
2370       if (next_state != NULL)
2371         {
2372           table_nodes = next_state->entrance_nodes;
2373           *err = re_node_set_init_union (&next_nodes, table_nodes,
2374                                              log_nodes);
2375           if (BE (*err != REG_NOERROR, 0))
2376             return NULL;
2377         }
2378       else
2379         next_nodes = *log_nodes;
2380       /* Note: We already add the nodes of the initial state,
2381          then we don't need to add them here.  */
2382
2383       context = re_string_context_at (&mctx->input,
2384                                       re_string_cur_idx (&mctx->input) - 1,
2385                                       mctx->eflags);
2386       next_state = mctx->state_log[cur_idx]
2387         = re_acquire_state_context (err, dfa, &next_nodes, context);
2388       /* We don't need to check errors here, since the return value of
2389          this function is next_state and ERR is already set.  */
2390
2391       if (table_nodes != NULL)
2392         re_node_set_free (&next_nodes);
2393     }
2394
2395   if (BE (dfa->nbackref, 0) && next_state != NULL)
2396     {
2397       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2398          later.  We must check them here, since the back references in the
2399          next state might use them.  */
2400       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2401                                         cur_idx);
2402       if (BE (*err != REG_NOERROR, 0))
2403         return NULL;
2404
2405       /* If the next state has back references.  */
2406       if (next_state->has_backref)
2407         {
2408           *err = transit_state_bkref (mctx, &next_state->nodes);
2409           if (BE (*err != REG_NOERROR, 0))
2410             return NULL;
2411           next_state = mctx->state_log[cur_idx];
2412         }
2413     }
2414
2415   return next_state;
2416 }
2417
2418 /* Skip bytes in the input that correspond to part of a
2419    multi-byte match, then look in the log for a state
2420    from which to restart matching.  */
2421 re_dfastate_t *
2422 internal_function
2423 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2424 {
2425   re_dfastate_t *cur_state;
2426   do
2427     {
2428       int max = mctx->state_log_top;
2429       int cur_str_idx = re_string_cur_idx (&mctx->input);
2430
2431       do
2432         {
2433           if (++cur_str_idx > max)
2434             return NULL;
2435           re_string_skip_bytes (&mctx->input, 1);
2436         }
2437       while (mctx->state_log[cur_str_idx] == NULL);
2438
2439       cur_state = merge_state_with_log (err, mctx, NULL);
2440     }
2441   while (*err == REG_NOERROR && cur_state == NULL);
2442   return cur_state;
2443 }
2444
2445 /* Helper functions for transit_state.  */
2446
2447 /* From the node set CUR_NODES, pick up the nodes whose types are
2448    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2449    expression. And register them to use them later for evaluating the
2450    correspoding back references.  */
2451
2452 static reg_errcode_t
2453 internal_function
2454 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2455                            int str_idx)
2456 {
2457   const re_dfa_t *const dfa = mctx->dfa;
2458   int node_idx;
2459   reg_errcode_t err;
2460
2461   /* TODO: This isn't efficient.
2462            Because there might be more than one nodes whose types are
2463            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2464            nodes.
2465            E.g. RE: (a){2}  */
2466   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2467     {
2468       int node = cur_nodes->elems[node_idx];
2469       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2470           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2471           && (dfa->used_bkref_map
2472               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2473         {
2474           err = match_ctx_add_subtop (mctx, node, str_idx);
2475           if (BE (err != REG_NOERROR, 0))
2476             return err;
2477         }
2478     }
2479   return REG_NOERROR;
2480 }
2481
2482 #if 0
2483 /* Return the next state to which the current state STATE will transit by
2484    accepting the current input byte.  */
2485
2486 static re_dfastate_t *
2487 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2488                   re_dfastate_t *state)
2489 {
2490   const re_dfa_t *const dfa = mctx->dfa;
2491   re_node_set next_nodes;
2492   re_dfastate_t *next_state;
2493   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2494   unsigned int context;
2495
2496   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2497   if (BE (*err != REG_NOERROR, 0))
2498     return NULL;
2499   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2500     {
2501       int cur_node = state->nodes.elems[node_cnt];
2502       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2503         {
2504           *err = re_node_set_merge (&next_nodes,
2505                                     dfa->eclosures + dfa->nexts[cur_node]);
2506           if (BE (*err != REG_NOERROR, 0))
2507             {
2508               re_node_set_free (&next_nodes);
2509               return NULL;
2510             }
2511         }
2512     }
2513   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2514   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2515   /* We don't need to check errors here, since the return value of
2516      this function is next_state and ERR is already set.  */
2517
2518   re_node_set_free (&next_nodes);
2519   re_string_skip_bytes (&mctx->input, 1);
2520   return next_state;
2521 }
2522 #endif
2523
2524 #ifdef RE_ENABLE_I18N
2525 static reg_errcode_t
2526 internal_function
2527 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2528 {
2529   const re_dfa_t *const dfa = mctx->dfa;
2530   reg_errcode_t err;
2531   int i;
2532
2533   for (i = 0; i < pstate->nodes.nelem; ++i)
2534     {
2535       re_node_set dest_nodes, *new_nodes;
2536       int cur_node_idx = pstate->nodes.elems[i];
2537       int naccepted, dest_idx;
2538       unsigned int context;
2539       re_dfastate_t *dest_state;
2540
2541       if (!dfa->nodes[cur_node_idx].accept_mb)
2542         continue;
2543
2544       if (dfa->nodes[cur_node_idx].constraint)
2545         {
2546           context = re_string_context_at (&mctx->input,
2547                                           re_string_cur_idx (&mctx->input),
2548                                           mctx->eflags);
2549           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2550                                            context))
2551             continue;
2552         }
2553
2554       /* How many bytes the node can accept?  */
2555       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2556                                            re_string_cur_idx (&mctx->input));
2557       if (naccepted == 0)
2558         continue;
2559
2560       /* The node can accepts `naccepted' bytes.  */
2561       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2562       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2563                                : mctx->max_mb_elem_len);
2564       err = clean_state_log_if_needed (mctx, dest_idx);
2565       if (BE (err != REG_NOERROR, 0))
2566         return err;
2567 #ifdef DEBUG
2568       assert (dfa->nexts[cur_node_idx] != -1);
2569 #endif
2570       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2571
2572       dest_state = mctx->state_log[dest_idx];
2573       if (dest_state == NULL)
2574         dest_nodes = *new_nodes;
2575       else
2576         {
2577           err = re_node_set_init_union (&dest_nodes,
2578                                         dest_state->entrance_nodes, new_nodes);
2579           if (BE (err != REG_NOERROR, 0))
2580             return err;
2581         }
2582       context = re_string_context_at (&mctx->input, dest_idx - 1,
2583                                       mctx->eflags);
2584       mctx->state_log[dest_idx]
2585         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2586       if (dest_state != NULL)
2587         re_node_set_free (&dest_nodes);
2588       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2589         return err;
2590     }
2591   return REG_NOERROR;
2592 }
2593 #endif /* RE_ENABLE_I18N */
2594
2595 static reg_errcode_t
2596 internal_function
2597 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2598 {
2599   const re_dfa_t *const dfa = mctx->dfa;
2600   reg_errcode_t err;
2601   int i;
2602   int cur_str_idx = re_string_cur_idx (&mctx->input);
2603
2604   for (i = 0; i < nodes->nelem; ++i)
2605     {
2606       int dest_str_idx, prev_nelem, bkc_idx;
2607       int node_idx = nodes->elems[i];
2608       unsigned int context;
2609       const re_token_t *node = dfa->nodes + node_idx;
2610       re_node_set *new_dest_nodes;
2611
2612       /* Check whether `node' is a backreference or not.  */
2613       if (node->type != OP_BACK_REF)
2614         continue;
2615
2616       if (node->constraint)
2617         {
2618           context = re_string_context_at (&mctx->input, cur_str_idx,
2619                                           mctx->eflags);
2620           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2621             continue;
2622         }
2623
2624       /* `node' is a backreference.
2625          Check the substring which the substring matched.  */
2626       bkc_idx = mctx->nbkref_ents;
2627       err = get_subexp (mctx, node_idx, cur_str_idx);
2628       if (BE (err != REG_NOERROR, 0))
2629         goto free_return;
2630
2631       /* And add the epsilon closures (which is `new_dest_nodes') of
2632          the backreference to appropriate state_log.  */
2633 #ifdef DEBUG
2634       assert (dfa->nexts[node_idx] != -1);
2635 #endif
2636       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2637         {
2638           int subexp_len;
2639           re_dfastate_t *dest_state;
2640           struct re_backref_cache_entry *bkref_ent;
2641           bkref_ent = mctx->bkref_ents + bkc_idx;
2642           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2643             continue;
2644           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2645           new_dest_nodes = (subexp_len == 0
2646                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2647                             : dfa->eclosures + dfa->nexts[node_idx]);
2648           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2649                           - bkref_ent->subexp_from);
2650           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2651                                           mctx->eflags);
2652           dest_state = mctx->state_log[dest_str_idx];
2653           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2654                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2655           /* Add `new_dest_node' to state_log.  */
2656           if (dest_state == NULL)
2657             {
2658               mctx->state_log[dest_str_idx]
2659                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2660                                             context);
2661               if (BE (mctx->state_log[dest_str_idx] == NULL
2662                       && err != REG_NOERROR, 0))
2663                 goto free_return;
2664             }
2665           else
2666             {
2667               re_node_set dest_nodes;
2668               err = re_node_set_init_union (&dest_nodes,
2669                                             dest_state->entrance_nodes,
2670                                             new_dest_nodes);
2671               if (BE (err != REG_NOERROR, 0))
2672                 {
2673                   re_node_set_free (&dest_nodes);
2674                   goto free_return;
2675                 }
2676               mctx->state_log[dest_str_idx]
2677                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2678               re_node_set_free (&dest_nodes);
2679               if (BE (mctx->state_log[dest_str_idx] == NULL
2680                       && err != REG_NOERROR, 0))
2681                 goto free_return;
2682             }
2683           /* We need to check recursively if the backreference can epsilon
2684              transit.  */
2685           if (subexp_len == 0
2686               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2687             {
2688               err = check_subexp_matching_top (mctx, new_dest_nodes,
2689                                                cur_str_idx);
2690               if (BE (err != REG_NOERROR, 0))
2691                 goto free_return;
2692               err = transit_state_bkref (mctx, new_dest_nodes);
2693               if (BE (err != REG_NOERROR, 0))
2694                 goto free_return;
2695             }
2696         }
2697     }
2698   err = REG_NOERROR;
2699  free_return:
2700   return err;
2701 }
2702
2703 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2704    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2705    Note that we might collect inappropriate candidates here.
2706    However, the cost of checking them strictly here is too high, then we
2707    delay these checking for prune_impossible_nodes().  */
2708
2709 static reg_errcode_t
2710 internal_function __attribute_warn_unused_result__
2711 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2712 {
2713   const re_dfa_t *const dfa = mctx->dfa;
2714   int subexp_num, sub_top_idx;
2715   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2716   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2717   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2718   if (cache_idx != -1)
2719     {
2720       const struct re_backref_cache_entry *entry
2721         = mctx->bkref_ents + cache_idx;
2722       do
2723         if (entry->node == bkref_node)
2724           return REG_NOERROR; /* We already checked it.  */
2725       while (entry++->more);
2726     }
2727
2728   subexp_num = dfa->nodes[bkref_node].opr.idx;
2729
2730   /* For each sub expression  */
2731   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2732     {
2733       reg_errcode_t err;
2734       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2735       re_sub_match_last_t *sub_last;
2736       int sub_last_idx, sl_str, bkref_str_off;
2737
2738       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2739         continue; /* It isn't related.  */
2740
2741       sl_str = sub_top->str_idx;
2742       bkref_str_off = bkref_str_idx;
2743       /* At first, check the last node of sub expressions we already
2744          evaluated.  */
2745       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2746         {
2747           int sl_str_diff;
2748           sub_last = sub_top->lasts[sub_last_idx];
2749           sl_str_diff = sub_last->str_idx - sl_str;
2750           /* The matched string by the sub expression match with the substring
2751              at the back reference?  */
2752           if (sl_str_diff > 0)
2753             {
2754               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2755                 {
2756                   /* Not enough chars for a successful match.  */
2757                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2758                     break;
2759
2760                   err = clean_state_log_if_needed (mctx,
2761                                                    bkref_str_off
2762                                                    + sl_str_diff);
2763                   if (BE (err != REG_NOERROR, 0))
2764                     return err;
2765                   buf = (const char *) re_string_get_buffer (&mctx->input);
2766                 }
2767               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2768                 /* We don't need to search this sub expression any more.  */
2769                 break;
2770             }
2771           bkref_str_off += sl_str_diff;
2772           sl_str += sl_str_diff;
2773           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2774                                 bkref_str_idx);
2775
2776           /* Reload buf, since the preceding call might have reallocated
2777              the buffer.  */
2778           buf = (const char *) re_string_get_buffer (&mctx->input);
2779
2780           if (err == REG_NOMATCH)
2781             continue;
2782           if (BE (err != REG_NOERROR, 0))
2783             return err;
2784         }
2785
2786       if (sub_last_idx < sub_top->nlasts)
2787         continue;
2788       if (sub_last_idx > 0)
2789         ++sl_str;
2790       /* Then, search for the other last nodes of the sub expression.  */
2791       for (; sl_str <= bkref_str_idx; ++sl_str)
2792         {
2793           int cls_node, sl_str_off;
2794           const re_node_set *nodes;
2795           sl_str_off = sl_str - sub_top->str_idx;
2796           /* The matched string by the sub expression match with the substring
2797              at the back reference?  */
2798           if (sl_str_off > 0)
2799             {
2800               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2801                 {
2802                   /* If we are at the end of the input, we cannot match.  */
2803                   if (bkref_str_off >= mctx->input.len)
2804                     break;
2805
2806                   err = extend_buffers (mctx);
2807                   if (BE (err != REG_NOERROR, 0))
2808                     return err;
2809
2810                   buf = (const char *) re_string_get_buffer (&mctx->input);
2811                 }
2812               if (buf [bkref_str_off++] != buf[sl_str - 1])
2813                 break; /* We don't need to search this sub expression
2814                           any more.  */
2815             }
2816           if (mctx->state_log[sl_str] == NULL)
2817             continue;
2818           /* Does this state have a ')' of the sub expression?  */
2819           nodes = &mctx->state_log[sl_str]->nodes;
2820           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2821                                        OP_CLOSE_SUBEXP);
2822           if (cls_node == -1)
2823             continue; /* No.  */
2824           if (sub_top->path == NULL)
2825             {
2826               sub_top->path = calloc (sizeof (state_array_t),
2827                                       sl_str - sub_top->str_idx + 1);
2828               if (sub_top->path == NULL)
2829                 return REG_ESPACE;
2830             }
2831           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2832              in the current context?  */
2833           err = check_arrival (mctx, sub_top->path, sub_top->node,
2834                                sub_top->str_idx, cls_node, sl_str,
2835                                OP_CLOSE_SUBEXP);
2836           if (err == REG_NOMATCH)
2837               continue;
2838           if (BE (err != REG_NOERROR, 0))
2839               return err;
2840           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2841           if (BE (sub_last == NULL, 0))
2842             return REG_ESPACE;
2843           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2844                                 bkref_str_idx);
2845           if (err == REG_NOMATCH)
2846             continue;
2847         }
2848     }
2849   return REG_NOERROR;
2850 }
2851
2852 /* Helper functions for get_subexp().  */
2853
2854 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2855    If it can arrive, register the sub expression expressed with SUB_TOP
2856    and SUB_LAST.  */
2857
2858 static reg_errcode_t
2859 internal_function
2860 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2861                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2862 {
2863   reg_errcode_t err;
2864   int to_idx;
2865   /* Can the subexpression arrive the back reference?  */
2866   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2867                        sub_last->str_idx, bkref_node, bkref_str,
2868                        OP_OPEN_SUBEXP);
2869   if (err != REG_NOERROR)
2870     return err;
2871   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2872                              sub_last->str_idx);
2873   if (BE (err != REG_NOERROR, 0))
2874     return err;
2875   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2876   return clean_state_log_if_needed (mctx, to_idx);
2877 }
2878
2879 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2880    Search '(' if FL_OPEN, or search ')' otherwise.
2881    TODO: This function isn't efficient...
2882          Because there might be more than one nodes whose types are
2883          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2884          nodes.
2885          E.g. RE: (a){2}  */
2886
2887 static int
2888 internal_function
2889 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2890                   int subexp_idx, int type)
2891 {
2892   int cls_idx;
2893   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2894     {
2895       int cls_node = nodes->elems[cls_idx];
2896       const re_token_t *node = dfa->nodes + cls_node;
2897       if (node->type == type
2898           && node->opr.idx == subexp_idx)
2899         return cls_node;
2900     }
2901   return -1;
2902 }
2903
2904 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2905    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2906    heavily reused.
2907    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2908
2909 static reg_errcode_t
2910 internal_function __attribute_warn_unused_result__
2911 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2912                int top_str, int last_node, int last_str, int type)
2913 {
2914   const re_dfa_t *const dfa = mctx->dfa;
2915   reg_errcode_t err = REG_NOERROR;
2916   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2917   re_dfastate_t *cur_state = NULL;
2918   re_node_set *cur_nodes, next_nodes;
2919   re_dfastate_t **backup_state_log;
2920   unsigned int context;
2921
2922   subexp_num = dfa->nodes[top_node].opr.idx;
2923   /* Extend the buffer if we need.  */
2924   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2925     {
2926       re_dfastate_t **new_array;
2927       int old_alloc = path->alloc;
2928       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2929       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2930       if (BE (new_array == NULL, 0))
2931         {
2932           path->alloc = old_alloc;
2933           return REG_ESPACE;
2934         }
2935       path->array = new_array;
2936       memset (new_array + old_alloc, '\0',
2937               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2938     }
2939
2940   str_idx = path->next_idx ? path->next_idx : top_str;
2941
2942   /* Temporary modify MCTX.  */
2943   backup_state_log = mctx->state_log;
2944   backup_cur_idx = mctx->input.cur_idx;
2945   mctx->state_log = path->array;
2946   mctx->input.cur_idx = str_idx;
2947
2948   /* Setup initial node set.  */
2949   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2950   if (str_idx == top_str)
2951     {
2952       err = re_node_set_init_1 (&next_nodes, top_node);
2953       if (BE (err != REG_NOERROR, 0))
2954         return err;
2955       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2956       if (BE (err != REG_NOERROR, 0))
2957         {
2958           re_node_set_free (&next_nodes);
2959           return err;
2960         }
2961     }
2962   else
2963     {
2964       cur_state = mctx->state_log[str_idx];
2965       if (cur_state && cur_state->has_backref)
2966         {
2967           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2968           if (BE (err != REG_NOERROR, 0))
2969             return err;
2970         }
2971       else
2972         re_node_set_init_empty (&next_nodes);
2973     }
2974   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2975     {
2976       if (next_nodes.nelem)
2977         {
2978           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2979                                     subexp_num, type);
2980           if (BE (err != REG_NOERROR, 0))
2981             {
2982               re_node_set_free (&next_nodes);
2983               return err;
2984             }
2985         }
2986       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2987       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2988         {
2989           re_node_set_free (&next_nodes);
2990           return err;
2991         }
2992       mctx->state_log[str_idx] = cur_state;
2993     }
2994
2995   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2996     {
2997       re_node_set_empty (&next_nodes);
2998       if (mctx->state_log[str_idx + 1])
2999         {
3000           err = re_node_set_merge (&next_nodes,
3001                                    &mctx->state_log[str_idx + 1]->nodes);
3002           if (BE (err != REG_NOERROR, 0))
3003             {
3004               re_node_set_free (&next_nodes);
3005               return err;
3006             }
3007         }
3008       if (cur_state)
3009         {
3010           err = check_arrival_add_next_nodes (mctx, str_idx,
3011                                               &cur_state->non_eps_nodes,
3012                                               &next_nodes);
3013           if (BE (err != REG_NOERROR, 0))
3014             {
3015               re_node_set_free (&next_nodes);
3016               return err;
3017             }
3018         }
3019       ++str_idx;
3020       if (next_nodes.nelem)
3021         {
3022           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3023           if (BE (err != REG_NOERROR, 0))
3024             {
3025               re_node_set_free (&next_nodes);
3026               return err;
3027             }
3028           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3029                                     subexp_num, type);
3030           if (BE (err != REG_NOERROR, 0))
3031             {
3032               re_node_set_free (&next_nodes);
3033               return err;
3034             }
3035         }
3036       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3037       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3038       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3039         {
3040           re_node_set_free (&next_nodes);
3041           return err;
3042         }
3043       mctx->state_log[str_idx] = cur_state;
3044       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3045     }
3046   re_node_set_free (&next_nodes);
3047   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3048                : &mctx->state_log[last_str]->nodes);
3049   path->next_idx = str_idx;
3050
3051   /* Fix MCTX.  */
3052   mctx->state_log = backup_state_log;
3053   mctx->input.cur_idx = backup_cur_idx;
3054
3055   /* Then check the current node set has the node LAST_NODE.  */
3056   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3057     return REG_NOERROR;
3058
3059   return REG_NOMATCH;
3060 }
3061
3062 /* Helper functions for check_arrival.  */
3063
3064 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3065    to NEXT_NODES.
3066    TODO: This function is similar to the functions transit_state*(),
3067          however this function has many additional works.
3068          Can't we unify them?  */
3069
3070 static reg_errcode_t
3071 internal_function __attribute_warn_unused_result__
3072 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3073                               re_node_set *cur_nodes, re_node_set *next_nodes)
3074 {
3075   const re_dfa_t *const dfa = mctx->dfa;
3076   int result;
3077   int cur_idx;
3078 #ifdef RE_ENABLE_I18N
3079   reg_errcode_t err = REG_NOERROR;
3080 #endif
3081   re_node_set union_set;
3082   re_node_set_init_empty (&union_set);
3083   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3084     {
3085       int naccepted = 0;
3086       int cur_node = cur_nodes->elems[cur_idx];
3087 #ifdef DEBUG
3088       re_token_type_t type = dfa->nodes[cur_node].type;
3089       assert (!IS_EPSILON_NODE (type));
3090 #endif
3091 #ifdef RE_ENABLE_I18N
3092       /* If the node may accept `multi byte'.  */
3093       if (dfa->nodes[cur_node].accept_mb)
3094         {
3095           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3096                                                str_idx);
3097           if (naccepted > 1)
3098             {
3099               re_dfastate_t *dest_state;
3100               int next_node = dfa->nexts[cur_node];
3101               int next_idx = str_idx + naccepted;
3102               dest_state = mctx->state_log[next_idx];
3103               re_node_set_empty (&union_set);
3104               if (dest_state)
3105                 {
3106                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3107                   if (BE (err != REG_NOERROR, 0))
3108                     {
3109                       re_node_set_free (&union_set);
3110                       return err;
3111                     }
3112                 }
3113               result = re_node_set_insert (&union_set, next_node);
3114               if (BE (result < 0, 0))
3115                 {
3116                   re_node_set_free (&union_set);
3117                   return REG_ESPACE;
3118                 }
3119               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3120                                                             &union_set);
3121               if (BE (mctx->state_log[next_idx] == NULL
3122                       && err != REG_NOERROR, 0))
3123                 {
3124                   re_node_set_free (&union_set);
3125                   return err;
3126                 }
3127             }
3128         }
3129 #endif /* RE_ENABLE_I18N */
3130       if (naccepted
3131           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3132         {
3133           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3134           if (BE (result < 0, 0))
3135             {
3136               re_node_set_free (&union_set);
3137               return REG_ESPACE;
3138             }
3139         }
3140     }
3141   re_node_set_free (&union_set);
3142   return REG_NOERROR;
3143 }
3144
3145 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3146    CUR_NODES, however exclude the nodes which are:
3147     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3148     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3149 */
3150
3151 static reg_errcode_t
3152 internal_function
3153 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3154                           int ex_subexp, int type)
3155 {
3156   reg_errcode_t err;
3157   int idx, outside_node;
3158   re_node_set new_nodes;
3159 #ifdef DEBUG
3160   assert (cur_nodes->nelem);
3161 #endif
3162   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3163   if (BE (err != REG_NOERROR, 0))
3164     return err;
3165   /* Create a new node set NEW_NODES with the nodes which are epsilon
3166      closures of the node in CUR_NODES.  */
3167
3168   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3169     {
3170       int cur_node = cur_nodes->elems[idx];
3171       const re_node_set *eclosure = dfa->eclosures + cur_node;
3172       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3173       if (outside_node == -1)
3174         {
3175           /* There are no problematic nodes, just merge them.  */
3176           err = re_node_set_merge (&new_nodes, eclosure);
3177           if (BE (err != REG_NOERROR, 0))
3178             {
3179               re_node_set_free (&new_nodes);
3180               return err;
3181             }
3182         }
3183       else
3184         {
3185           /* There are problematic nodes, re-calculate incrementally.  */
3186           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3187                                               ex_subexp, type);
3188           if (BE (err != REG_NOERROR, 0))
3189             {
3190               re_node_set_free (&new_nodes);
3191               return err;
3192             }
3193         }
3194     }
3195   re_node_set_free (cur_nodes);
3196   *cur_nodes = new_nodes;
3197   return REG_NOERROR;
3198 }
3199
3200 /* Helper function for check_arrival_expand_ecl.
3201    Check incrementally the epsilon closure of TARGET, and if it isn't
3202    problematic append it to DST_NODES.  */
3203
3204 static reg_errcode_t
3205 internal_function __attribute_warn_unused_result__
3206 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3207                               int target, int ex_subexp, int type)
3208 {
3209   int cur_node;
3210   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3211     {
3212       int err;
3213
3214       if (dfa->nodes[cur_node].type == type
3215           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3216         {
3217           if (type == OP_CLOSE_SUBEXP)
3218             {
3219               err = re_node_set_insert (dst_nodes, cur_node);
3220               if (BE (err == -1, 0))
3221                 return REG_ESPACE;
3222             }
3223           break;
3224         }
3225       err = re_node_set_insert (dst_nodes, cur_node);
3226       if (BE (err == -1, 0))
3227         return REG_ESPACE;
3228       if (dfa->edests[cur_node].nelem == 0)
3229         break;
3230       if (dfa->edests[cur_node].nelem == 2)
3231         {
3232           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3233                                               dfa->edests[cur_node].elems[1],
3234                                               ex_subexp, type);
3235           if (BE (err != REG_NOERROR, 0))
3236             return err;
3237         }
3238       cur_node = dfa->edests[cur_node].elems[0];
3239     }
3240   return REG_NOERROR;
3241 }
3242
3243
3244 /* For all the back references in the current state, calculate the
3245    destination of the back references by the appropriate entry
3246    in MCTX->BKREF_ENTS.  */
3247
3248 static reg_errcode_t
3249 internal_function __attribute_warn_unused_result__
3250 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3251                     int cur_str, int subexp_num, int type)
3252 {
3253   const re_dfa_t *const dfa = mctx->dfa;
3254   reg_errcode_t err;
3255   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3256   struct re_backref_cache_entry *ent;
3257
3258   if (cache_idx_start == -1)
3259     return REG_NOERROR;
3260
3261  restart:
3262   ent = mctx->bkref_ents + cache_idx_start;
3263   do
3264     {
3265       int to_idx, next_node;
3266
3267       /* Is this entry ENT is appropriate?  */
3268       if (!re_node_set_contains (cur_nodes, ent->node))
3269         continue; /* No.  */
3270
3271       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3272       /* Calculate the destination of the back reference, and append it
3273          to MCTX->STATE_LOG.  */
3274       if (to_idx == cur_str)
3275         {
3276           /* The backreference did epsilon transit, we must re-check all the
3277              node in the current state.  */
3278           re_node_set new_dests;
3279           reg_errcode_t err2, err3;
3280           next_node = dfa->edests[ent->node].elems[0];
3281           if (re_node_set_contains (cur_nodes, next_node))
3282             continue;
3283           err = re_node_set_init_1 (&new_dests, next_node);
3284           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3285           err3 = re_node_set_merge (cur_nodes, &new_dests);
3286           re_node_set_free (&new_dests);
3287           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3288                   || err3 != REG_NOERROR, 0))
3289             {
3290               err = (err != REG_NOERROR ? err
3291                      : (err2 != REG_NOERROR ? err2 : err3));
3292               return err;
3293             }
3294           /* TODO: It is still inefficient...  */
3295           goto restart;
3296         }
3297       else
3298         {
3299           re_node_set union_set;
3300           next_node = dfa->nexts[ent->node];
3301           if (mctx->state_log[to_idx])
3302             {
3303               int ret;
3304               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3305                                         next_node))
3306                 continue;
3307               err = re_node_set_init_copy (&union_set,
3308                                            &mctx->state_log[to_idx]->nodes);
3309               ret = re_node_set_insert (&union_set, next_node);
3310               if (BE (err != REG_NOERROR || ret < 0, 0))
3311                 {
3312                   re_node_set_free (&union_set);
3313                   err = err != REG_NOERROR ? err : REG_ESPACE;
3314                   return err;
3315                 }
3316             }
3317           else
3318             {
3319               err = re_node_set_init_1 (&union_set, next_node);
3320               if (BE (err != REG_NOERROR, 0))
3321                 return err;
3322             }
3323           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3324           re_node_set_free (&union_set);
3325           if (BE (mctx->state_log[to_idx] == NULL
3326                   && err != REG_NOERROR, 0))
3327             return err;
3328         }
3329     }
3330   while (ent++->more);
3331   return REG_NOERROR;
3332 }
3333
3334 /* Build transition table for the state.
3335    Return 1 if succeeded, otherwise return NULL.  */
3336
3337 static int
3338 internal_function
3339 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3340 {
3341   reg_errcode_t err;
3342   int i, j, ch, need_word_trtable = 0;
3343   bitset_word_t elem, mask;
3344   bool dests_node_malloced = false;
3345   bool dest_states_malloced = false;
3346   int ndests; /* Number of the destination states from `state'.  */
3347   re_dfastate_t **trtable;
3348   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3349   re_node_set follows, *dests_node;
3350   bitset_t *dests_ch;
3351   bitset_t acceptable;
3352
3353   struct dests_alloc
3354   {
3355     re_node_set dests_node[SBC_MAX];
3356     bitset_t dests_ch[SBC_MAX];
3357   } *dests_alloc;
3358
3359   /* We build DFA states which corresponds to the destination nodes
3360      from `state'.  `dests_node[i]' represents the nodes which i-th
3361      destination state contains, and `dests_ch[i]' represents the
3362      characters which i-th destination state accepts.  */
3363 #ifdef HAVE_ALLOCA
3364   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3365     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3366   else
3367 #endif
3368     {
3369       dests_alloc = re_malloc (struct dests_alloc, 1);
3370       if (BE (dests_alloc == NULL, 0))
3371         return 0;
3372       dests_node_malloced = true;
3373     }
3374   dests_node = dests_alloc->dests_node;
3375   dests_ch = dests_alloc->dests_ch;
3376
3377   /* Initialize transiton table.  */
3378   state->word_trtable = state->trtable = NULL;
3379
3380   /* At first, group all nodes belonging to `state' into several
3381      destinations.  */
3382   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3383   if (BE (ndests <= 0, 0))
3384     {
3385       if (dests_node_malloced)
3386         free (dests_alloc);
3387       /* Return 0 in case of an error, 1 otherwise.  */
3388       if (ndests == 0)
3389         {
3390           state->trtable = (re_dfastate_t **)
3391             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3392           if (BE (state->trtable == NULL, 0))
3393             return 0;
3394           return 1;
3395         }
3396       return 0;
3397     }
3398
3399   err = re_node_set_alloc (&follows, ndests + 1);
3400   if (BE (err != REG_NOERROR, 0))
3401     goto out_free;
3402
3403   /* Avoid arithmetic overflow in size calculation.  */
3404   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3405             / (3 * sizeof (re_dfastate_t *)))
3406            < ndests),
3407           0))
3408     goto out_free;
3409
3410 #ifdef HAVE_ALLOCA
3411   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3412                          + ndests * 3 * sizeof (re_dfastate_t *)))
3413     dest_states = (re_dfastate_t **)
3414       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3415   else
3416 #endif
3417     {
3418       dest_states = (re_dfastate_t **)
3419         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3420       if (BE (dest_states == NULL, 0))
3421         {
3422 out_free:
3423           if (dest_states_malloced)
3424             free (dest_states);
3425           re_node_set_free (&follows);
3426           for (i = 0; i < ndests; ++i)
3427             re_node_set_free (dests_node + i);
3428           if (dests_node_malloced)
3429             free (dests_alloc);
3430           return 0;
3431         }
3432       dest_states_malloced = true;
3433     }
3434   dest_states_word = dest_states + ndests;
3435   dest_states_nl = dest_states_word + ndests;
3436   bitset_empty (acceptable);
3437
3438   /* Then build the states for all destinations.  */
3439   for (i = 0; i < ndests; ++i)
3440     {
3441       int next_node;
3442       re_node_set_empty (&follows);
3443       /* Merge the follows of this destination states.  */
3444       for (j = 0; j < dests_node[i].nelem; ++j)
3445         {
3446           next_node = dfa->nexts[dests_node[i].elems[j]];
3447           if (next_node != -1)
3448             {
3449               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3450               if (BE (err != REG_NOERROR, 0))
3451                 goto out_free;
3452             }
3453         }
3454       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3455       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3456         goto out_free;
3457       /* If the new state has context constraint,
3458          build appropriate states for these contexts.  */
3459       if (dest_states[i]->has_constraint)
3460         {
3461           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3462                                                           CONTEXT_WORD);
3463           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3464             goto out_free;
3465
3466           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3467             need_word_trtable = 1;
3468
3469           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3470                                                         CONTEXT_NEWLINE);
3471           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3472             goto out_free;
3473         }
3474       else
3475         {
3476           dest_states_word[i] = dest_states[i];
3477           dest_states_nl[i] = dest_states[i];
3478         }
3479       bitset_merge (acceptable, dests_ch[i]);
3480     }
3481
3482   if (!BE (need_word_trtable, 0))
3483     {
3484       /* We don't care about whether the following character is a word
3485          character, or we are in a single-byte character set so we can
3486          discern by looking at the character code: allocate a
3487          256-entry transition table.  */
3488       trtable = state->trtable =
3489         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3490       if (BE (trtable == NULL, 0))
3491         goto out_free;
3492
3493       /* For all characters ch...:  */
3494       for (i = 0; i < BITSET_WORDS; ++i)
3495         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3496              elem;
3497              mask <<= 1, elem >>= 1, ++ch)
3498           if (BE (elem & 1, 0))
3499             {
3500               /* There must be exactly one destination which accepts
3501                  character ch.  See group_nodes_into_DFAstates.  */
3502               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3503                 ;
3504
3505               /* j-th destination accepts the word character ch.  */
3506               if (dfa->word_char[i] & mask)
3507                 trtable[ch] = dest_states_word[j];
3508               else
3509                 trtable[ch] = dest_states[j];
3510             }
3511     }
3512   else
3513     {
3514       /* We care about whether the following character is a word
3515          character, and we are in a multi-byte character set: discern
3516          by looking at the character code: build two 256-entry
3517          transition tables, one starting at trtable[0] and one
3518          starting at trtable[SBC_MAX].  */
3519       trtable = state->word_trtable =
3520         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3521       if (BE (trtable == NULL, 0))
3522         goto out_free;
3523
3524       /* For all characters ch...:  */
3525       for (i = 0; i < BITSET_WORDS; ++i)
3526         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3527              elem;
3528              mask <<= 1, elem >>= 1, ++ch)
3529           if (BE (elem & 1, 0))
3530             {
3531               /* There must be exactly one destination which accepts
3532                  character ch.  See group_nodes_into_DFAstates.  */
3533               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3534                 ;
3535
3536               /* j-th destination accepts the word character ch.  */
3537               trtable[ch] = dest_states[j];
3538               trtable[ch + SBC_MAX] = dest_states_word[j];
3539             }
3540     }
3541
3542   /* new line */
3543   if (bitset_contain (acceptable, NEWLINE_CHAR))
3544     {
3545       /* The current state accepts newline character.  */
3546       for (j = 0; j < ndests; ++j)
3547         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3548           {
3549             /* k-th destination accepts newline character.  */
3550             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3551             if (need_word_trtable)
3552               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3553             /* There must be only one destination which accepts
3554                newline.  See group_nodes_into_DFAstates.  */
3555             break;
3556           }
3557     }
3558
3559   if (dest_states_malloced)
3560     free (dest_states);
3561
3562   re_node_set_free (&follows);
3563   for (i = 0; i < ndests; ++i)
3564     re_node_set_free (dests_node + i);
3565
3566   if (dests_node_malloced)
3567     free (dests_alloc);
3568
3569   return 1;
3570 }
3571
3572 /* Group all nodes belonging to STATE into several destinations.
3573    Then for all destinations, set the nodes belonging to the destination
3574    to DESTS_NODE[i] and set the characters accepted by the destination
3575    to DEST_CH[i].  This function return the number of destinations.  */
3576
3577 static int
3578 internal_function
3579 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3580                             re_node_set *dests_node, bitset_t *dests_ch)
3581 {
3582   reg_errcode_t err;
3583   int result;
3584   int i, j, k;
3585   int ndests; /* Number of the destinations from `state'.  */
3586   bitset_t accepts; /* Characters a node can accept.  */
3587   const re_node_set *cur_nodes = &state->nodes;
3588   bitset_empty (accepts);
3589   ndests = 0;
3590
3591   /* For all the nodes belonging to `state',  */
3592   for (i = 0; i < cur_nodes->nelem; ++i)
3593     {
3594       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3595       re_token_type_t type = node->type;
3596       unsigned int constraint = node->constraint;
3597
3598       /* Enumerate all single byte character this node can accept.  */
3599       if (type == CHARACTER)
3600         bitset_set (accepts, node->opr.c);
3601       else if (type == SIMPLE_BRACKET)
3602         {
3603           bitset_merge (accepts, node->opr.sbcset);
3604         }
3605       else if (type == OP_PERIOD)
3606         {
3607 #ifdef RE_ENABLE_I18N
3608           if (dfa->mb_cur_max > 1)
3609             bitset_merge (accepts, dfa->sb_char);
3610           else
3611 #endif
3612             bitset_set_all (accepts);
3613           if (!(dfa->syntax & RE_DOT_NEWLINE))
3614             bitset_clear (accepts, '\n');
3615           if (dfa->syntax & RE_DOT_NOT_NULL)
3616             bitset_clear (accepts, '\0');
3617         }
3618 #ifdef RE_ENABLE_I18N
3619       else if (type == OP_UTF8_PERIOD)
3620         {
3621           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3622           if (!(dfa->syntax & RE_DOT_NEWLINE))
3623             bitset_clear (accepts, '\n');
3624           if (dfa->syntax & RE_DOT_NOT_NULL)
3625             bitset_clear (accepts, '\0');
3626         }
3627 #endif
3628       else
3629         continue;
3630
3631       /* Check the `accepts' and sift the characters which are not
3632          match it the context.  */
3633       if (constraint)
3634         {
3635           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3636             {
3637               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3638               bitset_empty (accepts);
3639               if (accepts_newline)
3640                 bitset_set (accepts, NEWLINE_CHAR);
3641               else
3642                 continue;
3643             }
3644           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3645             {
3646               bitset_empty (accepts);
3647               continue;
3648             }
3649
3650           if (constraint & NEXT_WORD_CONSTRAINT)
3651             {
3652               bitset_word_t any_set = 0;
3653               if (type == CHARACTER && !node->word_char)
3654                 {
3655                   bitset_empty (accepts);
3656                   continue;
3657                 }
3658 #ifdef RE_ENABLE_I18N
3659               if (dfa->mb_cur_max > 1)
3660                 for (j = 0; j < BITSET_WORDS; ++j)
3661                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3662               else
3663 #endif
3664                 for (j = 0; j < BITSET_WORDS; ++j)
3665                   any_set |= (accepts[j] &= dfa->word_char[j]);
3666               if (!any_set)
3667                 continue;
3668             }
3669           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3670             {
3671               bitset_word_t any_set = 0;
3672               if (type == CHARACTER && node->word_char)
3673                 {
3674                   bitset_empty (accepts);
3675                   continue;
3676                 }
3677 #ifdef RE_ENABLE_I18N
3678               if (dfa->mb_cur_max > 1)
3679                 for (j = 0; j < BITSET_WORDS; ++j)
3680                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3681               else
3682 #endif
3683                 for (j = 0; j < BITSET_WORDS; ++j)
3684                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3685               if (!any_set)
3686                 continue;
3687             }
3688         }
3689
3690       /* Then divide `accepts' into DFA states, or create a new
3691          state.  Above, we make sure that accepts is not empty.  */
3692       for (j = 0; j < ndests; ++j)
3693         {
3694           bitset_t intersec; /* Intersection sets, see below.  */
3695           bitset_t remains;
3696           /* Flags, see below.  */
3697           bitset_word_t has_intersec, not_subset, not_consumed;
3698
3699           /* Optimization, skip if this state doesn't accept the character.  */
3700           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3701             continue;
3702
3703           /* Enumerate the intersection set of this state and `accepts'.  */
3704           has_intersec = 0;
3705           for (k = 0; k < BITSET_WORDS; ++k)
3706             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3707           /* And skip if the intersection set is empty.  */
3708           if (!has_intersec)
3709             continue;
3710
3711           /* Then check if this state is a subset of `accepts'.  */
3712           not_subset = not_consumed = 0;
3713           for (k = 0; k < BITSET_WORDS; ++k)
3714             {
3715               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3716               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3717             }
3718
3719           /* If this state isn't a subset of `accepts', create a
3720              new group state, which has the `remains'. */
3721           if (not_subset)
3722             {
3723               bitset_copy (dests_ch[ndests], remains);
3724               bitset_copy (dests_ch[j], intersec);
3725               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3726               if (BE (err != REG_NOERROR, 0))
3727                 goto error_return;
3728               ++ndests;
3729             }
3730
3731           /* Put the position in the current group. */
3732           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3733           if (BE (result < 0, 0))
3734             goto error_return;
3735
3736           /* If all characters are consumed, go to next node. */
3737           if (!not_consumed)
3738             break;
3739         }
3740       /* Some characters remain, create a new group. */
3741       if (j == ndests)
3742         {
3743           bitset_copy (dests_ch[ndests], accepts);
3744           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3745           if (BE (err != REG_NOERROR, 0))
3746             goto error_return;
3747           ++ndests;
3748           bitset_empty (accepts);
3749         }
3750     }
3751   return ndests;
3752  error_return:
3753   for (j = 0; j < ndests; ++j)
3754     re_node_set_free (dests_node + j);
3755   return -1;
3756 }
3757
3758 #ifdef RE_ENABLE_I18N
3759 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3760    Return the number of the bytes the node accepts.
3761    STR_IDX is the current index of the input string.
3762
3763    This function handles the nodes which can accept one character, or
3764    one collating element like '.', '[a-z]', opposite to the other nodes
3765    can only accept one byte.  */
3766
3767 static int
3768 internal_function
3769 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3770                          const re_string_t *input, int str_idx)
3771 {
3772   const re_token_t *node = dfa->nodes + node_idx;
3773   int char_len, elem_len;
3774   int i;
3775   wint_t wc;
3776
3777   if (BE (node->type == OP_UTF8_PERIOD, 0))
3778     {
3779       unsigned char c = re_string_byte_at (input, str_idx), d;
3780       if (BE (c < 0xc2, 1))
3781         return 0;
3782
3783       if (str_idx + 2 > input->len)
3784         return 0;
3785
3786       d = re_string_byte_at (input, str_idx + 1);
3787       if (c < 0xe0)
3788         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3789       else if (c < 0xf0)
3790         {
3791           char_len = 3;
3792           if (c == 0xe0 && d < 0xa0)
3793             return 0;
3794         }
3795       else if (c < 0xf8)
3796         {
3797           char_len = 4;
3798           if (c == 0xf0 && d < 0x90)
3799             return 0;
3800         }
3801       else if (c < 0xfc)
3802         {
3803           char_len = 5;
3804           if (c == 0xf8 && d < 0x88)
3805             return 0;
3806         }
3807       else if (c < 0xfe)
3808         {
3809           char_len = 6;
3810           if (c == 0xfc && d < 0x84)
3811             return 0;
3812         }
3813       else
3814         return 0;
3815
3816       if (str_idx + char_len > input->len)
3817         return 0;
3818
3819       for (i = 1; i < char_len; ++i)
3820         {
3821           d = re_string_byte_at (input, str_idx + i);
3822           if (d < 0x80 || d > 0xbf)
3823             return 0;
3824         }
3825       return char_len;
3826     }
3827
3828   char_len = re_string_char_size_at (input, str_idx);
3829   if (node->type == OP_PERIOD)
3830     {
3831       if (char_len <= 1)
3832         return 0;
3833       /* FIXME: I don't think this if is needed, as both '\n'
3834          and '\0' are char_len == 1.  */
3835       /* '.' accepts any one character except the following two cases.  */
3836       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3837            re_string_byte_at (input, str_idx) == '\n') ||
3838           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3839            re_string_byte_at (input, str_idx) == '\0'))
3840         return 0;
3841       return char_len;
3842     }
3843
3844   elem_len = re_string_elem_size_at (input, str_idx);
3845   wc = __btowc(*(input->mbs+str_idx));
3846   if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3847     return 0;
3848
3849   if (node->type == COMPLEX_BRACKET)
3850     {
3851       const re_charset_t *cset = node->opr.mbcset;
3852 # ifdef _LIBC
3853       const unsigned char *pin
3854         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3855       int j;
3856       uint32_t nrules;
3857 # endif /* _LIBC */
3858       int match_len = 0;
3859       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3860                     ? re_string_wchar_at (input, str_idx) : 0);
3861
3862       /* match with multibyte character?  */
3863       for (i = 0; i < cset->nmbchars; ++i)
3864         if (wc == cset->mbchars[i])
3865           {
3866             match_len = char_len;
3867             goto check_node_accept_bytes_match;
3868           }
3869       /* match with character_class?  */
3870       for (i = 0; i < cset->nchar_classes; ++i)
3871         {
3872           wctype_t wt = cset->char_classes[i];
3873           if (__iswctype (wc, wt))
3874             {
3875               match_len = char_len;
3876               goto check_node_accept_bytes_match;
3877             }
3878         }
3879
3880 # ifdef _LIBC
3881       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3882       if (nrules != 0)
3883         {
3884           unsigned int in_collseq = 0;
3885           const int32_t *table, *indirect;
3886           const unsigned char *weights, *extra;
3887           const char *collseqwc;
3888           /* This #include defines a local function!  */
3889 #  include <locale/weight.h>
3890
3891           /* match with collating_symbol?  */
3892           if (cset->ncoll_syms)
3893             extra = (const unsigned char *)
3894               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3895           for (i = 0; i < cset->ncoll_syms; ++i)
3896             {
3897               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3898               /* Compare the length of input collating element and
3899                  the length of current collating element.  */
3900               if (*coll_sym != elem_len)
3901                 continue;
3902               /* Compare each bytes.  */
3903               for (j = 0; j < *coll_sym; j++)
3904                 if (pin[j] != coll_sym[1 + j])
3905                   break;
3906               if (j == *coll_sym)
3907                 {
3908                   /* Match if every bytes is equal.  */
3909                   match_len = j;
3910                   goto check_node_accept_bytes_match;
3911                 }
3912             }
3913
3914           if (cset->nranges)
3915             {
3916               if (elem_len <= char_len)
3917                 {
3918                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3919                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3920                 }
3921               else
3922                 in_collseq = find_collation_sequence_value (pin, elem_len);
3923             }
3924           /* match with range expression?  */
3925           for (i = 0; i < cset->nranges; ++i)
3926             if (cset->range_starts[i] <= in_collseq
3927                 && in_collseq <= cset->range_ends[i])
3928               {
3929                 match_len = elem_len;
3930                 goto check_node_accept_bytes_match;
3931               }
3932
3933           /* match with equivalence_class?  */
3934           if (cset->nequiv_classes)
3935             {
3936               const unsigned char *cp = pin;
3937           int32_t idx;
3938               table = (const int32_t *)
3939                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3940               weights = (const unsigned char *)
3941                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3942               extra = (const unsigned char *)
3943                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3944               indirect = (const int32_t *)
3945                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3946               idx = findidx (&cp, elem_len);
3947               if (idx > 0)
3948                 for (i = 0; i < cset->nequiv_classes; ++i)
3949                   {
3950                     int32_t equiv_class_idx = cset->equiv_classes[i];
3951                     size_t weight_len = weights[idx & 0xffffff];
3952                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3953                         && (idx >> 24) == (equiv_class_idx >> 24))
3954                       {
3955                         int cnt = 0;
3956
3957                         idx &= 0xffffff;
3958                         equiv_class_idx &= 0xffffff;
3959
3960                         while (cnt <= weight_len
3961                                && (weights[equiv_class_idx + 1 + cnt]
3962                                    == weights[idx + 1 + cnt]))
3963                           ++cnt;
3964                         if (cnt > weight_len)
3965                           {
3966                             match_len = elem_len;
3967                             goto check_node_accept_bytes_match;
3968                           }
3969                       }
3970                   }
3971             }
3972         }
3973       else
3974 # endif /* _LIBC */
3975         {
3976           /* match with range expression?  */
3977           for (i = 0; i < cset->nranges; ++i)
3978             {
3979               if (cset->range_starts[i] <= wc
3980                   && wc <= cset->range_ends[i])
3981                 {
3982                   match_len = char_len;
3983                   goto check_node_accept_bytes_match;
3984                 }
3985             }
3986         }
3987     check_node_accept_bytes_match:
3988       if (!cset->non_match)
3989         return match_len;
3990       else
3991         {
3992           if (match_len > 0)
3993             return 0;
3994           else
3995             return (elem_len > char_len) ? elem_len : char_len;
3996         }
3997     }
3998   return 0;
3999 }
4000
4001 # ifdef _LIBC
4002 static unsigned int
4003 internal_function
4004 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4005 {
4006   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4007   if (nrules == 0)
4008     {
4009       if (mbs_len == 1)
4010         {
4011           /* No valid character.  Match it as a single byte character.  */
4012           const unsigned char *collseq = (const unsigned char *)
4013             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4014           return collseq[mbs[0]];
4015         }
4016       return UINT_MAX;
4017     }
4018   else
4019     {
4020       int32_t idx;
4021       const unsigned char *extra = (const unsigned char *)
4022         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4023       int32_t extrasize = (const unsigned char *)
4024         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4025
4026       for (idx = 0; idx < extrasize;)
4027         {
4028           int mbs_cnt, found = 0;
4029           int32_t elem_mbs_len;
4030           /* Skip the name of collating element name.  */
4031           idx = idx + extra[idx] + 1;
4032           elem_mbs_len = extra[idx++];
4033           if (mbs_len == elem_mbs_len)
4034             {
4035               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4036                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4037                   break;
4038               if (mbs_cnt == elem_mbs_len)
4039                 /* Found the entry.  */
4040                 found = 1;
4041             }
4042           /* Skip the byte sequence of the collating element.  */
4043           idx += elem_mbs_len;
4044           /* Adjust for the alignment.  */
4045           idx = (idx + 3) & ~3;
4046           /* Skip the collation sequence value.  */
4047           idx += sizeof (uint32_t);
4048           /* Skip the wide char sequence of the collating element.  */
4049           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4050           /* If we found the entry, return the sequence value.  */
4051           if (found)
4052             return *(uint32_t *) (extra + idx);
4053           /* Skip the collation sequence value.  */
4054           idx += sizeof (uint32_t);
4055         }
4056       return UINT_MAX;
4057     }
4058 }
4059 # endif /* _LIBC */
4060 #endif /* RE_ENABLE_I18N */
4061
4062 /* Check whether the node accepts the byte which is IDX-th
4063    byte of the INPUT.  */
4064
4065 static int
4066 internal_function
4067 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4068                    int idx)
4069 {
4070   unsigned char ch;
4071   ch = re_string_byte_at (&mctx->input, idx);
4072   switch (node->type)
4073     {
4074     case CHARACTER:
4075       if (node->opr.c != ch)
4076         return 0;
4077       break;
4078
4079     case SIMPLE_BRACKET:
4080       if (!bitset_contain (node->opr.sbcset, ch))
4081         return 0;
4082       break;
4083
4084 #ifdef RE_ENABLE_I18N
4085     case OP_UTF8_PERIOD:
4086       if (ch >= 0x80)
4087         return 0;
4088       /* FALLTHROUGH */
4089 #endif
4090     case OP_PERIOD:
4091       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4092           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4093         return 0;
4094       break;
4095
4096     default:
4097       return 0;
4098     }
4099
4100   if (node->constraint)
4101     {
4102       /* The node has constraints.  Check whether the current context
4103          satisfies the constraints.  */
4104       unsigned int context = re_string_context_at (&mctx->input, idx,
4105                                                    mctx->eflags);
4106       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4107         return 0;
4108     }
4109
4110   return 1;
4111 }
4112
4113 /* Extend the buffers, if the buffers have run out.  */
4114
4115 static reg_errcode_t
4116 internal_function __attribute_warn_unused_result__
4117 extend_buffers (re_match_context_t *mctx)
4118 {
4119   reg_errcode_t ret;
4120   re_string_t *pstr = &mctx->input;
4121
4122   /* Avoid overflow.  */
4123   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4124     return REG_ESPACE;
4125
4126   /* Double the lengthes of the buffers.  */
4127   ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2));
4128   if (BE (ret != REG_NOERROR, 0))
4129     return ret;
4130
4131   if (mctx->state_log != NULL)
4132     {
4133       /* And double the length of state_log.  */
4134       /* XXX We have no indication of the size of this buffer.  If this
4135          allocation fail we have no indication that the state_log array
4136          does not have the right size.  */
4137       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4138                                               pstr->bufs_len + 1);
4139       if (BE (new_array == NULL, 0))
4140         return REG_ESPACE;
4141       mctx->state_log = new_array;
4142     }
4143
4144   /* Then reconstruct the buffers.  */
4145   if (pstr->icase)
4146     {
4147 #ifdef RE_ENABLE_I18N
4148       if (pstr->mb_cur_max > 1)
4149         {
4150           ret = build_wcs_upper_buffer (pstr);
4151           if (BE (ret != REG_NOERROR, 0))
4152             return ret;
4153         }
4154       else
4155 #endif /* RE_ENABLE_I18N  */
4156         build_upper_buffer (pstr);
4157     }
4158   else
4159     {
4160 #ifdef RE_ENABLE_I18N
4161       if (pstr->mb_cur_max > 1)
4162         build_wcs_buffer (pstr);
4163       else
4164 #endif /* RE_ENABLE_I18N  */
4165         {
4166           if (pstr->trans != NULL)
4167             re_string_translate_buffer (pstr);
4168         }
4169     }
4170   return REG_NOERROR;
4171 }
4172
4173 \f
4174 /* Functions for matching context.  */
4175
4176 /* Initialize MCTX.  */
4177
4178 static reg_errcode_t
4179 internal_function __attribute_warn_unused_result__
4180 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4181 {
4182   mctx->eflags = eflags;
4183   mctx->match_last = -1;
4184   if (n > 0)
4185     {
4186       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4187       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4188       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4189         return REG_ESPACE;
4190     }
4191   /* Already zero-ed by the caller.
4192      else
4193        mctx->bkref_ents = NULL;
4194      mctx->nbkref_ents = 0;
4195      mctx->nsub_tops = 0;  */
4196   mctx->abkref_ents = n;
4197   mctx->max_mb_elem_len = 1;
4198   mctx->asub_tops = n;
4199   return REG_NOERROR;
4200 }
4201
4202 /* Clean the entries which depend on the current input in MCTX.
4203    This function must be invoked when the matcher changes the start index
4204    of the input, or changes the input string.  */
4205
4206 static void
4207 internal_function
4208 match_ctx_clean (re_match_context_t *mctx)
4209 {
4210   int st_idx;
4211   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4212     {
4213       int sl_idx;
4214       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4215       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4216         {
4217           re_sub_match_last_t *last = top->lasts[sl_idx];
4218           re_free (last->path.array);
4219           re_free (last);
4220         }
4221       re_free (top->lasts);
4222       if (top->path)
4223         {
4224           re_free (top->path->array);
4225           re_free (top->path);
4226         }
4227       free (top);
4228     }
4229
4230   mctx->nsub_tops = 0;
4231   mctx->nbkref_ents = 0;
4232 }
4233
4234 /* Free all the memory associated with MCTX.  */
4235
4236 static void
4237 internal_function
4238 match_ctx_free (re_match_context_t *mctx)
4239 {
4240   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4241   match_ctx_clean (mctx);
4242   re_free (mctx->sub_tops);
4243   re_free (mctx->bkref_ents);
4244 }
4245
4246 /* Add a new backreference entry to MCTX.
4247    Note that we assume that caller never call this function with duplicate
4248    entry, and call with STR_IDX which isn't smaller than any existing entry.
4249 */
4250
4251 static reg_errcode_t
4252 internal_function __attribute_warn_unused_result__
4253 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4254                      int to)
4255 {
4256   if (mctx->nbkref_ents >= mctx->abkref_ents)
4257     {
4258       struct re_backref_cache_entry* new_entry;
4259       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4260                               mctx->abkref_ents * 2);
4261       if (BE (new_entry == NULL, 0))
4262         {
4263           re_free (mctx->bkref_ents);
4264           return REG_ESPACE;
4265         }
4266       mctx->bkref_ents = new_entry;
4267       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4268               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4269       mctx->abkref_ents *= 2;
4270     }
4271   if (mctx->nbkref_ents > 0
4272       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4273     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4274
4275   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4276   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4277   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4278   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4279
4280   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4281      If bit N is clear, means that this entry won't epsilon-transition to
4282      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4283      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4284      such node.
4285
4286      A backreference does not epsilon-transition unless it is empty, so set
4287      to all zeros if FROM != TO.  */
4288   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4289     = (from == to ? ~0 : 0);
4290
4291   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4292   if (mctx->max_mb_elem_len < to - from)
4293     mctx->max_mb_elem_len = to - from;
4294   return REG_NOERROR;
4295 }
4296
4297 /* Search for the first entry which has the same str_idx, or -1 if none is
4298    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4299
4300 static int
4301 internal_function
4302 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4303 {
4304   int left, right, mid, last;
4305   last = right = mctx->nbkref_ents;
4306   for (left = 0; left < right;)
4307     {
4308       mid = (left + right) / 2;
4309       if (mctx->bkref_ents[mid].str_idx < str_idx)
4310         left = mid + 1;
4311       else
4312         right = mid;
4313     }
4314   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4315     return left;
4316   else
4317     return -1;
4318 }
4319
4320 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4321    at STR_IDX.  */
4322
4323 static reg_errcode_t
4324 internal_function __attribute_warn_unused_result__
4325 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4326 {
4327 #ifdef DEBUG
4328   assert (mctx->sub_tops != NULL);
4329   assert (mctx->asub_tops > 0);
4330 #endif
4331   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4332     {
4333       int new_asub_tops = mctx->asub_tops * 2;
4334       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4335                                                    re_sub_match_top_t *,
4336                                                    new_asub_tops);
4337       if (BE (new_array == NULL, 0))
4338         return REG_ESPACE;
4339       mctx->sub_tops = new_array;
4340       mctx->asub_tops = new_asub_tops;
4341     }
4342   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4343   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4344     return REG_ESPACE;
4345   mctx->sub_tops[mctx->nsub_tops]->node = node;
4346   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4347   return REG_NOERROR;
4348 }
4349
4350 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4351    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4352
4353 static re_sub_match_last_t *
4354 internal_function
4355 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4356 {
4357   re_sub_match_last_t *new_entry;
4358   if (BE (subtop->nlasts == subtop->alasts, 0))
4359     {
4360       int new_alasts = 2 * subtop->alasts + 1;
4361       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4362                                                     re_sub_match_last_t *,
4363                                                     new_alasts);
4364       if (BE (new_array == NULL, 0))
4365         return NULL;
4366       subtop->lasts = new_array;
4367       subtop->alasts = new_alasts;
4368     }
4369   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4370   if (BE (new_entry != NULL, 1))
4371     {
4372       subtop->lasts[subtop->nlasts] = new_entry;
4373       new_entry->node = node;
4374       new_entry->str_idx = str_idx;
4375       ++subtop->nlasts;
4376     }
4377   return new_entry;
4378 }
4379
4380 static void
4381 internal_function
4382 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4383                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4384 {
4385   sctx->sifted_states = sifted_sts;
4386   sctx->limited_states = limited_sts;
4387   sctx->last_node = last_node;
4388   sctx->last_str_idx = last_str_idx;
4389   re_node_set_init_empty (&sctx->limits);
4390 }