posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56      internal_function;
  57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                            int *p_match_first) internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes)
  69      internal_function;
  70 static reg_errcode_t set_regs (const regex_t *preg,
  71                                const re_match_context_t *mctx,
  72                                size_t nmatch, regmatch_t *pmatch,
  73                                int fl_backtrack) internal_function;
  74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75      internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 int node_idx, int str_idx, int max_str_idx)
  81      internal_function;
  82 #endif /* RE_ENABLE_I18N */
  83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                            re_sift_context_t *sctx)
  85      internal_function;
  86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                           re_sift_context_t *sctx, int str_idx,
  88                                           re_node_set *cur_dest)
  89      internal_function;
  90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                               re_sift_context_t *sctx,
  92                                               int str_idx,
  93                                               re_node_set *dest_nodes)
  94      internal_function;
  95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                             re_node_set *dest_nodes,
  97                                             const re_node_set *candidates)
  98      internal_function;
  99 static int check_dst_limits (const re_match_context_t *mctx,
 100                              re_node_set *limits,
 101                              int dst_node, int dst_idx, int src_node,
 102                              int src_idx) internal_function;
 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                         int boundaries, int subexp_idx,
 105                                         int from_node, int bkref_idx)
 106      internal_function;
 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                       int limit, int subexp_idx,
 109                                       int node, int str_idx,
 110                                       int bkref_idx) internal_function;
 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                           re_node_set *dest_nodes,
 113                                           const re_node_set *candidates,
 114                                           re_node_set *limits,
 115                                           struct re_backref_cache_entry *bkref_ents,
 116                                           int str_idx) internal_function;
 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                         re_sift_context_t *sctx,
 119                                         int str_idx, const re_node_set *candidates)
 120      internal_function;
 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                         re_dfastate_t **dst,
 123                                         re_dfastate_t **src, int num)
 124      internal_function;
 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                          re_match_context_t *mctx) internal_function;
 127 static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                      re_match_context_t *mctx,
 129                                      re_dfastate_t *state) internal_function;
 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                             re_match_context_t *mctx,
 132                                             re_dfastate_t *next_state)
 133      internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate)
 141      internal_function;
 142 #endif
 143 #ifdef RE_ENABLE_I18N
 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                        re_dfastate_t *pstate)
 146      internal_function;
 147 #endif /* RE_ENABLE_I18N */
 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                           const re_node_set *nodes)
 150      internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx)
 153      internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str)
 158      internal_function;
 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                              int subexp_idx, int type) internal_function;
 161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                     state_array_t *path, int top_node,
 163                                     int top_str, int last_node, int last_str,
 164                                     int type) internal_function;
 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                    int str_idx,
 167                                                    re_node_set *cur_nodes,
 168                                                    re_node_set *next_nodes)
 169      internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                    re_node_set *dst_nodes,
 176                                                    int target, int ex_subexp,
 177                                                    int type) internal_function;
 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                          re_node_set *cur_nodes, int cur_str,
 180                                          int subexp_num, int type)
 181      internal_function;
 182 static int build_trtable (const re_dfa_t *dfa,
 183                           re_dfastate_t *state) internal_function;
 184 #ifdef RE_ENABLE_I18N
 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                     const re_string_t *input, int idx)
 187      internal_function;
 188 # ifdef _LIBC
 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                    size_t name_len)
 191      internal_function;
 192 # endif /* _LIBC */
 193 #endif /* RE_ENABLE_I18N */
 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                        const re_dfastate_t *state,
 196                                        re_node_set *states_node,
 197                                        bitset_t *states_ch) internal_function;
 198 static int check_node_accept (const re_match_context_t *mctx,
 199                               const re_token_t *node, int idx)
 200      internal_function;
 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202      internal_function;
 203 \f
 204 /* Entry point for POSIX code.  */
 205
 206 /* regexec searches for a given pattern, specified by PREG, in the
 207    string STRING.
 208
 209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211    least NMATCH elements, and we set them to the offsets of the
 212    corresponding matched substrings.
 213
 214    EFLAGS specifies `execution flags' which affect matching: if
 215    REG_NOTBOL is set, then ^ does not match at the beginning of the
 216    string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218    We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220 int
 221 regexec (preg, string, nmatch, pmatch, eflags)
 222     const regex_t *__restrict preg;
 223     const char *__restrict string;
 224     size_t nmatch;
 225     regmatch_t pmatch[];
 226     int eflags;
 227 {
 228   reg_errcode_t err;
 229   int start, length;
 230   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 231
 232   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 233     return REG_BADPAT;
 234
 235   if (eflags & REG_STARTEND)
 236     {
 237       start = pmatch[0].rm_so;
 238       length = pmatch[0].rm_eo;
 239     }
 240   else
 241     {
 242       start = 0;
 243       length = strlen (string);
 244     }
 245
 246   __libc_lock_lock (dfa->lock);
 247   if (preg->no_sub)
 248     err = re_search_internal (preg, string, length, start, length - start,
 249                               length, 0, NULL, eflags);
 250   else
 251     err = re_search_internal (preg, string, length, start, length - start,
 252                               length, nmatch, pmatch, eflags);
 253   __libc_lock_unlock (dfa->lock);
 254   return err != REG_NOERROR;
 255 }
 256
 257 #ifdef _LIBC
 258 # include <shlib-compat.h>
 259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 260
 261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 262 __typeof__ (__regexec) __compat_regexec;
 263
 264 int
 265 attribute_compat_text_section
 266 __compat_regexec (const regex_t *__restrict preg,
 267                   const char *__restrict string, size_t nmatch,
 268                   regmatch_t pmatch[], int eflags)
 269 {
 270   return regexec (preg, string, nmatch, pmatch,
 271                   eflags & (REG_NOTBOL | REG_NOTEOL));
 272 }
 273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 274 # endif
 275 #endif
 276
 277 /* Entry points for GNU code.  */
 278
 279 /* re_match, re_search, re_match_2, re_search_2
 280
 281    The former two functions operate on STRING with length LENGTH,
 282    while the later two operate on concatenation of STRING1 and STRING2
 283    with lengths LENGTH1 and LENGTH2, respectively.
 284
 285    re_match() matches the compiled pattern in BUFP against the string,
 286    starting at index START.
 287
 288    re_search() first tries matching at index START, then it tries to match
 289    starting from index START + 1, and so on.  The last start position tried
 290    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 291    way as re_match().)
 292
 293    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 294    the first STOP characters of the concatenation of the strings should be
 295    concerned.
 296
 297    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 298    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 299    computed relative to the concatenation, not relative to the individual
 300    strings.)
 301
 302    On success, re_match* functions return the length of the match, re_search*
 303    return the position of the start of the match.  Return value -1 means no
 304    match was found and -2 indicates an internal error.  */
 305
 306 int
 307 re_match (bufp, string, length, start, regs)
 308     struct re_pattern_buffer *bufp;
 309     const char *string;
 310     int length, start;
 311     struct re_registers *regs;
 312 {
 313   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 314 }
 315 #ifdef _LIBC
 316 weak_alias (__re_match, re_match)
 317 #endif
 318
 319 int
 320 re_search (bufp, string, length, start, range, regs)
 321     struct re_pattern_buffer *bufp;
 322     const char *string;
 323     int length, start, range;
 324     struct re_registers *regs;
 325 {
 326   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 327 }
 328 #ifdef _LIBC
 329 weak_alias (__re_search, re_search)
 330 #endif
 331
 332 int
 333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 334     struct re_pattern_buffer *bufp;
 335     const char *string1, *string2;
 336     int length1, length2, start, stop;
 337     struct re_registers *regs;
 338 {
 339   return re_search_2_stub (bufp, string1, length1, string2, length2,
 340                            start, 0, regs, stop, 1);
 341 }
 342 #ifdef _LIBC
 343 weak_alias (__re_match_2, re_match_2)
 344 #endif
 345
 346 int
 347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 348     struct re_pattern_buffer *bufp;
 349     const char *string1, *string2;
 350     int length1, length2, start, range, stop;
 351     struct re_registers *regs;
 352 {
 353   return re_search_2_stub (bufp, string1, length1, string2, length2,
 354                            start, range, regs, stop, 0);
 355 }
 356 #ifdef _LIBC
 357 weak_alias (__re_search_2, re_search_2)
 358 #endif
 359
 360 static int
 361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 362                   stop, ret_len)
 363     struct re_pattern_buffer *bufp;
 364     const char *string1, *string2;
 365     int length1, length2, start, range, stop, ret_len;
 366     struct re_registers *regs;
 367 {
 368   const char *str;
 369   int rval;
 370   int len = length1 + length2;
 371   char *s = NULL;
 372
 373   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
 374     return -2;
 375
 376   /* Concatenate the strings.  */
 377   if (length2 > 0)
 378     if (length1 > 0)
 379       {
 380         s = re_malloc (char, len);
 381
 382         if (BE (s == NULL, 0))
 383           return -2;
 384 #ifdef _LIBC
 385         memcpy (__mempcpy (s, string1, length1), string2, length2);
 386 #else
 387         memcpy (s, string1, length1);
 388         memcpy (s + length1, string2, length2);
 389 #endif
 390         str = s;
 391       }
 392     else
 393       str = string2;
 394   else
 395     str = string1;
 396
 397   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 398   re_free (s);
 399   return rval;
 400 }
 401
 402 /* The parameters have the same meaning as those of re_search.
 403    Additional parameters:
 404    If RET_LEN is nonzero the length of the match is returned (re_match style);
 405    otherwise the position of the match is returned.  */
 406
 407 static int
 408 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 409     struct re_pattern_buffer *bufp;
 410     const char *string;
 411     int length, start, range, stop, ret_len;
 412     struct re_registers *regs;
 413 {
 414   reg_errcode_t result;
 415   regmatch_t *pmatch;
 416   int nregs, rval;
 417   int eflags = 0;
 418   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 419
 420   /* Check for out-of-range.  */
 421   if (BE (start < 0 || start > length, 0))
 422     return -1;
 423   if (BE (start + range > length, 0))
 424     range = length - start;
 425   else if (BE (start + range < 0, 0))
 426     range = -start;
 427
 428   __libc_lock_lock (dfa->lock);
 429
 430   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 431   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 432
 433   /* Compile fastmap if we haven't yet.  */
 434   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 435     re_compile_fastmap (bufp);
 436
 437   if (BE (bufp->no_sub, 0))
 438     regs = NULL;
 439
 440   /* We need at least 1 register.  */
 441   if (regs == NULL)
 442     nregs = 1;
 443   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 444                regs->num_regs < bufp->re_nsub + 1, 0))
 445     {
 446       nregs = regs->num_regs;
 447       if (BE (nregs < 1, 0))
 448         {
 449           /* Nothing can be copied to regs.  */
 450           regs = NULL;
 451           nregs = 1;
 452         }
 453     }
 454   else
 455     nregs = bufp->re_nsub + 1;
 456   pmatch = re_malloc (regmatch_t, nregs);
 457   if (BE (pmatch == NULL, 0))
 458     {
 459       rval = -2;
 460       goto out;
 461     }
 462
 463   result = re_search_internal (bufp, string, length, start, range, stop,
 464                                nregs, pmatch, eflags);
 465
 466   rval = 0;
 467
 468   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 469   if (result != REG_NOERROR)
 470     rval = -1;
 471   else if (regs != NULL)
 472     {
 473       /* If caller wants register contents data back, copy them.  */
 474       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 475                                            bufp->regs_allocated);
 476       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 477         rval = -2;
 478     }
 479
 480   if (BE (rval == 0, 1))
 481     {
 482       if (ret_len)
 483         {
 484           assert (pmatch[0].rm_so == start);
 485           rval = pmatch[0].rm_eo - start;
 486         }
 487       else
 488         rval = pmatch[0].rm_so;
 489     }
 490   re_free (pmatch);
 491  out:
 492   __libc_lock_unlock (dfa->lock);
 493   return rval;
 494 }
 495
 496 static unsigned
 497 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 498     struct re_registers *regs;
 499     regmatch_t *pmatch;
 500     int nregs, regs_allocated;
 501 {
 502   int rval = REGS_REALLOCATE;
 503   int i;
 504   int need_regs = nregs + 1;
 505   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 506      uses.  */
 507
 508   /* Have the register data arrays been allocated?  */
 509   if (regs_allocated == REGS_UNALLOCATED)
 510     { /* No.  So allocate them with malloc.  */
 511       regs->start = re_malloc (regoff_t, need_regs);
 512       if (BE (regs->start == NULL, 0))
 513         return REGS_UNALLOCATED;
 514       regs->end = re_malloc (regoff_t, need_regs);
 515       if (BE (regs->end == NULL, 0))
 516         {
 517           re_free (regs->start);
 518           return REGS_UNALLOCATED;
 519         }
 520       regs->num_regs = need_regs;
 521     }
 522   else if (regs_allocated == REGS_REALLOCATE)
 523     { /* Yes.  If we need more elements than were already
 524          allocated, reallocate them.  If we need fewer, just
 525          leave it alone.  */
 526       if (BE (need_regs > regs->num_regs, 0))
 527         {
 528           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 529           regoff_t *new_end;
 530           if (BE (new_start == NULL, 0))
 531             return REGS_UNALLOCATED;
 532           new_end = re_realloc (regs->end, regoff_t, need_regs);
 533           if (BE (new_end == NULL, 0))
 534             {
 535               re_free (new_start);
 536               return REGS_UNALLOCATED;
 537             }
 538           regs->start = new_start;
 539           regs->end = new_end;
 540           regs->num_regs = need_regs;
 541         }
 542     }
 543   else
 544     {
 545       assert (regs_allocated == REGS_FIXED);
 546       /* This function may not be called with REGS_FIXED and nregs too big.  */
 547       assert (regs->num_regs >= nregs);
 548       rval = REGS_FIXED;
 549     }
 550
 551   /* Copy the regs.  */
 552   for (i = 0; i < nregs; ++i)
 553     {
 554       regs->start[i] = pmatch[i].rm_so;
 555       regs->end[i] = pmatch[i].rm_eo;
 556     }
 557   for ( ; i < regs->num_regs; ++i)
 558     regs->start[i] = regs->end[i] = -1;
 559
 560   return rval;
 561 }
 562
 563 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 564    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 565    this memory for recording register information.  STARTS and ENDS
 566    must be allocated using the malloc library routine, and must each
 567    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 568
 569    If NUM_REGS == 0, then subsequent matches should allocate their own
 570    register data.
 571
 572    Unless this function is called, the first search or match using
 573    PATTERN_BUFFER will allocate its own register data, without
 574    freeing the old data.  */
 575
 576 void
 577 re_set_registers (bufp, regs, num_regs, starts, ends)
 578     struct re_pattern_buffer *bufp;
 579     struct re_registers *regs;
 580     unsigned num_regs;
 581     regoff_t *starts, *ends;
 582 {
 583   if (num_regs)
 584     {
 585       bufp->regs_allocated = REGS_REALLOCATE;
 586       regs->num_regs = num_regs;
 587       regs->start = starts;
 588       regs->end = ends;
 589     }
 590   else
 591     {
 592       bufp->regs_allocated = REGS_UNALLOCATED;
 593       regs->num_regs = 0;
 594       regs->start = regs->end = (regoff_t *) 0;
 595     }
 596 }
 597 #ifdef _LIBC
 598 weak_alias (__re_set_registers, re_set_registers)
 599 #endif
 600 \f
 601 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 602    them unless specifically requested.  */
 603
 604 #if defined _REGEX_RE_COMP || defined _LIBC
 605 int
 606 # ifdef _LIBC
 607 weak_function
 608 # endif
 609 re_exec (s)
 610      const char *s;
 611 {
 612   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 613 }
 614 #endif /* _REGEX_RE_COMP */
 615 \f
 616 /* Internal entry point.  */
 617
 618 /* Searches for a compiled pattern PREG in the string STRING, whose
 619    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 620    mingings with regexec.  START, and RANGE have the same meanings
 621    with re_search.
 622    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 623    otherwise return the error code.
 624    Note: We assume front end functions already check ranges.
 625    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 626
 627 static reg_errcode_t
 628 __attribute_warn_unused_result__
 629 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 630                     eflags)
 631     const regex_t *preg;
 632     const char *string;
 633     int length, start, range, stop, eflags;
 634     size_t nmatch;
 635     regmatch_t pmatch[];
 636 {
 637   reg_errcode_t err;
 638   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 639   int left_lim, right_lim, incr;
 640   int fl_longest_match, match_first, match_kind, match_last = -1;
 641   int extra_nmatch;
 642   int sb, ch;
 643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 644   re_match_context_t mctx = { .dfa = dfa };
 645 #else
 646   re_match_context_t mctx;
 647 #endif
 648   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 649                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 650   RE_TRANSLATE_TYPE t = preg->translate;
 651
 652 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 653   memset (&mctx, '\0', sizeof (re_match_context_t));
 654   mctx.dfa = dfa;
 655 #endif
 656
 657   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 658   nmatch -= extra_nmatch;
 659
 660   /* Check if the DFA haven't been compiled.  */
 661   if (BE (preg->used == 0 || dfa->init_state == NULL
 662           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 663           || dfa->init_state_begbuf == NULL, 0))
 664     return REG_NOMATCH;
 665
 666 #ifdef DEBUG
 667   /* We assume front-end functions already check them.  */
 668   assert (start + range >= 0 && start + range <= length);
 669 #endif
 670
 671   /* If initial states with non-begbuf contexts have no elements,
 672      the regex must be anchored.  If preg->newline_anchor is set,
 673      we'll never use init_state_nl, so do not check it.  */
 674   if (dfa->init_state->nodes.nelem == 0
 675       && dfa->init_state_word->nodes.nelem == 0
 676       && (dfa->init_state_nl->nodes.nelem == 0
 677           || !preg->newline_anchor))
 678     {
 679       if (start != 0 && start + range != 0)
 680         return REG_NOMATCH;
 681       start = range = 0;
 682     }
 683
 684   /* We must check the longest matching, if nmatch > 0.  */
 685   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 686
 687   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 688                             preg->translate, preg->syntax & RE_ICASE, dfa);
 689   if (BE (err != REG_NOERROR, 0))
 690     goto free_return;
 691   mctx.input.stop = stop;
 692   mctx.input.raw_stop = stop;
 693   mctx.input.newline_anchor = preg->newline_anchor;
 694
 695   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 696   if (BE (err != REG_NOERROR, 0))
 697     goto free_return;
 698
 699   /* We will log all the DFA states through which the dfa pass,
 700      if nmatch > 1, or this dfa has "multibyte node", which is a
 701      back-reference or a node which can accept multibyte character or
 702      multi character collating element.  */
 703   if (nmatch > 1 || dfa->has_mb_node)
 704     {
 705       /* Avoid overflow.  */
 706       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 707         {
 708           err = REG_ESPACE;
 709           goto free_return;
 710         }
 711
 712       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 713       if (BE (mctx.state_log == NULL, 0))
 714         {
 715           err = REG_ESPACE;
 716           goto free_return;
 717         }
 718     }
 719   else
 720     mctx.state_log = NULL;
 721
 722   match_first = start;
 723   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 724                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 725
 726   /* Check incrementally whether of not the input string match.  */
 727   incr = (range < 0) ? -1 : 1;
 728   left_lim = (range < 0) ? start + range : start;
 729   right_lim = (range < 0) ? start : start + range;
 730   sb = dfa->mb_cur_max == 1;
 731   match_kind =
 732     (fastmap
 733      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 734         | (range >= 0 ? 2 : 0)
 735         | (t != NULL ? 1 : 0))
 736      : 8);
 737
 738   for (;; match_first += incr)
 739     {
 740       err = REG_NOMATCH;
 741       if (match_first < left_lim || right_lim < match_first)
 742         goto free_return;
 743
 744       /* Advance as rapidly as possible through the string, until we
 745          find a plausible place to start matching.  This may be done
 746          with varying efficiency, so there are various possibilities:
 747          only the most common of them are specialized, in order to
 748          save on code size.  We use a switch statement for speed.  */
 749       switch (match_kind)
 750         {
 751         case 8:
 752           /* No fastmap.  */
 753           break;
 754
 755         case 7:
 756           /* Fastmap with single-byte translation, match forward.  */
 757           while (BE (match_first < right_lim, 1)
 758                  && !fastmap[t[(unsigned char) string[match_first]]])
 759             ++match_first;
 760           goto forward_match_found_start_or_reached_end;
 761
 762         case 6:
 763           /* Fastmap without translation, match forward.  */
 764           while (BE (match_first < right_lim, 1)
 765                  && !fastmap[(unsigned char) string[match_first]])
 766             ++match_first;
 767
 768         forward_match_found_start_or_reached_end:
 769           if (BE (match_first == right_lim, 0))
 770             {
 771               ch = match_first >= length
 772                        ? 0 : (unsigned char) string[match_first];
 773               if (!fastmap[t ? t[ch] : ch])
 774                 goto free_return;
 775             }
 776           break;
 777
 778         case 4:
 779         case 5:
 780           /* Fastmap without multi-byte translation, match backwards.  */
 781           while (match_first >= left_lim)
 782             {
 783               ch = match_first >= length
 784                        ? 0 : (unsigned char) string[match_first];
 785               if (fastmap[t ? t[ch] : ch])
 786                 break;
 787               --match_first;
 788             }
 789           if (match_first < left_lim)
 790             goto free_return;
 791           break;
 792
 793         default:
 794           /* In this case, we can't determine easily the current byte,
 795              since it might be a component byte of a multibyte
 796              character.  Then we use the constructed buffer instead.  */
 797           for (;;)
 798             {
 799               /* If MATCH_FIRST is out of the valid range, reconstruct the
 800                  buffers.  */
 801               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 802               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 803                 {
 804                   err = re_string_reconstruct (&mctx.input, match_first,
 805                                                eflags);
 806                   if (BE (err != REG_NOERROR, 0))
 807                     goto free_return;
 808
 809                   offset = match_first - mctx.input.raw_mbs_idx;
 810                 }
 811               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 812                  Note that MATCH_FIRST must not be smaller than 0.  */
 813               ch = (match_first >= length
 814                     ? 0 : re_string_byte_at (&mctx.input, offset));
 815               if (fastmap[ch])
 816                 break;
 817               match_first += incr;
 818               if (match_first < left_lim || match_first > right_lim)
 819                 {
 820                   err = REG_NOMATCH;
 821                   goto free_return;
 822                 }
 823             }
 824           break;
 825         }
 826
 827       /* Reconstruct the buffers so that the matcher can assume that
 828          the matching starts from the beginning of the buffer.  */
 829       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 830       if (BE (err != REG_NOERROR, 0))
 831         goto free_return;
 832
 833 #ifdef RE_ENABLE_I18N
 834      /* Don't consider this char as a possible match start if it part,
 835         yet isn't the head, of a multibyte character.  */
 836       if (!sb && !re_string_first_byte (&mctx.input, 0))
 837         continue;
 838 #endif
 839
 840       /* It seems to be appropriate one, then use the matcher.  */
 841       /* We assume that the matching starts from 0.  */
 842       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 843       match_last = check_matching (&mctx, fl_longest_match,
 844                                    range >= 0 ? &match_first : NULL);
 845       if (match_last != -1)
 846         {
 847           if (BE (match_last == -2, 0))
 848             {
 849               err = REG_ESPACE;
 850               goto free_return;
 851             }
 852           else
 853             {
 854               mctx.match_last = match_last;
 855               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 856                 {
 857                   re_dfastate_t *pstate = mctx.state_log[match_last];
 858                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 859                                                              match_last);
 860                 }
 861               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 862                   || dfa->nbackref)
 863                 {
 864                   err = prune_impossible_nodes (&mctx);
 865                   if (err == REG_NOERROR)
 866                     break;
 867                   if (BE (err != REG_NOMATCH, 0))
 868                     goto free_return;
 869                   match_last = -1;
 870                 }
 871               else
 872                 break; /* We found a match.  */
 873             }
 874         }
 875
 876       match_ctx_clean (&mctx);
 877     }
 878
 879 #ifdef DEBUG
 880   assert (match_last != -1);
 881   assert (err == REG_NOERROR);
 882 #endif
 883
 884   /* Set pmatch[] if we need.  */
 885   if (nmatch > 0)
 886     {
 887       int reg_idx;
 888
 889       /* Initialize registers.  */
 890       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 891         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 892
 893       /* Set the points where matching start/end.  */
 894       pmatch[0].rm_so = 0;
 895       pmatch[0].rm_eo = mctx.match_last;
 896
 897       if (!preg->no_sub && nmatch > 1)
 898         {
 899           err = set_regs (preg, &mctx, nmatch, pmatch,
 900                           dfa->has_plural_match && dfa->nbackref > 0);
 901           if (BE (err != REG_NOERROR, 0))
 902             goto free_return;
 903         }
 904
 905       /* At last, add the offset to the each registers, since we slided
 906          the buffers so that we could assume that the matching starts
 907          from 0.  */
 908       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 909         if (pmatch[reg_idx].rm_so != -1)
 910           {
 911 #ifdef RE_ENABLE_I18N
 912             if (BE (mctx.input.offsets_needed != 0, 0))
 913               {
 914                 pmatch[reg_idx].rm_so =
 915                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 916                    ? mctx.input.valid_raw_len
 917                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 918                 pmatch[reg_idx].rm_eo =
 919                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 920                    ? mctx.input.valid_raw_len
 921                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 922               }
 923 #else
 924             assert (mctx.input.offsets_needed == 0);
 925 #endif
 926             pmatch[reg_idx].rm_so += match_first;
 927             pmatch[reg_idx].rm_eo += match_first;
 928           }
 929       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 930         {
 931           pmatch[nmatch + reg_idx].rm_so = -1;
 932           pmatch[nmatch + reg_idx].rm_eo = -1;
 933         }
 934
 935       if (dfa->subexp_map)
 936         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 937           if (dfa->subexp_map[reg_idx] != reg_idx)
 938             {
 939               pmatch[reg_idx + 1].rm_so
 940                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 941               pmatch[reg_idx + 1].rm_eo
 942                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 943             }
 944     }
 945
 946  free_return:
 947   re_free (mctx.state_log);
 948   if (dfa->nbackref)
 949     match_ctx_free (&mctx);
 950   re_string_destruct (&mctx.input);
 951   return err;
 952 }
 953
 954 static reg_errcode_t
 955 __attribute_warn_unused_result__
 956 prune_impossible_nodes (mctx)
 957      re_match_context_t *mctx;
 958 {
 959   const re_dfa_t *const dfa = mctx->dfa;
 960   int halt_node, match_last;
 961   reg_errcode_t ret;
 962   re_dfastate_t **sifted_states;
 963   re_dfastate_t **lim_states = NULL;
 964   re_sift_context_t sctx;
 965 #ifdef DEBUG
 966   assert (mctx->state_log != NULL);
 967 #endif
 968   match_last = mctx->match_last;
 969   halt_node = mctx->last_node;
 970
 971   /* Avoid overflow.  */
 972   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 973     return REG_ESPACE;
 974
 975   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 976   if (BE (sifted_states == NULL, 0))
 977     {
 978       ret = REG_ESPACE;
 979       goto free_return;
 980     }
 981   if (dfa->nbackref)
 982     {
 983       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 984       if (BE (lim_states == NULL, 0))
 985         {
 986           ret = REG_ESPACE;
 987           goto free_return;
 988         }
 989       while (1)
 990         {
 991           memset (lim_states, '\0',
 992                   sizeof (re_dfastate_t *) * (match_last + 1));
 993           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 994                          match_last);
 995           ret = sift_states_backward (mctx, &sctx);
 996           re_node_set_free (&sctx.limits);
 997           if (BE (ret != REG_NOERROR, 0))
 998               goto free_return;
 999           if (sifted_states[0] != NULL || lim_states[0] != NULL)
1000             break;
1001           do
1002             {
1003               --match_last;
1004               if (match_last < 0)
1005                 {
1006                   ret = REG_NOMATCH;
1007                   goto free_return;
1008                 }
1009             } while (mctx->state_log[match_last] == NULL
1010                      || !mctx->state_log[match_last]->halt);
1011           halt_node = check_halt_state_context (mctx,
1012                                                 mctx->state_log[match_last],
1013                                                 match_last);
1014         }
1015       ret = merge_state_array (dfa, sifted_states, lim_states,
1016                                match_last + 1);
1017       re_free (lim_states);
1018       lim_states = NULL;
1019       if (BE (ret != REG_NOERROR, 0))
1020         goto free_return;
1021     }
1022   else
1023     {
1024       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1025       ret = sift_states_backward (mctx, &sctx);
1026       re_node_set_free (&sctx.limits);
1027       if (BE (ret != REG_NOERROR, 0))
1028         goto free_return;
1029       if (sifted_states[0] == NULL)
1030         {
1031           ret = REG_NOMATCH;
1032           goto free_return;
1033         }
1034     }
1035   re_free (mctx->state_log);
1036   mctx->state_log = sifted_states;
1037   sifted_states = NULL;
1038   mctx->last_node = halt_node;
1039   mctx->match_last = match_last;
1040   ret = REG_NOERROR;
1041  free_return:
1042   re_free (sifted_states);
1043   re_free (lim_states);
1044   return ret;
1045 }
1046
1047 /* Acquire an initial state and return it.
1048    We must select appropriate initial state depending on the context,
1049    since initial states may have constraints like "\<", "^", etc..  */
1050
1051 static inline re_dfastate_t *
1052 __attribute ((always_inline)) internal_function
1053 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1054                             int idx)
1055 {
1056   const re_dfa_t *const dfa = mctx->dfa;
1057   if (dfa->init_state->has_constraint)
1058     {
1059       unsigned int context;
1060       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1061       if (IS_WORD_CONTEXT (context))
1062         return dfa->init_state_word;
1063       else if (IS_ORDINARY_CONTEXT (context))
1064         return dfa->init_state;
1065       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1066         return dfa->init_state_begbuf;
1067       else if (IS_NEWLINE_CONTEXT (context))
1068         return dfa->init_state_nl;
1069       else if (IS_BEGBUF_CONTEXT (context))
1070         {
1071           /* It is relatively rare case, then calculate on demand.  */
1072           return re_acquire_state_context (err, dfa,
1073                                            dfa->init_state->entrance_nodes,
1074                                            context);
1075         }
1076       else
1077         /* Must not happen?  */
1078         return dfa->init_state;
1079     }
1080   else
1081     return dfa->init_state;
1082 }
1083
1084 /* Check whether the regular expression match input string INPUT or not,
1085    and return the index where the matching end, return -1 if not match,
1086    or return -2 in case of an error.
1087    FL_LONGEST_MATCH means we want the POSIX longest matching.
1088    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1089    next place where we may want to try matching.
1090    Note that the matcher assume that the maching starts from the current
1091    index of the buffer.  */
1092
1093 static int
1094 internal_function __attribute_warn_unused_result__
1095 check_matching (re_match_context_t *mctx, int fl_longest_match,
1096                 int *p_match_first)
1097 {
1098   const re_dfa_t *const dfa = mctx->dfa;
1099   reg_errcode_t err;
1100   int match = 0;
1101   int match_last = -1;
1102   int cur_str_idx = re_string_cur_idx (&mctx->input);
1103   re_dfastate_t *cur_state;
1104   int at_init_state = p_match_first != NULL;
1105   int next_start_idx = cur_str_idx;
1106
1107   err = REG_NOERROR;
1108   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1109   /* An initial state must not be NULL (invalid).  */
1110   if (BE (cur_state == NULL, 0))
1111     {
1112       assert (err == REG_ESPACE);
1113       return -2;
1114     }
1115
1116   if (mctx->state_log != NULL)
1117     {
1118       mctx->state_log[cur_str_idx] = cur_state;
1119
1120       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1121          later.  E.g. Processing back references.  */
1122       if (BE (dfa->nbackref, 0))
1123         {
1124           at_init_state = 0;
1125           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1126           if (BE (err != REG_NOERROR, 0))
1127             return err;
1128
1129           if (cur_state->has_backref)
1130             {
1131               err = transit_state_bkref (mctx, &cur_state->nodes);
1132               if (BE (err != REG_NOERROR, 0))
1133                 return err;
1134             }
1135         }
1136     }
1137
1138   /* If the RE accepts NULL string.  */
1139   if (BE (cur_state->halt, 0))
1140     {
1141       if (!cur_state->has_constraint
1142           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1143         {
1144           if (!fl_longest_match)
1145             return cur_str_idx;
1146           else
1147             {
1148               match_last = cur_str_idx;
1149               match = 1;
1150             }
1151         }
1152     }
1153
1154   while (!re_string_eoi (&mctx->input))
1155     {
1156       re_dfastate_t *old_state = cur_state;
1157       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1158
1159       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1160           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1161               && mctx->input.valid_len < mctx->input.len))
1162         {
1163           err = extend_buffers (mctx);
1164           if (BE (err != REG_NOERROR, 0))
1165             {
1166               assert (err == REG_ESPACE);
1167               return -2;
1168             }
1169         }
1170
1171       cur_state = transit_state (&err, mctx, cur_state);
1172       if (mctx->state_log != NULL)
1173         cur_state = merge_state_with_log (&err, mctx, cur_state);
1174
1175       if (cur_state == NULL)
1176         {
1177           /* Reached the invalid state or an error.  Try to recover a valid
1178              state using the state log, if available and if we have not
1179              already found a valid (even if not the longest) match.  */
1180           if (BE (err != REG_NOERROR, 0))
1181             return -2;
1182
1183           if (mctx->state_log == NULL
1184               || (match && !fl_longest_match)
1185               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1186             break;
1187         }
1188
1189       if (BE (at_init_state, 0))
1190         {
1191           if (old_state == cur_state)
1192             next_start_idx = next_char_idx;
1193           else
1194             at_init_state = 0;
1195         }
1196
1197       if (cur_state->halt)
1198         {
1199           /* Reached a halt state.
1200              Check the halt state can satisfy the current context.  */
1201           if (!cur_state->has_constraint
1202               || check_halt_state_context (mctx, cur_state,
1203                                            re_string_cur_idx (&mctx->input)))
1204             {
1205               /* We found an appropriate halt state.  */
1206               match_last = re_string_cur_idx (&mctx->input);
1207               match = 1;
1208
1209               /* We found a match, do not modify match_first below.  */
1210               p_match_first = NULL;
1211               if (!fl_longest_match)
1212                 break;
1213             }
1214         }
1215     }
1216
1217   if (p_match_first)
1218     *p_match_first += next_start_idx;
1219
1220   return match_last;
1221 }
1222
1223 /* Check NODE match the current context.  */
1224
1225 static int
1226 internal_function
1227 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1228 {
1229   re_token_type_t type = dfa->nodes[node].type;
1230   unsigned int constraint = dfa->nodes[node].constraint;
1231   if (type != END_OF_RE)
1232     return 0;
1233   if (!constraint)
1234     return 1;
1235   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1236     return 0;
1237   return 1;
1238 }
1239
1240 /* Check the halt state STATE match the current context.
1241    Return 0 if not match, if the node, STATE has, is a halt node and
1242    match the context, return the node.  */
1243
1244 static int
1245 internal_function
1246 check_halt_state_context (const re_match_context_t *mctx,
1247                           const re_dfastate_t *state, int idx)
1248 {
1249   int i;
1250   unsigned int context;
1251 #ifdef DEBUG
1252   assert (state->halt);
1253 #endif
1254   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1255   for (i = 0; i < state->nodes.nelem; ++i)
1256     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1257       return state->nodes.elems[i];
1258   return 0;
1259 }
1260
1261 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1262    corresponding to the DFA).
1263    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1264    of errors.  */
1265
1266 static int
1267 internal_function
1268 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1269                    int *pidx, int node, re_node_set *eps_via_nodes,
1270                    struct re_fail_stack_t *fs)
1271 {
1272   const re_dfa_t *const dfa = mctx->dfa;
1273   int i, err;
1274   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1275     {
1276       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1277       re_node_set *edests = &dfa->edests[node];
1278       int dest_node;
1279       err = re_node_set_insert (eps_via_nodes, node);
1280       if (BE (err < 0, 0))
1281         return -2;
1282       /* Pick up a valid destination, or return -1 if none is found.  */
1283       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1284         {
1285           int candidate = edests->elems[i];
1286           if (!re_node_set_contains (cur_nodes, candidate))
1287             continue;
1288           if (dest_node == -1)
1289             dest_node = candidate;
1290
1291           else
1292             {
1293               /* In order to avoid infinite loop like "(a*)*", return the second
1294                  epsilon-transition if the first was already considered.  */
1295               if (re_node_set_contains (eps_via_nodes, dest_node))
1296                 return candidate;
1297
1298               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1299               else if (fs != NULL
1300                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1301                                            eps_via_nodes))
1302                 return -2;
1303
1304               /* We know we are going to exit.  */
1305               break;
1306             }
1307         }
1308       return dest_node;
1309     }
1310   else
1311     {
1312       int naccepted = 0;
1313       re_token_type_t type = dfa->nodes[node].type;
1314
1315 #ifdef RE_ENABLE_I18N
1316       if (dfa->nodes[node].accept_mb)
1317         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1318       else
1319 #endif /* RE_ENABLE_I18N */
1320       if (type == OP_BACK_REF)
1321         {
1322           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1323           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1324           if (fs != NULL)
1325             {
1326               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1327                 return -1;
1328               else if (naccepted)
1329                 {
1330                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1331                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1332                               naccepted) != 0)
1333                     return -1;
1334                 }
1335             }
1336
1337           if (naccepted == 0)
1338             {
1339               int dest_node;
1340               err = re_node_set_insert (eps_via_nodes, node);
1341               if (BE (err < 0, 0))
1342                 return -2;
1343               dest_node = dfa->edests[node].elems[0];
1344               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1345                                         dest_node))
1346                 return dest_node;
1347             }
1348         }
1349
1350       if (naccepted != 0
1351           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1352         {
1353           int dest_node = dfa->nexts[node];
1354           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1355           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1356                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1357                                                dest_node)))
1358             return -1;
1359           re_node_set_empty (eps_via_nodes);
1360           return dest_node;
1361         }
1362     }
1363   return -1;
1364 }
1365
1366 static reg_errcode_t
1367 internal_function __attribute_warn_unused_result__
1368 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1369                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1370 {
1371   reg_errcode_t err;
1372   int num = fs->num++;
1373   if (fs->num == fs->alloc)
1374     {
1375       struct re_fail_stack_ent_t *new_array;
1376       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1377                                        * fs->alloc * 2));
1378       if (new_array == NULL)
1379         return REG_ESPACE;
1380       fs->alloc *= 2;
1381       fs->stack = new_array;
1382     }
1383   fs->stack[num].idx = str_idx;
1384   fs->stack[num].node = dest_node;
1385   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1386   if (fs->stack[num].regs == NULL)
1387     return REG_ESPACE;
1388   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1389   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1390   return err;
1391 }
1392
1393 static int
1394 internal_function
1395 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1396                 regmatch_t *regs, re_node_set *eps_via_nodes)
1397 {
1398   int num = --fs->num;
1399   assert (num >= 0);
1400   *pidx = fs->stack[num].idx;
1401   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1402   re_node_set_free (eps_via_nodes);
1403   re_free (fs->stack[num].regs);
1404   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1405   return fs->stack[num].node;
1406 }
1407
1408 /* Set the positions where the subexpressions are starts/ends to registers
1409    PMATCH.
1410    Note: We assume that pmatch[0] is already set, and
1411    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1412
1413 static reg_errcode_t
1414 internal_function __attribute_warn_unused_result__
1415 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1416           regmatch_t *pmatch, int fl_backtrack)
1417 {
1418   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1419   int idx, cur_node;
1420   re_node_set eps_via_nodes;
1421   struct re_fail_stack_t *fs;
1422   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1423   regmatch_t *prev_idx_match;
1424   int prev_idx_match_malloced = 0;
1425
1426 #ifdef DEBUG
1427   assert (nmatch > 1);
1428   assert (mctx->state_log != NULL);
1429 #endif
1430   if (fl_backtrack)
1431     {
1432       fs = &fs_body;
1433       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1434       if (fs->stack == NULL)
1435         return REG_ESPACE;
1436     }
1437   else
1438     fs = NULL;
1439
1440   cur_node = dfa->init_node;
1441   re_node_set_init_empty (&eps_via_nodes);
1442
1443   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1444     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1445   else
1446     {
1447       prev_idx_match = re_malloc (regmatch_t, nmatch);
1448       if (prev_idx_match == NULL)
1449         {
1450           free_fail_stack_return (fs);
1451           return REG_ESPACE;
1452         }
1453       prev_idx_match_malloced = 1;
1454     }
1455   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1456
1457   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1458     {
1459       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1460
1461       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1462         {
1463           int reg_idx;
1464           if (fs)
1465             {
1466               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1467                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1468                   break;
1469               if (reg_idx == nmatch)
1470                 {
1471                   re_node_set_free (&eps_via_nodes);
1472                   if (prev_idx_match_malloced)
1473                     re_free (prev_idx_match);
1474                   return free_fail_stack_return (fs);
1475                 }
1476               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1477                                          &eps_via_nodes);
1478             }
1479           else
1480             {
1481               re_node_set_free (&eps_via_nodes);
1482               if (prev_idx_match_malloced)
1483                 re_free (prev_idx_match);
1484               return REG_NOERROR;
1485             }
1486         }
1487
1488       /* Proceed to next node.  */
1489       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1490                                     &eps_via_nodes, fs);
1491
1492       if (BE (cur_node < 0, 0))
1493         {
1494           if (BE (cur_node == -2, 0))
1495             {
1496               re_node_set_free (&eps_via_nodes);
1497               if (prev_idx_match_malloced)
1498                 re_free (prev_idx_match);
1499               free_fail_stack_return (fs);
1500               return REG_ESPACE;
1501             }
1502           if (fs)
1503             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1504                                        &eps_via_nodes);
1505           else
1506             {
1507               re_node_set_free (&eps_via_nodes);
1508               if (prev_idx_match_malloced)
1509                 re_free (prev_idx_match);
1510               return REG_NOMATCH;
1511             }
1512         }
1513     }
1514   re_node_set_free (&eps_via_nodes);
1515   if (prev_idx_match_malloced)
1516     re_free (prev_idx_match);
1517   return free_fail_stack_return (fs);
1518 }
1519
1520 static reg_errcode_t
1521 internal_function
1522 free_fail_stack_return (struct re_fail_stack_t *fs)
1523 {
1524   if (fs)
1525     {
1526       int fs_idx;
1527       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1528         {
1529           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1530           re_free (fs->stack[fs_idx].regs);
1531         }
1532       re_free (fs->stack);
1533     }
1534   return REG_NOERROR;
1535 }
1536
1537 static void
1538 internal_function
1539 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1540              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1541 {
1542   int type = dfa->nodes[cur_node].type;
1543   if (type == OP_OPEN_SUBEXP)
1544     {
1545       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1546
1547       /* We are at the first node of this sub expression.  */
1548       if (reg_num < nmatch)
1549         {
1550           pmatch[reg_num].rm_so = cur_idx;
1551           pmatch[reg_num].rm_eo = -1;
1552         }
1553     }
1554   else if (type == OP_CLOSE_SUBEXP)
1555     {
1556       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1557       if (reg_num < nmatch)
1558         {
1559           /* We are at the last node of this sub expression.  */
1560           if (pmatch[reg_num].rm_so < cur_idx)
1561             {
1562               pmatch[reg_num].rm_eo = cur_idx;
1563               /* This is a non-empty match or we are not inside an optional
1564                  subexpression.  Accept this right away.  */
1565               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1566             }
1567           else
1568             {
1569               if (dfa->nodes[cur_node].opt_subexp
1570                   && prev_idx_match[reg_num].rm_so != -1)
1571                 /* We transited through an empty match for an optional
1572                    subexpression, like (a?)*, and this is not the subexp's
1573                    first match.  Copy back the old content of the registers
1574                    so that matches of an inner subexpression are undone as
1575                    well, like in ((a?))*.  */
1576                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1577               else
1578                 /* We completed a subexpression, but it may be part of
1579                    an optional one, so do not update PREV_IDX_MATCH.  */
1580                 pmatch[reg_num].rm_eo = cur_idx;
1581             }
1582         }
1583     }
1584 }
1585
1586 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1587    and sift the nodes in each states according to the following rules.
1588    Updated state_log will be wrote to STATE_LOG.
1589
1590    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1591      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1592         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1593         the LAST_NODE, we throw away the node `a'.
1594      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1595         string `s' and transit to `b':
1596         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1597            away the node `a'.
1598         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1599             thrown away, we throw away the node `a'.
1600      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1601         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1602            node `a'.
1603         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1604             we throw away the node `a'.  */
1605
1606 #define STATE_NODE_CONTAINS(state,node) \
1607   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1608
1609 static reg_errcode_t
1610 internal_function
1611 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1612 {
1613   reg_errcode_t err;
1614   int null_cnt = 0;
1615   int str_idx = sctx->last_str_idx;
1616   re_node_set cur_dest;
1617
1618 #ifdef DEBUG
1619   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1620 #endif
1621
1622   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1623      transit to the last_node and the last_node itself.  */
1624   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1625   if (BE (err != REG_NOERROR, 0))
1626     return err;
1627   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1628   if (BE (err != REG_NOERROR, 0))
1629     goto free_return;
1630
1631   /* Then check each states in the state_log.  */
1632   while (str_idx > 0)
1633     {
1634       /* Update counters.  */
1635       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1636       if (null_cnt > mctx->max_mb_elem_len)
1637         {
1638           memset (sctx->sifted_states, '\0',
1639                   sizeof (re_dfastate_t *) * str_idx);
1640           re_node_set_free (&cur_dest);
1641           return REG_NOERROR;
1642         }
1643       re_node_set_empty (&cur_dest);
1644       --str_idx;
1645
1646       if (mctx->state_log[str_idx])
1647         {
1648           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1649           if (BE (err != REG_NOERROR, 0))
1650             goto free_return;
1651         }
1652
1653       /* Add all the nodes which satisfy the following conditions:
1654          - It can epsilon transit to a node in CUR_DEST.
1655          - It is in CUR_SRC.
1656          And update state_log.  */
1657       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1658       if (BE (err != REG_NOERROR, 0))
1659         goto free_return;
1660     }
1661   err = REG_NOERROR;
1662  free_return:
1663   re_node_set_free (&cur_dest);
1664   return err;
1665 }
1666
1667 static reg_errcode_t
1668 internal_function __attribute_warn_unused_result__
1669 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1670                      int str_idx, re_node_set *cur_dest)
1671 {
1672   const re_dfa_t *const dfa = mctx->dfa;
1673   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1674   int i;
1675
1676   /* Then build the next sifted state.
1677      We build the next sifted state on `cur_dest', and update
1678      `sifted_states[str_idx]' with `cur_dest'.
1679      Note:
1680      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1681      `cur_src' points the node_set of the old `state_log[str_idx]'
1682      (with the epsilon nodes pre-filtered out).  */
1683   for (i = 0; i < cur_src->nelem; i++)
1684     {
1685       int prev_node = cur_src->elems[i];
1686       int naccepted = 0;
1687       int ret;
1688
1689 #ifdef DEBUG
1690       re_token_type_t type = dfa->nodes[prev_node].type;
1691       assert (!IS_EPSILON_NODE (type));
1692 #endif
1693 #ifdef RE_ENABLE_I18N
1694       /* If the node may accept `multi byte'.  */
1695       if (dfa->nodes[prev_node].accept_mb)
1696         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1697                                          str_idx, sctx->last_str_idx);
1698 #endif /* RE_ENABLE_I18N */
1699
1700       /* We don't check backreferences here.
1701          See update_cur_sifted_state().  */
1702       if (!naccepted
1703           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1704           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1705                                   dfa->nexts[prev_node]))
1706         naccepted = 1;
1707
1708       if (naccepted == 0)
1709         continue;
1710
1711       if (sctx->limits.nelem)
1712         {
1713           int to_idx = str_idx + naccepted;
1714           if (check_dst_limits (mctx, &sctx->limits,
1715                                 dfa->nexts[prev_node], to_idx,
1716                                 prev_node, str_idx))
1717             continue;
1718         }
1719       ret = re_node_set_insert (cur_dest, prev_node);
1720       if (BE (ret == -1, 0))
1721         return REG_ESPACE;
1722     }
1723
1724   return REG_NOERROR;
1725 }
1726
1727 /* Helper functions.  */
1728
1729 static reg_errcode_t
1730 internal_function
1731 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1732 {
1733   int top = mctx->state_log_top;
1734
1735   if (next_state_log_idx >= mctx->input.bufs_len
1736       || (next_state_log_idx >= mctx->input.valid_len
1737           && mctx->input.valid_len < mctx->input.len))
1738     {
1739       reg_errcode_t err;
1740       err = extend_buffers (mctx);
1741       if (BE (err != REG_NOERROR, 0))
1742         return err;
1743     }
1744
1745   if (top < next_state_log_idx)
1746     {
1747       memset (mctx->state_log + top + 1, '\0',
1748               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1749       mctx->state_log_top = next_state_log_idx;
1750     }
1751   return REG_NOERROR;
1752 }
1753
1754 static reg_errcode_t
1755 internal_function
1756 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1757                    re_dfastate_t **src, int num)
1758 {
1759   int st_idx;
1760   reg_errcode_t err;
1761   for (st_idx = 0; st_idx < num; ++st_idx)
1762     {
1763       if (dst[st_idx] == NULL)
1764         dst[st_idx] = src[st_idx];
1765       else if (src[st_idx] != NULL)
1766         {
1767           re_node_set merged_set;
1768           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1769                                         &src[st_idx]->nodes);
1770           if (BE (err != REG_NOERROR, 0))
1771             return err;
1772           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1773           re_node_set_free (&merged_set);
1774           if (BE (err != REG_NOERROR, 0))
1775             return err;
1776         }
1777     }
1778   return REG_NOERROR;
1779 }
1780
1781 static reg_errcode_t
1782 internal_function
1783 update_cur_sifted_state (const re_match_context_t *mctx,
1784                          re_sift_context_t *sctx, int str_idx,
1785                          re_node_set *dest_nodes)
1786 {
1787   const re_dfa_t *const dfa = mctx->dfa;
1788   reg_errcode_t err = REG_NOERROR;
1789   const re_node_set *candidates;
1790   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1791                 : &mctx->state_log[str_idx]->nodes);
1792
1793   if (dest_nodes->nelem == 0)
1794     sctx->sifted_states[str_idx] = NULL;
1795   else
1796     {
1797       if (candidates)
1798         {
1799           /* At first, add the nodes which can epsilon transit to a node in
1800              DEST_NODE.  */
1801           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1802           if (BE (err != REG_NOERROR, 0))
1803             return err;
1804
1805           /* Then, check the limitations in the current sift_context.  */
1806           if (sctx->limits.nelem)
1807             {
1808               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1809                                          mctx->bkref_ents, str_idx);
1810               if (BE (err != REG_NOERROR, 0))
1811                 return err;
1812             }
1813         }
1814
1815       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1816       if (BE (err != REG_NOERROR, 0))
1817         return err;
1818     }
1819
1820   if (candidates && mctx->state_log[str_idx]->has_backref)
1821     {
1822       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1823       if (BE (err != REG_NOERROR, 0))
1824         return err;
1825     }
1826   return REG_NOERROR;
1827 }
1828
1829 static reg_errcode_t
1830 internal_function __attribute_warn_unused_result__
1831 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1832                        const re_node_set *candidates)
1833 {
1834   reg_errcode_t err = REG_NOERROR;
1835   int i;
1836
1837   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1838   if (BE (err != REG_NOERROR, 0))
1839     return err;
1840
1841   if (!state->inveclosure.alloc)
1842     {
1843       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1844       if (BE (err != REG_NOERROR, 0))
1845         return REG_ESPACE;
1846       for (i = 0; i < dest_nodes->nelem; i++)
1847         {
1848           err = re_node_set_merge (&state->inveclosure,
1849                                    dfa->inveclosures + dest_nodes->elems[i]);
1850           if (BE (err != REG_NOERROR, 0))
1851             return REG_ESPACE;
1852         }
1853     }
1854   return re_node_set_add_intersect (dest_nodes, candidates,
1855                                     &state->inveclosure);
1856 }
1857
1858 static reg_errcode_t
1859 internal_function
1860 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1861                        const re_node_set *candidates)
1862 {
1863     int ecl_idx;
1864     reg_errcode_t err;
1865     re_node_set *inv_eclosure = dfa->inveclosures + node;
1866     re_node_set except_nodes;
1867     re_node_set_init_empty (&except_nodes);
1868     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1869       {
1870         int cur_node = inv_eclosure->elems[ecl_idx];
1871         if (cur_node == node)
1872           continue;
1873         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1874           {
1875             int edst1 = dfa->edests[cur_node].elems[0];
1876             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1877                          ? dfa->edests[cur_node].elems[1] : -1);
1878             if ((!re_node_set_contains (inv_eclosure, edst1)
1879                  && re_node_set_contains (dest_nodes, edst1))
1880                 || (edst2 > 0
1881                     && !re_node_set_contains (inv_eclosure, edst2)
1882                     && re_node_set_contains (dest_nodes, edst2)))
1883               {
1884                 err = re_node_set_add_intersect (&except_nodes, candidates,
1885                                                  dfa->inveclosures + cur_node);
1886                 if (BE (err != REG_NOERROR, 0))
1887                   {
1888                     re_node_set_free (&except_nodes);
1889                     return err;
1890                   }
1891               }
1892           }
1893       }
1894     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1895       {
1896         int cur_node = inv_eclosure->elems[ecl_idx];
1897         if (!re_node_set_contains (&except_nodes, cur_node))
1898           {
1899             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1900             re_node_set_remove_at (dest_nodes, idx);
1901           }
1902       }
1903     re_node_set_free (&except_nodes);
1904     return REG_NOERROR;
1905 }
1906
1907 static int
1908 internal_function
1909 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1910                   int dst_node, int dst_idx, int src_node, int src_idx)
1911 {
1912   const re_dfa_t *const dfa = mctx->dfa;
1913   int lim_idx, src_pos, dst_pos;
1914
1915   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1916   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1917   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1918     {
1919       int subexp_idx;
1920       struct re_backref_cache_entry *ent;
1921       ent = mctx->bkref_ents + limits->elems[lim_idx];
1922       subexp_idx = dfa->nodes[ent->node].opr.idx;
1923
1924       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1925                                            subexp_idx, dst_node, dst_idx,
1926                                            dst_bkref_idx);
1927       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1928                                            subexp_idx, src_node, src_idx,
1929                                            src_bkref_idx);
1930
1931       /* In case of:
1932          <src> <dst> ( <subexp> )
1933          ( <subexp> ) <src> <dst>
1934          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1935       if (src_pos == dst_pos)
1936         continue; /* This is unrelated limitation.  */
1937       else
1938         return 1;
1939     }
1940   return 0;
1941 }
1942
1943 static int
1944 internal_function
1945 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1946                              int subexp_idx, int from_node, int bkref_idx)
1947 {
1948   const re_dfa_t *const dfa = mctx->dfa;
1949   const re_node_set *eclosures = dfa->eclosures + from_node;
1950   int node_idx;
1951
1952   /* Else, we are on the boundary: examine the nodes on the epsilon
1953      closure.  */
1954   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1955     {
1956       int node = eclosures->elems[node_idx];
1957       switch (dfa->nodes[node].type)
1958         {
1959         case OP_BACK_REF:
1960           if (bkref_idx != -1)
1961             {
1962               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1963               do
1964                 {
1965                   int dst, cpos;
1966
1967                   if (ent->node != node)
1968                     continue;
1969
1970                   if (subexp_idx < BITSET_WORD_BITS
1971                       && !(ent->eps_reachable_subexps_map
1972                            & ((bitset_word_t) 1 << subexp_idx)))
1973                     continue;
1974
1975                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1976                      OP_CLOSE_SUBEXP cases below.  But, if the
1977                      destination node is the same node as the source
1978                      node, don't recurse because it would cause an
1979                      infinite loop: a regex that exhibits this behavior
1980                      is ()\1*\1*  */
1981                   dst = dfa->edests[node].elems[0];
1982                   if (dst == from_node)
1983                     {
1984                       if (boundaries & 1)
1985                         return -1;
1986                       else /* if (boundaries & 2) */
1987                         return 0;
1988                     }
1989
1990                   cpos =
1991                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1992                                                  dst, bkref_idx);
1993                   if (cpos == -1 /* && (boundaries & 1) */)
1994                     return -1;
1995                   if (cpos == 0 && (boundaries & 2))
1996                     return 0;
1997
1998                   if (subexp_idx < BITSET_WORD_BITS)
1999                     ent->eps_reachable_subexps_map
2000                       &= ~((bitset_word_t) 1 << subexp_idx);
2001                 }
2002               while (ent++->more);
2003             }
2004           break;
2005
2006         case OP_OPEN_SUBEXP:
2007           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2008             return -1;
2009           break;
2010
2011         case OP_CLOSE_SUBEXP:
2012           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2013             return 0;
2014           break;
2015
2016         default:
2017             break;
2018         }
2019     }
2020
2021   return (boundaries & 2) ? 1 : 0;
2022 }
2023
2024 static int
2025 internal_function
2026 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2027                            int subexp_idx, int from_node, int str_idx,
2028                            int bkref_idx)
2029 {
2030   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2031   int boundaries;
2032
2033   /* If we are outside the range of the subexpression, return -1 or 1.  */
2034   if (str_idx < lim->subexp_from)
2035     return -1;
2036
2037   if (lim->subexp_to < str_idx)
2038     return 1;
2039
2040   /* If we are within the subexpression, return 0.  */
2041   boundaries = (str_idx == lim->subexp_from);
2042   boundaries |= (str_idx == lim->subexp_to) << 1;
2043   if (boundaries == 0)
2044     return 0;
2045
2046   /* Else, examine epsilon closure.  */
2047   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2048                                       from_node, bkref_idx);
2049 }
2050
2051 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2052    which are against limitations from DEST_NODES. */
2053
2054 static reg_errcode_t
2055 internal_function
2056 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2057                      const re_node_set *candidates, re_node_set *limits,
2058                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2059 {
2060   reg_errcode_t err;
2061   int node_idx, lim_idx;
2062
2063   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2064     {
2065       int subexp_idx;
2066       struct re_backref_cache_entry *ent;
2067       ent = bkref_ents + limits->elems[lim_idx];
2068
2069       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2070         continue; /* This is unrelated limitation.  */
2071
2072       subexp_idx = dfa->nodes[ent->node].opr.idx;
2073       if (ent->subexp_to == str_idx)
2074         {
2075           int ops_node = -1;
2076           int cls_node = -1;
2077           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2078             {
2079               int node = dest_nodes->elems[node_idx];
2080               re_token_type_t type = dfa->nodes[node].type;
2081               if (type == OP_OPEN_SUBEXP
2082                   && subexp_idx == dfa->nodes[node].opr.idx)
2083                 ops_node = node;
2084               else if (type == OP_CLOSE_SUBEXP
2085                        && subexp_idx == dfa->nodes[node].opr.idx)
2086                 cls_node = node;
2087             }
2088
2089           /* Check the limitation of the open subexpression.  */
2090           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2091           if (ops_node >= 0)
2092             {
2093               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2094                                            candidates);
2095               if (BE (err != REG_NOERROR, 0))
2096                 return err;
2097             }
2098
2099           /* Check the limitation of the close subexpression.  */
2100           if (cls_node >= 0)
2101             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2102               {
2103                 int node = dest_nodes->elems[node_idx];
2104                 if (!re_node_set_contains (dfa->inveclosures + node,
2105                                            cls_node)
2106                     && !re_node_set_contains (dfa->eclosures + node,
2107                                               cls_node))
2108                   {
2109                     /* It is against this limitation.
2110                        Remove it form the current sifted state.  */
2111                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2112                                                  candidates);
2113                     if (BE (err != REG_NOERROR, 0))
2114                       return err;
2115                     --node_idx;
2116                   }
2117               }
2118         }
2119       else /* (ent->subexp_to != str_idx)  */
2120         {
2121           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2122             {
2123               int node = dest_nodes->elems[node_idx];
2124               re_token_type_t type = dfa->nodes[node].type;
2125               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2126                 {
2127                   if (subexp_idx != dfa->nodes[node].opr.idx)
2128                     continue;
2129                   /* It is against this limitation.
2130                      Remove it form the current sifted state.  */
2131                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2132                                                candidates);
2133                   if (BE (err != REG_NOERROR, 0))
2134                     return err;
2135                 }
2136             }
2137         }
2138     }
2139   return REG_NOERROR;
2140 }
2141
2142 static reg_errcode_t
2143 internal_function __attribute_warn_unused_result__
2144 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2145                    int str_idx, const re_node_set *candidates)
2146 {
2147   const re_dfa_t *const dfa = mctx->dfa;
2148   reg_errcode_t err;
2149   int node_idx, node;
2150   re_sift_context_t local_sctx;
2151   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2152
2153   if (first_idx == -1)
2154     return REG_NOERROR;
2155
2156   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2157
2158   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2159     {
2160       int enabled_idx;
2161       re_token_type_t type;
2162       struct re_backref_cache_entry *entry;
2163       node = candidates->elems[node_idx];
2164       type = dfa->nodes[node].type;
2165       /* Avoid infinite loop for the REs like "()\1+".  */
2166       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2167         continue;
2168       if (type != OP_BACK_REF)
2169         continue;
2170
2171       entry = mctx->bkref_ents + first_idx;
2172       enabled_idx = first_idx;
2173       do
2174         {
2175           int subexp_len;
2176           int to_idx;
2177           int dst_node;
2178           int ret;
2179           re_dfastate_t *cur_state;
2180
2181           if (entry->node != node)
2182             continue;
2183           subexp_len = entry->subexp_to - entry->subexp_from;
2184           to_idx = str_idx + subexp_len;
2185           dst_node = (subexp_len ? dfa->nexts[node]
2186                       : dfa->edests[node].elems[0]);
2187
2188           if (to_idx > sctx->last_str_idx
2189               || sctx->sifted_states[to_idx] == NULL
2190               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2191               || check_dst_limits (mctx, &sctx->limits, node,
2192                                    str_idx, dst_node, to_idx))
2193             continue;
2194
2195           if (local_sctx.sifted_states == NULL)
2196             {
2197               local_sctx = *sctx;
2198               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2199               if (BE (err != REG_NOERROR, 0))
2200                 goto free_return;
2201             }
2202           local_sctx.last_node = node;
2203           local_sctx.last_str_idx = str_idx;
2204           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2205           if (BE (ret < 0, 0))
2206             {
2207               err = REG_ESPACE;
2208               goto free_return;
2209             }
2210           cur_state = local_sctx.sifted_states[str_idx];
2211           err = sift_states_backward (mctx, &local_sctx);
2212           if (BE (err != REG_NOERROR, 0))
2213             goto free_return;
2214           if (sctx->limited_states != NULL)
2215             {
2216               err = merge_state_array (dfa, sctx->limited_states,
2217                                        local_sctx.sifted_states,
2218                                        str_idx + 1);
2219               if (BE (err != REG_NOERROR, 0))
2220                 goto free_return;
2221             }
2222           local_sctx.sifted_states[str_idx] = cur_state;
2223           re_node_set_remove (&local_sctx.limits, enabled_idx);
2224
2225           /* mctx->bkref_ents may have changed, reload the pointer.  */
2226           entry = mctx->bkref_ents + enabled_idx;
2227         }
2228       while (enabled_idx++, entry++->more);
2229     }
2230   err = REG_NOERROR;
2231  free_return:
2232   if (local_sctx.sifted_states != NULL)
2233     {
2234       re_node_set_free (&local_sctx.limits);
2235     }
2236
2237   return err;
2238 }
2239
2240
2241 #ifdef RE_ENABLE_I18N
2242 static int
2243 internal_function
2244 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2245                      int node_idx, int str_idx, int max_str_idx)
2246 {
2247   const re_dfa_t *const dfa = mctx->dfa;
2248   int naccepted;
2249   /* Check the node can accept `multi byte'.  */
2250   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2251   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2252       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2253                             dfa->nexts[node_idx]))
2254     /* The node can't accept the `multi byte', or the
2255        destination was already thrown away, then the node
2256        could't accept the current input `multi byte'.   */
2257     naccepted = 0;
2258   /* Otherwise, it is sure that the node could accept
2259      `naccepted' bytes input.  */
2260   return naccepted;
2261 }
2262 #endif /* RE_ENABLE_I18N */
2263
2264 \f
2265 /* Functions for state transition.  */
2266
2267 /* Return the next state to which the current state STATE will transit by
2268    accepting the current input byte, and update STATE_LOG if necessary.
2269    If STATE can accept a multibyte char/collating element/back reference
2270    update the destination of STATE_LOG.  */
2271
2272 static re_dfastate_t *
2273 internal_function __attribute_warn_unused_result__
2274 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2275                re_dfastate_t *state)
2276 {
2277   re_dfastate_t **trtable;
2278   unsigned char ch;
2279
2280 #ifdef RE_ENABLE_I18N
2281   /* If the current state can accept multibyte.  */
2282   if (BE (state->accept_mb, 0))
2283     {
2284       *err = transit_state_mb (mctx, state);
2285       if (BE (*err != REG_NOERROR, 0))
2286         return NULL;
2287     }
2288 #endif /* RE_ENABLE_I18N */
2289
2290   /* Then decide the next state with the single byte.  */
2291 #if 0
2292   if (0)
2293     /* don't use transition table  */
2294     return transit_state_sb (err, mctx, state);
2295 #endif
2296
2297   /* Use transition table  */
2298   ch = re_string_fetch_byte (&mctx->input);
2299   for (;;)
2300     {
2301       trtable = state->trtable;
2302       if (BE (trtable != NULL, 1))
2303         return trtable[ch];
2304
2305       trtable = state->word_trtable;
2306       if (BE (trtable != NULL, 1))
2307         {
2308           unsigned int context;
2309           context
2310             = re_string_context_at (&mctx->input,
2311                                     re_string_cur_idx (&mctx->input) - 1,
2312                                     mctx->eflags);
2313           if (IS_WORD_CONTEXT (context))
2314             return trtable[ch + SBC_MAX];
2315           else
2316             return trtable[ch];
2317         }
2318
2319       if (!build_trtable (mctx->dfa, state))
2320         {
2321           *err = REG_ESPACE;
2322           return NULL;
2323         }
2324
2325       /* Retry, we now have a transition table.  */
2326     }
2327 }
2328
2329 /* Update the state_log if we need */
2330 re_dfastate_t *
2331 internal_function
2332 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2333                       re_dfastate_t *next_state)
2334 {
2335   const re_dfa_t *const dfa = mctx->dfa;
2336   int cur_idx = re_string_cur_idx (&mctx->input);
2337
2338   if (cur_idx > mctx->state_log_top)
2339     {
2340       mctx->state_log[cur_idx] = next_state;
2341       mctx->state_log_top = cur_idx;
2342     }
2343   else if (mctx->state_log[cur_idx] == 0)
2344     {
2345       mctx->state_log[cur_idx] = next_state;
2346     }
2347   else
2348     {
2349       re_dfastate_t *pstate;
2350       unsigned int context;
2351       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2352       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2353          the destination of a multibyte char/collating element/
2354          back reference.  Then the next state is the union set of
2355          these destinations and the results of the transition table.  */
2356       pstate = mctx->state_log[cur_idx];
2357       log_nodes = pstate->entrance_nodes;
2358       if (next_state != NULL)
2359         {
2360           table_nodes = next_state->entrance_nodes;
2361           *err = re_node_set_init_union (&next_nodes, table_nodes,
2362                                              log_nodes);
2363           if (BE (*err != REG_NOERROR, 0))
2364             return NULL;
2365         }
2366       else
2367         next_nodes = *log_nodes;
2368       /* Note: We already add the nodes of the initial state,
2369          then we don't need to add them here.  */
2370
2371       context = re_string_context_at (&mctx->input,
2372                                       re_string_cur_idx (&mctx->input) - 1,
2373                                       mctx->eflags);
2374       next_state = mctx->state_log[cur_idx]
2375         = re_acquire_state_context (err, dfa, &next_nodes, context);
2376       /* We don't need to check errors here, since the return value of
2377          this function is next_state and ERR is already set.  */
2378
2379       if (table_nodes != NULL)
2380         re_node_set_free (&next_nodes);
2381     }
2382
2383   if (BE (dfa->nbackref, 0) && next_state != NULL)
2384     {
2385       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2386          later.  We must check them here, since the back references in the
2387          next state might use them.  */
2388       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2389                                         cur_idx);
2390       if (BE (*err != REG_NOERROR, 0))
2391         return NULL;
2392
2393       /* If the next state has back references.  */
2394       if (next_state->has_backref)
2395         {
2396           *err = transit_state_bkref (mctx, &next_state->nodes);
2397           if (BE (*err != REG_NOERROR, 0))
2398             return NULL;
2399           next_state = mctx->state_log[cur_idx];
2400         }
2401     }
2402
2403   return next_state;
2404 }
2405
2406 /* Skip bytes in the input that correspond to part of a
2407    multi-byte match, then look in the log for a state
2408    from which to restart matching.  */
2409 re_dfastate_t *
2410 internal_function
2411 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2412 {
2413   re_dfastate_t *cur_state;
2414   do
2415     {
2416       int max = mctx->state_log_top;
2417       int cur_str_idx = re_string_cur_idx (&mctx->input);
2418
2419       do
2420         {
2421           if (++cur_str_idx > max)
2422             return NULL;
2423           re_string_skip_bytes (&mctx->input, 1);
2424         }
2425       while (mctx->state_log[cur_str_idx] == NULL);
2426
2427       cur_state = merge_state_with_log (err, mctx, NULL);
2428     }
2429   while (*err == REG_NOERROR && cur_state == NULL);
2430   return cur_state;
2431 }
2432
2433 /* Helper functions for transit_state.  */
2434
2435 /* From the node set CUR_NODES, pick up the nodes whose types are
2436    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2437    expression. And register them to use them later for evaluating the
2438    correspoding back references.  */
2439
2440 static reg_errcode_t
2441 internal_function
2442 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2443                            int str_idx)
2444 {
2445   const re_dfa_t *const dfa = mctx->dfa;
2446   int node_idx;
2447   reg_errcode_t err;
2448
2449   /* TODO: This isn't efficient.
2450            Because there might be more than one nodes whose types are
2451            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2452            nodes.
2453            E.g. RE: (a){2}  */
2454   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2455     {
2456       int node = cur_nodes->elems[node_idx];
2457       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2458           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2459           && (dfa->used_bkref_map
2460               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2461         {
2462           err = match_ctx_add_subtop (mctx, node, str_idx);
2463           if (BE (err != REG_NOERROR, 0))
2464             return err;
2465         }
2466     }
2467   return REG_NOERROR;
2468 }
2469
2470 #if 0
2471 /* Return the next state to which the current state STATE will transit by
2472    accepting the current input byte.  */
2473
2474 static re_dfastate_t *
2475 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2476                   re_dfastate_t *state)
2477 {
2478   const re_dfa_t *const dfa = mctx->dfa;
2479   re_node_set next_nodes;
2480   re_dfastate_t *next_state;
2481   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2482   unsigned int context;
2483
2484   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2485   if (BE (*err != REG_NOERROR, 0))
2486     return NULL;
2487   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2488     {
2489       int cur_node = state->nodes.elems[node_cnt];
2490       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2491         {
2492           *err = re_node_set_merge (&next_nodes,
2493                                     dfa->eclosures + dfa->nexts[cur_node]);
2494           if (BE (*err != REG_NOERROR, 0))
2495             {
2496               re_node_set_free (&next_nodes);
2497               return NULL;
2498             }
2499         }
2500     }
2501   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2502   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2503   /* We don't need to check errors here, since the return value of
2504      this function is next_state and ERR is already set.  */
2505
2506   re_node_set_free (&next_nodes);
2507   re_string_skip_bytes (&mctx->input, 1);
2508   return next_state;
2509 }
2510 #endif
2511
2512 #ifdef RE_ENABLE_I18N
2513 static reg_errcode_t
2514 internal_function
2515 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2516 {
2517   const re_dfa_t *const dfa = mctx->dfa;
2518   reg_errcode_t err;
2519   int i;
2520
2521   for (i = 0; i < pstate->nodes.nelem; ++i)
2522     {
2523       re_node_set dest_nodes, *new_nodes;
2524       int cur_node_idx = pstate->nodes.elems[i];
2525       int naccepted, dest_idx;
2526       unsigned int context;
2527       re_dfastate_t *dest_state;
2528
2529       if (!dfa->nodes[cur_node_idx].accept_mb)
2530         continue;
2531
2532       if (dfa->nodes[cur_node_idx].constraint)
2533         {
2534           context = re_string_context_at (&mctx->input,
2535                                           re_string_cur_idx (&mctx->input),
2536                                           mctx->eflags);
2537           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2538                                            context))
2539             continue;
2540         }
2541
2542       /* How many bytes the node can accept?  */
2543       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2544                                            re_string_cur_idx (&mctx->input));
2545       if (naccepted == 0)
2546         continue;
2547
2548       /* The node can accepts `naccepted' bytes.  */
2549       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2550       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2551                                : mctx->max_mb_elem_len);
2552       err = clean_state_log_if_needed (mctx, dest_idx);
2553       if (BE (err != REG_NOERROR, 0))
2554         return err;
2555 #ifdef DEBUG
2556       assert (dfa->nexts[cur_node_idx] != -1);
2557 #endif
2558       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2559
2560       dest_state = mctx->state_log[dest_idx];
2561       if (dest_state == NULL)
2562         dest_nodes = *new_nodes;
2563       else
2564         {
2565           err = re_node_set_init_union (&dest_nodes,
2566                                         dest_state->entrance_nodes, new_nodes);
2567           if (BE (err != REG_NOERROR, 0))
2568             return err;
2569         }
2570       context = re_string_context_at (&mctx->input, dest_idx - 1,
2571                                       mctx->eflags);
2572       mctx->state_log[dest_idx]
2573         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2574       if (dest_state != NULL)
2575         re_node_set_free (&dest_nodes);
2576       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2577         return err;
2578     }
2579   return REG_NOERROR;
2580 }
2581 #endif /* RE_ENABLE_I18N */
2582
2583 static reg_errcode_t
2584 internal_function
2585 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2586 {
2587   const re_dfa_t *const dfa = mctx->dfa;
2588   reg_errcode_t err;
2589   int i;
2590   int cur_str_idx = re_string_cur_idx (&mctx->input);
2591
2592   for (i = 0; i < nodes->nelem; ++i)
2593     {
2594       int dest_str_idx, prev_nelem, bkc_idx;
2595       int node_idx = nodes->elems[i];
2596       unsigned int context;
2597       const re_token_t *node = dfa->nodes + node_idx;
2598       re_node_set *new_dest_nodes;
2599
2600       /* Check whether `node' is a backreference or not.  */
2601       if (node->type != OP_BACK_REF)
2602         continue;
2603
2604       if (node->constraint)
2605         {
2606           context = re_string_context_at (&mctx->input, cur_str_idx,
2607                                           mctx->eflags);
2608           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2609             continue;
2610         }
2611
2612       /* `node' is a backreference.
2613          Check the substring which the substring matched.  */
2614       bkc_idx = mctx->nbkref_ents;
2615       err = get_subexp (mctx, node_idx, cur_str_idx);
2616       if (BE (err != REG_NOERROR, 0))
2617         goto free_return;
2618
2619       /* And add the epsilon closures (which is `new_dest_nodes') of
2620          the backreference to appropriate state_log.  */
2621 #ifdef DEBUG
2622       assert (dfa->nexts[node_idx] != -1);
2623 #endif
2624       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2625         {
2626           int subexp_len;
2627           re_dfastate_t *dest_state;
2628           struct re_backref_cache_entry *bkref_ent;
2629           bkref_ent = mctx->bkref_ents + bkc_idx;
2630           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2631             continue;
2632           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2633           new_dest_nodes = (subexp_len == 0
2634                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2635                             : dfa->eclosures + dfa->nexts[node_idx]);
2636           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2637                           - bkref_ent->subexp_from);
2638           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2639                                           mctx->eflags);
2640           dest_state = mctx->state_log[dest_str_idx];
2641           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2642                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2643           /* Add `new_dest_node' to state_log.  */
2644           if (dest_state == NULL)
2645             {
2646               mctx->state_log[dest_str_idx]
2647                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2648                                             context);
2649               if (BE (mctx->state_log[dest_str_idx] == NULL
2650                       && err != REG_NOERROR, 0))
2651                 goto free_return;
2652             }
2653           else
2654             {
2655               re_node_set dest_nodes;
2656               err = re_node_set_init_union (&dest_nodes,
2657                                             dest_state->entrance_nodes,
2658                                             new_dest_nodes);
2659               if (BE (err != REG_NOERROR, 0))
2660                 {
2661                   re_node_set_free (&dest_nodes);
2662                   goto free_return;
2663                 }
2664               mctx->state_log[dest_str_idx]
2665                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2666               re_node_set_free (&dest_nodes);
2667               if (BE (mctx->state_log[dest_str_idx] == NULL
2668                       && err != REG_NOERROR, 0))
2669                 goto free_return;
2670             }
2671           /* We need to check recursively if the backreference can epsilon
2672              transit.  */
2673           if (subexp_len == 0
2674               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2675             {
2676               err = check_subexp_matching_top (mctx, new_dest_nodes,
2677                                                cur_str_idx);
2678               if (BE (err != REG_NOERROR, 0))
2679                 goto free_return;
2680               err = transit_state_bkref (mctx, new_dest_nodes);
2681               if (BE (err != REG_NOERROR, 0))
2682                 goto free_return;
2683             }
2684         }
2685     }
2686   err = REG_NOERROR;
2687  free_return:
2688   return err;
2689 }
2690
2691 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2692    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2693    Note that we might collect inappropriate candidates here.
2694    However, the cost of checking them strictly here is too high, then we
2695    delay these checking for prune_impossible_nodes().  */
2696
2697 static reg_errcode_t
2698 internal_function __attribute_warn_unused_result__
2699 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2700 {
2701   const re_dfa_t *const dfa = mctx->dfa;
2702   int subexp_num, sub_top_idx;
2703   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2704   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2705   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2706   if (cache_idx != -1)
2707     {
2708       const struct re_backref_cache_entry *entry
2709         = mctx->bkref_ents + cache_idx;
2710       do
2711         if (entry->node == bkref_node)
2712           return REG_NOERROR; /* We already checked it.  */
2713       while (entry++->more);
2714     }
2715
2716   subexp_num = dfa->nodes[bkref_node].opr.idx;
2717
2718   /* For each sub expression  */
2719   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2720     {
2721       reg_errcode_t err;
2722       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2723       re_sub_match_last_t *sub_last;
2724       int sub_last_idx, sl_str, bkref_str_off;
2725
2726       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2727         continue; /* It isn't related.  */
2728
2729       sl_str = sub_top->str_idx;
2730       bkref_str_off = bkref_str_idx;
2731       /* At first, check the last node of sub expressions we already
2732          evaluated.  */
2733       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2734         {
2735           int sl_str_diff;
2736           sub_last = sub_top->lasts[sub_last_idx];
2737           sl_str_diff = sub_last->str_idx - sl_str;
2738           /* The matched string by the sub expression match with the substring
2739              at the back reference?  */
2740           if (sl_str_diff > 0)
2741             {
2742               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2743                 {
2744                   /* Not enough chars for a successful match.  */
2745                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2746                     break;
2747
2748                   err = clean_state_log_if_needed (mctx,
2749                                                    bkref_str_off
2750                                                    + sl_str_diff);
2751                   if (BE (err != REG_NOERROR, 0))
2752                     return err;
2753                   buf = (const char *) re_string_get_buffer (&mctx->input);
2754                 }
2755               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2756                 /* We don't need to search this sub expression any more.  */
2757                 break;
2758             }
2759           bkref_str_off += sl_str_diff;
2760           sl_str += sl_str_diff;
2761           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2762                                 bkref_str_idx);
2763
2764           /* Reload buf, since the preceding call might have reallocated
2765              the buffer.  */
2766           buf = (const char *) re_string_get_buffer (&mctx->input);
2767
2768           if (err == REG_NOMATCH)
2769             continue;
2770           if (BE (err != REG_NOERROR, 0))
2771             return err;
2772         }
2773
2774       if (sub_last_idx < sub_top->nlasts)
2775         continue;
2776       if (sub_last_idx > 0)
2777         ++sl_str;
2778       /* Then, search for the other last nodes of the sub expression.  */
2779       for (; sl_str <= bkref_str_idx; ++sl_str)
2780         {
2781           int cls_node, sl_str_off;
2782           const re_node_set *nodes;
2783           sl_str_off = sl_str - sub_top->str_idx;
2784           /* The matched string by the sub expression match with the substring
2785              at the back reference?  */
2786           if (sl_str_off > 0)
2787             {
2788               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2789                 {
2790                   /* If we are at the end of the input, we cannot match.  */
2791                   if (bkref_str_off >= mctx->input.len)
2792                     break;
2793
2794                   err = extend_buffers (mctx);
2795                   if (BE (err != REG_NOERROR, 0))
2796                     return err;
2797
2798                   buf = (const char *) re_string_get_buffer (&mctx->input);
2799                 }
2800               if (buf [bkref_str_off++] != buf[sl_str - 1])
2801                 break; /* We don't need to search this sub expression
2802                           any more.  */
2803             }
2804           if (mctx->state_log[sl_str] == NULL)
2805             continue;
2806           /* Does this state have a ')' of the sub expression?  */
2807           nodes = &mctx->state_log[sl_str]->nodes;
2808           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2809                                        OP_CLOSE_SUBEXP);
2810           if (cls_node == -1)
2811             continue; /* No.  */
2812           if (sub_top->path == NULL)
2813             {
2814               sub_top->path = calloc (sizeof (state_array_t),
2815                                       sl_str - sub_top->str_idx + 1);
2816               if (sub_top->path == NULL)
2817                 return REG_ESPACE;
2818             }
2819           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2820              in the current context?  */
2821           err = check_arrival (mctx, sub_top->path, sub_top->node,
2822                                sub_top->str_idx, cls_node, sl_str,
2823                                OP_CLOSE_SUBEXP);
2824           if (err == REG_NOMATCH)
2825               continue;
2826           if (BE (err != REG_NOERROR, 0))
2827               return err;
2828           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2829           if (BE (sub_last == NULL, 0))
2830             return REG_ESPACE;
2831           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2832                                 bkref_str_idx);
2833           if (err == REG_NOMATCH)
2834             continue;
2835         }
2836     }
2837   return REG_NOERROR;
2838 }
2839
2840 /* Helper functions for get_subexp().  */
2841
2842 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2843    If it can arrive, register the sub expression expressed with SUB_TOP
2844    and SUB_LAST.  */
2845
2846 static reg_errcode_t
2847 internal_function
2848 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2849                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2850 {
2851   reg_errcode_t err;
2852   int to_idx;
2853   /* Can the subexpression arrive the back reference?  */
2854   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2855                        sub_last->str_idx, bkref_node, bkref_str,
2856                        OP_OPEN_SUBEXP);
2857   if (err != REG_NOERROR)
2858     return err;
2859   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2860                              sub_last->str_idx);
2861   if (BE (err != REG_NOERROR, 0))
2862     return err;
2863   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2864   return clean_state_log_if_needed (mctx, to_idx);
2865 }
2866
2867 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2868    Search '(' if FL_OPEN, or search ')' otherwise.
2869    TODO: This function isn't efficient...
2870          Because there might be more than one nodes whose types are
2871          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2872          nodes.
2873          E.g. RE: (a){2}  */
2874
2875 static int
2876 internal_function
2877 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2878                   int subexp_idx, int type)
2879 {
2880   int cls_idx;
2881   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2882     {
2883       int cls_node = nodes->elems[cls_idx];
2884       const re_token_t *node = dfa->nodes + cls_node;
2885       if (node->type == type
2886           && node->opr.idx == subexp_idx)
2887         return cls_node;
2888     }
2889   return -1;
2890 }
2891
2892 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2893    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2894    heavily reused.
2895    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2896
2897 static reg_errcode_t
2898 internal_function __attribute_warn_unused_result__
2899 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2900                int top_str, int last_node, int last_str, int type)
2901 {
2902   const re_dfa_t *const dfa = mctx->dfa;
2903   reg_errcode_t err = REG_NOERROR;
2904   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2905   re_dfastate_t *cur_state = NULL;
2906   re_node_set *cur_nodes, next_nodes;
2907   re_dfastate_t **backup_state_log;
2908   unsigned int context;
2909
2910   subexp_num = dfa->nodes[top_node].opr.idx;
2911   /* Extend the buffer if we need.  */
2912   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2913     {
2914       re_dfastate_t **new_array;
2915       int old_alloc = path->alloc;
2916       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2917       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2918       if (BE (new_array == NULL, 0))
2919         {
2920           path->alloc = old_alloc;
2921           return REG_ESPACE;
2922         }
2923       path->array = new_array;
2924       memset (new_array + old_alloc, '\0',
2925               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2926     }
2927
2928   str_idx = path->next_idx ?: top_str;
2929
2930   /* Temporary modify MCTX.  */
2931   backup_state_log = mctx->state_log;
2932   backup_cur_idx = mctx->input.cur_idx;
2933   mctx->state_log = path->array;
2934   mctx->input.cur_idx = str_idx;
2935
2936   /* Setup initial node set.  */
2937   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2938   if (str_idx == top_str)
2939     {
2940       err = re_node_set_init_1 (&next_nodes, top_node);
2941       if (BE (err != REG_NOERROR, 0))
2942         return err;
2943       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2944       if (BE (err != REG_NOERROR, 0))
2945         {
2946           re_node_set_free (&next_nodes);
2947           return err;
2948         }
2949     }
2950   else
2951     {
2952       cur_state = mctx->state_log[str_idx];
2953       if (cur_state && cur_state->has_backref)
2954         {
2955           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2956           if (BE (err != REG_NOERROR, 0))
2957             return err;
2958         }
2959       else
2960         re_node_set_init_empty (&next_nodes);
2961     }
2962   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2963     {
2964       if (next_nodes.nelem)
2965         {
2966           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2967                                     subexp_num, type);
2968           if (BE (err != REG_NOERROR, 0))
2969             {
2970               re_node_set_free (&next_nodes);
2971               return err;
2972             }
2973         }
2974       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2975       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2976         {
2977           re_node_set_free (&next_nodes);
2978           return err;
2979         }
2980       mctx->state_log[str_idx] = cur_state;
2981     }
2982
2983   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2984     {
2985       re_node_set_empty (&next_nodes);
2986       if (mctx->state_log[str_idx + 1])
2987         {
2988           err = re_node_set_merge (&next_nodes,
2989                                    &mctx->state_log[str_idx + 1]->nodes);
2990           if (BE (err != REG_NOERROR, 0))
2991             {
2992               re_node_set_free (&next_nodes);
2993               return err;
2994             }
2995         }
2996       if (cur_state)
2997         {
2998           err = check_arrival_add_next_nodes (mctx, str_idx,
2999                                               &cur_state->non_eps_nodes,
3000                                               &next_nodes);
3001           if (BE (err != REG_NOERROR, 0))
3002             {
3003               re_node_set_free (&next_nodes);
3004               return err;
3005             }
3006         }
3007       ++str_idx;
3008       if (next_nodes.nelem)
3009         {
3010           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3011           if (BE (err != REG_NOERROR, 0))
3012             {
3013               re_node_set_free (&next_nodes);
3014               return err;
3015             }
3016           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3017                                     subexp_num, type);
3018           if (BE (err != REG_NOERROR, 0))
3019             {
3020               re_node_set_free (&next_nodes);
3021               return err;
3022             }
3023         }
3024       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3025       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3026       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3027         {
3028           re_node_set_free (&next_nodes);
3029           return err;
3030         }
3031       mctx->state_log[str_idx] = cur_state;
3032       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3033     }
3034   re_node_set_free (&next_nodes);
3035   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3036                : &mctx->state_log[last_str]->nodes);
3037   path->next_idx = str_idx;
3038
3039   /* Fix MCTX.  */
3040   mctx->state_log = backup_state_log;
3041   mctx->input.cur_idx = backup_cur_idx;
3042
3043   /* Then check the current node set has the node LAST_NODE.  */
3044   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3045     return REG_NOERROR;
3046
3047   return REG_NOMATCH;
3048 }
3049
3050 /* Helper functions for check_arrival.  */
3051
3052 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3053    to NEXT_NODES.
3054    TODO: This function is similar to the functions transit_state*(),
3055          however this function has many additional works.
3056          Can't we unify them?  */
3057
3058 static reg_errcode_t
3059 internal_function __attribute_warn_unused_result__
3060 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3061                               re_node_set *cur_nodes, re_node_set *next_nodes)
3062 {
3063   const re_dfa_t *const dfa = mctx->dfa;
3064   int result;
3065   int cur_idx;
3066   reg_errcode_t err = REG_NOERROR;
3067   re_node_set union_set;
3068   re_node_set_init_empty (&union_set);
3069   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3070     {
3071       int naccepted = 0;
3072       int cur_node = cur_nodes->elems[cur_idx];
3073 #ifdef DEBUG
3074       re_token_type_t type = dfa->nodes[cur_node].type;
3075       assert (!IS_EPSILON_NODE (type));
3076 #endif
3077 #ifdef RE_ENABLE_I18N
3078       /* If the node may accept `multi byte'.  */
3079       if (dfa->nodes[cur_node].accept_mb)
3080         {
3081           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3082                                                str_idx);
3083           if (naccepted > 1)
3084             {
3085               re_dfastate_t *dest_state;
3086               int next_node = dfa->nexts[cur_node];
3087               int next_idx = str_idx + naccepted;
3088               dest_state = mctx->state_log[next_idx];
3089               re_node_set_empty (&union_set);
3090               if (dest_state)
3091                 {
3092                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3093                   if (BE (err != REG_NOERROR, 0))
3094                     {
3095                       re_node_set_free (&union_set);
3096                       return err;
3097                     }
3098                 }
3099               result = re_node_set_insert (&union_set, next_node);
3100               if (BE (result < 0, 0))
3101                 {
3102                   re_node_set_free (&union_set);
3103                   return REG_ESPACE;
3104                 }
3105               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3106                                                             &union_set);
3107               if (BE (mctx->state_log[next_idx] == NULL
3108                       && err != REG_NOERROR, 0))
3109                 {
3110                   re_node_set_free (&union_set);
3111                   return err;
3112                 }
3113             }
3114         }
3115 #endif /* RE_ENABLE_I18N */
3116       if (naccepted
3117           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3118         {
3119           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3120           if (BE (result < 0, 0))
3121             {
3122               re_node_set_free (&union_set);
3123               return REG_ESPACE;
3124             }
3125         }
3126     }
3127   re_node_set_free (&union_set);
3128   return REG_NOERROR;
3129 }
3130
3131 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3132    CUR_NODES, however exclude the nodes which are:
3133     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3134     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3135 */
3136
3137 static reg_errcode_t
3138 internal_function
3139 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3140                           int ex_subexp, int type)
3141 {
3142   reg_errcode_t err;
3143   int idx, outside_node;
3144   re_node_set new_nodes;
3145 #ifdef DEBUG
3146   assert (cur_nodes->nelem);
3147 #endif
3148   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3149   if (BE (err != REG_NOERROR, 0))
3150     return err;
3151   /* Create a new node set NEW_NODES with the nodes which are epsilon
3152      closures of the node in CUR_NODES.  */
3153
3154   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3155     {
3156       int cur_node = cur_nodes->elems[idx];
3157       const re_node_set *eclosure = dfa->eclosures + cur_node;
3158       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3159       if (outside_node == -1)
3160         {
3161           /* There are no problematic nodes, just merge them.  */
3162           err = re_node_set_merge (&new_nodes, eclosure);
3163           if (BE (err != REG_NOERROR, 0))
3164             {
3165               re_node_set_free (&new_nodes);
3166               return err;
3167             }
3168         }
3169       else
3170         {
3171           /* There are problematic nodes, re-calculate incrementally.  */
3172           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3173                                               ex_subexp, type);
3174           if (BE (err != REG_NOERROR, 0))
3175             {
3176               re_node_set_free (&new_nodes);
3177               return err;
3178             }
3179         }
3180     }
3181   re_node_set_free (cur_nodes);
3182   *cur_nodes = new_nodes;
3183   return REG_NOERROR;
3184 }
3185
3186 /* Helper function for check_arrival_expand_ecl.
3187    Check incrementally the epsilon closure of TARGET, and if it isn't
3188    problematic append it to DST_NODES.  */
3189
3190 static reg_errcode_t
3191 internal_function __attribute_warn_unused_result__
3192 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3193                               int target, int ex_subexp, int type)
3194 {
3195   int cur_node;
3196   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3197     {
3198       int err;
3199
3200       if (dfa->nodes[cur_node].type == type
3201           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3202         {
3203           if (type == OP_CLOSE_SUBEXP)
3204             {
3205               err = re_node_set_insert (dst_nodes, cur_node);
3206               if (BE (err == -1, 0))
3207                 return REG_ESPACE;
3208             }
3209           break;
3210         }
3211       err = re_node_set_insert (dst_nodes, cur_node);
3212       if (BE (err == -1, 0))
3213         return REG_ESPACE;
3214       if (dfa->edests[cur_node].nelem == 0)
3215         break;
3216       if (dfa->edests[cur_node].nelem == 2)
3217         {
3218           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3219                                               dfa->edests[cur_node].elems[1],
3220                                               ex_subexp, type);
3221           if (BE (err != REG_NOERROR, 0))
3222             return err;
3223         }
3224       cur_node = dfa->edests[cur_node].elems[0];
3225     }
3226   return REG_NOERROR;
3227 }
3228
3229
3230 /* For all the back references in the current state, calculate the
3231    destination of the back references by the appropriate entry
3232    in MCTX->BKREF_ENTS.  */
3233
3234 static reg_errcode_t
3235 internal_function __attribute_warn_unused_result__
3236 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3237                     int cur_str, int subexp_num, int type)
3238 {
3239   const re_dfa_t *const dfa = mctx->dfa;
3240   reg_errcode_t err;
3241   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3242   struct re_backref_cache_entry *ent;
3243
3244   if (cache_idx_start == -1)
3245     return REG_NOERROR;
3246
3247  restart:
3248   ent = mctx->bkref_ents + cache_idx_start;
3249   do
3250     {
3251       int to_idx, next_node;
3252
3253       /* Is this entry ENT is appropriate?  */
3254       if (!re_node_set_contains (cur_nodes, ent->node))
3255         continue; /* No.  */
3256
3257       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3258       /* Calculate the destination of the back reference, and append it
3259          to MCTX->STATE_LOG.  */
3260       if (to_idx == cur_str)
3261         {
3262           /* The backreference did epsilon transit, we must re-check all the
3263              node in the current state.  */
3264           re_node_set new_dests;
3265           reg_errcode_t err2, err3;
3266           next_node = dfa->edests[ent->node].elems[0];
3267           if (re_node_set_contains (cur_nodes, next_node))
3268             continue;
3269           err = re_node_set_init_1 (&new_dests, next_node);
3270           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3271           err3 = re_node_set_merge (cur_nodes, &new_dests);
3272           re_node_set_free (&new_dests);
3273           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3274                   || err3 != REG_NOERROR, 0))
3275             {
3276               err = (err != REG_NOERROR ? err
3277                      : (err2 != REG_NOERROR ? err2 : err3));
3278               return err;
3279             }
3280           /* TODO: It is still inefficient...  */
3281           goto restart;
3282         }
3283       else
3284         {
3285           re_node_set union_set;
3286           next_node = dfa->nexts[ent->node];
3287           if (mctx->state_log[to_idx])
3288             {
3289               int ret;
3290               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3291                                         next_node))
3292                 continue;
3293               err = re_node_set_init_copy (&union_set,
3294                                            &mctx->state_log[to_idx]->nodes);
3295               ret = re_node_set_insert (&union_set, next_node);
3296               if (BE (err != REG_NOERROR || ret < 0, 0))
3297                 {
3298                   re_node_set_free (&union_set);
3299                   err = err != REG_NOERROR ? err : REG_ESPACE;
3300                   return err;
3301                 }
3302             }
3303           else
3304             {
3305               err = re_node_set_init_1 (&union_set, next_node);
3306               if (BE (err != REG_NOERROR, 0))
3307                 return err;
3308             }
3309           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3310           re_node_set_free (&union_set);
3311           if (BE (mctx->state_log[to_idx] == NULL
3312                   && err != REG_NOERROR, 0))
3313             return err;
3314         }
3315     }
3316   while (ent++->more);
3317   return REG_NOERROR;
3318 }
3319
3320 /* Build transition table for the state.
3321    Return 1 if succeeded, otherwise return NULL.  */
3322
3323 static int
3324 internal_function
3325 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3326 {
3327   reg_errcode_t err;
3328   int i, j, ch, need_word_trtable = 0;
3329   bitset_word_t elem, mask;
3330   bool dests_node_malloced = false;
3331   bool dest_states_malloced = false;
3332   int ndests; /* Number of the destination states from `state'.  */
3333   re_dfastate_t **trtable;
3334   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3335   re_node_set follows, *dests_node;
3336   bitset_t *dests_ch;
3337   bitset_t acceptable;
3338
3339   struct dests_alloc
3340   {
3341     re_node_set dests_node[SBC_MAX];
3342     bitset_t dests_ch[SBC_MAX];
3343   } *dests_alloc;
3344
3345   /* We build DFA states which corresponds to the destination nodes
3346      from `state'.  `dests_node[i]' represents the nodes which i-th
3347      destination state contains, and `dests_ch[i]' represents the
3348      characters which i-th destination state accepts.  */
3349   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3350     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3351   else
3352     {
3353       dests_alloc = re_malloc (struct dests_alloc, 1);
3354       if (BE (dests_alloc == NULL, 0))
3355         return 0;
3356       dests_node_malloced = true;
3357     }
3358   dests_node = dests_alloc->dests_node;
3359   dests_ch = dests_alloc->dests_ch;
3360
3361   /* Initialize transiton table.  */
3362   state->word_trtable = state->trtable = NULL;
3363
3364   /* At first, group all nodes belonging to `state' into several
3365      destinations.  */
3366   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3367   if (BE (ndests <= 0, 0))
3368     {
3369       if (dests_node_malloced)
3370         free (dests_alloc);
3371       /* Return 0 in case of an error, 1 otherwise.  */
3372       if (ndests == 0)
3373         {
3374           state->trtable = (re_dfastate_t **)
3375             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3376           return 1;
3377         }
3378       return 0;
3379     }
3380
3381   err = re_node_set_alloc (&follows, ndests + 1);
3382   if (BE (err != REG_NOERROR, 0))
3383     goto out_free;
3384
3385   /* Avoid arithmetic overflow in size calculation.  */
3386   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3387             / (3 * sizeof (re_dfastate_t *)))
3388            < ndests),
3389           0))
3390     goto out_free;
3391
3392   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3393                          + ndests * 3 * sizeof (re_dfastate_t *)))
3394     dest_states = (re_dfastate_t **)
3395       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3396   else
3397     {
3398       dest_states = (re_dfastate_t **)
3399         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3400       if (BE (dest_states == NULL, 0))
3401         {
3402 out_free:
3403           if (dest_states_malloced)
3404             free (dest_states);
3405           re_node_set_free (&follows);
3406           for (i = 0; i < ndests; ++i)
3407             re_node_set_free (dests_node + i);
3408           if (dests_node_malloced)
3409             free (dests_alloc);
3410           return 0;
3411         }
3412       dest_states_malloced = true;
3413     }
3414   dest_states_word = dest_states + ndests;
3415   dest_states_nl = dest_states_word + ndests;
3416   bitset_empty (acceptable);
3417
3418   /* Then build the states for all destinations.  */
3419   for (i = 0; i < ndests; ++i)
3420     {
3421       int next_node;
3422       re_node_set_empty (&follows);
3423       /* Merge the follows of this destination states.  */
3424       for (j = 0; j < dests_node[i].nelem; ++j)
3425         {
3426           next_node = dfa->nexts[dests_node[i].elems[j]];
3427           if (next_node != -1)
3428             {
3429               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3430               if (BE (err != REG_NOERROR, 0))
3431                 goto out_free;
3432             }
3433         }
3434       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3435       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3436         goto out_free;
3437       /* If the new state has context constraint,
3438          build appropriate states for these contexts.  */
3439       if (dest_states[i]->has_constraint)
3440         {
3441           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3442                                                           CONTEXT_WORD);
3443           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3444             goto out_free;
3445
3446           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3447             need_word_trtable = 1;
3448
3449           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3450                                                         CONTEXT_NEWLINE);
3451           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3452             goto out_free;
3453         }
3454       else
3455         {
3456           dest_states_word[i] = dest_states[i];
3457           dest_states_nl[i] = dest_states[i];
3458         }
3459       bitset_merge (acceptable, dests_ch[i]);
3460     }
3461
3462   if (!BE (need_word_trtable, 0))
3463     {
3464       /* We don't care about whether the following character is a word
3465          character, or we are in a single-byte character set so we can
3466          discern by looking at the character code: allocate a
3467          256-entry transition table.  */
3468       trtable = state->trtable =
3469         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3470       if (BE (trtable == NULL, 0))
3471         goto out_free;
3472
3473       /* For all characters ch...:  */
3474       for (i = 0; i < BITSET_WORDS; ++i)
3475         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3476              elem;
3477              mask <<= 1, elem >>= 1, ++ch)
3478           if (BE (elem & 1, 0))
3479             {
3480               /* There must be exactly one destination which accepts
3481                  character ch.  See group_nodes_into_DFAstates.  */
3482               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3483                 ;
3484
3485               /* j-th destination accepts the word character ch.  */
3486               if (dfa->word_char[i] & mask)
3487                 trtable[ch] = dest_states_word[j];
3488               else
3489                 trtable[ch] = dest_states[j];
3490             }
3491     }
3492   else
3493     {
3494       /* We care about whether the following character is a word
3495          character, and we are in a multi-byte character set: discern
3496          by looking at the character code: build two 256-entry
3497          transition tables, one starting at trtable[0] and one
3498          starting at trtable[SBC_MAX].  */
3499       trtable = state->word_trtable =
3500         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3501       if (BE (trtable == NULL, 0))
3502         goto out_free;
3503
3504       /* For all characters ch...:  */
3505       for (i = 0; i < BITSET_WORDS; ++i)
3506         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3507              elem;
3508              mask <<= 1, elem >>= 1, ++ch)
3509           if (BE (elem & 1, 0))
3510             {
3511               /* There must be exactly one destination which accepts
3512                  character ch.  See group_nodes_into_DFAstates.  */
3513               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3514                 ;
3515
3516               /* j-th destination accepts the word character ch.  */
3517               trtable[ch] = dest_states[j];
3518               trtable[ch + SBC_MAX] = dest_states_word[j];
3519             }
3520     }
3521
3522   /* new line */
3523   if (bitset_contain (acceptable, NEWLINE_CHAR))
3524     {
3525       /* The current state accepts newline character.  */
3526       for (j = 0; j < ndests; ++j)
3527         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3528           {
3529             /* k-th destination accepts newline character.  */
3530             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3531             if (need_word_trtable)
3532               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3533             /* There must be only one destination which accepts
3534                newline.  See group_nodes_into_DFAstates.  */
3535             break;
3536           }
3537     }
3538
3539   if (dest_states_malloced)
3540     free (dest_states);
3541
3542   re_node_set_free (&follows);
3543   for (i = 0; i < ndests; ++i)
3544     re_node_set_free (dests_node + i);
3545
3546   if (dests_node_malloced)
3547     free (dests_alloc);
3548
3549   return 1;
3550 }
3551
3552 /* Group all nodes belonging to STATE into several destinations.
3553    Then for all destinations, set the nodes belonging to the destination
3554    to DESTS_NODE[i] and set the characters accepted by the destination
3555    to DEST_CH[i].  This function return the number of destinations.  */
3556
3557 static int
3558 internal_function
3559 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3560                             re_node_set *dests_node, bitset_t *dests_ch)
3561 {
3562   reg_errcode_t err;
3563   int result;
3564   int i, j, k;
3565   int ndests; /* Number of the destinations from `state'.  */
3566   bitset_t accepts; /* Characters a node can accept.  */
3567   const re_node_set *cur_nodes = &state->nodes;
3568   bitset_empty (accepts);
3569   ndests = 0;
3570
3571   /* For all the nodes belonging to `state',  */
3572   for (i = 0; i < cur_nodes->nelem; ++i)
3573     {
3574       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3575       re_token_type_t type = node->type;
3576       unsigned int constraint = node->constraint;
3577
3578       /* Enumerate all single byte character this node can accept.  */
3579       if (type == CHARACTER)
3580         bitset_set (accepts, node->opr.c);
3581       else if (type == SIMPLE_BRACKET)
3582         {
3583           bitset_merge (accepts, node->opr.sbcset);
3584         }
3585       else if (type == OP_PERIOD)
3586         {
3587 #ifdef RE_ENABLE_I18N
3588           if (dfa->mb_cur_max > 1)
3589             bitset_merge (accepts, dfa->sb_char);
3590           else
3591 #endif
3592             bitset_set_all (accepts);
3593           if (!(dfa->syntax & RE_DOT_NEWLINE))
3594             bitset_clear (accepts, '\n');
3595           if (dfa->syntax & RE_DOT_NOT_NULL)
3596             bitset_clear (accepts, '\0');
3597         }
3598 #ifdef RE_ENABLE_I18N
3599       else if (type == OP_UTF8_PERIOD)
3600         {
3601           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3602           if (!(dfa->syntax & RE_DOT_NEWLINE))
3603             bitset_clear (accepts, '\n');
3604           if (dfa->syntax & RE_DOT_NOT_NULL)
3605             bitset_clear (accepts, '\0');
3606         }
3607 #endif
3608       else
3609         continue;
3610
3611       /* Check the `accepts' and sift the characters which are not
3612          match it the context.  */
3613       if (constraint)
3614         {
3615           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3616             {
3617               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3618               bitset_empty (accepts);
3619               if (accepts_newline)
3620                 bitset_set (accepts, NEWLINE_CHAR);
3621               else
3622                 continue;
3623             }
3624           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3625             {
3626               bitset_empty (accepts);
3627               continue;
3628             }
3629
3630           if (constraint & NEXT_WORD_CONSTRAINT)
3631             {
3632               bitset_word_t any_set = 0;
3633               if (type == CHARACTER && !node->word_char)
3634                 {
3635                   bitset_empty (accepts);
3636                   continue;
3637                 }
3638 #ifdef RE_ENABLE_I18N
3639               if (dfa->mb_cur_max > 1)
3640                 for (j = 0; j < BITSET_WORDS; ++j)
3641                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3642               else
3643 #endif
3644                 for (j = 0; j < BITSET_WORDS; ++j)
3645                   any_set |= (accepts[j] &= dfa->word_char[j]);
3646               if (!any_set)
3647                 continue;
3648             }
3649           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3650             {
3651               bitset_word_t any_set = 0;
3652               if (type == CHARACTER && node->word_char)
3653                 {
3654                   bitset_empty (accepts);
3655                   continue;
3656                 }
3657 #ifdef RE_ENABLE_I18N
3658               if (dfa->mb_cur_max > 1)
3659                 for (j = 0; j < BITSET_WORDS; ++j)
3660                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3661               else
3662 #endif
3663                 for (j = 0; j < BITSET_WORDS; ++j)
3664                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3665               if (!any_set)
3666                 continue;
3667             }
3668         }
3669
3670       /* Then divide `accepts' into DFA states, or create a new
3671          state.  Above, we make sure that accepts is not empty.  */
3672       for (j = 0; j < ndests; ++j)
3673         {
3674           bitset_t intersec; /* Intersection sets, see below.  */
3675           bitset_t remains;
3676           /* Flags, see below.  */
3677           bitset_word_t has_intersec, not_subset, not_consumed;
3678
3679           /* Optimization, skip if this state doesn't accept the character.  */
3680           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3681             continue;
3682
3683           /* Enumerate the intersection set of this state and `accepts'.  */
3684           has_intersec = 0;
3685           for (k = 0; k < BITSET_WORDS; ++k)
3686             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3687           /* And skip if the intersection set is empty.  */
3688           if (!has_intersec)
3689             continue;
3690
3691           /* Then check if this state is a subset of `accepts'.  */
3692           not_subset = not_consumed = 0;
3693           for (k = 0; k < BITSET_WORDS; ++k)
3694             {
3695               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3696               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3697             }
3698
3699           /* If this state isn't a subset of `accepts', create a
3700              new group state, which has the `remains'. */
3701           if (not_subset)
3702             {
3703               bitset_copy (dests_ch[ndests], remains);
3704               bitset_copy (dests_ch[j], intersec);
3705               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3706               if (BE (err != REG_NOERROR, 0))
3707                 goto error_return;
3708               ++ndests;
3709             }
3710
3711           /* Put the position in the current group. */
3712           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3713           if (BE (result < 0, 0))
3714             goto error_return;
3715
3716           /* If all characters are consumed, go to next node. */
3717           if (!not_consumed)
3718             break;
3719         }
3720       /* Some characters remain, create a new group. */
3721       if (j == ndests)
3722         {
3723           bitset_copy (dests_ch[ndests], accepts);
3724           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3725           if (BE (err != REG_NOERROR, 0))
3726             goto error_return;
3727           ++ndests;
3728           bitset_empty (accepts);
3729         }
3730     }
3731   return ndests;
3732  error_return:
3733   for (j = 0; j < ndests; ++j)
3734     re_node_set_free (dests_node + j);
3735   return -1;
3736 }
3737
3738 #ifdef RE_ENABLE_I18N
3739 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3740    Return the number of the bytes the node accepts.
3741    STR_IDX is the current index of the input string.
3742
3743    This function handles the nodes which can accept one character, or
3744    one collating element like '.', '[a-z]', opposite to the other nodes
3745    can only accept one byte.  */
3746
3747 static int
3748 internal_function
3749 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3750                          const re_string_t *input, int str_idx)
3751 {
3752   const re_token_t *node = dfa->nodes + node_idx;
3753   int char_len, elem_len;
3754   int i;
3755
3756   if (BE (node->type == OP_UTF8_PERIOD, 0))
3757     {
3758       unsigned char c = re_string_byte_at (input, str_idx), d;
3759       if (BE (c < 0xc2, 1))
3760         return 0;
3761
3762       if (str_idx + 2 > input->len)
3763         return 0;
3764
3765       d = re_string_byte_at (input, str_idx + 1);
3766       if (c < 0xe0)
3767         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3768       else if (c < 0xf0)
3769         {
3770           char_len = 3;
3771           if (c == 0xe0 && d < 0xa0)
3772             return 0;
3773         }
3774       else if (c < 0xf8)
3775         {
3776           char_len = 4;
3777           if (c == 0xf0 && d < 0x90)
3778             return 0;
3779         }
3780       else if (c < 0xfc)
3781         {
3782           char_len = 5;
3783           if (c == 0xf8 && d < 0x88)
3784             return 0;
3785         }
3786       else if (c < 0xfe)
3787         {
3788           char_len = 6;
3789           if (c == 0xfc && d < 0x84)
3790             return 0;
3791         }
3792       else
3793         return 0;
3794
3795       if (str_idx + char_len > input->len)
3796         return 0;
3797
3798       for (i = 1; i < char_len; ++i)
3799         {
3800           d = re_string_byte_at (input, str_idx + i);
3801           if (d < 0x80 || d > 0xbf)
3802             return 0;
3803         }
3804       return char_len;
3805     }
3806
3807   char_len = re_string_char_size_at (input, str_idx);
3808   if (node->type == OP_PERIOD)
3809     {
3810       if (char_len <= 1)
3811         return 0;
3812       /* FIXME: I don't think this if is needed, as both '\n'
3813          and '\0' are char_len == 1.  */
3814       /* '.' accepts any one character except the following two cases.  */
3815       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3816            re_string_byte_at (input, str_idx) == '\n') ||
3817           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3818            re_string_byte_at (input, str_idx) == '\0'))
3819         return 0;
3820       return char_len;
3821     }
3822
3823   elem_len = re_string_elem_size_at (input, str_idx);
3824   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3825     return 0;
3826
3827   if (node->type == COMPLEX_BRACKET)
3828     {
3829       const re_charset_t *cset = node->opr.mbcset;
3830 # ifdef _LIBC
3831       const unsigned char *pin
3832         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3833       int j;
3834       uint32_t nrules;
3835 # endif /* _LIBC */
3836       int match_len = 0;
3837       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3838                     ? re_string_wchar_at (input, str_idx) : 0);
3839
3840       /* match with multibyte character?  */
3841       for (i = 0; i < cset->nmbchars; ++i)
3842         if (wc == cset->mbchars[i])
3843           {
3844             match_len = char_len;
3845             goto check_node_accept_bytes_match;
3846           }
3847       /* match with character_class?  */
3848       for (i = 0; i < cset->nchar_classes; ++i)
3849         {
3850           wctype_t wt = cset->char_classes[i];
3851           if (__iswctype (wc, wt))
3852             {
3853               match_len = char_len;
3854               goto check_node_accept_bytes_match;
3855             }
3856         }
3857
3858 # ifdef _LIBC
3859       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3860       if (nrules != 0)
3861         {
3862           unsigned int in_collseq = 0;
3863           const int32_t *table, *indirect;
3864           const unsigned char *weights, *extra;
3865           const char *collseqwc;
3866           /* This #include defines a local function!  */
3867 #  include <locale/weight.h>
3868
3869           /* match with collating_symbol?  */
3870           if (cset->ncoll_syms)
3871             extra = (const unsigned char *)
3872               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3873           for (i = 0; i < cset->ncoll_syms; ++i)
3874             {
3875               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3876               /* Compare the length of input collating element and
3877                  the length of current collating element.  */
3878               if (*coll_sym != elem_len)
3879                 continue;
3880               /* Compare each bytes.  */
3881               for (j = 0; j < *coll_sym; j++)
3882                 if (pin[j] != coll_sym[1 + j])
3883                   break;
3884               if (j == *coll_sym)
3885                 {
3886                   /* Match if every bytes is equal.  */
3887                   match_len = j;
3888                   goto check_node_accept_bytes_match;
3889                 }
3890             }
3891
3892           if (cset->nranges)
3893             {
3894               if (elem_len <= char_len)
3895                 {
3896                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3897                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3898                 }
3899               else
3900                 in_collseq = find_collation_sequence_value (pin, elem_len);
3901             }
3902           /* match with range expression?  */
3903           for (i = 0; i < cset->nranges; ++i)
3904             if (cset->range_starts[i] <= in_collseq
3905                 && in_collseq <= cset->range_ends[i])
3906               {
3907                 match_len = elem_len;
3908                 goto check_node_accept_bytes_match;
3909               }
3910
3911           /* match with equivalence_class?  */
3912           if (cset->nequiv_classes)
3913             {
3914               const unsigned char *cp = pin;
3915               table = (const int32_t *)
3916                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3917               weights = (const unsigned char *)
3918                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3919               extra = (const unsigned char *)
3920                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3921               indirect = (const int32_t *)
3922                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3923               int32_t idx = findidx (&cp);
3924               if (idx > 0)
3925                 for (i = 0; i < cset->nequiv_classes; ++i)
3926                   {
3927                     int32_t equiv_class_idx = cset->equiv_classes[i];
3928                     size_t weight_len = weights[idx & 0xffffff];
3929                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3930                         && (idx >> 24) == (equiv_class_idx >> 24))
3931                       {
3932                         int cnt = 0;
3933
3934                         idx &= 0xffffff;
3935                         equiv_class_idx &= 0xffffff;
3936
3937                         while (cnt <= weight_len
3938                                && (weights[equiv_class_idx + 1 + cnt]
3939                                    == weights[idx + 1 + cnt]))
3940                           ++cnt;
3941                         if (cnt > weight_len)
3942                           {
3943                             match_len = elem_len;
3944                             goto check_node_accept_bytes_match;
3945                           }
3946                       }
3947                   }
3948             }
3949         }
3950       else
3951 # endif /* _LIBC */
3952         {
3953           /* match with range expression?  */
3954 #if __GNUC__ >= 2
3955           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3956 #else
3957           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3958           cmp_buf[2] = wc;
3959 #endif
3960           for (i = 0; i < cset->nranges; ++i)
3961             {
3962               cmp_buf[0] = cset->range_starts[i];
3963               cmp_buf[4] = cset->range_ends[i];
3964               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3965                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3966                 {
3967                   match_len = char_len;
3968                   goto check_node_accept_bytes_match;
3969                 }
3970             }
3971         }
3972     check_node_accept_bytes_match:
3973       if (!cset->non_match)
3974         return match_len;
3975       else
3976         {
3977           if (match_len > 0)
3978             return 0;
3979           else
3980             return (elem_len > char_len) ? elem_len : char_len;
3981         }
3982     }
3983   return 0;
3984 }
3985
3986 # ifdef _LIBC
3987 static unsigned int
3988 internal_function
3989 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3990 {
3991   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3992   if (nrules == 0)
3993     {
3994       if (mbs_len == 1)
3995         {
3996           /* No valid character.  Match it as a single byte character.  */
3997           const unsigned char *collseq = (const unsigned char *)
3998             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3999           return collseq[mbs[0]];
4000         }
4001       return UINT_MAX;
4002     }
4003   else
4004     {
4005       int32_t idx;
4006       const unsigned char *extra = (const unsigned char *)
4007         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4008       int32_t extrasize = (const unsigned char *)
4009         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4010
4011       for (idx = 0; idx < extrasize;)
4012         {
4013           int mbs_cnt, found = 0;
4014           int32_t elem_mbs_len;
4015           /* Skip the name of collating element name.  */
4016           idx = idx + extra[idx] + 1;
4017           elem_mbs_len = extra[idx++];
4018           if (mbs_len == elem_mbs_len)
4019             {
4020               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4021                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4022                   break;
4023               if (mbs_cnt == elem_mbs_len)
4024                 /* Found the entry.  */
4025                 found = 1;
4026             }
4027           /* Skip the byte sequence of the collating element.  */
4028           idx += elem_mbs_len;
4029           /* Adjust for the alignment.  */
4030           idx = (idx + 3) & ~3;
4031           /* Skip the collation sequence value.  */
4032           idx += sizeof (uint32_t);
4033           /* Skip the wide char sequence of the collating element.  */
4034           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4035           /* If we found the entry, return the sequence value.  */
4036           if (found)
4037             return *(uint32_t *) (extra + idx);
4038           /* Skip the collation sequence value.  */
4039           idx += sizeof (uint32_t);
4040         }
4041       return UINT_MAX;
4042     }
4043 }
4044 # endif /* _LIBC */
4045 #endif /* RE_ENABLE_I18N */
4046
4047 /* Check whether the node accepts the byte which is IDX-th
4048    byte of the INPUT.  */
4049
4050 static int
4051 internal_function
4052 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4053                    int idx)
4054 {
4055   unsigned char ch;
4056   ch = re_string_byte_at (&mctx->input, idx);
4057   switch (node->type)
4058     {
4059     case CHARACTER:
4060       if (node->opr.c != ch)
4061         return 0;
4062       break;
4063
4064     case SIMPLE_BRACKET:
4065       if (!bitset_contain (node->opr.sbcset, ch))
4066         return 0;
4067       break;
4068
4069 #ifdef RE_ENABLE_I18N
4070     case OP_UTF8_PERIOD:
4071       if (ch >= 0x80)
4072         return 0;
4073       /* FALLTHROUGH */
4074 #endif
4075     case OP_PERIOD:
4076       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4077           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4078         return 0;
4079       break;
4080
4081     default:
4082       return 0;
4083     }
4084
4085   if (node->constraint)
4086     {
4087       /* The node has constraints.  Check whether the current context
4088          satisfies the constraints.  */
4089       unsigned int context = re_string_context_at (&mctx->input, idx,
4090                                                    mctx->eflags);
4091       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4092         return 0;
4093     }
4094
4095   return 1;
4096 }
4097
4098 /* Extend the buffers, if the buffers have run out.  */
4099
4100 static reg_errcode_t
4101 internal_function __attribute_warn_unused_result__
4102 extend_buffers (re_match_context_t *mctx)
4103 {
4104   reg_errcode_t ret;
4105   re_string_t *pstr = &mctx->input;
4106
4107   /* Avoid overflow.  */
4108   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4109     return REG_ESPACE;
4110
4111   /* Double the lengthes of the buffers.  */
4112   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4113   if (BE (ret != REG_NOERROR, 0))
4114     return ret;
4115
4116   if (mctx->state_log != NULL)
4117     {
4118       /* And double the length of state_log.  */
4119       /* XXX We have no indication of the size of this buffer.  If this
4120          allocation fail we have no indication that the state_log array
4121          does not have the right size.  */
4122       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4123                                               pstr->bufs_len + 1);
4124       if (BE (new_array == NULL, 0))
4125         return REG_ESPACE;
4126       mctx->state_log = new_array;
4127     }
4128
4129   /* Then reconstruct the buffers.  */
4130   if (pstr->icase)
4131     {
4132 #ifdef RE_ENABLE_I18N
4133       if (pstr->mb_cur_max > 1)
4134         {
4135           ret = build_wcs_upper_buffer (pstr);
4136           if (BE (ret != REG_NOERROR, 0))
4137             return ret;
4138         }
4139       else
4140 #endif /* RE_ENABLE_I18N  */
4141         build_upper_buffer (pstr);
4142     }
4143   else
4144     {
4145 #ifdef RE_ENABLE_I18N
4146       if (pstr->mb_cur_max > 1)
4147         build_wcs_buffer (pstr);
4148       else
4149 #endif /* RE_ENABLE_I18N  */
4150         {
4151           if (pstr->trans != NULL)
4152             re_string_translate_buffer (pstr);
4153         }
4154     }
4155   return REG_NOERROR;
4156 }
4157
4158 \f
4159 /* Functions for matching context.  */
4160
4161 /* Initialize MCTX.  */
4162
4163 static reg_errcode_t
4164 internal_function __attribute_warn_unused_result__
4165 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4166 {
4167   mctx->eflags = eflags;
4168   mctx->match_last = -1;
4169   if (n > 0)
4170     {
4171       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4172       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4173       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4174         return REG_ESPACE;
4175     }
4176   /* Already zero-ed by the caller.
4177      else
4178        mctx->bkref_ents = NULL;
4179      mctx->nbkref_ents = 0;
4180      mctx->nsub_tops = 0;  */
4181   mctx->abkref_ents = n;
4182   mctx->max_mb_elem_len = 1;
4183   mctx->asub_tops = n;
4184   return REG_NOERROR;
4185 }
4186
4187 /* Clean the entries which depend on the current input in MCTX.
4188    This function must be invoked when the matcher changes the start index
4189    of the input, or changes the input string.  */
4190
4191 static void
4192 internal_function
4193 match_ctx_clean (re_match_context_t *mctx)
4194 {
4195   int st_idx;
4196   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4197     {
4198       int sl_idx;
4199       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4200       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4201         {
4202           re_sub_match_last_t *last = top->lasts[sl_idx];
4203           re_free (last->path.array);
4204           re_free (last);
4205         }
4206       re_free (top->lasts);
4207       if (top->path)
4208         {
4209           re_free (top->path->array);
4210           re_free (top->path);
4211         }
4212       free (top);
4213     }
4214
4215   mctx->nsub_tops = 0;
4216   mctx->nbkref_ents = 0;
4217 }
4218
4219 /* Free all the memory associated with MCTX.  */
4220
4221 static void
4222 internal_function
4223 match_ctx_free (re_match_context_t *mctx)
4224 {
4225   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4226   match_ctx_clean (mctx);
4227   re_free (mctx->sub_tops);
4228   re_free (mctx->bkref_ents);
4229 }
4230
4231 /* Add a new backreference entry to MCTX.
4232    Note that we assume that caller never call this function with duplicate
4233    entry, and call with STR_IDX which isn't smaller than any existing entry.
4234 */
4235
4236 static reg_errcode_t
4237 internal_function __attribute_warn_unused_result__
4238 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4239                      int to)
4240 {
4241   if (mctx->nbkref_ents >= mctx->abkref_ents)
4242     {
4243       struct re_backref_cache_entry* new_entry;
4244       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4245                               mctx->abkref_ents * 2);
4246       if (BE (new_entry == NULL, 0))
4247         {
4248           re_free (mctx->bkref_ents);
4249           return REG_ESPACE;
4250         }
4251       mctx->bkref_ents = new_entry;
4252       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4253               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4254       mctx->abkref_ents *= 2;
4255     }
4256   if (mctx->nbkref_ents > 0
4257       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4258     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4259
4260   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4261   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4262   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4263   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4264
4265   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4266      If bit N is clear, means that this entry won't epsilon-transition to
4267      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4268      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4269      such node.
4270
4271      A backreference does not epsilon-transition unless it is empty, so set
4272      to all zeros if FROM != TO.  */
4273   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4274     = (from == to ? ~0 : 0);
4275
4276   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4277   if (mctx->max_mb_elem_len < to - from)
4278     mctx->max_mb_elem_len = to - from;
4279   return REG_NOERROR;
4280 }
4281
4282 /* Search for the first entry which has the same str_idx, or -1 if none is
4283    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4284
4285 static int
4286 internal_function
4287 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4288 {
4289   int left, right, mid, last;
4290   last = right = mctx->nbkref_ents;
4291   for (left = 0; left < right;)
4292     {
4293       mid = (left + right) / 2;
4294       if (mctx->bkref_ents[mid].str_idx < str_idx)
4295         left = mid + 1;
4296       else
4297         right = mid;
4298     }
4299   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4300     return left;
4301   else
4302     return -1;
4303 }
4304
4305 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4306    at STR_IDX.  */
4307
4308 static reg_errcode_t
4309 internal_function __attribute_warn_unused_result__
4310 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4311 {
4312 #ifdef DEBUG
4313   assert (mctx->sub_tops != NULL);
4314   assert (mctx->asub_tops > 0);
4315 #endif
4316   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4317     {
4318       int new_asub_tops = mctx->asub_tops * 2;
4319       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4320                                                    re_sub_match_top_t *,
4321                                                    new_asub_tops);
4322       if (BE (new_array == NULL, 0))
4323         return REG_ESPACE;
4324       mctx->sub_tops = new_array;
4325       mctx->asub_tops = new_asub_tops;
4326     }
4327   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4328   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4329     return REG_ESPACE;
4330   mctx->sub_tops[mctx->nsub_tops]->node = node;
4331   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4332   return REG_NOERROR;
4333 }
4334
4335 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4336    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4337
4338 static re_sub_match_last_t *
4339 internal_function
4340 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4341 {
4342   re_sub_match_last_t *new_entry;
4343   if (BE (subtop->nlasts == subtop->alasts, 0))
4344     {
4345       int new_alasts = 2 * subtop->alasts + 1;
4346       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4347                                                     re_sub_match_last_t *,
4348                                                     new_alasts);
4349       if (BE (new_array == NULL, 0))
4350         return NULL;
4351       subtop->lasts = new_array;
4352       subtop->alasts = new_alasts;
4353     }
4354   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4355   if (BE (new_entry != NULL, 1))
4356     {
4357       subtop->lasts[subtop->nlasts] = new_entry;
4358       new_entry->node = node;
4359       new_entry->str_idx = str_idx;
4360       ++subtop->nlasts;
4361     }
4362   return new_entry;
4363 }
4364
4365 static void
4366 internal_function
4367 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4368                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4369 {
4370   sctx->sifted_states = sifted_sts;
4371   sctx->limited_states = limited_sts;
4372   sctx->last_node = last_node;
4373   sctx->last_str_idx = last_str_idx;
4374   re_node_set_init_empty (&sctx->limits);
4375 }