posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2005,2007,2009,2010,2011 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56      internal_function;
  57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                            int *p_match_first) internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes)
  69      internal_function;
  70 static reg_errcode_t set_regs (const regex_t *preg,
  71                                const re_match_context_t *mctx,
  72                                size_t nmatch, regmatch_t *pmatch,
  73                                int fl_backtrack) internal_function;
  74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75      internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 int node_idx, int str_idx, int max_str_idx)
  81      internal_function;
  82 #endif /* RE_ENABLE_I18N */
  83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                            re_sift_context_t *sctx)
  85      internal_function;
  86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                           re_sift_context_t *sctx, int str_idx,
  88                                           re_node_set *cur_dest)
  89      internal_function;
  90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                               re_sift_context_t *sctx,
  92                                               int str_idx,
  93                                               re_node_set *dest_nodes)
  94      internal_function;
  95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                             re_node_set *dest_nodes,
  97                                             const re_node_set *candidates)
  98      internal_function;
  99 static int check_dst_limits (const re_match_context_t *mctx,
 100                              re_node_set *limits,
 101                              int dst_node, int dst_idx, int src_node,
 102                              int src_idx) internal_function;
 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                         int boundaries, int subexp_idx,
 105                                         int from_node, int bkref_idx)
 106      internal_function;
 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                       int limit, int subexp_idx,
 109                                       int node, int str_idx,
 110                                       int bkref_idx) internal_function;
 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                           re_node_set *dest_nodes,
 113                                           const re_node_set *candidates,
 114                                           re_node_set *limits,
 115                                           struct re_backref_cache_entry *bkref_ents,
 116                                           int str_idx) internal_function;
 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                         re_sift_context_t *sctx,
 119                                         int str_idx, const re_node_set *candidates)
 120      internal_function;
 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                         re_dfastate_t **dst,
 123                                         re_dfastate_t **src, int num)
 124      internal_function;
 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                          re_match_context_t *mctx) internal_function;
 127 static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                      re_match_context_t *mctx,
 129                                      re_dfastate_t *state) internal_function;
 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                             re_match_context_t *mctx,
 132                                             re_dfastate_t *next_state)
 133      internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate)
 141      internal_function;
 142 #endif
 143 #ifdef RE_ENABLE_I18N
 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                        re_dfastate_t *pstate)
 146      internal_function;
 147 #endif /* RE_ENABLE_I18N */
 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                           const re_node_set *nodes)
 150      internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx)
 153      internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str)
 158      internal_function;
 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                              int subexp_idx, int type) internal_function;
 161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                     state_array_t *path, int top_node,
 163                                     int top_str, int last_node, int last_str,
 164                                     int type) internal_function;
 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                    int str_idx,
 167                                                    re_node_set *cur_nodes,
 168                                                    re_node_set *next_nodes)
 169      internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                    re_node_set *dst_nodes,
 176                                                    int target, int ex_subexp,
 177                                                    int type) internal_function;
 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                          re_node_set *cur_nodes, int cur_str,
 180                                          int subexp_num, int type)
 181      internal_function;
 182 static int build_trtable (const re_dfa_t *dfa,
 183                           re_dfastate_t *state) internal_function;
 184 #ifdef RE_ENABLE_I18N
 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                     const re_string_t *input, int idx)
 187      internal_function;
 188 # ifdef _LIBC
 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                    size_t name_len)
 191      internal_function;
 192 # endif /* _LIBC */
 193 #endif /* RE_ENABLE_I18N */
 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                        const re_dfastate_t *state,
 196                                        re_node_set *states_node,
 197                                        bitset_t *states_ch) internal_function;
 198 static int check_node_accept (const re_match_context_t *mctx,
 199                               const re_token_t *node, int idx)
 200      internal_function;
 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202      internal_function;
 203 \f
 204 /* Entry point for POSIX code.  */
 205
 206 /* regexec searches for a given pattern, specified by PREG, in the
 207    string STRING.
 208
 209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211    least NMATCH elements, and we set them to the offsets of the
 212    corresponding matched substrings.
 213
 214    EFLAGS specifies `execution flags' which affect matching: if
 215    REG_NOTBOL is set, then ^ does not match at the beginning of the
 216    string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218    We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220 int
 221 regexec (preg, string, nmatch, pmatch, eflags)
 222     const regex_t *__restrict preg;
 223     const char *__restrict string;
 224     size_t nmatch;
 225     regmatch_t pmatch[];
 226     int eflags;
 227 {
 228   reg_errcode_t err;
 229   int start, length;
 230   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 231
 232   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 233     return REG_BADPAT;
 234
 235   if (eflags & REG_STARTEND)
 236     {
 237       start = pmatch[0].rm_so;
 238       length = pmatch[0].rm_eo;
 239     }
 240   else
 241     {
 242       start = 0;
 243       length = strlen (string);
 244     }
 245
 246   __libc_lock_lock (dfa->lock);
 247   if (preg->no_sub)
 248     err = re_search_internal (preg, string, length, start, length - start,
 249                               length, 0, NULL, eflags);
 250   else
 251     err = re_search_internal (preg, string, length, start, length - start,
 252                               length, nmatch, pmatch, eflags);
 253   __libc_lock_unlock (dfa->lock);
 254   return err != REG_NOERROR;
 255 }
 256
 257 #ifdef _LIBC
 258 # include <shlib-compat.h>
 259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 260
 261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 262 __typeof__ (__regexec) __compat_regexec;
 263
 264 int
 265 attribute_compat_text_section
 266 __compat_regexec (const regex_t *__restrict preg,
 267                   const char *__restrict string, size_t nmatch,
 268                   regmatch_t pmatch[], int eflags)
 269 {
 270   return regexec (preg, string, nmatch, pmatch,
 271                   eflags & (REG_NOTBOL | REG_NOTEOL));
 272 }
 273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 274 # endif
 275 #endif
 276
 277 /* Entry points for GNU code.  */
 278
 279 /* re_match, re_search, re_match_2, re_search_2
 280
 281    The former two functions operate on STRING with length LENGTH,
 282    while the later two operate on concatenation of STRING1 and STRING2
 283    with lengths LENGTH1 and LENGTH2, respectively.
 284
 285    re_match() matches the compiled pattern in BUFP against the string,
 286    starting at index START.
 287
 288    re_search() first tries matching at index START, then it tries to match
 289    starting from index START + 1, and so on.  The last start position tried
 290    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 291    way as re_match().)
 292
 293    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 294    the first STOP characters of the concatenation of the strings should be
 295    concerned.
 296
 297    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 298    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 299    computed relative to the concatenation, not relative to the individual
 300    strings.)
 301
 302    On success, re_match* functions return the length of the match, re_search*
 303    return the position of the start of the match.  Return value -1 means no
 304    match was found and -2 indicates an internal error.  */
 305
 306 int
 307 re_match (bufp, string, length, start, regs)
 308     struct re_pattern_buffer *bufp;
 309     const char *string;
 310     int length, start;
 311     struct re_registers *regs;
 312 {
 313   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 314 }
 315 #ifdef _LIBC
 316 weak_alias (__re_match, re_match)
 317 #endif
 318
 319 int
 320 re_search (bufp, string, length, start, range, regs)
 321     struct re_pattern_buffer *bufp;
 322     const char *string;
 323     int length, start, range;
 324     struct re_registers *regs;
 325 {
 326   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 327 }
 328 #ifdef _LIBC
 329 weak_alias (__re_search, re_search)
 330 #endif
 331
 332 int
 333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 334     struct re_pattern_buffer *bufp;
 335     const char *string1, *string2;
 336     int length1, length2, start, stop;
 337     struct re_registers *regs;
 338 {
 339   return re_search_2_stub (bufp, string1, length1, string2, length2,
 340                            start, 0, regs, stop, 1);
 341 }
 342 #ifdef _LIBC
 343 weak_alias (__re_match_2, re_match_2)
 344 #endif
 345
 346 int
 347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 348     struct re_pattern_buffer *bufp;
 349     const char *string1, *string2;
 350     int length1, length2, start, range, stop;
 351     struct re_registers *regs;
 352 {
 353   return re_search_2_stub (bufp, string1, length1, string2, length2,
 354                            start, range, regs, stop, 0);
 355 }
 356 #ifdef _LIBC
 357 weak_alias (__re_search_2, re_search_2)
 358 #endif
 359
 360 static int
 361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 362                   stop, ret_len)
 363     struct re_pattern_buffer *bufp;
 364     const char *string1, *string2;
 365     int length1, length2, start, range, stop, ret_len;
 366     struct re_registers *regs;
 367 {
 368   const char *str;
 369   int rval;
 370   int len = length1 + length2;
 371   char *s = NULL;
 372
 373   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
 374     return -2;
 375
 376   /* Concatenate the strings.  */
 377   if (length2 > 0)
 378     if (length1 > 0)
 379       {
 380         s = re_malloc (char, len);
 381
 382         if (BE (s == NULL, 0))
 383           return -2;
 384 #ifdef _LIBC
 385         memcpy (__mempcpy (s, string1, length1), string2, length2);
 386 #else
 387         memcpy (s, string1, length1);
 388         memcpy (s + length1, string2, length2);
 389 #endif
 390         str = s;
 391       }
 392     else
 393       str = string2;
 394   else
 395     str = string1;
 396
 397   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 398   re_free (s);
 399   return rval;
 400 }
 401
 402 /* The parameters have the same meaning as those of re_search.
 403    Additional parameters:
 404    If RET_LEN is nonzero the length of the match is returned (re_match style);
 405    otherwise the position of the match is returned.  */
 406
 407 static int
 408 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 409     struct re_pattern_buffer *bufp;
 410     const char *string;
 411     int length, start, range, stop, ret_len;
 412     struct re_registers *regs;
 413 {
 414   reg_errcode_t result;
 415   regmatch_t *pmatch;
 416   int nregs, rval;
 417   int eflags = 0;
 418   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 419
 420   /* Check for out-of-range.  */
 421   if (BE (start < 0 || start > length, 0))
 422     return -1;
 423   if (BE (start + range > length, 0))
 424     range = length - start;
 425   else if (BE (start + range < 0, 0))
 426     range = -start;
 427
 428   __libc_lock_lock (dfa->lock);
 429
 430   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 431   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 432
 433   /* Compile fastmap if we haven't yet.  */
 434   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 435     re_compile_fastmap (bufp);
 436
 437   if (BE (bufp->no_sub, 0))
 438     regs = NULL;
 439
 440   /* We need at least 1 register.  */
 441   if (regs == NULL)
 442     nregs = 1;
 443   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 444                regs->num_regs < bufp->re_nsub + 1, 0))
 445     {
 446       nregs = regs->num_regs;
 447       if (BE (nregs < 1, 0))
 448         {
 449           /* Nothing can be copied to regs.  */
 450           regs = NULL;
 451           nregs = 1;
 452         }
 453     }
 454   else
 455     nregs = bufp->re_nsub + 1;
 456   pmatch = re_malloc (regmatch_t, nregs);
 457   if (BE (pmatch == NULL, 0))
 458     {
 459       rval = -2;
 460       goto out;
 461     }
 462
 463   result = re_search_internal (bufp, string, length, start, range, stop,
 464                                nregs, pmatch, eflags);
 465
 466   rval = 0;
 467
 468   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 469   if (result != REG_NOERROR)
 470     rval = -1;
 471   else if (regs != NULL)
 472     {
 473       /* If caller wants register contents data back, copy them.  */
 474       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 475                                            bufp->regs_allocated);
 476       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 477         rval = -2;
 478     }
 479
 480   if (BE (rval == 0, 1))
 481     {
 482       if (ret_len)
 483         {
 484           assert (pmatch[0].rm_so == start);
 485           rval = pmatch[0].rm_eo - start;
 486         }
 487       else
 488         rval = pmatch[0].rm_so;
 489     }
 490   re_free (pmatch);
 491  out:
 492   __libc_lock_unlock (dfa->lock);
 493   return rval;
 494 }
 495
 496 static unsigned
 497 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 498     struct re_registers *regs;
 499     regmatch_t *pmatch;
 500     int nregs, regs_allocated;
 501 {
 502   int rval = REGS_REALLOCATE;
 503   int i;
 504   int need_regs = nregs + 1;
 505   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 506      uses.  */
 507
 508   /* Have the register data arrays been allocated?  */
 509   if (regs_allocated == REGS_UNALLOCATED)
 510     { /* No.  So allocate them with malloc.  */
 511       regs->start = re_malloc (regoff_t, need_regs);
 512       if (BE (regs->start == NULL, 0))
 513         return REGS_UNALLOCATED;
 514       regs->end = re_malloc (regoff_t, need_regs);
 515       if (BE (regs->end == NULL, 0))
 516         {
 517           re_free (regs->start);
 518           return REGS_UNALLOCATED;
 519         }
 520       regs->num_regs = need_regs;
 521     }
 522   else if (regs_allocated == REGS_REALLOCATE)
 523     { /* Yes.  If we need more elements than were already
 524          allocated, reallocate them.  If we need fewer, just
 525          leave it alone.  */
 526       if (BE (need_regs > regs->num_regs, 0))
 527         {
 528           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 529           regoff_t *new_end;
 530           if (BE (new_start == NULL, 0))
 531             return REGS_UNALLOCATED;
 532           new_end = re_realloc (regs->end, regoff_t, need_regs);
 533           if (BE (new_end == NULL, 0))
 534             {
 535               re_free (new_start);
 536               return REGS_UNALLOCATED;
 537             }
 538           regs->start = new_start;
 539           regs->end = new_end;
 540           regs->num_regs = need_regs;
 541         }
 542     }
 543   else
 544     {
 545       assert (regs_allocated == REGS_FIXED);
 546       /* This function may not be called with REGS_FIXED and nregs too big.  */
 547       assert (regs->num_regs >= nregs);
 548       rval = REGS_FIXED;
 549     }
 550
 551   /* Copy the regs.  */
 552   for (i = 0; i < nregs; ++i)
 553     {
 554       regs->start[i] = pmatch[i].rm_so;
 555       regs->end[i] = pmatch[i].rm_eo;
 556     }
 557   for ( ; i < regs->num_regs; ++i)
 558     regs->start[i] = regs->end[i] = -1;
 559
 560   return rval;
 561 }
 562
 563 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 564    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 565    this memory for recording register information.  STARTS and ENDS
 566    must be allocated using the malloc library routine, and must each
 567    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 568
 569    If NUM_REGS == 0, then subsequent matches should allocate their own
 570    register data.
 571
 572    Unless this function is called, the first search or match using
 573    PATTERN_BUFFER will allocate its own register data, without
 574    freeing the old data.  */
 575
 576 void
 577 re_set_registers (bufp, regs, num_regs, starts, ends)
 578     struct re_pattern_buffer *bufp;
 579     struct re_registers *regs;
 580     unsigned num_regs;
 581     regoff_t *starts, *ends;
 582 {
 583   if (num_regs)
 584     {
 585       bufp->regs_allocated = REGS_REALLOCATE;
 586       regs->num_regs = num_regs;
 587       regs->start = starts;
 588       regs->end = ends;
 589     }
 590   else
 591     {
 592       bufp->regs_allocated = REGS_UNALLOCATED;
 593       regs->num_regs = 0;
 594       regs->start = regs->end = (regoff_t *) 0;
 595     }
 596 }
 597 #ifdef _LIBC
 598 weak_alias (__re_set_registers, re_set_registers)
 599 #endif
 600 \f
 601 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 602    them unless specifically requested.  */
 603
 604 #if defined _REGEX_RE_COMP || defined _LIBC
 605 int
 606 # ifdef _LIBC
 607 weak_function
 608 # endif
 609 re_exec (s)
 610      const char *s;
 611 {
 612   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 613 }
 614 #endif /* _REGEX_RE_COMP */
 615 \f
 616 /* Internal entry point.  */
 617
 618 /* Searches for a compiled pattern PREG in the string STRING, whose
 619    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 620    mingings with regexec.  START, and RANGE have the same meanings
 621    with re_search.
 622    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 623    otherwise return the error code.
 624    Note: We assume front end functions already check ranges.
 625    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 626
 627 static reg_errcode_t
 628 __attribute_warn_unused_result__
 629 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 630                     eflags)
 631     const regex_t *preg;
 632     const char *string;
 633     int length, start, range, stop, eflags;
 634     size_t nmatch;
 635     regmatch_t pmatch[];
 636 {
 637   reg_errcode_t err;
 638   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 639   int left_lim, right_lim, incr;
 640   int fl_longest_match, match_first, match_kind, match_last = -1;
 641   int extra_nmatch;
 642   int sb, ch;
 643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 644   re_match_context_t mctx = { .dfa = dfa };
 645 #else
 646   re_match_context_t mctx;
 647 #endif
 648   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 649                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 650   RE_TRANSLATE_TYPE t = preg->translate;
 651
 652 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 653   memset (&mctx, '\0', sizeof (re_match_context_t));
 654   mctx.dfa = dfa;
 655 #endif
 656
 657   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 658   nmatch -= extra_nmatch;
 659
 660   /* Check if the DFA haven't been compiled.  */
 661   if (BE (preg->used == 0 || dfa->init_state == NULL
 662           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 663           || dfa->init_state_begbuf == NULL, 0))
 664     return REG_NOMATCH;
 665
 666 #ifdef DEBUG
 667   /* We assume front-end functions already check them.  */
 668   assert (start + range >= 0 && start + range <= length);
 669 #endif
 670
 671   /* If initial states with non-begbuf contexts have no elements,
 672      the regex must be anchored.  If preg->newline_anchor is set,
 673      we'll never use init_state_nl, so do not check it.  */
 674   if (dfa->init_state->nodes.nelem == 0
 675       && dfa->init_state_word->nodes.nelem == 0
 676       && (dfa->init_state_nl->nodes.nelem == 0
 677           || !preg->newline_anchor))
 678     {
 679       if (start != 0 && start + range != 0)
 680         return REG_NOMATCH;
 681       start = range = 0;
 682     }
 683
 684   /* We must check the longest matching, if nmatch > 0.  */
 685   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 686
 687   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 688                             preg->translate, preg->syntax & RE_ICASE, dfa);
 689   if (BE (err != REG_NOERROR, 0))
 690     goto free_return;
 691   mctx.input.stop = stop;
 692   mctx.input.raw_stop = stop;
 693   mctx.input.newline_anchor = preg->newline_anchor;
 694
 695   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 696   if (BE (err != REG_NOERROR, 0))
 697     goto free_return;
 698
 699   /* We will log all the DFA states through which the dfa pass,
 700      if nmatch > 1, or this dfa has "multibyte node", which is a
 701      back-reference or a node which can accept multibyte character or
 702      multi character collating element.  */
 703   if (nmatch > 1 || dfa->has_mb_node)
 704     {
 705       /* Avoid overflow.  */
 706       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 707         {
 708           err = REG_ESPACE;
 709           goto free_return;
 710         }
 711
 712       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 713       if (BE (mctx.state_log == NULL, 0))
 714         {
 715           err = REG_ESPACE;
 716           goto free_return;
 717         }
 718     }
 719   else
 720     mctx.state_log = NULL;
 721
 722   match_first = start;
 723   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 724                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 725
 726   /* Check incrementally whether of not the input string match.  */
 727   incr = (range < 0) ? -1 : 1;
 728   left_lim = (range < 0) ? start + range : start;
 729   right_lim = (range < 0) ? start : start + range;
 730   sb = dfa->mb_cur_max == 1;
 731   match_kind =
 732     (fastmap
 733      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 734         | (range >= 0 ? 2 : 0)
 735         | (t != NULL ? 1 : 0))
 736      : 8);
 737
 738   for (;; match_first += incr)
 739     {
 740       err = REG_NOMATCH;
 741       if (match_first < left_lim || right_lim < match_first)
 742         goto free_return;
 743
 744       /* Advance as rapidly as possible through the string, until we
 745          find a plausible place to start matching.  This may be done
 746          with varying efficiency, so there are various possibilities:
 747          only the most common of them are specialized, in order to
 748          save on code size.  We use a switch statement for speed.  */
 749       switch (match_kind)
 750         {
 751         case 8:
 752           /* No fastmap.  */
 753           break;
 754
 755         case 7:
 756           /* Fastmap with single-byte translation, match forward.  */
 757           while (BE (match_first < right_lim, 1)
 758                  && !fastmap[t[(unsigned char) string[match_first]]])
 759             ++match_first;
 760           goto forward_match_found_start_or_reached_end;
 761
 762         case 6:
 763           /* Fastmap without translation, match forward.  */
 764           while (BE (match_first < right_lim, 1)
 765                  && !fastmap[(unsigned char) string[match_first]])
 766             ++match_first;
 767
 768         forward_match_found_start_or_reached_end:
 769           if (BE (match_first == right_lim, 0))
 770             {
 771               ch = match_first >= length
 772                        ? 0 : (unsigned char) string[match_first];
 773               if (!fastmap[t ? t[ch] : ch])
 774                 goto free_return;
 775             }
 776           break;
 777
 778         case 4:
 779         case 5:
 780           /* Fastmap without multi-byte translation, match backwards.  */
 781           while (match_first >= left_lim)
 782             {
 783               ch = match_first >= length
 784                        ? 0 : (unsigned char) string[match_first];
 785               if (fastmap[t ? t[ch] : ch])
 786                 break;
 787               --match_first;
 788             }
 789           if (match_first < left_lim)
 790             goto free_return;
 791           break;
 792
 793         default:
 794           /* In this case, we can't determine easily the current byte,
 795              since it might be a component byte of a multibyte
 796              character.  Then we use the constructed buffer instead.  */
 797           for (;;)
 798             {
 799               /* If MATCH_FIRST is out of the valid range, reconstruct the
 800                  buffers.  */
 801               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 802               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 803                 {
 804                   err = re_string_reconstruct (&mctx.input, match_first,
 805                                                eflags);
 806                   if (BE (err != REG_NOERROR, 0))
 807                     goto free_return;
 808
 809                   offset = match_first - mctx.input.raw_mbs_idx;
 810                 }
 811               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 812                  Note that MATCH_FIRST must not be smaller than 0.  */
 813               ch = (match_first >= length
 814                     ? 0 : re_string_byte_at (&mctx.input, offset));
 815               if (fastmap[ch])
 816                 break;
 817               match_first += incr;
 818               if (match_first < left_lim || match_first > right_lim)
 819                 {
 820                   err = REG_NOMATCH;
 821                   goto free_return;
 822                 }
 823             }
 824           break;
 825         }
 826
 827       /* Reconstruct the buffers so that the matcher can assume that
 828          the matching starts from the beginning of the buffer.  */
 829       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 830       if (BE (err != REG_NOERROR, 0))
 831         goto free_return;
 832
 833 #ifdef RE_ENABLE_I18N
 834      /* Don't consider this char as a possible match start if it part,
 835         yet isn't the head, of a multibyte character.  */
 836       if (!sb && !re_string_first_byte (&mctx.input, 0))
 837         continue;
 838 #endif
 839
 840       /* It seems to be appropriate one, then use the matcher.  */
 841       /* We assume that the matching starts from 0.  */
 842       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 843       match_last = check_matching (&mctx, fl_longest_match,
 844                                    range >= 0 ? &match_first : NULL);
 845       if (match_last != -1)
 846         {
 847           if (BE (match_last == -2, 0))
 848             {
 849               err = REG_ESPACE;
 850               goto free_return;
 851             }
 852           else
 853             {
 854               mctx.match_last = match_last;
 855               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 856                 {
 857                   re_dfastate_t *pstate = mctx.state_log[match_last];
 858                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 859                                                              match_last);
 860                 }
 861               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 862                   || dfa->nbackref)
 863                 {
 864                   err = prune_impossible_nodes (&mctx);
 865                   if (err == REG_NOERROR)
 866                     break;
 867                   if (BE (err != REG_NOMATCH, 0))
 868                     goto free_return;
 869                   match_last = -1;
 870                 }
 871               else
 872                 break; /* We found a match.  */
 873             }
 874         }
 875
 876       match_ctx_clean (&mctx);
 877     }
 878
 879 #ifdef DEBUG
 880   assert (match_last != -1);
 881   assert (err == REG_NOERROR);
 882 #endif
 883
 884   /* Set pmatch[] if we need.  */
 885   if (nmatch > 0)
 886     {
 887       int reg_idx;
 888
 889       /* Initialize registers.  */
 890       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 891         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 892
 893       /* Set the points where matching start/end.  */
 894       pmatch[0].rm_so = 0;
 895       pmatch[0].rm_eo = mctx.match_last;
 896
 897       if (!preg->no_sub && nmatch > 1)
 898         {
 899           err = set_regs (preg, &mctx, nmatch, pmatch,
 900                           dfa->has_plural_match && dfa->nbackref > 0);
 901           if (BE (err != REG_NOERROR, 0))
 902             goto free_return;
 903         }
 904
 905       /* At last, add the offset to the each registers, since we slided
 906          the buffers so that we could assume that the matching starts
 907          from 0.  */
 908       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 909         if (pmatch[reg_idx].rm_so != -1)
 910           {
 911 #ifdef RE_ENABLE_I18N
 912             if (BE (mctx.input.offsets_needed != 0, 0))
 913               {
 914                 pmatch[reg_idx].rm_so =
 915                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 916                    ? mctx.input.valid_raw_len
 917                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 918                 pmatch[reg_idx].rm_eo =
 919                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 920                    ? mctx.input.valid_raw_len
 921                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 922               }
 923 #else
 924             assert (mctx.input.offsets_needed == 0);
 925 #endif
 926             pmatch[reg_idx].rm_so += match_first;
 927             pmatch[reg_idx].rm_eo += match_first;
 928           }
 929       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 930         {
 931           pmatch[nmatch + reg_idx].rm_so = -1;
 932           pmatch[nmatch + reg_idx].rm_eo = -1;
 933         }
 934
 935       if (dfa->subexp_map)
 936         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 937           if (dfa->subexp_map[reg_idx] != reg_idx)
 938             {
 939               pmatch[reg_idx + 1].rm_so
 940                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 941               pmatch[reg_idx + 1].rm_eo
 942                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 943             }
 944     }
 945
 946  free_return:
 947   re_free (mctx.state_log);
 948   if (dfa->nbackref)
 949     match_ctx_free (&mctx);
 950   re_string_destruct (&mctx.input);
 951   return err;
 952 }
 953
 954 static reg_errcode_t
 955 __attribute_warn_unused_result__
 956 prune_impossible_nodes (mctx)
 957      re_match_context_t *mctx;
 958 {
 959   const re_dfa_t *const dfa = mctx->dfa;
 960   int halt_node, match_last;
 961   reg_errcode_t ret;
 962   re_dfastate_t **sifted_states;
 963   re_dfastate_t **lim_states = NULL;
 964   re_sift_context_t sctx;
 965 #ifdef DEBUG
 966   assert (mctx->state_log != NULL);
 967 #endif
 968   match_last = mctx->match_last;
 969   halt_node = mctx->last_node;
 970
 971   /* Avoid overflow.  */
 972   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 973     return REG_ESPACE;
 974
 975   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 976   if (BE (sifted_states == NULL, 0))
 977     {
 978       ret = REG_ESPACE;
 979       goto free_return;
 980     }
 981   if (dfa->nbackref)
 982     {
 983       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 984       if (BE (lim_states == NULL, 0))
 985         {
 986           ret = REG_ESPACE;
 987           goto free_return;
 988         }
 989       while (1)
 990         {
 991           memset (lim_states, '\0',
 992                   sizeof (re_dfastate_t *) * (match_last + 1));
 993           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 994                          match_last);
 995           ret = sift_states_backward (mctx, &sctx);
 996           re_node_set_free (&sctx.limits);
 997           if (BE (ret != REG_NOERROR, 0))
 998               goto free_return;
 999           if (sifted_states[0] != NULL || lim_states[0] != NULL)
1000             break;
1001           do
1002             {
1003               --match_last;
1004               if (match_last < 0)
1005                 {
1006                   ret = REG_NOMATCH;
1007                   goto free_return;
1008                 }
1009             } while (mctx->state_log[match_last] == NULL
1010                      || !mctx->state_log[match_last]->halt);
1011           halt_node = check_halt_state_context (mctx,
1012                                                 mctx->state_log[match_last],
1013                                                 match_last);
1014         }
1015       ret = merge_state_array (dfa, sifted_states, lim_states,
1016                                match_last + 1);
1017       re_free (lim_states);
1018       lim_states = NULL;
1019       if (BE (ret != REG_NOERROR, 0))
1020         goto free_return;
1021     }
1022   else
1023     {
1024       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1025       ret = sift_states_backward (mctx, &sctx);
1026       re_node_set_free (&sctx.limits);
1027       if (BE (ret != REG_NOERROR, 0))
1028         goto free_return;
1029       if (sifted_states[0] == NULL)
1030         {
1031           ret = REG_NOMATCH;
1032           goto free_return;
1033         }
1034     }
1035   re_free (mctx->state_log);
1036   mctx->state_log = sifted_states;
1037   sifted_states = NULL;
1038   mctx->last_node = halt_node;
1039   mctx->match_last = match_last;
1040   ret = REG_NOERROR;
1041  free_return:
1042   re_free (sifted_states);
1043   re_free (lim_states);
1044   return ret;
1045 }
1046
1047 /* Acquire an initial state and return it.
1048    We must select appropriate initial state depending on the context,
1049    since initial states may have constraints like "\<", "^", etc..  */
1050
1051 static inline re_dfastate_t *
1052 __attribute ((always_inline)) internal_function
1053 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1054                             int idx)
1055 {
1056   const re_dfa_t *const dfa = mctx->dfa;
1057   if (dfa->init_state->has_constraint)
1058     {
1059       unsigned int context;
1060       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1061       if (IS_WORD_CONTEXT (context))
1062         return dfa->init_state_word;
1063       else if (IS_ORDINARY_CONTEXT (context))
1064         return dfa->init_state;
1065       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1066         return dfa->init_state_begbuf;
1067       else if (IS_NEWLINE_CONTEXT (context))
1068         return dfa->init_state_nl;
1069       else if (IS_BEGBUF_CONTEXT (context))
1070         {
1071           /* It is relatively rare case, then calculate on demand.  */
1072           return re_acquire_state_context (err, dfa,
1073                                            dfa->init_state->entrance_nodes,
1074                                            context);
1075         }
1076       else
1077         /* Must not happen?  */
1078         return dfa->init_state;
1079     }
1080   else
1081     return dfa->init_state;
1082 }
1083
1084 /* Check whether the regular expression match input string INPUT or not,
1085    and return the index where the matching end, return -1 if not match,
1086    or return -2 in case of an error.
1087    FL_LONGEST_MATCH means we want the POSIX longest matching.
1088    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1089    next place where we may want to try matching.
1090    Note that the matcher assume that the maching starts from the current
1091    index of the buffer.  */
1092
1093 static int
1094 internal_function __attribute_warn_unused_result__
1095 check_matching (re_match_context_t *mctx, int fl_longest_match,
1096                 int *p_match_first)
1097 {
1098   const re_dfa_t *const dfa = mctx->dfa;
1099   reg_errcode_t err;
1100   int match = 0;
1101   int match_last = -1;
1102   int cur_str_idx = re_string_cur_idx (&mctx->input);
1103   re_dfastate_t *cur_state;
1104   int at_init_state = p_match_first != NULL;
1105   int next_start_idx = cur_str_idx;
1106
1107   err = REG_NOERROR;
1108   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1109   /* An initial state must not be NULL (invalid).  */
1110   if (BE (cur_state == NULL, 0))
1111     {
1112       assert (err == REG_ESPACE);
1113       return -2;
1114     }
1115
1116   if (mctx->state_log != NULL)
1117     {
1118       mctx->state_log[cur_str_idx] = cur_state;
1119
1120       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1121          later.  E.g. Processing back references.  */
1122       if (BE (dfa->nbackref, 0))
1123         {
1124           at_init_state = 0;
1125           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1126           if (BE (err != REG_NOERROR, 0))
1127             return err;
1128
1129           if (cur_state->has_backref)
1130             {
1131               err = transit_state_bkref (mctx, &cur_state->nodes);
1132               if (BE (err != REG_NOERROR, 0))
1133                 return err;
1134             }
1135         }
1136     }
1137
1138   /* If the RE accepts NULL string.  */
1139   if (BE (cur_state->halt, 0))
1140     {
1141       if (!cur_state->has_constraint
1142           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1143         {
1144           if (!fl_longest_match)
1145             return cur_str_idx;
1146           else
1147             {
1148               match_last = cur_str_idx;
1149               match = 1;
1150             }
1151         }
1152     }
1153
1154   while (!re_string_eoi (&mctx->input))
1155     {
1156       re_dfastate_t *old_state = cur_state;
1157       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1158
1159       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1160            && mctx->input.bufs_len < mctx->input.len)
1161           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1162               && mctx->input.valid_len < mctx->input.len))
1163         {
1164           err = extend_buffers (mctx);
1165           if (BE (err != REG_NOERROR, 0))
1166             {
1167               assert (err == REG_ESPACE);
1168               return -2;
1169             }
1170         }
1171
1172       cur_state = transit_state (&err, mctx, cur_state);
1173       if (mctx->state_log != NULL)
1174         cur_state = merge_state_with_log (&err, mctx, cur_state);
1175
1176       if (cur_state == NULL)
1177         {
1178           /* Reached the invalid state or an error.  Try to recover a valid
1179              state using the state log, if available and if we have not
1180              already found a valid (even if not the longest) match.  */
1181           if (BE (err != REG_NOERROR, 0))
1182             return -2;
1183
1184           if (mctx->state_log == NULL
1185               || (match && !fl_longest_match)
1186               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1187             break;
1188         }
1189
1190       if (BE (at_init_state, 0))
1191         {
1192           if (old_state == cur_state)
1193             next_start_idx = next_char_idx;
1194           else
1195             at_init_state = 0;
1196         }
1197
1198       if (cur_state->halt)
1199         {
1200           /* Reached a halt state.
1201              Check the halt state can satisfy the current context.  */
1202           if (!cur_state->has_constraint
1203               || check_halt_state_context (mctx, cur_state,
1204                                            re_string_cur_idx (&mctx->input)))
1205             {
1206               /* We found an appropriate halt state.  */
1207               match_last = re_string_cur_idx (&mctx->input);
1208               match = 1;
1209
1210               /* We found a match, do not modify match_first below.  */
1211               p_match_first = NULL;
1212               if (!fl_longest_match)
1213                 break;
1214             }
1215         }
1216     }
1217
1218   if (p_match_first)
1219     *p_match_first += next_start_idx;
1220
1221   return match_last;
1222 }
1223
1224 /* Check NODE match the current context.  */
1225
1226 static int
1227 internal_function
1228 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1229 {
1230   re_token_type_t type = dfa->nodes[node].type;
1231   unsigned int constraint = dfa->nodes[node].constraint;
1232   if (type != END_OF_RE)
1233     return 0;
1234   if (!constraint)
1235     return 1;
1236   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1237     return 0;
1238   return 1;
1239 }
1240
1241 /* Check the halt state STATE match the current context.
1242    Return 0 if not match, if the node, STATE has, is a halt node and
1243    match the context, return the node.  */
1244
1245 static int
1246 internal_function
1247 check_halt_state_context (const re_match_context_t *mctx,
1248                           const re_dfastate_t *state, int idx)
1249 {
1250   int i;
1251   unsigned int context;
1252 #ifdef DEBUG
1253   assert (state->halt);
1254 #endif
1255   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1256   for (i = 0; i < state->nodes.nelem; ++i)
1257     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1258       return state->nodes.elems[i];
1259   return 0;
1260 }
1261
1262 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1263    corresponding to the DFA).
1264    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1265    of errors.  */
1266
1267 static int
1268 internal_function
1269 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1270                    int *pidx, int node, re_node_set *eps_via_nodes,
1271                    struct re_fail_stack_t *fs)
1272 {
1273   const re_dfa_t *const dfa = mctx->dfa;
1274   int i, err;
1275   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1276     {
1277       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1278       re_node_set *edests = &dfa->edests[node];
1279       int dest_node;
1280       err = re_node_set_insert (eps_via_nodes, node);
1281       if (BE (err < 0, 0))
1282         return -2;
1283       /* Pick up a valid destination, or return -1 if none is found.  */
1284       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1285         {
1286           int candidate = edests->elems[i];
1287           if (!re_node_set_contains (cur_nodes, candidate))
1288             continue;
1289           if (dest_node == -1)
1290             dest_node = candidate;
1291
1292           else
1293             {
1294               /* In order to avoid infinite loop like "(a*)*", return the second
1295                  epsilon-transition if the first was already considered.  */
1296               if (re_node_set_contains (eps_via_nodes, dest_node))
1297                 return candidate;
1298
1299               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1300               else if (fs != NULL
1301                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1302                                            eps_via_nodes))
1303                 return -2;
1304
1305               /* We know we are going to exit.  */
1306               break;
1307             }
1308         }
1309       return dest_node;
1310     }
1311   else
1312     {
1313       int naccepted = 0;
1314       re_token_type_t type = dfa->nodes[node].type;
1315
1316 #ifdef RE_ENABLE_I18N
1317       if (dfa->nodes[node].accept_mb)
1318         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1319       else
1320 #endif /* RE_ENABLE_I18N */
1321       if (type == OP_BACK_REF)
1322         {
1323           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1324           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1325           if (fs != NULL)
1326             {
1327               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1328                 return -1;
1329               else if (naccepted)
1330                 {
1331                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1332                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1333                               naccepted) != 0)
1334                     return -1;
1335                 }
1336             }
1337
1338           if (naccepted == 0)
1339             {
1340               int dest_node;
1341               err = re_node_set_insert (eps_via_nodes, node);
1342               if (BE (err < 0, 0))
1343                 return -2;
1344               dest_node = dfa->edests[node].elems[0];
1345               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1346                                         dest_node))
1347                 return dest_node;
1348             }
1349         }
1350
1351       if (naccepted != 0
1352           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1353         {
1354           int dest_node = dfa->nexts[node];
1355           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1356           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1357                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1358                                                dest_node)))
1359             return -1;
1360           re_node_set_empty (eps_via_nodes);
1361           return dest_node;
1362         }
1363     }
1364   return -1;
1365 }
1366
1367 static reg_errcode_t
1368 internal_function __attribute_warn_unused_result__
1369 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1370                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1371 {
1372   reg_errcode_t err;
1373   int num = fs->num++;
1374   if (fs->num == fs->alloc)
1375     {
1376       struct re_fail_stack_ent_t *new_array;
1377       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1378                                        * fs->alloc * 2));
1379       if (new_array == NULL)
1380         return REG_ESPACE;
1381       fs->alloc *= 2;
1382       fs->stack = new_array;
1383     }
1384   fs->stack[num].idx = str_idx;
1385   fs->stack[num].node = dest_node;
1386   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1387   if (fs->stack[num].regs == NULL)
1388     return REG_ESPACE;
1389   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1390   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1391   return err;
1392 }
1393
1394 static int
1395 internal_function
1396 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1397                 regmatch_t *regs, re_node_set *eps_via_nodes)
1398 {
1399   int num = --fs->num;
1400   assert (num >= 0);
1401   *pidx = fs->stack[num].idx;
1402   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1403   re_node_set_free (eps_via_nodes);
1404   re_free (fs->stack[num].regs);
1405   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1406   return fs->stack[num].node;
1407 }
1408
1409 /* Set the positions where the subexpressions are starts/ends to registers
1410    PMATCH.
1411    Note: We assume that pmatch[0] is already set, and
1412    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1413
1414 static reg_errcode_t
1415 internal_function __attribute_warn_unused_result__
1416 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1417           regmatch_t *pmatch, int fl_backtrack)
1418 {
1419   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1420   int idx, cur_node;
1421   re_node_set eps_via_nodes;
1422   struct re_fail_stack_t *fs;
1423   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1424   regmatch_t *prev_idx_match;
1425   int prev_idx_match_malloced = 0;
1426
1427 #ifdef DEBUG
1428   assert (nmatch > 1);
1429   assert (mctx->state_log != NULL);
1430 #endif
1431   if (fl_backtrack)
1432     {
1433       fs = &fs_body;
1434       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1435       if (fs->stack == NULL)
1436         return REG_ESPACE;
1437     }
1438   else
1439     fs = NULL;
1440
1441   cur_node = dfa->init_node;
1442   re_node_set_init_empty (&eps_via_nodes);
1443
1444   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1445     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1446   else
1447     {
1448       prev_idx_match = re_malloc (regmatch_t, nmatch);
1449       if (prev_idx_match == NULL)
1450         {
1451           free_fail_stack_return (fs);
1452           return REG_ESPACE;
1453         }
1454       prev_idx_match_malloced = 1;
1455     }
1456   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1457
1458   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1459     {
1460       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1461
1462       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1463         {
1464           int reg_idx;
1465           if (fs)
1466             {
1467               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1468                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1469                   break;
1470               if (reg_idx == nmatch)
1471                 {
1472                   re_node_set_free (&eps_via_nodes);
1473                   if (prev_idx_match_malloced)
1474                     re_free (prev_idx_match);
1475                   return free_fail_stack_return (fs);
1476                 }
1477               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1478                                          &eps_via_nodes);
1479             }
1480           else
1481             {
1482               re_node_set_free (&eps_via_nodes);
1483               if (prev_idx_match_malloced)
1484                 re_free (prev_idx_match);
1485               return REG_NOERROR;
1486             }
1487         }
1488
1489       /* Proceed to next node.  */
1490       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1491                                     &eps_via_nodes, fs);
1492
1493       if (BE (cur_node < 0, 0))
1494         {
1495           if (BE (cur_node == -2, 0))
1496             {
1497               re_node_set_free (&eps_via_nodes);
1498               if (prev_idx_match_malloced)
1499                 re_free (prev_idx_match);
1500               free_fail_stack_return (fs);
1501               return REG_ESPACE;
1502             }
1503           if (fs)
1504             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1505                                        &eps_via_nodes);
1506           else
1507             {
1508               re_node_set_free (&eps_via_nodes);
1509               if (prev_idx_match_malloced)
1510                 re_free (prev_idx_match);
1511               return REG_NOMATCH;
1512             }
1513         }
1514     }
1515   re_node_set_free (&eps_via_nodes);
1516   if (prev_idx_match_malloced)
1517     re_free (prev_idx_match);
1518   return free_fail_stack_return (fs);
1519 }
1520
1521 static reg_errcode_t
1522 internal_function
1523 free_fail_stack_return (struct re_fail_stack_t *fs)
1524 {
1525   if (fs)
1526     {
1527       int fs_idx;
1528       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1529         {
1530           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1531           re_free (fs->stack[fs_idx].regs);
1532         }
1533       re_free (fs->stack);
1534     }
1535   return REG_NOERROR;
1536 }
1537
1538 static void
1539 internal_function
1540 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1541              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1542 {
1543   int type = dfa->nodes[cur_node].type;
1544   if (type == OP_OPEN_SUBEXP)
1545     {
1546       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1547
1548       /* We are at the first node of this sub expression.  */
1549       if (reg_num < nmatch)
1550         {
1551           pmatch[reg_num].rm_so = cur_idx;
1552           pmatch[reg_num].rm_eo = -1;
1553         }
1554     }
1555   else if (type == OP_CLOSE_SUBEXP)
1556     {
1557       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1558       if (reg_num < nmatch)
1559         {
1560           /* We are at the last node of this sub expression.  */
1561           if (pmatch[reg_num].rm_so < cur_idx)
1562             {
1563               pmatch[reg_num].rm_eo = cur_idx;
1564               /* This is a non-empty match or we are not inside an optional
1565                  subexpression.  Accept this right away.  */
1566               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1567             }
1568           else
1569             {
1570               if (dfa->nodes[cur_node].opt_subexp
1571                   && prev_idx_match[reg_num].rm_so != -1)
1572                 /* We transited through an empty match for an optional
1573                    subexpression, like (a?)*, and this is not the subexp's
1574                    first match.  Copy back the old content of the registers
1575                    so that matches of an inner subexpression are undone as
1576                    well, like in ((a?))*.  */
1577                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1578               else
1579                 /* We completed a subexpression, but it may be part of
1580                    an optional one, so do not update PREV_IDX_MATCH.  */
1581                 pmatch[reg_num].rm_eo = cur_idx;
1582             }
1583         }
1584     }
1585 }
1586
1587 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1588    and sift the nodes in each states according to the following rules.
1589    Updated state_log will be wrote to STATE_LOG.
1590
1591    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1592      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1593         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1594         the LAST_NODE, we throw away the node `a'.
1595      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1596         string `s' and transit to `b':
1597         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1598            away the node `a'.
1599         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1600             thrown away, we throw away the node `a'.
1601      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1602         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1603            node `a'.
1604         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1605             we throw away the node `a'.  */
1606
1607 #define STATE_NODE_CONTAINS(state,node) \
1608   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1609
1610 static reg_errcode_t
1611 internal_function
1612 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1613 {
1614   reg_errcode_t err;
1615   int null_cnt = 0;
1616   int str_idx = sctx->last_str_idx;
1617   re_node_set cur_dest;
1618
1619 #ifdef DEBUG
1620   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1621 #endif
1622
1623   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1624      transit to the last_node and the last_node itself.  */
1625   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1626   if (BE (err != REG_NOERROR, 0))
1627     return err;
1628   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1629   if (BE (err != REG_NOERROR, 0))
1630     goto free_return;
1631
1632   /* Then check each states in the state_log.  */
1633   while (str_idx > 0)
1634     {
1635       /* Update counters.  */
1636       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1637       if (null_cnt > mctx->max_mb_elem_len)
1638         {
1639           memset (sctx->sifted_states, '\0',
1640                   sizeof (re_dfastate_t *) * str_idx);
1641           re_node_set_free (&cur_dest);
1642           return REG_NOERROR;
1643         }
1644       re_node_set_empty (&cur_dest);
1645       --str_idx;
1646
1647       if (mctx->state_log[str_idx])
1648         {
1649           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1650           if (BE (err != REG_NOERROR, 0))
1651             goto free_return;
1652         }
1653
1654       /* Add all the nodes which satisfy the following conditions:
1655          - It can epsilon transit to a node in CUR_DEST.
1656          - It is in CUR_SRC.
1657          And update state_log.  */
1658       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1659       if (BE (err != REG_NOERROR, 0))
1660         goto free_return;
1661     }
1662   err = REG_NOERROR;
1663  free_return:
1664   re_node_set_free (&cur_dest);
1665   return err;
1666 }
1667
1668 static reg_errcode_t
1669 internal_function __attribute_warn_unused_result__
1670 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1671                      int str_idx, re_node_set *cur_dest)
1672 {
1673   const re_dfa_t *const dfa = mctx->dfa;
1674   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1675   int i;
1676
1677   /* Then build the next sifted state.
1678      We build the next sifted state on `cur_dest', and update
1679      `sifted_states[str_idx]' with `cur_dest'.
1680      Note:
1681      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1682      `cur_src' points the node_set of the old `state_log[str_idx]'
1683      (with the epsilon nodes pre-filtered out).  */
1684   for (i = 0; i < cur_src->nelem; i++)
1685     {
1686       int prev_node = cur_src->elems[i];
1687       int naccepted = 0;
1688       int ret;
1689
1690 #ifdef DEBUG
1691       re_token_type_t type = dfa->nodes[prev_node].type;
1692       assert (!IS_EPSILON_NODE (type));
1693 #endif
1694 #ifdef RE_ENABLE_I18N
1695       /* If the node may accept `multi byte'.  */
1696       if (dfa->nodes[prev_node].accept_mb)
1697         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1698                                          str_idx, sctx->last_str_idx);
1699 #endif /* RE_ENABLE_I18N */
1700
1701       /* We don't check backreferences here.
1702          See update_cur_sifted_state().  */
1703       if (!naccepted
1704           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1705           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1706                                   dfa->nexts[prev_node]))
1707         naccepted = 1;
1708
1709       if (naccepted == 0)
1710         continue;
1711
1712       if (sctx->limits.nelem)
1713         {
1714           int to_idx = str_idx + naccepted;
1715           if (check_dst_limits (mctx, &sctx->limits,
1716                                 dfa->nexts[prev_node], to_idx,
1717                                 prev_node, str_idx))
1718             continue;
1719         }
1720       ret = re_node_set_insert (cur_dest, prev_node);
1721       if (BE (ret == -1, 0))
1722         return REG_ESPACE;
1723     }
1724
1725   return REG_NOERROR;
1726 }
1727
1728 /* Helper functions.  */
1729
1730 static reg_errcode_t
1731 internal_function
1732 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1733 {
1734   int top = mctx->state_log_top;
1735
1736   if ((next_state_log_idx >= mctx->input.bufs_len
1737        && mctx->input.bufs_len < mctx->input.len)
1738       || (next_state_log_idx >= mctx->input.valid_len
1739           && mctx->input.valid_len < mctx->input.len))
1740     {
1741       reg_errcode_t err;
1742       err = extend_buffers (mctx);
1743       if (BE (err != REG_NOERROR, 0))
1744         return err;
1745     }
1746
1747   if (top < next_state_log_idx)
1748     {
1749       memset (mctx->state_log + top + 1, '\0',
1750               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1751       mctx->state_log_top = next_state_log_idx;
1752     }
1753   return REG_NOERROR;
1754 }
1755
1756 static reg_errcode_t
1757 internal_function
1758 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1759                    re_dfastate_t **src, int num)
1760 {
1761   int st_idx;
1762   reg_errcode_t err;
1763   for (st_idx = 0; st_idx < num; ++st_idx)
1764     {
1765       if (dst[st_idx] == NULL)
1766         dst[st_idx] = src[st_idx];
1767       else if (src[st_idx] != NULL)
1768         {
1769           re_node_set merged_set;
1770           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1771                                         &src[st_idx]->nodes);
1772           if (BE (err != REG_NOERROR, 0))
1773             return err;
1774           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1775           re_node_set_free (&merged_set);
1776           if (BE (err != REG_NOERROR, 0))
1777             return err;
1778         }
1779     }
1780   return REG_NOERROR;
1781 }
1782
1783 static reg_errcode_t
1784 internal_function
1785 update_cur_sifted_state (const re_match_context_t *mctx,
1786                          re_sift_context_t *sctx, int str_idx,
1787                          re_node_set *dest_nodes)
1788 {
1789   const re_dfa_t *const dfa = mctx->dfa;
1790   reg_errcode_t err = REG_NOERROR;
1791   const re_node_set *candidates;
1792   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1793                 : &mctx->state_log[str_idx]->nodes);
1794
1795   if (dest_nodes->nelem == 0)
1796     sctx->sifted_states[str_idx] = NULL;
1797   else
1798     {
1799       if (candidates)
1800         {
1801           /* At first, add the nodes which can epsilon transit to a node in
1802              DEST_NODE.  */
1803           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1804           if (BE (err != REG_NOERROR, 0))
1805             return err;
1806
1807           /* Then, check the limitations in the current sift_context.  */
1808           if (sctx->limits.nelem)
1809             {
1810               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1811                                          mctx->bkref_ents, str_idx);
1812               if (BE (err != REG_NOERROR, 0))
1813                 return err;
1814             }
1815         }
1816
1817       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1818       if (BE (err != REG_NOERROR, 0))
1819         return err;
1820     }
1821
1822   if (candidates && mctx->state_log[str_idx]->has_backref)
1823     {
1824       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1825       if (BE (err != REG_NOERROR, 0))
1826         return err;
1827     }
1828   return REG_NOERROR;
1829 }
1830
1831 static reg_errcode_t
1832 internal_function __attribute_warn_unused_result__
1833 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1834                        const re_node_set *candidates)
1835 {
1836   reg_errcode_t err = REG_NOERROR;
1837   int i;
1838
1839   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1840   if (BE (err != REG_NOERROR, 0))
1841     return err;
1842
1843   if (!state->inveclosure.alloc)
1844     {
1845       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1846       if (BE (err != REG_NOERROR, 0))
1847         return REG_ESPACE;
1848       for (i = 0; i < dest_nodes->nelem; i++)
1849         {
1850           err = re_node_set_merge (&state->inveclosure,
1851                                    dfa->inveclosures + dest_nodes->elems[i]);
1852           if (BE (err != REG_NOERROR, 0))
1853             return REG_ESPACE;
1854         }
1855     }
1856   return re_node_set_add_intersect (dest_nodes, candidates,
1857                                     &state->inveclosure);
1858 }
1859
1860 static reg_errcode_t
1861 internal_function
1862 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1863                        const re_node_set *candidates)
1864 {
1865     int ecl_idx;
1866     reg_errcode_t err;
1867     re_node_set *inv_eclosure = dfa->inveclosures + node;
1868     re_node_set except_nodes;
1869     re_node_set_init_empty (&except_nodes);
1870     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1871       {
1872         int cur_node = inv_eclosure->elems[ecl_idx];
1873         if (cur_node == node)
1874           continue;
1875         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1876           {
1877             int edst1 = dfa->edests[cur_node].elems[0];
1878             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1879                          ? dfa->edests[cur_node].elems[1] : -1);
1880             if ((!re_node_set_contains (inv_eclosure, edst1)
1881                  && re_node_set_contains (dest_nodes, edst1))
1882                 || (edst2 > 0
1883                     && !re_node_set_contains (inv_eclosure, edst2)
1884                     && re_node_set_contains (dest_nodes, edst2)))
1885               {
1886                 err = re_node_set_add_intersect (&except_nodes, candidates,
1887                                                  dfa->inveclosures + cur_node);
1888                 if (BE (err != REG_NOERROR, 0))
1889                   {
1890                     re_node_set_free (&except_nodes);
1891                     return err;
1892                   }
1893               }
1894           }
1895       }
1896     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1897       {
1898         int cur_node = inv_eclosure->elems[ecl_idx];
1899         if (!re_node_set_contains (&except_nodes, cur_node))
1900           {
1901             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1902             re_node_set_remove_at (dest_nodes, idx);
1903           }
1904       }
1905     re_node_set_free (&except_nodes);
1906     return REG_NOERROR;
1907 }
1908
1909 static int
1910 internal_function
1911 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1912                   int dst_node, int dst_idx, int src_node, int src_idx)
1913 {
1914   const re_dfa_t *const dfa = mctx->dfa;
1915   int lim_idx, src_pos, dst_pos;
1916
1917   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1918   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1919   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1920     {
1921       int subexp_idx;
1922       struct re_backref_cache_entry *ent;
1923       ent = mctx->bkref_ents + limits->elems[lim_idx];
1924       subexp_idx = dfa->nodes[ent->node].opr.idx;
1925
1926       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1927                                            subexp_idx, dst_node, dst_idx,
1928                                            dst_bkref_idx);
1929       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1930                                            subexp_idx, src_node, src_idx,
1931                                            src_bkref_idx);
1932
1933       /* In case of:
1934          <src> <dst> ( <subexp> )
1935          ( <subexp> ) <src> <dst>
1936          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1937       if (src_pos == dst_pos)
1938         continue; /* This is unrelated limitation.  */
1939       else
1940         return 1;
1941     }
1942   return 0;
1943 }
1944
1945 static int
1946 internal_function
1947 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1948                              int subexp_idx, int from_node, int bkref_idx)
1949 {
1950   const re_dfa_t *const dfa = mctx->dfa;
1951   const re_node_set *eclosures = dfa->eclosures + from_node;
1952   int node_idx;
1953
1954   /* Else, we are on the boundary: examine the nodes on the epsilon
1955      closure.  */
1956   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1957     {
1958       int node = eclosures->elems[node_idx];
1959       switch (dfa->nodes[node].type)
1960         {
1961         case OP_BACK_REF:
1962           if (bkref_idx != -1)
1963             {
1964               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1965               do
1966                 {
1967                   int dst, cpos;
1968
1969                   if (ent->node != node)
1970                     continue;
1971
1972                   if (subexp_idx < BITSET_WORD_BITS
1973                       && !(ent->eps_reachable_subexps_map
1974                            & ((bitset_word_t) 1 << subexp_idx)))
1975                     continue;
1976
1977                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1978                      OP_CLOSE_SUBEXP cases below.  But, if the
1979                      destination node is the same node as the source
1980                      node, don't recurse because it would cause an
1981                      infinite loop: a regex that exhibits this behavior
1982                      is ()\1*\1*  */
1983                   dst = dfa->edests[node].elems[0];
1984                   if (dst == from_node)
1985                     {
1986                       if (boundaries & 1)
1987                         return -1;
1988                       else /* if (boundaries & 2) */
1989                         return 0;
1990                     }
1991
1992                   cpos =
1993                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1994                                                  dst, bkref_idx);
1995                   if (cpos == -1 /* && (boundaries & 1) */)
1996                     return -1;
1997                   if (cpos == 0 && (boundaries & 2))
1998                     return 0;
1999
2000                   if (subexp_idx < BITSET_WORD_BITS)
2001                     ent->eps_reachable_subexps_map
2002                       &= ~((bitset_word_t) 1 << subexp_idx);
2003                 }
2004               while (ent++->more);
2005             }
2006           break;
2007
2008         case OP_OPEN_SUBEXP:
2009           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2010             return -1;
2011           break;
2012
2013         case OP_CLOSE_SUBEXP:
2014           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2015             return 0;
2016           break;
2017
2018         default:
2019             break;
2020         }
2021     }
2022
2023   return (boundaries & 2) ? 1 : 0;
2024 }
2025
2026 static int
2027 internal_function
2028 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2029                            int subexp_idx, int from_node, int str_idx,
2030                            int bkref_idx)
2031 {
2032   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2033   int boundaries;
2034
2035   /* If we are outside the range of the subexpression, return -1 or 1.  */
2036   if (str_idx < lim->subexp_from)
2037     return -1;
2038
2039   if (lim->subexp_to < str_idx)
2040     return 1;
2041
2042   /* If we are within the subexpression, return 0.  */
2043   boundaries = (str_idx == lim->subexp_from);
2044   boundaries |= (str_idx == lim->subexp_to) << 1;
2045   if (boundaries == 0)
2046     return 0;
2047
2048   /* Else, examine epsilon closure.  */
2049   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2050                                       from_node, bkref_idx);
2051 }
2052
2053 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2054    which are against limitations from DEST_NODES. */
2055
2056 static reg_errcode_t
2057 internal_function
2058 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2059                      const re_node_set *candidates, re_node_set *limits,
2060                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2061 {
2062   reg_errcode_t err;
2063   int node_idx, lim_idx;
2064
2065   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2066     {
2067       int subexp_idx;
2068       struct re_backref_cache_entry *ent;
2069       ent = bkref_ents + limits->elems[lim_idx];
2070
2071       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2072         continue; /* This is unrelated limitation.  */
2073
2074       subexp_idx = dfa->nodes[ent->node].opr.idx;
2075       if (ent->subexp_to == str_idx)
2076         {
2077           int ops_node = -1;
2078           int cls_node = -1;
2079           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2080             {
2081               int node = dest_nodes->elems[node_idx];
2082               re_token_type_t type = dfa->nodes[node].type;
2083               if (type == OP_OPEN_SUBEXP
2084                   && subexp_idx == dfa->nodes[node].opr.idx)
2085                 ops_node = node;
2086               else if (type == OP_CLOSE_SUBEXP
2087                        && subexp_idx == dfa->nodes[node].opr.idx)
2088                 cls_node = node;
2089             }
2090
2091           /* Check the limitation of the open subexpression.  */
2092           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2093           if (ops_node >= 0)
2094             {
2095               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2096                                            candidates);
2097               if (BE (err != REG_NOERROR, 0))
2098                 return err;
2099             }
2100
2101           /* Check the limitation of the close subexpression.  */
2102           if (cls_node >= 0)
2103             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2104               {
2105                 int node = dest_nodes->elems[node_idx];
2106                 if (!re_node_set_contains (dfa->inveclosures + node,
2107                                            cls_node)
2108                     && !re_node_set_contains (dfa->eclosures + node,
2109                                               cls_node))
2110                   {
2111                     /* It is against this limitation.
2112                        Remove it form the current sifted state.  */
2113                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2114                                                  candidates);
2115                     if (BE (err != REG_NOERROR, 0))
2116                       return err;
2117                     --node_idx;
2118                   }
2119               }
2120         }
2121       else /* (ent->subexp_to != str_idx)  */
2122         {
2123           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2124             {
2125               int node = dest_nodes->elems[node_idx];
2126               re_token_type_t type = dfa->nodes[node].type;
2127               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2128                 {
2129                   if (subexp_idx != dfa->nodes[node].opr.idx)
2130                     continue;
2131                   /* It is against this limitation.
2132                      Remove it form the current sifted state.  */
2133                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2134                                                candidates);
2135                   if (BE (err != REG_NOERROR, 0))
2136                     return err;
2137                 }
2138             }
2139         }
2140     }
2141   return REG_NOERROR;
2142 }
2143
2144 static reg_errcode_t
2145 internal_function __attribute_warn_unused_result__
2146 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2147                    int str_idx, const re_node_set *candidates)
2148 {
2149   const re_dfa_t *const dfa = mctx->dfa;
2150   reg_errcode_t err;
2151   int node_idx, node;
2152   re_sift_context_t local_sctx;
2153   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2154
2155   if (first_idx == -1)
2156     return REG_NOERROR;
2157
2158   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2159
2160   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2161     {
2162       int enabled_idx;
2163       re_token_type_t type;
2164       struct re_backref_cache_entry *entry;
2165       node = candidates->elems[node_idx];
2166       type = dfa->nodes[node].type;
2167       /* Avoid infinite loop for the REs like "()\1+".  */
2168       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2169         continue;
2170       if (type != OP_BACK_REF)
2171         continue;
2172
2173       entry = mctx->bkref_ents + first_idx;
2174       enabled_idx = first_idx;
2175       do
2176         {
2177           int subexp_len;
2178           int to_idx;
2179           int dst_node;
2180           int ret;
2181           re_dfastate_t *cur_state;
2182
2183           if (entry->node != node)
2184             continue;
2185           subexp_len = entry->subexp_to - entry->subexp_from;
2186           to_idx = str_idx + subexp_len;
2187           dst_node = (subexp_len ? dfa->nexts[node]
2188                       : dfa->edests[node].elems[0]);
2189
2190           if (to_idx > sctx->last_str_idx
2191               || sctx->sifted_states[to_idx] == NULL
2192               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2193               || check_dst_limits (mctx, &sctx->limits, node,
2194                                    str_idx, dst_node, to_idx))
2195             continue;
2196
2197           if (local_sctx.sifted_states == NULL)
2198             {
2199               local_sctx = *sctx;
2200               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2201               if (BE (err != REG_NOERROR, 0))
2202                 goto free_return;
2203             }
2204           local_sctx.last_node = node;
2205           local_sctx.last_str_idx = str_idx;
2206           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2207           if (BE (ret < 0, 0))
2208             {
2209               err = REG_ESPACE;
2210               goto free_return;
2211             }
2212           cur_state = local_sctx.sifted_states[str_idx];
2213           err = sift_states_backward (mctx, &local_sctx);
2214           if (BE (err != REG_NOERROR, 0))
2215             goto free_return;
2216           if (sctx->limited_states != NULL)
2217             {
2218               err = merge_state_array (dfa, sctx->limited_states,
2219                                        local_sctx.sifted_states,
2220                                        str_idx + 1);
2221               if (BE (err != REG_NOERROR, 0))
2222                 goto free_return;
2223             }
2224           local_sctx.sifted_states[str_idx] = cur_state;
2225           re_node_set_remove (&local_sctx.limits, enabled_idx);
2226
2227           /* mctx->bkref_ents may have changed, reload the pointer.  */
2228           entry = mctx->bkref_ents + enabled_idx;
2229         }
2230       while (enabled_idx++, entry++->more);
2231     }
2232   err = REG_NOERROR;
2233  free_return:
2234   if (local_sctx.sifted_states != NULL)
2235     {
2236       re_node_set_free (&local_sctx.limits);
2237     }
2238
2239   return err;
2240 }
2241
2242
2243 #ifdef RE_ENABLE_I18N
2244 static int
2245 internal_function
2246 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2247                      int node_idx, int str_idx, int max_str_idx)
2248 {
2249   const re_dfa_t *const dfa = mctx->dfa;
2250   int naccepted;
2251   /* Check the node can accept `multi byte'.  */
2252   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2253   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2254       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2255                             dfa->nexts[node_idx]))
2256     /* The node can't accept the `multi byte', or the
2257        destination was already thrown away, then the node
2258        could't accept the current input `multi byte'.   */
2259     naccepted = 0;
2260   /* Otherwise, it is sure that the node could accept
2261      `naccepted' bytes input.  */
2262   return naccepted;
2263 }
2264 #endif /* RE_ENABLE_I18N */
2265
2266 \f
2267 /* Functions for state transition.  */
2268
2269 /* Return the next state to which the current state STATE will transit by
2270    accepting the current input byte, and update STATE_LOG if necessary.
2271    If STATE can accept a multibyte char/collating element/back reference
2272    update the destination of STATE_LOG.  */
2273
2274 static re_dfastate_t *
2275 internal_function __attribute_warn_unused_result__
2276 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2277                re_dfastate_t *state)
2278 {
2279   re_dfastate_t **trtable;
2280   unsigned char ch;
2281
2282 #ifdef RE_ENABLE_I18N
2283   /* If the current state can accept multibyte.  */
2284   if (BE (state->accept_mb, 0))
2285     {
2286       *err = transit_state_mb (mctx, state);
2287       if (BE (*err != REG_NOERROR, 0))
2288         return NULL;
2289     }
2290 #endif /* RE_ENABLE_I18N */
2291
2292   /* Then decide the next state with the single byte.  */
2293 #if 0
2294   if (0)
2295     /* don't use transition table  */
2296     return transit_state_sb (err, mctx, state);
2297 #endif
2298
2299   /* Use transition table  */
2300   ch = re_string_fetch_byte (&mctx->input);
2301   for (;;)
2302     {
2303       trtable = state->trtable;
2304       if (BE (trtable != NULL, 1))
2305         return trtable[ch];
2306
2307       trtable = state->word_trtable;
2308       if (BE (trtable != NULL, 1))
2309         {
2310           unsigned int context;
2311           context
2312             = re_string_context_at (&mctx->input,
2313                                     re_string_cur_idx (&mctx->input) - 1,
2314                                     mctx->eflags);
2315           if (IS_WORD_CONTEXT (context))
2316             return trtable[ch + SBC_MAX];
2317           else
2318             return trtable[ch];
2319         }
2320
2321       if (!build_trtable (mctx->dfa, state))
2322         {
2323           *err = REG_ESPACE;
2324           return NULL;
2325         }
2326
2327       /* Retry, we now have a transition table.  */
2328     }
2329 }
2330
2331 /* Update the state_log if we need */
2332 re_dfastate_t *
2333 internal_function
2334 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2335                       re_dfastate_t *next_state)
2336 {
2337   const re_dfa_t *const dfa = mctx->dfa;
2338   int cur_idx = re_string_cur_idx (&mctx->input);
2339
2340   if (cur_idx > mctx->state_log_top)
2341     {
2342       mctx->state_log[cur_idx] = next_state;
2343       mctx->state_log_top = cur_idx;
2344     }
2345   else if (mctx->state_log[cur_idx] == 0)
2346     {
2347       mctx->state_log[cur_idx] = next_state;
2348     }
2349   else
2350     {
2351       re_dfastate_t *pstate;
2352       unsigned int context;
2353       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2354       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2355          the destination of a multibyte char/collating element/
2356          back reference.  Then the next state is the union set of
2357          these destinations and the results of the transition table.  */
2358       pstate = mctx->state_log[cur_idx];
2359       log_nodes = pstate->entrance_nodes;
2360       if (next_state != NULL)
2361         {
2362           table_nodes = next_state->entrance_nodes;
2363           *err = re_node_set_init_union (&next_nodes, table_nodes,
2364                                              log_nodes);
2365           if (BE (*err != REG_NOERROR, 0))
2366             return NULL;
2367         }
2368       else
2369         next_nodes = *log_nodes;
2370       /* Note: We already add the nodes of the initial state,
2371          then we don't need to add them here.  */
2372
2373       context = re_string_context_at (&mctx->input,
2374                                       re_string_cur_idx (&mctx->input) - 1,
2375                                       mctx->eflags);
2376       next_state = mctx->state_log[cur_idx]
2377         = re_acquire_state_context (err, dfa, &next_nodes, context);
2378       /* We don't need to check errors here, since the return value of
2379          this function is next_state and ERR is already set.  */
2380
2381       if (table_nodes != NULL)
2382         re_node_set_free (&next_nodes);
2383     }
2384
2385   if (BE (dfa->nbackref, 0) && next_state != NULL)
2386     {
2387       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2388          later.  We must check them here, since the back references in the
2389          next state might use them.  */
2390       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2391                                         cur_idx);
2392       if (BE (*err != REG_NOERROR, 0))
2393         return NULL;
2394
2395       /* If the next state has back references.  */
2396       if (next_state->has_backref)
2397         {
2398           *err = transit_state_bkref (mctx, &next_state->nodes);
2399           if (BE (*err != REG_NOERROR, 0))
2400             return NULL;
2401           next_state = mctx->state_log[cur_idx];
2402         }
2403     }
2404
2405   return next_state;
2406 }
2407
2408 /* Skip bytes in the input that correspond to part of a
2409    multi-byte match, then look in the log for a state
2410    from which to restart matching.  */
2411 re_dfastate_t *
2412 internal_function
2413 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2414 {
2415   re_dfastate_t *cur_state;
2416   do
2417     {
2418       int max = mctx->state_log_top;
2419       int cur_str_idx = re_string_cur_idx (&mctx->input);
2420
2421       do
2422         {
2423           if (++cur_str_idx > max)
2424             return NULL;
2425           re_string_skip_bytes (&mctx->input, 1);
2426         }
2427       while (mctx->state_log[cur_str_idx] == NULL);
2428
2429       cur_state = merge_state_with_log (err, mctx, NULL);
2430     }
2431   while (*err == REG_NOERROR && cur_state == NULL);
2432   return cur_state;
2433 }
2434
2435 /* Helper functions for transit_state.  */
2436
2437 /* From the node set CUR_NODES, pick up the nodes whose types are
2438    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2439    expression. And register them to use them later for evaluating the
2440    correspoding back references.  */
2441
2442 static reg_errcode_t
2443 internal_function
2444 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2445                            int str_idx)
2446 {
2447   const re_dfa_t *const dfa = mctx->dfa;
2448   int node_idx;
2449   reg_errcode_t err;
2450
2451   /* TODO: This isn't efficient.
2452            Because there might be more than one nodes whose types are
2453            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2454            nodes.
2455            E.g. RE: (a){2}  */
2456   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2457     {
2458       int node = cur_nodes->elems[node_idx];
2459       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2460           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2461           && (dfa->used_bkref_map
2462               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2463         {
2464           err = match_ctx_add_subtop (mctx, node, str_idx);
2465           if (BE (err != REG_NOERROR, 0))
2466             return err;
2467         }
2468     }
2469   return REG_NOERROR;
2470 }
2471
2472 #if 0
2473 /* Return the next state to which the current state STATE will transit by
2474    accepting the current input byte.  */
2475
2476 static re_dfastate_t *
2477 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2478                   re_dfastate_t *state)
2479 {
2480   const re_dfa_t *const dfa = mctx->dfa;
2481   re_node_set next_nodes;
2482   re_dfastate_t *next_state;
2483   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2484   unsigned int context;
2485
2486   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2487   if (BE (*err != REG_NOERROR, 0))
2488     return NULL;
2489   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2490     {
2491       int cur_node = state->nodes.elems[node_cnt];
2492       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2493         {
2494           *err = re_node_set_merge (&next_nodes,
2495                                     dfa->eclosures + dfa->nexts[cur_node]);
2496           if (BE (*err != REG_NOERROR, 0))
2497             {
2498               re_node_set_free (&next_nodes);
2499               return NULL;
2500             }
2501         }
2502     }
2503   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2504   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2505   /* We don't need to check errors here, since the return value of
2506      this function is next_state and ERR is already set.  */
2507
2508   re_node_set_free (&next_nodes);
2509   re_string_skip_bytes (&mctx->input, 1);
2510   return next_state;
2511 }
2512 #endif
2513
2514 #ifdef RE_ENABLE_I18N
2515 static reg_errcode_t
2516 internal_function
2517 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2518 {
2519   const re_dfa_t *const dfa = mctx->dfa;
2520   reg_errcode_t err;
2521   int i;
2522
2523   for (i = 0; i < pstate->nodes.nelem; ++i)
2524     {
2525       re_node_set dest_nodes, *new_nodes;
2526       int cur_node_idx = pstate->nodes.elems[i];
2527       int naccepted, dest_idx;
2528       unsigned int context;
2529       re_dfastate_t *dest_state;
2530
2531       if (!dfa->nodes[cur_node_idx].accept_mb)
2532         continue;
2533
2534       if (dfa->nodes[cur_node_idx].constraint)
2535         {
2536           context = re_string_context_at (&mctx->input,
2537                                           re_string_cur_idx (&mctx->input),
2538                                           mctx->eflags);
2539           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2540                                            context))
2541             continue;
2542         }
2543
2544       /* How many bytes the node can accept?  */
2545       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2546                                            re_string_cur_idx (&mctx->input));
2547       if (naccepted == 0)
2548         continue;
2549
2550       /* The node can accepts `naccepted' bytes.  */
2551       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2552       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2553                                : mctx->max_mb_elem_len);
2554       err = clean_state_log_if_needed (mctx, dest_idx);
2555       if (BE (err != REG_NOERROR, 0))
2556         return err;
2557 #ifdef DEBUG
2558       assert (dfa->nexts[cur_node_idx] != -1);
2559 #endif
2560       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2561
2562       dest_state = mctx->state_log[dest_idx];
2563       if (dest_state == NULL)
2564         dest_nodes = *new_nodes;
2565       else
2566         {
2567           err = re_node_set_init_union (&dest_nodes,
2568                                         dest_state->entrance_nodes, new_nodes);
2569           if (BE (err != REG_NOERROR, 0))
2570             return err;
2571         }
2572       context = re_string_context_at (&mctx->input, dest_idx - 1,
2573                                       mctx->eflags);
2574       mctx->state_log[dest_idx]
2575         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2576       if (dest_state != NULL)
2577         re_node_set_free (&dest_nodes);
2578       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2579         return err;
2580     }
2581   return REG_NOERROR;
2582 }
2583 #endif /* RE_ENABLE_I18N */
2584
2585 static reg_errcode_t
2586 internal_function
2587 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2588 {
2589   const re_dfa_t *const dfa = mctx->dfa;
2590   reg_errcode_t err;
2591   int i;
2592   int cur_str_idx = re_string_cur_idx (&mctx->input);
2593
2594   for (i = 0; i < nodes->nelem; ++i)
2595     {
2596       int dest_str_idx, prev_nelem, bkc_idx;
2597       int node_idx = nodes->elems[i];
2598       unsigned int context;
2599       const re_token_t *node = dfa->nodes + node_idx;
2600       re_node_set *new_dest_nodes;
2601
2602       /* Check whether `node' is a backreference or not.  */
2603       if (node->type != OP_BACK_REF)
2604         continue;
2605
2606       if (node->constraint)
2607         {
2608           context = re_string_context_at (&mctx->input, cur_str_idx,
2609                                           mctx->eflags);
2610           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2611             continue;
2612         }
2613
2614       /* `node' is a backreference.
2615          Check the substring which the substring matched.  */
2616       bkc_idx = mctx->nbkref_ents;
2617       err = get_subexp (mctx, node_idx, cur_str_idx);
2618       if (BE (err != REG_NOERROR, 0))
2619         goto free_return;
2620
2621       /* And add the epsilon closures (which is `new_dest_nodes') of
2622          the backreference to appropriate state_log.  */
2623 #ifdef DEBUG
2624       assert (dfa->nexts[node_idx] != -1);
2625 #endif
2626       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2627         {
2628           int subexp_len;
2629           re_dfastate_t *dest_state;
2630           struct re_backref_cache_entry *bkref_ent;
2631           bkref_ent = mctx->bkref_ents + bkc_idx;
2632           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2633             continue;
2634           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2635           new_dest_nodes = (subexp_len == 0
2636                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2637                             : dfa->eclosures + dfa->nexts[node_idx]);
2638           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2639                           - bkref_ent->subexp_from);
2640           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2641                                           mctx->eflags);
2642           dest_state = mctx->state_log[dest_str_idx];
2643           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2644                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2645           /* Add `new_dest_node' to state_log.  */
2646           if (dest_state == NULL)
2647             {
2648               mctx->state_log[dest_str_idx]
2649                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2650                                             context);
2651               if (BE (mctx->state_log[dest_str_idx] == NULL
2652                       && err != REG_NOERROR, 0))
2653                 goto free_return;
2654             }
2655           else
2656             {
2657               re_node_set dest_nodes;
2658               err = re_node_set_init_union (&dest_nodes,
2659                                             dest_state->entrance_nodes,
2660                                             new_dest_nodes);
2661               if (BE (err != REG_NOERROR, 0))
2662                 {
2663                   re_node_set_free (&dest_nodes);
2664                   goto free_return;
2665                 }
2666               mctx->state_log[dest_str_idx]
2667                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2668               re_node_set_free (&dest_nodes);
2669               if (BE (mctx->state_log[dest_str_idx] == NULL
2670                       && err != REG_NOERROR, 0))
2671                 goto free_return;
2672             }
2673           /* We need to check recursively if the backreference can epsilon
2674              transit.  */
2675           if (subexp_len == 0
2676               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2677             {
2678               err = check_subexp_matching_top (mctx, new_dest_nodes,
2679                                                cur_str_idx);
2680               if (BE (err != REG_NOERROR, 0))
2681                 goto free_return;
2682               err = transit_state_bkref (mctx, new_dest_nodes);
2683               if (BE (err != REG_NOERROR, 0))
2684                 goto free_return;
2685             }
2686         }
2687     }
2688   err = REG_NOERROR;
2689  free_return:
2690   return err;
2691 }
2692
2693 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2694    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2695    Note that we might collect inappropriate candidates here.
2696    However, the cost of checking them strictly here is too high, then we
2697    delay these checking for prune_impossible_nodes().  */
2698
2699 static reg_errcode_t
2700 internal_function __attribute_warn_unused_result__
2701 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2702 {
2703   const re_dfa_t *const dfa = mctx->dfa;
2704   int subexp_num, sub_top_idx;
2705   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2706   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2707   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2708   if (cache_idx != -1)
2709     {
2710       const struct re_backref_cache_entry *entry
2711         = mctx->bkref_ents + cache_idx;
2712       do
2713         if (entry->node == bkref_node)
2714           return REG_NOERROR; /* We already checked it.  */
2715       while (entry++->more);
2716     }
2717
2718   subexp_num = dfa->nodes[bkref_node].opr.idx;
2719
2720   /* For each sub expression  */
2721   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2722     {
2723       reg_errcode_t err;
2724       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2725       re_sub_match_last_t *sub_last;
2726       int sub_last_idx, sl_str, bkref_str_off;
2727
2728       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2729         continue; /* It isn't related.  */
2730
2731       sl_str = sub_top->str_idx;
2732       bkref_str_off = bkref_str_idx;
2733       /* At first, check the last node of sub expressions we already
2734          evaluated.  */
2735       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2736         {
2737           int sl_str_diff;
2738           sub_last = sub_top->lasts[sub_last_idx];
2739           sl_str_diff = sub_last->str_idx - sl_str;
2740           /* The matched string by the sub expression match with the substring
2741              at the back reference?  */
2742           if (sl_str_diff > 0)
2743             {
2744               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2745                 {
2746                   /* Not enough chars for a successful match.  */
2747                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2748                     break;
2749
2750                   err = clean_state_log_if_needed (mctx,
2751                                                    bkref_str_off
2752                                                    + sl_str_diff);
2753                   if (BE (err != REG_NOERROR, 0))
2754                     return err;
2755                   buf = (const char *) re_string_get_buffer (&mctx->input);
2756                 }
2757               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2758                 /* We don't need to search this sub expression any more.  */
2759                 break;
2760             }
2761           bkref_str_off += sl_str_diff;
2762           sl_str += sl_str_diff;
2763           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2764                                 bkref_str_idx);
2765
2766           /* Reload buf, since the preceding call might have reallocated
2767              the buffer.  */
2768           buf = (const char *) re_string_get_buffer (&mctx->input);
2769
2770           if (err == REG_NOMATCH)
2771             continue;
2772           if (BE (err != REG_NOERROR, 0))
2773             return err;
2774         }
2775
2776       if (sub_last_idx < sub_top->nlasts)
2777         continue;
2778       if (sub_last_idx > 0)
2779         ++sl_str;
2780       /* Then, search for the other last nodes of the sub expression.  */
2781       for (; sl_str <= bkref_str_idx; ++sl_str)
2782         {
2783           int cls_node, sl_str_off;
2784           const re_node_set *nodes;
2785           sl_str_off = sl_str - sub_top->str_idx;
2786           /* The matched string by the sub expression match with the substring
2787              at the back reference?  */
2788           if (sl_str_off > 0)
2789             {
2790               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2791                 {
2792                   /* If we are at the end of the input, we cannot match.  */
2793                   if (bkref_str_off >= mctx->input.len)
2794                     break;
2795
2796                   err = extend_buffers (mctx);
2797                   if (BE (err != REG_NOERROR, 0))
2798                     return err;
2799
2800                   buf = (const char *) re_string_get_buffer (&mctx->input);
2801                 }
2802               if (buf [bkref_str_off++] != buf[sl_str - 1])
2803                 break; /* We don't need to search this sub expression
2804                           any more.  */
2805             }
2806           if (mctx->state_log[sl_str] == NULL)
2807             continue;
2808           /* Does this state have a ')' of the sub expression?  */
2809           nodes = &mctx->state_log[sl_str]->nodes;
2810           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2811                                        OP_CLOSE_SUBEXP);
2812           if (cls_node == -1)
2813             continue; /* No.  */
2814           if (sub_top->path == NULL)
2815             {
2816               sub_top->path = calloc (sizeof (state_array_t),
2817                                       sl_str - sub_top->str_idx + 1);
2818               if (sub_top->path == NULL)
2819                 return REG_ESPACE;
2820             }
2821           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2822              in the current context?  */
2823           err = check_arrival (mctx, sub_top->path, sub_top->node,
2824                                sub_top->str_idx, cls_node, sl_str,
2825                                OP_CLOSE_SUBEXP);
2826           if (err == REG_NOMATCH)
2827               continue;
2828           if (BE (err != REG_NOERROR, 0))
2829               return err;
2830           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2831           if (BE (sub_last == NULL, 0))
2832             return REG_ESPACE;
2833           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2834                                 bkref_str_idx);
2835           if (err == REG_NOMATCH)
2836             continue;
2837         }
2838     }
2839   return REG_NOERROR;
2840 }
2841
2842 /* Helper functions for get_subexp().  */
2843
2844 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2845    If it can arrive, register the sub expression expressed with SUB_TOP
2846    and SUB_LAST.  */
2847
2848 static reg_errcode_t
2849 internal_function
2850 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2851                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2852 {
2853   reg_errcode_t err;
2854   int to_idx;
2855   /* Can the subexpression arrive the back reference?  */
2856   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2857                        sub_last->str_idx, bkref_node, bkref_str,
2858                        OP_OPEN_SUBEXP);
2859   if (err != REG_NOERROR)
2860     return err;
2861   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2862                              sub_last->str_idx);
2863   if (BE (err != REG_NOERROR, 0))
2864     return err;
2865   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2866   return clean_state_log_if_needed (mctx, to_idx);
2867 }
2868
2869 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2870    Search '(' if FL_OPEN, or search ')' otherwise.
2871    TODO: This function isn't efficient...
2872          Because there might be more than one nodes whose types are
2873          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2874          nodes.
2875          E.g. RE: (a){2}  */
2876
2877 static int
2878 internal_function
2879 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2880                   int subexp_idx, int type)
2881 {
2882   int cls_idx;
2883   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2884     {
2885       int cls_node = nodes->elems[cls_idx];
2886       const re_token_t *node = dfa->nodes + cls_node;
2887       if (node->type == type
2888           && node->opr.idx == subexp_idx)
2889         return cls_node;
2890     }
2891   return -1;
2892 }
2893
2894 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2895    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2896    heavily reused.
2897    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2898
2899 static reg_errcode_t
2900 internal_function __attribute_warn_unused_result__
2901 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2902                int top_str, int last_node, int last_str, int type)
2903 {
2904   const re_dfa_t *const dfa = mctx->dfa;
2905   reg_errcode_t err = REG_NOERROR;
2906   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2907   re_dfastate_t *cur_state = NULL;
2908   re_node_set *cur_nodes, next_nodes;
2909   re_dfastate_t **backup_state_log;
2910   unsigned int context;
2911
2912   subexp_num = dfa->nodes[top_node].opr.idx;
2913   /* Extend the buffer if we need.  */
2914   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2915     {
2916       re_dfastate_t **new_array;
2917       int old_alloc = path->alloc;
2918       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2919       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2920       if (BE (new_array == NULL, 0))
2921         {
2922           path->alloc = old_alloc;
2923           return REG_ESPACE;
2924         }
2925       path->array = new_array;
2926       memset (new_array + old_alloc, '\0',
2927               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2928     }
2929
2930   str_idx = path->next_idx ?: top_str;
2931
2932   /* Temporary modify MCTX.  */
2933   backup_state_log = mctx->state_log;
2934   backup_cur_idx = mctx->input.cur_idx;
2935   mctx->state_log = path->array;
2936   mctx->input.cur_idx = str_idx;
2937
2938   /* Setup initial node set.  */
2939   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2940   if (str_idx == top_str)
2941     {
2942       err = re_node_set_init_1 (&next_nodes, top_node);
2943       if (BE (err != REG_NOERROR, 0))
2944         return err;
2945       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2946       if (BE (err != REG_NOERROR, 0))
2947         {
2948           re_node_set_free (&next_nodes);
2949           return err;
2950         }
2951     }
2952   else
2953     {
2954       cur_state = mctx->state_log[str_idx];
2955       if (cur_state && cur_state->has_backref)
2956         {
2957           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2958           if (BE (err != REG_NOERROR, 0))
2959             return err;
2960         }
2961       else
2962         re_node_set_init_empty (&next_nodes);
2963     }
2964   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2965     {
2966       if (next_nodes.nelem)
2967         {
2968           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2969                                     subexp_num, type);
2970           if (BE (err != REG_NOERROR, 0))
2971             {
2972               re_node_set_free (&next_nodes);
2973               return err;
2974             }
2975         }
2976       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2977       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2978         {
2979           re_node_set_free (&next_nodes);
2980           return err;
2981         }
2982       mctx->state_log[str_idx] = cur_state;
2983     }
2984
2985   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2986     {
2987       re_node_set_empty (&next_nodes);
2988       if (mctx->state_log[str_idx + 1])
2989         {
2990           err = re_node_set_merge (&next_nodes,
2991                                    &mctx->state_log[str_idx + 1]->nodes);
2992           if (BE (err != REG_NOERROR, 0))
2993             {
2994               re_node_set_free (&next_nodes);
2995               return err;
2996             }
2997         }
2998       if (cur_state)
2999         {
3000           err = check_arrival_add_next_nodes (mctx, str_idx,
3001                                               &cur_state->non_eps_nodes,
3002                                               &next_nodes);
3003           if (BE (err != REG_NOERROR, 0))
3004             {
3005               re_node_set_free (&next_nodes);
3006               return err;
3007             }
3008         }
3009       ++str_idx;
3010       if (next_nodes.nelem)
3011         {
3012           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3013           if (BE (err != REG_NOERROR, 0))
3014             {
3015               re_node_set_free (&next_nodes);
3016               return err;
3017             }
3018           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3019                                     subexp_num, type);
3020           if (BE (err != REG_NOERROR, 0))
3021             {
3022               re_node_set_free (&next_nodes);
3023               return err;
3024             }
3025         }
3026       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3027       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3028       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3029         {
3030           re_node_set_free (&next_nodes);
3031           return err;
3032         }
3033       mctx->state_log[str_idx] = cur_state;
3034       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3035     }
3036   re_node_set_free (&next_nodes);
3037   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3038                : &mctx->state_log[last_str]->nodes);
3039   path->next_idx = str_idx;
3040
3041   /* Fix MCTX.  */
3042   mctx->state_log = backup_state_log;
3043   mctx->input.cur_idx = backup_cur_idx;
3044
3045   /* Then check the current node set has the node LAST_NODE.  */
3046   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3047     return REG_NOERROR;
3048
3049   return REG_NOMATCH;
3050 }
3051
3052 /* Helper functions for check_arrival.  */
3053
3054 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3055    to NEXT_NODES.
3056    TODO: This function is similar to the functions transit_state*(),
3057          however this function has many additional works.
3058          Can't we unify them?  */
3059
3060 static reg_errcode_t
3061 internal_function __attribute_warn_unused_result__
3062 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3063                               re_node_set *cur_nodes, re_node_set *next_nodes)
3064 {
3065   const re_dfa_t *const dfa = mctx->dfa;
3066   int result;
3067   int cur_idx;
3068   reg_errcode_t err = REG_NOERROR;
3069   re_node_set union_set;
3070   re_node_set_init_empty (&union_set);
3071   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3072     {
3073       int naccepted = 0;
3074       int cur_node = cur_nodes->elems[cur_idx];
3075 #ifdef DEBUG
3076       re_token_type_t type = dfa->nodes[cur_node].type;
3077       assert (!IS_EPSILON_NODE (type));
3078 #endif
3079 #ifdef RE_ENABLE_I18N
3080       /* If the node may accept `multi byte'.  */
3081       if (dfa->nodes[cur_node].accept_mb)
3082         {
3083           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3084                                                str_idx);
3085           if (naccepted > 1)
3086             {
3087               re_dfastate_t *dest_state;
3088               int next_node = dfa->nexts[cur_node];
3089               int next_idx = str_idx + naccepted;
3090               dest_state = mctx->state_log[next_idx];
3091               re_node_set_empty (&union_set);
3092               if (dest_state)
3093                 {
3094                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3095                   if (BE (err != REG_NOERROR, 0))
3096                     {
3097                       re_node_set_free (&union_set);
3098                       return err;
3099                     }
3100                 }
3101               result = re_node_set_insert (&union_set, next_node);
3102               if (BE (result < 0, 0))
3103                 {
3104                   re_node_set_free (&union_set);
3105                   return REG_ESPACE;
3106                 }
3107               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3108                                                             &union_set);
3109               if (BE (mctx->state_log[next_idx] == NULL
3110                       && err != REG_NOERROR, 0))
3111                 {
3112                   re_node_set_free (&union_set);
3113                   return err;
3114                 }
3115             }
3116         }
3117 #endif /* RE_ENABLE_I18N */
3118       if (naccepted
3119           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3120         {
3121           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3122           if (BE (result < 0, 0))
3123             {
3124               re_node_set_free (&union_set);
3125               return REG_ESPACE;
3126             }
3127         }
3128     }
3129   re_node_set_free (&union_set);
3130   return REG_NOERROR;
3131 }
3132
3133 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3134    CUR_NODES, however exclude the nodes which are:
3135     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3136     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3137 */
3138
3139 static reg_errcode_t
3140 internal_function
3141 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3142                           int ex_subexp, int type)
3143 {
3144   reg_errcode_t err;
3145   int idx, outside_node;
3146   re_node_set new_nodes;
3147 #ifdef DEBUG
3148   assert (cur_nodes->nelem);
3149 #endif
3150   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3151   if (BE (err != REG_NOERROR, 0))
3152     return err;
3153   /* Create a new node set NEW_NODES with the nodes which are epsilon
3154      closures of the node in CUR_NODES.  */
3155
3156   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3157     {
3158       int cur_node = cur_nodes->elems[idx];
3159       const re_node_set *eclosure = dfa->eclosures + cur_node;
3160       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3161       if (outside_node == -1)
3162         {
3163           /* There are no problematic nodes, just merge them.  */
3164           err = re_node_set_merge (&new_nodes, eclosure);
3165           if (BE (err != REG_NOERROR, 0))
3166             {
3167               re_node_set_free (&new_nodes);
3168               return err;
3169             }
3170         }
3171       else
3172         {
3173           /* There are problematic nodes, re-calculate incrementally.  */
3174           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3175                                               ex_subexp, type);
3176           if (BE (err != REG_NOERROR, 0))
3177             {
3178               re_node_set_free (&new_nodes);
3179               return err;
3180             }
3181         }
3182     }
3183   re_node_set_free (cur_nodes);
3184   *cur_nodes = new_nodes;
3185   return REG_NOERROR;
3186 }
3187
3188 /* Helper function for check_arrival_expand_ecl.
3189    Check incrementally the epsilon closure of TARGET, and if it isn't
3190    problematic append it to DST_NODES.  */
3191
3192 static reg_errcode_t
3193 internal_function __attribute_warn_unused_result__
3194 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3195                               int target, int ex_subexp, int type)
3196 {
3197   int cur_node;
3198   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3199     {
3200       int err;
3201
3202       if (dfa->nodes[cur_node].type == type
3203           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3204         {
3205           if (type == OP_CLOSE_SUBEXP)
3206             {
3207               err = re_node_set_insert (dst_nodes, cur_node);
3208               if (BE (err == -1, 0))
3209                 return REG_ESPACE;
3210             }
3211           break;
3212         }
3213       err = re_node_set_insert (dst_nodes, cur_node);
3214       if (BE (err == -1, 0))
3215         return REG_ESPACE;
3216       if (dfa->edests[cur_node].nelem == 0)
3217         break;
3218       if (dfa->edests[cur_node].nelem == 2)
3219         {
3220           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3221                                               dfa->edests[cur_node].elems[1],
3222                                               ex_subexp, type);
3223           if (BE (err != REG_NOERROR, 0))
3224             return err;
3225         }
3226       cur_node = dfa->edests[cur_node].elems[0];
3227     }
3228   return REG_NOERROR;
3229 }
3230
3231
3232 /* For all the back references in the current state, calculate the
3233    destination of the back references by the appropriate entry
3234    in MCTX->BKREF_ENTS.  */
3235
3236 static reg_errcode_t
3237 internal_function __attribute_warn_unused_result__
3238 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3239                     int cur_str, int subexp_num, int type)
3240 {
3241   const re_dfa_t *const dfa = mctx->dfa;
3242   reg_errcode_t err;
3243   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3244   struct re_backref_cache_entry *ent;
3245
3246   if (cache_idx_start == -1)
3247     return REG_NOERROR;
3248
3249  restart:
3250   ent = mctx->bkref_ents + cache_idx_start;
3251   do
3252     {
3253       int to_idx, next_node;
3254
3255       /* Is this entry ENT is appropriate?  */
3256       if (!re_node_set_contains (cur_nodes, ent->node))
3257         continue; /* No.  */
3258
3259       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3260       /* Calculate the destination of the back reference, and append it
3261          to MCTX->STATE_LOG.  */
3262       if (to_idx == cur_str)
3263         {
3264           /* The backreference did epsilon transit, we must re-check all the
3265              node in the current state.  */
3266           re_node_set new_dests;
3267           reg_errcode_t err2, err3;
3268           next_node = dfa->edests[ent->node].elems[0];
3269           if (re_node_set_contains (cur_nodes, next_node))
3270             continue;
3271           err = re_node_set_init_1 (&new_dests, next_node);
3272           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3273           err3 = re_node_set_merge (cur_nodes, &new_dests);
3274           re_node_set_free (&new_dests);
3275           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3276                   || err3 != REG_NOERROR, 0))
3277             {
3278               err = (err != REG_NOERROR ? err
3279                      : (err2 != REG_NOERROR ? err2 : err3));
3280               return err;
3281             }
3282           /* TODO: It is still inefficient...  */
3283           goto restart;
3284         }
3285       else
3286         {
3287           re_node_set union_set;
3288           next_node = dfa->nexts[ent->node];
3289           if (mctx->state_log[to_idx])
3290             {
3291               int ret;
3292               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3293                                         next_node))
3294                 continue;
3295               err = re_node_set_init_copy (&union_set,
3296                                            &mctx->state_log[to_idx]->nodes);
3297               ret = re_node_set_insert (&union_set, next_node);
3298               if (BE (err != REG_NOERROR || ret < 0, 0))
3299                 {
3300                   re_node_set_free (&union_set);
3301                   err = err != REG_NOERROR ? err : REG_ESPACE;
3302                   return err;
3303                 }
3304             }
3305           else
3306             {
3307               err = re_node_set_init_1 (&union_set, next_node);
3308               if (BE (err != REG_NOERROR, 0))
3309                 return err;
3310             }
3311           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3312           re_node_set_free (&union_set);
3313           if (BE (mctx->state_log[to_idx] == NULL
3314                   && err != REG_NOERROR, 0))
3315             return err;
3316         }
3317     }
3318   while (ent++->more);
3319   return REG_NOERROR;
3320 }
3321
3322 /* Build transition table for the state.
3323    Return 1 if succeeded, otherwise return NULL.  */
3324
3325 static int
3326 internal_function
3327 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3328 {
3329   reg_errcode_t err;
3330   int i, j, ch, need_word_trtable = 0;
3331   bitset_word_t elem, mask;
3332   bool dests_node_malloced = false;
3333   bool dest_states_malloced = false;
3334   int ndests; /* Number of the destination states from `state'.  */
3335   re_dfastate_t **trtable;
3336   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3337   re_node_set follows, *dests_node;
3338   bitset_t *dests_ch;
3339   bitset_t acceptable;
3340
3341   struct dests_alloc
3342   {
3343     re_node_set dests_node[SBC_MAX];
3344     bitset_t dests_ch[SBC_MAX];
3345   } *dests_alloc;
3346
3347   /* We build DFA states which corresponds to the destination nodes
3348      from `state'.  `dests_node[i]' represents the nodes which i-th
3349      destination state contains, and `dests_ch[i]' represents the
3350      characters which i-th destination state accepts.  */
3351   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3352     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3353   else
3354     {
3355       dests_alloc = re_malloc (struct dests_alloc, 1);
3356       if (BE (dests_alloc == NULL, 0))
3357         return 0;
3358       dests_node_malloced = true;
3359     }
3360   dests_node = dests_alloc->dests_node;
3361   dests_ch = dests_alloc->dests_ch;
3362
3363   /* Initialize transiton table.  */
3364   state->word_trtable = state->trtable = NULL;
3365
3366   /* At first, group all nodes belonging to `state' into several
3367      destinations.  */
3368   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3369   if (BE (ndests <= 0, 0))
3370     {
3371       if (dests_node_malloced)
3372         free (dests_alloc);
3373       /* Return 0 in case of an error, 1 otherwise.  */
3374       if (ndests == 0)
3375         {
3376           state->trtable = (re_dfastate_t **)
3377             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3378           if (BE (state->trtable == NULL, 0))
3379             return 0;
3380           return 1;
3381         }
3382       return 0;
3383     }
3384
3385   err = re_node_set_alloc (&follows, ndests + 1);
3386   if (BE (err != REG_NOERROR, 0))
3387     goto out_free;
3388
3389   /* Avoid arithmetic overflow in size calculation.  */
3390   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3391             / (3 * sizeof (re_dfastate_t *)))
3392            < ndests),
3393           0))
3394     goto out_free;
3395
3396   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3397                          + ndests * 3 * sizeof (re_dfastate_t *)))
3398     dest_states = (re_dfastate_t **)
3399       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3400   else
3401     {
3402       dest_states = (re_dfastate_t **)
3403         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3404       if (BE (dest_states == NULL, 0))
3405         {
3406 out_free:
3407           if (dest_states_malloced)
3408             free (dest_states);
3409           re_node_set_free (&follows);
3410           for (i = 0; i < ndests; ++i)
3411             re_node_set_free (dests_node + i);
3412           if (dests_node_malloced)
3413             free (dests_alloc);
3414           return 0;
3415         }
3416       dest_states_malloced = true;
3417     }
3418   dest_states_word = dest_states + ndests;
3419   dest_states_nl = dest_states_word + ndests;
3420   bitset_empty (acceptable);
3421
3422   /* Then build the states for all destinations.  */
3423   for (i = 0; i < ndests; ++i)
3424     {
3425       int next_node;
3426       re_node_set_empty (&follows);
3427       /* Merge the follows of this destination states.  */
3428       for (j = 0; j < dests_node[i].nelem; ++j)
3429         {
3430           next_node = dfa->nexts[dests_node[i].elems[j]];
3431           if (next_node != -1)
3432             {
3433               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3434               if (BE (err != REG_NOERROR, 0))
3435                 goto out_free;
3436             }
3437         }
3438       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3439       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3440         goto out_free;
3441       /* If the new state has context constraint,
3442          build appropriate states for these contexts.  */
3443       if (dest_states[i]->has_constraint)
3444         {
3445           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3446                                                           CONTEXT_WORD);
3447           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3448             goto out_free;
3449
3450           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3451             need_word_trtable = 1;
3452
3453           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3454                                                         CONTEXT_NEWLINE);
3455           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3456             goto out_free;
3457         }
3458       else
3459         {
3460           dest_states_word[i] = dest_states[i];
3461           dest_states_nl[i] = dest_states[i];
3462         }
3463       bitset_merge (acceptable, dests_ch[i]);
3464     }
3465
3466   if (!BE (need_word_trtable, 0))
3467     {
3468       /* We don't care about whether the following character is a word
3469          character, or we are in a single-byte character set so we can
3470          discern by looking at the character code: allocate a
3471          256-entry transition table.  */
3472       trtable = state->trtable =
3473         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3474       if (BE (trtable == NULL, 0))
3475         goto out_free;
3476
3477       /* For all characters ch...:  */
3478       for (i = 0; i < BITSET_WORDS; ++i)
3479         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3480              elem;
3481              mask <<= 1, elem >>= 1, ++ch)
3482           if (BE (elem & 1, 0))
3483             {
3484               /* There must be exactly one destination which accepts
3485                  character ch.  See group_nodes_into_DFAstates.  */
3486               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3487                 ;
3488
3489               /* j-th destination accepts the word character ch.  */
3490               if (dfa->word_char[i] & mask)
3491                 trtable[ch] = dest_states_word[j];
3492               else
3493                 trtable[ch] = dest_states[j];
3494             }
3495     }
3496   else
3497     {
3498       /* We care about whether the following character is a word
3499          character, and we are in a multi-byte character set: discern
3500          by looking at the character code: build two 256-entry
3501          transition tables, one starting at trtable[0] and one
3502          starting at trtable[SBC_MAX].  */
3503       trtable = state->word_trtable =
3504         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3505       if (BE (trtable == NULL, 0))
3506         goto out_free;
3507
3508       /* For all characters ch...:  */
3509       for (i = 0; i < BITSET_WORDS; ++i)
3510         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3511              elem;
3512              mask <<= 1, elem >>= 1, ++ch)
3513           if (BE (elem & 1, 0))
3514             {
3515               /* There must be exactly one destination which accepts
3516                  character ch.  See group_nodes_into_DFAstates.  */
3517               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3518                 ;
3519
3520               /* j-th destination accepts the word character ch.  */
3521               trtable[ch] = dest_states[j];
3522               trtable[ch + SBC_MAX] = dest_states_word[j];
3523             }
3524     }
3525
3526   /* new line */
3527   if (bitset_contain (acceptable, NEWLINE_CHAR))
3528     {
3529       /* The current state accepts newline character.  */
3530       for (j = 0; j < ndests; ++j)
3531         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3532           {
3533             /* k-th destination accepts newline character.  */
3534             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3535             if (need_word_trtable)
3536               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3537             /* There must be only one destination which accepts
3538                newline.  See group_nodes_into_DFAstates.  */
3539             break;
3540           }
3541     }
3542
3543   if (dest_states_malloced)
3544     free (dest_states);
3545
3546   re_node_set_free (&follows);
3547   for (i = 0; i < ndests; ++i)
3548     re_node_set_free (dests_node + i);
3549
3550   if (dests_node_malloced)
3551     free (dests_alloc);
3552
3553   return 1;
3554 }
3555
3556 /* Group all nodes belonging to STATE into several destinations.
3557    Then for all destinations, set the nodes belonging to the destination
3558    to DESTS_NODE[i] and set the characters accepted by the destination
3559    to DEST_CH[i].  This function return the number of destinations.  */
3560
3561 static int
3562 internal_function
3563 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3564                             re_node_set *dests_node, bitset_t *dests_ch)
3565 {
3566   reg_errcode_t err;
3567   int result;
3568   int i, j, k;
3569   int ndests; /* Number of the destinations from `state'.  */
3570   bitset_t accepts; /* Characters a node can accept.  */
3571   const re_node_set *cur_nodes = &state->nodes;
3572   bitset_empty (accepts);
3573   ndests = 0;
3574
3575   /* For all the nodes belonging to `state',  */
3576   for (i = 0; i < cur_nodes->nelem; ++i)
3577     {
3578       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3579       re_token_type_t type = node->type;
3580       unsigned int constraint = node->constraint;
3581
3582       /* Enumerate all single byte character this node can accept.  */
3583       if (type == CHARACTER)
3584         bitset_set (accepts, node->opr.c);
3585       else if (type == SIMPLE_BRACKET)
3586         {
3587           bitset_merge (accepts, node->opr.sbcset);
3588         }
3589       else if (type == OP_PERIOD)
3590         {
3591 #ifdef RE_ENABLE_I18N
3592           if (dfa->mb_cur_max > 1)
3593             bitset_merge (accepts, dfa->sb_char);
3594           else
3595 #endif
3596             bitset_set_all (accepts);
3597           if (!(dfa->syntax & RE_DOT_NEWLINE))
3598             bitset_clear (accepts, '\n');
3599           if (dfa->syntax & RE_DOT_NOT_NULL)
3600             bitset_clear (accepts, '\0');
3601         }
3602 #ifdef RE_ENABLE_I18N
3603       else if (type == OP_UTF8_PERIOD)
3604         {
3605           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3606           if (!(dfa->syntax & RE_DOT_NEWLINE))
3607             bitset_clear (accepts, '\n');
3608           if (dfa->syntax & RE_DOT_NOT_NULL)
3609             bitset_clear (accepts, '\0');
3610         }
3611 #endif
3612       else
3613         continue;
3614
3615       /* Check the `accepts' and sift the characters which are not
3616          match it the context.  */
3617       if (constraint)
3618         {
3619           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3620             {
3621               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3622               bitset_empty (accepts);
3623               if (accepts_newline)
3624                 bitset_set (accepts, NEWLINE_CHAR);
3625               else
3626                 continue;
3627             }
3628           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3629             {
3630               bitset_empty (accepts);
3631               continue;
3632             }
3633
3634           if (constraint & NEXT_WORD_CONSTRAINT)
3635             {
3636               bitset_word_t any_set = 0;
3637               if (type == CHARACTER && !node->word_char)
3638                 {
3639                   bitset_empty (accepts);
3640                   continue;
3641                 }
3642 #ifdef RE_ENABLE_I18N
3643               if (dfa->mb_cur_max > 1)
3644                 for (j = 0; j < BITSET_WORDS; ++j)
3645                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3646               else
3647 #endif
3648                 for (j = 0; j < BITSET_WORDS; ++j)
3649                   any_set |= (accepts[j] &= dfa->word_char[j]);
3650               if (!any_set)
3651                 continue;
3652             }
3653           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3654             {
3655               bitset_word_t any_set = 0;
3656               if (type == CHARACTER && node->word_char)
3657                 {
3658                   bitset_empty (accepts);
3659                   continue;
3660                 }
3661 #ifdef RE_ENABLE_I18N
3662               if (dfa->mb_cur_max > 1)
3663                 for (j = 0; j < BITSET_WORDS; ++j)
3664                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3665               else
3666 #endif
3667                 for (j = 0; j < BITSET_WORDS; ++j)
3668                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3669               if (!any_set)
3670                 continue;
3671             }
3672         }
3673
3674       /* Then divide `accepts' into DFA states, or create a new
3675          state.  Above, we make sure that accepts is not empty.  */
3676       for (j = 0; j < ndests; ++j)
3677         {
3678           bitset_t intersec; /* Intersection sets, see below.  */
3679           bitset_t remains;
3680           /* Flags, see below.  */
3681           bitset_word_t has_intersec, not_subset, not_consumed;
3682
3683           /* Optimization, skip if this state doesn't accept the character.  */
3684           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3685             continue;
3686
3687           /* Enumerate the intersection set of this state and `accepts'.  */
3688           has_intersec = 0;
3689           for (k = 0; k < BITSET_WORDS; ++k)
3690             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3691           /* And skip if the intersection set is empty.  */
3692           if (!has_intersec)
3693             continue;
3694
3695           /* Then check if this state is a subset of `accepts'.  */
3696           not_subset = not_consumed = 0;
3697           for (k = 0; k < BITSET_WORDS; ++k)
3698             {
3699               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3700               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3701             }
3702
3703           /* If this state isn't a subset of `accepts', create a
3704              new group state, which has the `remains'. */
3705           if (not_subset)
3706             {
3707               bitset_copy (dests_ch[ndests], remains);
3708               bitset_copy (dests_ch[j], intersec);
3709               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3710               if (BE (err != REG_NOERROR, 0))
3711                 goto error_return;
3712               ++ndests;
3713             }
3714
3715           /* Put the position in the current group. */
3716           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3717           if (BE (result < 0, 0))
3718             goto error_return;
3719
3720           /* If all characters are consumed, go to next node. */
3721           if (!not_consumed)
3722             break;
3723         }
3724       /* Some characters remain, create a new group. */
3725       if (j == ndests)
3726         {
3727           bitset_copy (dests_ch[ndests], accepts);
3728           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3729           if (BE (err != REG_NOERROR, 0))
3730             goto error_return;
3731           ++ndests;
3732           bitset_empty (accepts);
3733         }
3734     }
3735   return ndests;
3736  error_return:
3737   for (j = 0; j < ndests; ++j)
3738     re_node_set_free (dests_node + j);
3739   return -1;
3740 }
3741
3742 #ifdef RE_ENABLE_I18N
3743 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3744    Return the number of the bytes the node accepts.
3745    STR_IDX is the current index of the input string.
3746
3747    This function handles the nodes which can accept one character, or
3748    one collating element like '.', '[a-z]', opposite to the other nodes
3749    can only accept one byte.  */
3750
3751 static int
3752 internal_function
3753 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3754                          const re_string_t *input, int str_idx)
3755 {
3756   const re_token_t *node = dfa->nodes + node_idx;
3757   int char_len, elem_len;
3758   int i;
3759
3760   if (BE (node->type == OP_UTF8_PERIOD, 0))
3761     {
3762       unsigned char c = re_string_byte_at (input, str_idx), d;
3763       if (BE (c < 0xc2, 1))
3764         return 0;
3765
3766       if (str_idx + 2 > input->len)
3767         return 0;
3768
3769       d = re_string_byte_at (input, str_idx + 1);
3770       if (c < 0xe0)
3771         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3772       else if (c < 0xf0)
3773         {
3774           char_len = 3;
3775           if (c == 0xe0 && d < 0xa0)
3776             return 0;
3777         }
3778       else if (c < 0xf8)
3779         {
3780           char_len = 4;
3781           if (c == 0xf0 && d < 0x90)
3782             return 0;
3783         }
3784       else if (c < 0xfc)
3785         {
3786           char_len = 5;
3787           if (c == 0xf8 && d < 0x88)
3788             return 0;
3789         }
3790       else if (c < 0xfe)
3791         {
3792           char_len = 6;
3793           if (c == 0xfc && d < 0x84)
3794             return 0;
3795         }
3796       else
3797         return 0;
3798
3799       if (str_idx + char_len > input->len)
3800         return 0;
3801
3802       for (i = 1; i < char_len; ++i)
3803         {
3804           d = re_string_byte_at (input, str_idx + i);
3805           if (d < 0x80 || d > 0xbf)
3806             return 0;
3807         }
3808       return char_len;
3809     }
3810
3811   char_len = re_string_char_size_at (input, str_idx);
3812   if (node->type == OP_PERIOD)
3813     {
3814       if (char_len <= 1)
3815         return 0;
3816       /* FIXME: I don't think this if is needed, as both '\n'
3817          and '\0' are char_len == 1.  */
3818       /* '.' accepts any one character except the following two cases.  */
3819       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3820            re_string_byte_at (input, str_idx) == '\n') ||
3821           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3822            re_string_byte_at (input, str_idx) == '\0'))
3823         return 0;
3824       return char_len;
3825     }
3826
3827   elem_len = re_string_elem_size_at (input, str_idx);
3828   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3829     return 0;
3830
3831   if (node->type == COMPLEX_BRACKET)
3832     {
3833       const re_charset_t *cset = node->opr.mbcset;
3834 # ifdef _LIBC
3835       const unsigned char *pin
3836         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3837       int j;
3838       uint32_t nrules;
3839 # endif /* _LIBC */
3840       int match_len = 0;
3841       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3842                     ? re_string_wchar_at (input, str_idx) : 0);
3843
3844       /* match with multibyte character?  */
3845       for (i = 0; i < cset->nmbchars; ++i)
3846         if (wc == cset->mbchars[i])
3847           {
3848             match_len = char_len;
3849             goto check_node_accept_bytes_match;
3850           }
3851       /* match with character_class?  */
3852       for (i = 0; i < cset->nchar_classes; ++i)
3853         {
3854           wctype_t wt = cset->char_classes[i];
3855           if (__iswctype (wc, wt))
3856             {
3857               match_len = char_len;
3858               goto check_node_accept_bytes_match;
3859             }
3860         }
3861
3862 # ifdef _LIBC
3863       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3864       if (nrules != 0)
3865         {
3866           unsigned int in_collseq = 0;
3867           const int32_t *table, *indirect;
3868           const unsigned char *weights, *extra;
3869           const char *collseqwc;
3870           /* This #include defines a local function!  */
3871 #  include <locale/weight.h>
3872
3873           /* match with collating_symbol?  */
3874           if (cset->ncoll_syms)
3875             extra = (const unsigned char *)
3876               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3877           for (i = 0; i < cset->ncoll_syms; ++i)
3878             {
3879               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3880               /* Compare the length of input collating element and
3881                  the length of current collating element.  */
3882               if (*coll_sym != elem_len)
3883                 continue;
3884               /* Compare each bytes.  */
3885               for (j = 0; j < *coll_sym; j++)
3886                 if (pin[j] != coll_sym[1 + j])
3887                   break;
3888               if (j == *coll_sym)
3889                 {
3890                   /* Match if every bytes is equal.  */
3891                   match_len = j;
3892                   goto check_node_accept_bytes_match;
3893                 }
3894             }
3895
3896           if (cset->nranges)
3897             {
3898               if (elem_len <= char_len)
3899                 {
3900                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3901                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3902                 }
3903               else
3904                 in_collseq = find_collation_sequence_value (pin, elem_len);
3905             }
3906           /* match with range expression?  */
3907           for (i = 0; i < cset->nranges; ++i)
3908             if (cset->range_starts[i] <= in_collseq
3909                 && in_collseq <= cset->range_ends[i])
3910               {
3911                 match_len = elem_len;
3912                 goto check_node_accept_bytes_match;
3913               }
3914
3915           /* match with equivalence_class?  */
3916           if (cset->nequiv_classes)
3917             {
3918               const unsigned char *cp = pin;
3919               table = (const int32_t *)
3920                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3921               weights = (const unsigned char *)
3922                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3923               extra = (const unsigned char *)
3924                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3925               indirect = (const int32_t *)
3926                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3927               int32_t idx = findidx (&cp, elem_len);
3928               if (idx > 0)
3929                 for (i = 0; i < cset->nequiv_classes; ++i)
3930                   {
3931                     int32_t equiv_class_idx = cset->equiv_classes[i];
3932                     size_t weight_len = weights[idx & 0xffffff];
3933                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3934                         && (idx >> 24) == (equiv_class_idx >> 24))
3935                       {
3936                         int cnt = 0;
3937
3938                         idx &= 0xffffff;
3939                         equiv_class_idx &= 0xffffff;
3940
3941                         while (cnt <= weight_len
3942                                && (weights[equiv_class_idx + 1 + cnt]
3943                                    == weights[idx + 1 + cnt]))
3944                           ++cnt;
3945                         if (cnt > weight_len)
3946                           {
3947                             match_len = elem_len;
3948                             goto check_node_accept_bytes_match;
3949                           }
3950                       }
3951                   }
3952             }
3953         }
3954       else
3955 # endif /* _LIBC */
3956         {
3957           /* match with range expression?  */
3958 #if __GNUC__ >= 2
3959           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3960 #else
3961           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3962           cmp_buf[2] = wc;
3963 #endif
3964           for (i = 0; i < cset->nranges; ++i)
3965             {
3966               cmp_buf[0] = cset->range_starts[i];
3967               cmp_buf[4] = cset->range_ends[i];
3968               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3969                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3970                 {
3971                   match_len = char_len;
3972                   goto check_node_accept_bytes_match;
3973                 }
3974             }
3975         }
3976     check_node_accept_bytes_match:
3977       if (!cset->non_match)
3978         return match_len;
3979       else
3980         {
3981           if (match_len > 0)
3982             return 0;
3983           else
3984             return (elem_len > char_len) ? elem_len : char_len;
3985         }
3986     }
3987   return 0;
3988 }
3989
3990 # ifdef _LIBC
3991 static unsigned int
3992 internal_function
3993 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3994 {
3995   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3996   if (nrules == 0)
3997     {
3998       if (mbs_len == 1)
3999         {
4000           /* No valid character.  Match it as a single byte character.  */
4001           const unsigned char *collseq = (const unsigned char *)
4002             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4003           return collseq[mbs[0]];
4004         }
4005       return UINT_MAX;
4006     }
4007   else
4008     {
4009       int32_t idx;
4010       const unsigned char *extra = (const unsigned char *)
4011         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4012       int32_t extrasize = (const unsigned char *)
4013         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4014
4015       for (idx = 0; idx < extrasize;)
4016         {
4017           int mbs_cnt, found = 0;
4018           int32_t elem_mbs_len;
4019           /* Skip the name of collating element name.  */
4020           idx = idx + extra[idx] + 1;
4021           elem_mbs_len = extra[idx++];
4022           if (mbs_len == elem_mbs_len)
4023             {
4024               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4025                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4026                   break;
4027               if (mbs_cnt == elem_mbs_len)
4028                 /* Found the entry.  */
4029                 found = 1;
4030             }
4031           /* Skip the byte sequence of the collating element.  */
4032           idx += elem_mbs_len;
4033           /* Adjust for the alignment.  */
4034           idx = (idx + 3) & ~3;
4035           /* Skip the collation sequence value.  */
4036           idx += sizeof (uint32_t);
4037           /* Skip the wide char sequence of the collating element.  */
4038           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4039           /* If we found the entry, return the sequence value.  */
4040           if (found)
4041             return *(uint32_t *) (extra + idx);
4042           /* Skip the collation sequence value.  */
4043           idx += sizeof (uint32_t);
4044         }
4045       return UINT_MAX;
4046     }
4047 }
4048 # endif /* _LIBC */
4049 #endif /* RE_ENABLE_I18N */
4050
4051 /* Check whether the node accepts the byte which is IDX-th
4052    byte of the INPUT.  */
4053
4054 static int
4055 internal_function
4056 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4057                    int idx)
4058 {
4059   unsigned char ch;
4060   ch = re_string_byte_at (&mctx->input, idx);
4061   switch (node->type)
4062     {
4063     case CHARACTER:
4064       if (node->opr.c != ch)
4065         return 0;
4066       break;
4067
4068     case SIMPLE_BRACKET:
4069       if (!bitset_contain (node->opr.sbcset, ch))
4070         return 0;
4071       break;
4072
4073 #ifdef RE_ENABLE_I18N
4074     case OP_UTF8_PERIOD:
4075       if (ch >= 0x80)
4076         return 0;
4077       /* FALLTHROUGH */
4078 #endif
4079     case OP_PERIOD:
4080       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4081           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4082         return 0;
4083       break;
4084
4085     default:
4086       return 0;
4087     }
4088
4089   if (node->constraint)
4090     {
4091       /* The node has constraints.  Check whether the current context
4092          satisfies the constraints.  */
4093       unsigned int context = re_string_context_at (&mctx->input, idx,
4094                                                    mctx->eflags);
4095       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4096         return 0;
4097     }
4098
4099   return 1;
4100 }
4101
4102 /* Extend the buffers, if the buffers have run out.  */
4103
4104 static reg_errcode_t
4105 internal_function __attribute_warn_unused_result__
4106 extend_buffers (re_match_context_t *mctx)
4107 {
4108   reg_errcode_t ret;
4109   re_string_t *pstr = &mctx->input;
4110
4111   /* Avoid overflow.  */
4112   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4113     return REG_ESPACE;
4114
4115   /* Double the lengthes of the buffers.  */
4116   ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2));
4117   if (BE (ret != REG_NOERROR, 0))
4118     return ret;
4119
4120   if (mctx->state_log != NULL)
4121     {
4122       /* And double the length of state_log.  */
4123       /* XXX We have no indication of the size of this buffer.  If this
4124          allocation fail we have no indication that the state_log array
4125          does not have the right size.  */
4126       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4127                                               pstr->bufs_len + 1);
4128       if (BE (new_array == NULL, 0))
4129         return REG_ESPACE;
4130       mctx->state_log = new_array;
4131     }
4132
4133   /* Then reconstruct the buffers.  */
4134   if (pstr->icase)
4135     {
4136 #ifdef RE_ENABLE_I18N
4137       if (pstr->mb_cur_max > 1)
4138         {
4139           ret = build_wcs_upper_buffer (pstr);
4140           if (BE (ret != REG_NOERROR, 0))
4141             return ret;
4142         }
4143       else
4144 #endif /* RE_ENABLE_I18N  */
4145         build_upper_buffer (pstr);
4146     }
4147   else
4148     {
4149 #ifdef RE_ENABLE_I18N
4150       if (pstr->mb_cur_max > 1)
4151         build_wcs_buffer (pstr);
4152       else
4153 #endif /* RE_ENABLE_I18N  */
4154         {
4155           if (pstr->trans != NULL)
4156             re_string_translate_buffer (pstr);
4157         }
4158     }
4159   return REG_NOERROR;
4160 }
4161
4162 \f
4163 /* Functions for matching context.  */
4164
4165 /* Initialize MCTX.  */
4166
4167 static reg_errcode_t
4168 internal_function __attribute_warn_unused_result__
4169 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4170 {
4171   mctx->eflags = eflags;
4172   mctx->match_last = -1;
4173   if (n > 0)
4174     {
4175       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4176       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4177       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4178         return REG_ESPACE;
4179     }
4180   /* Already zero-ed by the caller.
4181      else
4182        mctx->bkref_ents = NULL;
4183      mctx->nbkref_ents = 0;
4184      mctx->nsub_tops = 0;  */
4185   mctx->abkref_ents = n;
4186   mctx->max_mb_elem_len = 1;
4187   mctx->asub_tops = n;
4188   return REG_NOERROR;
4189 }
4190
4191 /* Clean the entries which depend on the current input in MCTX.
4192    This function must be invoked when the matcher changes the start index
4193    of the input, or changes the input string.  */
4194
4195 static void
4196 internal_function
4197 match_ctx_clean (re_match_context_t *mctx)
4198 {
4199   int st_idx;
4200   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4201     {
4202       int sl_idx;
4203       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4204       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4205         {
4206           re_sub_match_last_t *last = top->lasts[sl_idx];
4207           re_free (last->path.array);
4208           re_free (last);
4209         }
4210       re_free (top->lasts);
4211       if (top->path)
4212         {
4213           re_free (top->path->array);
4214           re_free (top->path);
4215         }
4216       free (top);
4217     }
4218
4219   mctx->nsub_tops = 0;
4220   mctx->nbkref_ents = 0;
4221 }
4222
4223 /* Free all the memory associated with MCTX.  */
4224
4225 static void
4226 internal_function
4227 match_ctx_free (re_match_context_t *mctx)
4228 {
4229   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4230   match_ctx_clean (mctx);
4231   re_free (mctx->sub_tops);
4232   re_free (mctx->bkref_ents);
4233 }
4234
4235 /* Add a new backreference entry to MCTX.
4236    Note that we assume that caller never call this function with duplicate
4237    entry, and call with STR_IDX which isn't smaller than any existing entry.
4238 */
4239
4240 static reg_errcode_t
4241 internal_function __attribute_warn_unused_result__
4242 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4243                      int to)
4244 {
4245   if (mctx->nbkref_ents >= mctx->abkref_ents)
4246     {
4247       struct re_backref_cache_entry* new_entry;
4248       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4249                               mctx->abkref_ents * 2);
4250       if (BE (new_entry == NULL, 0))
4251         {
4252           re_free (mctx->bkref_ents);
4253           return REG_ESPACE;
4254         }
4255       mctx->bkref_ents = new_entry;
4256       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4257               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4258       mctx->abkref_ents *= 2;
4259     }
4260   if (mctx->nbkref_ents > 0
4261       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4262     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4263
4264   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4265   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4266   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4267   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4268
4269   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4270      If bit N is clear, means that this entry won't epsilon-transition to
4271      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4272      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4273      such node.
4274
4275      A backreference does not epsilon-transition unless it is empty, so set
4276      to all zeros if FROM != TO.  */
4277   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4278     = (from == to ? ~0 : 0);
4279
4280   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4281   if (mctx->max_mb_elem_len < to - from)
4282     mctx->max_mb_elem_len = to - from;
4283   return REG_NOERROR;
4284 }
4285
4286 /* Search for the first entry which has the same str_idx, or -1 if none is
4287    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4288
4289 static int
4290 internal_function
4291 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4292 {
4293   int left, right, mid, last;
4294   last = right = mctx->nbkref_ents;
4295   for (left = 0; left < right;)
4296     {
4297       mid = (left + right) / 2;
4298       if (mctx->bkref_ents[mid].str_idx < str_idx)
4299         left = mid + 1;
4300       else
4301         right = mid;
4302     }
4303   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4304     return left;
4305   else
4306     return -1;
4307 }
4308
4309 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4310    at STR_IDX.  */
4311
4312 static reg_errcode_t
4313 internal_function __attribute_warn_unused_result__
4314 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4315 {
4316 #ifdef DEBUG
4317   assert (mctx->sub_tops != NULL);
4318   assert (mctx->asub_tops > 0);
4319 #endif
4320   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4321     {
4322       int new_asub_tops = mctx->asub_tops * 2;
4323       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4324                                                    re_sub_match_top_t *,
4325                                                    new_asub_tops);
4326       if (BE (new_array == NULL, 0))
4327         return REG_ESPACE;
4328       mctx->sub_tops = new_array;
4329       mctx->asub_tops = new_asub_tops;
4330     }
4331   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4332   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4333     return REG_ESPACE;
4334   mctx->sub_tops[mctx->nsub_tops]->node = node;
4335   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4336   return REG_NOERROR;
4337 }
4338
4339 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4340    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4341
4342 static re_sub_match_last_t *
4343 internal_function
4344 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4345 {
4346   re_sub_match_last_t *new_entry;
4347   if (BE (subtop->nlasts == subtop->alasts, 0))
4348     {
4349       int new_alasts = 2 * subtop->alasts + 1;
4350       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4351                                                     re_sub_match_last_t *,
4352                                                     new_alasts);
4353       if (BE (new_array == NULL, 0))
4354         return NULL;
4355       subtop->lasts = new_array;
4356       subtop->alasts = new_alasts;
4357     }
4358   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4359   if (BE (new_entry != NULL, 1))
4360     {
4361       subtop->lasts[subtop->nlasts] = new_entry;
4362       new_entry->node = node;
4363       new_entry->str_idx = str_idx;
4364       ++subtop->nlasts;
4365     }
4366   return new_entry;
4367 }
4368
4369 static void
4370 internal_function
4371 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4372                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4373 {
4374   sctx->sifted_states = sifted_sts;
4375   sctx->limited_states = limited_sts;
4376   sctx->last_node = last_node;
4377   sctx->last_str_idx = last_str_idx;
4378   re_node_set_init_empty (&sctx->limits);
4379 }