posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static inline re_dfastate_t *acquire_init_state_context
  56      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  57      __attribute ((always_inline)) internal_function;
  58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  59      internal_function;
  60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  61                            int *p_match_first)
  62      internal_function;
  63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  64                                     unsigned int context) internal_function;
  65 static int check_halt_state_context (const re_match_context_t *mctx,
  66                                      const re_dfastate_t *state, int idx)
  67      internal_function;
  68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  69                          regmatch_t *prev_idx_match, int cur_node,
  70                          int cur_idx, int nmatch) internal_function;
  71 static int proceed_next_node (const re_match_context_t *mctx,
  72                               int nregs, regmatch_t *regs,
  73                               int *pidx, int node, re_node_set *eps_via_nodes,
  74                               struct re_fail_stack_t *fs) internal_function;
  75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  76                                       int str_idx, int dest_node, int nregs,
  77                                       regmatch_t *regs,
  78                                       re_node_set *eps_via_nodes) internal_function;
  79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  80                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  81 static reg_errcode_t set_regs (const regex_t *preg,
  82                                const re_match_context_t *mctx,
  83                                size_t nmatch, regmatch_t *pmatch,
  84                                int fl_backtrack) internal_function;
  85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  86
  87 #ifdef RE_ENABLE_I18N
  88 static int sift_states_iter_mb (const re_match_context_t *mctx,
  89                                 re_sift_context_t *sctx,
  90                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  91 #endif /* RE_ENABLE_I18N */
  92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  93                                            re_sift_context_t *sctx) internal_function;
  94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  95                                           re_sift_context_t *sctx, int str_idx,
  96                                           re_node_set *cur_dest) internal_function;
  97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  98                                               re_sift_context_t *sctx,
  99                                               int str_idx,
 100                                               re_node_set *dest_nodes) internal_function;
 101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 102                                             re_node_set *dest_nodes,
 103                                             const re_node_set *candidates) internal_function;
 104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 105                                             re_node_set *dest_nodes,
 106                                             const re_node_set *and_nodes) internal_function;
 107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 108                              int dst_node, int dst_idx, int src_node,
 109                              int src_idx) internal_function;
 110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 111                                         int boundaries, int subexp_idx,
 112                                         int from_node, int bkref_idx) internal_function;
 113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 114                                       int limit, int subexp_idx,
 115                                       int node, int str_idx,
 116                                       int bkref_idx) internal_function;
 117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 118                                           re_node_set *dest_nodes,
 119                                           const re_node_set *candidates,
 120                                           re_node_set *limits,
 121                                           struct re_backref_cache_entry *bkref_ents,
 122                                           int str_idx) internal_function;
 123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 124                                         re_sift_context_t *sctx,
 125                                         int str_idx, const re_node_set *candidates) internal_function;
 126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 127                                                 int next_state_log_idx) internal_function;
 128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 129                                         re_dfastate_t **src, int num) internal_function;
 130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 131                                          re_match_context_t *mctx) internal_function;
 132 static re_dfastate_t *transit_state (reg_errcode_t *err,
 133                                      re_match_context_t *mctx,
 134                                      re_dfastate_t *state) internal_function;
 135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 136                                             re_match_context_t *mctx,
 137                                             re_dfastate_t *next_state) internal_function;
 138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 139                                                 re_node_set *cur_nodes,
 140                                                 int str_idx) internal_function;
 141 #if 0
 142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 143                                         re_match_context_t *mctx,
 144                                         re_dfastate_t *pstate) internal_function;
 145 #endif
 146 #ifdef RE_ENABLE_I18N
 147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 148                                        re_dfastate_t *pstate) internal_function;
 149 #endif /* RE_ENABLE_I18N */
 150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 151                                           const re_node_set *nodes) internal_function;
 152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 153                                  int bkref_node, int bkref_str_idx) internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str) internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes) internal_function;
 168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 169                                                re_node_set *cur_nodes,
 170                                                int ex_subexp, int type) internal_function;
 171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 172                                                    re_node_set *dst_nodes,
 173                                                    int target, int ex_subexp,
 174                                                    int type) internal_function;
 175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 176                                          re_node_set *cur_nodes, int cur_str,
 177                                          int subexp_num, int type) internal_function;
 178 static int build_trtable (re_dfa_t *dfa,
 179                           re_dfastate_t *state) internal_function;
 180 #ifdef RE_ENABLE_I18N
 181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 182                                     const re_string_t *input, int idx) internal_function;
 183 # ifdef _LIBC
 184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 185                                                    size_t name_len) internal_function;
 186 # endif /* _LIBC */
 187 #endif /* RE_ENABLE_I18N */
 188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 189                                        const re_dfastate_t *state,
 190                                        re_node_set *states_node,
 191                                        bitset *states_ch) internal_function;
 192 static int check_node_accept (const re_match_context_t *mctx,
 193                               const re_token_t *node, int idx) internal_function;
 194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 195 \f
 196 /* Entry point for POSIX code.  */
 197
 198 /* regexec searches for a given pattern, specified by PREG, in the
 199    string STRING.
 200
 201    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 202    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 203    least NMATCH elements, and we set them to the offsets of the
 204    corresponding matched substrings.
 205
 206    EFLAGS specifies `execution flags' which affect matching: if
 207    REG_NOTBOL is set, then ^ does not match at the beginning of the
 208    string; if REG_NOTEOL is set, then $ does not match at the end.
 209
 210    We return 0 if we find a match and REG_NOMATCH if not.  */
 211
 212 int
 213 regexec (preg, string, nmatch, pmatch, eflags)
 214     const regex_t *__restrict preg;
 215     const char *__restrict string;
 216     size_t nmatch;
 217     regmatch_t pmatch[];
 218     int eflags;
 219 {
 220   reg_errcode_t err;
 221   int start, length;
 222   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 223
 224   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 225     return REG_BADPAT;
 226
 227   if (eflags & REG_STARTEND)
 228     {
 229       start = pmatch[0].rm_so;
 230       length = pmatch[0].rm_eo;
 231     }
 232   else
 233     {
 234       start = 0;
 235       length = strlen (string);
 236     }
 237
 238   __libc_lock_lock (dfa->lock);
 239   if (preg->no_sub)
 240     err = re_search_internal (preg, string, length, start, length - start,
 241                               length, 0, NULL, eflags);
 242   else
 243     err = re_search_internal (preg, string, length, start, length - start,
 244                               length, nmatch, pmatch, eflags);
 245   __libc_lock_unlock (dfa->lock);
 246   return err != REG_NOERROR;
 247 }
 248
 249 #ifdef _LIBC
 250 # include <shlib-compat.h>
 251 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 252
 253 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 254 __typeof__ (__regexec) __compat_regexec;
 255
 256 int
 257 attribute_compat_text_section
 258 __compat_regexec (const regex_t *__restrict preg,
 259                   const char *__restrict string, size_t nmatch,
 260                   regmatch_t pmatch[], int eflags)
 261 {
 262   return regexec (preg, string, nmatch, pmatch,
 263                   eflags & (REG_NOTBOL | REG_NOTEOL));
 264 }
 265 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 266 # endif
 267 #endif
 268
 269 /* Entry points for GNU code.  */
 270
 271 /* re_match, re_search, re_match_2, re_search_2
 272
 273    The former two functions operate on STRING with length LENGTH,
 274    while the later two operate on concatenation of STRING1 and STRING2
 275    with lengths LENGTH1 and LENGTH2, respectively.
 276
 277    re_match() matches the compiled pattern in BUFP against the string,
 278    starting at index START.
 279
 280    re_search() first tries matching at index START, then it tries to match
 281    starting from index START + 1, and so on.  The last start position tried
 282    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 283    way as re_match().)
 284
 285    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 286    the first STOP characters of the concatenation of the strings should be
 287    concerned.
 288
 289    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 290    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 291    computed relative to the concatenation, not relative to the individual
 292    strings.)
 293
 294    On success, re_match* functions return the length of the match, re_search*
 295    return the position of the start of the match.  Return value -1 means no
 296    match was found and -2 indicates an internal error.  */
 297
 298 int
 299 re_match (bufp, string, length, start, regs)
 300     struct re_pattern_buffer *bufp;
 301     const char *string;
 302     int length, start;
 303     struct re_registers *regs;
 304 {
 305   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 306 }
 307 #ifdef _LIBC
 308 weak_alias (__re_match, re_match)
 309 #endif
 310
 311 int
 312 re_search (bufp, string, length, start, range, regs)
 313     struct re_pattern_buffer *bufp;
 314     const char *string;
 315     int length, start, range;
 316     struct re_registers *regs;
 317 {
 318   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 319 }
 320 #ifdef _LIBC
 321 weak_alias (__re_search, re_search)
 322 #endif
 323
 324 int
 325 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 326     struct re_pattern_buffer *bufp;
 327     const char *string1, *string2;
 328     int length1, length2, start, stop;
 329     struct re_registers *regs;
 330 {
 331   return re_search_2_stub (bufp, string1, length1, string2, length2,
 332                            start, 0, regs, stop, 1);
 333 }
 334 #ifdef _LIBC
 335 weak_alias (__re_match_2, re_match_2)
 336 #endif
 337
 338 int
 339 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 340     struct re_pattern_buffer *bufp;
 341     const char *string1, *string2;
 342     int length1, length2, start, range, stop;
 343     struct re_registers *regs;
 344 {
 345   return re_search_2_stub (bufp, string1, length1, string2, length2,
 346                            start, range, regs, stop, 0);
 347 }
 348 #ifdef _LIBC
 349 weak_alias (__re_search_2, re_search_2)
 350 #endif
 351
 352 static int
 353 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 354                   stop, ret_len)
 355     struct re_pattern_buffer *bufp;
 356     const char *string1, *string2;
 357     int length1, length2, start, range, stop, ret_len;
 358     struct re_registers *regs;
 359 {
 360   const char *str;
 361   int rval;
 362   int len = length1 + length2;
 363   int free_str = 0;
 364
 365   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 366     return -2;
 367
 368   /* Concatenate the strings.  */
 369   if (length2 > 0)
 370     if (length1 > 0)
 371       {
 372         char *s = re_malloc (char, len);
 373
 374         if (BE (s == NULL, 0))
 375           return -2;
 376         memcpy (s, string1, length1);
 377         memcpy (s + length1, string2, length2);
 378         str = s;
 379         free_str = 1;
 380       }
 381     else
 382       str = string2;
 383   else
 384     str = string1;
 385
 386   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 387                          ret_len);
 388   if (free_str)
 389     re_free ((char *) str);
 390   return rval;
 391 }
 392
 393 /* The parameters have the same meaning as those of re_search.
 394    Additional parameters:
 395    If RET_LEN is nonzero the length of the match is returned (re_match style);
 396    otherwise the position of the match is returned.  */
 397
 398 static int
 399 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 400     struct re_pattern_buffer *bufp;
 401     const char *string;
 402     int length, start, range, stop, ret_len;
 403     struct re_registers *regs;
 404 {
 405   reg_errcode_t result;
 406   regmatch_t *pmatch;
 407   int nregs, rval;
 408   int eflags = 0;
 409   re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
 410
 411   /* Check for out-of-range.  */
 412   if (BE (start < 0 || start > length, 0))
 413     return -1;
 414   if (BE (start + range > length, 0))
 415     range = length - start;
 416   else if (BE (start + range < 0, 0))
 417     range = -start;
 418
 419   __libc_lock_lock (dfa->lock);
 420
 421   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 422   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 423
 424   /* Compile fastmap if we haven't yet.  */
 425   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 426     re_compile_fastmap (bufp);
 427
 428   if (BE (bufp->no_sub, 0))
 429     regs = NULL;
 430
 431   /* We need at least 1 register.  */
 432   if (regs == NULL)
 433     nregs = 1;
 434   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 435                regs->num_regs < bufp->re_nsub + 1, 0))
 436     {
 437       nregs = regs->num_regs;
 438       if (BE (nregs < 1, 0))
 439         {
 440           /* Nothing can be copied to regs.  */
 441           regs = NULL;
 442           nregs = 1;
 443         }
 444     }
 445   else
 446     nregs = bufp->re_nsub + 1;
 447   pmatch = re_malloc (regmatch_t, nregs);
 448   if (BE (pmatch == NULL, 0))
 449     {
 450       rval = -2;
 451       goto out;
 452     }
 453
 454   result = re_search_internal (bufp, string, length, start, range, stop,
 455                                nregs, pmatch, eflags);
 456
 457   rval = 0;
 458
 459   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 460   if (result != REG_NOERROR)
 461     rval = -1;
 462   else if (regs != NULL)
 463     {
 464       /* If caller wants register contents data back, copy them.  */
 465       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 466                                            bufp->regs_allocated);
 467       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 468         rval = -2;
 469     }
 470
 471   if (BE (rval == 0, 1))
 472     {
 473       if (ret_len)
 474         {
 475           assert (pmatch[0].rm_so == start);
 476           rval = pmatch[0].rm_eo - start;
 477         }
 478       else
 479         rval = pmatch[0].rm_so;
 480     }
 481   re_free (pmatch);
 482  out:
 483   __libc_lock_unlock (dfa->lock);
 484   return rval;
 485 }
 486
 487 static unsigned
 488 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 489     struct re_registers *regs;
 490     regmatch_t *pmatch;
 491     int nregs, regs_allocated;
 492 {
 493   int rval = REGS_REALLOCATE;
 494   int i;
 495   int need_regs = nregs + 1;
 496   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 497      uses.  */
 498
 499   /* Have the register data arrays been allocated?  */
 500   if (regs_allocated == REGS_UNALLOCATED)
 501     { /* No.  So allocate them with malloc.  */
 502       regs->start = re_malloc (regoff_t, need_regs);
 503       regs->end = re_malloc (regoff_t, need_regs);
 504       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 505         return REGS_UNALLOCATED;
 506       regs->num_regs = need_regs;
 507     }
 508   else if (regs_allocated == REGS_REALLOCATE)
 509     { /* Yes.  If we need more elements than were already
 510          allocated, reallocate them.  If we need fewer, just
 511          leave it alone.  */
 512       if (BE (need_regs > regs->num_regs, 0))
 513         {
 514           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 515           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 516           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 517             return REGS_UNALLOCATED;
 518           regs->start = new_start;
 519           regs->end = new_end;
 520           regs->num_regs = need_regs;
 521         }
 522     }
 523   else
 524     {
 525       assert (regs_allocated == REGS_FIXED);
 526       /* This function may not be called with REGS_FIXED and nregs too big.  */
 527       assert (regs->num_regs >= nregs);
 528       rval = REGS_FIXED;
 529     }
 530
 531   /* Copy the regs.  */
 532   for (i = 0; i < nregs; ++i)
 533     {
 534       regs->start[i] = pmatch[i].rm_so;
 535       regs->end[i] = pmatch[i].rm_eo;
 536     }
 537   for ( ; i < regs->num_regs; ++i)
 538     regs->start[i] = regs->end[i] = -1;
 539
 540   return rval;
 541 }
 542
 543 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 544    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 545    this memory for recording register information.  STARTS and ENDS
 546    must be allocated using the malloc library routine, and must each
 547    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 548
 549    If NUM_REGS == 0, then subsequent matches should allocate their own
 550    register data.
 551
 552    Unless this function is called, the first search or match using
 553    PATTERN_BUFFER will allocate its own register data, without
 554    freeing the old data.  */
 555
 556 void
 557 re_set_registers (bufp, regs, num_regs, starts, ends)
 558     struct re_pattern_buffer *bufp;
 559     struct re_registers *regs;
 560     unsigned num_regs;
 561     regoff_t *starts, *ends;
 562 {
 563   if (num_regs)
 564     {
 565       bufp->regs_allocated = REGS_REALLOCATE;
 566       regs->num_regs = num_regs;
 567       regs->start = starts;
 568       regs->end = ends;
 569     }
 570   else
 571     {
 572       bufp->regs_allocated = REGS_UNALLOCATED;
 573       regs->num_regs = 0;
 574       regs->start = regs->end = (regoff_t *) 0;
 575     }
 576 }
 577 #ifdef _LIBC
 578 weak_alias (__re_set_registers, re_set_registers)
 579 #endif
 580 \f
 581 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 582    them unless specifically requested.  */
 583
 584 #if defined _REGEX_RE_COMP || defined _LIBC
 585 int
 586 # ifdef _LIBC
 587 weak_function
 588 # endif
 589 re_exec (s)
 590      const char *s;
 591 {
 592   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 593 }
 594 #endif /* _REGEX_RE_COMP */
 595 \f
 596 /* Internal entry point.  */
 597
 598 /* Searches for a compiled pattern PREG in the string STRING, whose
 599    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 600    mingings with regexec.  START, and RANGE have the same meanings
 601    with re_search.
 602    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 603    otherwise return the error code.
 604    Note: We assume front end functions already check ranges.
 605    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 606
 607 static reg_errcode_t
 608 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 609                     eflags)
 610     const regex_t *preg;
 611     const char *string;
 612     int length, start, range, stop, eflags;
 613     size_t nmatch;
 614     regmatch_t pmatch[];
 615 {
 616   reg_errcode_t err;
 617   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 618   int left_lim, right_lim, incr;
 619   int fl_longest_match, match_first, match_kind, match_last = -1;
 620   int extra_nmatch;
 621   int sb, ch;
 622 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 623   re_match_context_t mctx = { .dfa = dfa };
 624 #else
 625   re_match_context_t mctx;
 626 #endif
 627   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 628                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 629   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 630
 631 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 632   memset (&mctx, '\0', sizeof (re_match_context_t));
 633   mctx.dfa = dfa;
 634 #endif
 635
 636   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 637   nmatch -= extra_nmatch;
 638
 639   /* Check if the DFA haven't been compiled.  */
 640   if (BE (preg->used == 0 || dfa->init_state == NULL
 641           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 642           || dfa->init_state_begbuf == NULL, 0))
 643     return REG_NOMATCH;
 644
 645 #ifdef DEBUG
 646   /* We assume front-end functions already check them.  */
 647   assert (start + range >= 0 && start + range <= length);
 648 #endif
 649
 650   /* If initial states with non-begbuf contexts have no elements,
 651      the regex must be anchored.  If preg->newline_anchor is set,
 652      we'll never use init_state_nl, so do not check it.  */
 653   if (dfa->init_state->nodes.nelem == 0
 654       && dfa->init_state_word->nodes.nelem == 0
 655       && (dfa->init_state_nl->nodes.nelem == 0
 656           || !preg->newline_anchor))
 657     {
 658       if (start != 0 && start + range != 0)
 659         return REG_NOMATCH;
 660       start = range = 0;
 661     }
 662
 663   /* We must check the longest matching, if nmatch > 0.  */
 664   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 665
 666   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 667                             preg->translate, preg->syntax & RE_ICASE, dfa);
 668   if (BE (err != REG_NOERROR, 0))
 669     goto free_return;
 670   mctx.input.stop = stop;
 671   mctx.input.raw_stop = stop;
 672   mctx.input.newline_anchor = preg->newline_anchor;
 673
 674   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 675   if (BE (err != REG_NOERROR, 0))
 676     goto free_return;
 677
 678   /* We will log all the DFA states through which the dfa pass,
 679      if nmatch > 1, or this dfa has "multibyte node", which is a
 680      back-reference or a node which can accept multibyte character or
 681      multi character collating element.  */
 682   if (nmatch > 1 || dfa->has_mb_node)
 683     {
 684       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 685       if (BE (mctx.state_log == NULL, 0))
 686         {
 687           err = REG_ESPACE;
 688           goto free_return;
 689         }
 690     }
 691   else
 692     mctx.state_log = NULL;
 693
 694   match_first = start;
 695   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 696                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 697
 698   /* Check incrementally whether of not the input string match.  */
 699   incr = (range < 0) ? -1 : 1;
 700   left_lim = (range < 0) ? start + range : start;
 701   right_lim = (range < 0) ? start : start + range;
 702   sb = dfa->mb_cur_max == 1;
 703   match_kind =
 704     (fastmap
 705      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 706         | (range >= 0 ? 2 : 0)
 707         | (t != NULL ? 1 : 0))
 708      : 8);
 709
 710   for (;; match_first += incr)
 711     {
 712       err = REG_NOMATCH;
 713       if (match_first < left_lim || right_lim < match_first)
 714         goto free_return;
 715
 716       /* Advance as rapidly as possible through the string, until we
 717          find a plausible place to start matching.  This may be done
 718          with varying efficiency, so there are various possibilities:
 719          only the most common of them are specialized, in order to
 720          save on code size.  We use a switch statement for speed.  */
 721       switch (match_kind)
 722         {
 723         case 8:
 724           /* No fastmap.  */
 725           break;
 726
 727         case 7:
 728           /* Fastmap with single-byte translation, match forward.  */
 729           while (BE (match_first < right_lim, 1)
 730                  && !fastmap[t[(unsigned char) string[match_first]]])
 731             ++match_first;
 732           goto forward_match_found_start_or_reached_end;
 733
 734         case 6:
 735           /* Fastmap without translation, match forward.  */
 736           while (BE (match_first < right_lim, 1)
 737                  && !fastmap[(unsigned char) string[match_first]])
 738             ++match_first;
 739
 740         forward_match_found_start_or_reached_end:
 741           if (BE (match_first == right_lim, 0))
 742             {
 743               ch = match_first >= length
 744                        ? 0 : (unsigned char) string[match_first];
 745               if (!fastmap[t ? t[ch] : ch])
 746                 goto free_return;
 747             }
 748           break;
 749
 750         case 4:
 751         case 5:
 752           /* Fastmap without multi-byte translation, match backwards.  */
 753           while (match_first >= left_lim)
 754             {
 755               ch = match_first >= length
 756                        ? 0 : (unsigned char) string[match_first];
 757               if (fastmap[t ? t[ch] : ch])
 758                 break;
 759               --match_first;
 760             }
 761           if (match_first < left_lim)
 762             goto free_return;
 763           break;
 764
 765         default:
 766           /* In this case, we can't determine easily the current byte,
 767              since it might be a component byte of a multibyte
 768              character.  Then we use the constructed buffer instead.  */
 769           for (;;)
 770             {
 771               /* If MATCH_FIRST is out of the valid range, reconstruct the
 772                  buffers.  */
 773               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 774               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 775                 {
 776                   err = re_string_reconstruct (&mctx.input, match_first,
 777                                                eflags);
 778                   if (BE (err != REG_NOERROR, 0))
 779                     goto free_return;
 780
 781                   offset = match_first - mctx.input.raw_mbs_idx;
 782                 }
 783               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 784                  Note that MATCH_FIRST must not be smaller than 0.  */
 785               ch = (match_first >= length
 786                     ? 0 : re_string_byte_at (&mctx.input, offset));
 787               if (fastmap[ch])
 788                 break;
 789               match_first += incr;
 790               if (match_first < left_lim || match_first > right_lim)
 791                 {
 792                   err = REG_NOMATCH;
 793                   goto free_return;
 794                 }
 795             }
 796           break;
 797         }
 798
 799       /* Reconstruct the buffers so that the matcher can assume that
 800          the matching starts from the beginning of the buffer.  */
 801       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 802       if (BE (err != REG_NOERROR, 0))
 803         goto free_return;
 804
 805 #ifdef RE_ENABLE_I18N
 806      /* Don't consider this char as a possible match start if it part,
 807         yet isn't the head, of a multibyte character.  */
 808       if (!sb && !re_string_first_byte (&mctx.input, 0))
 809         continue;
 810 #endif
 811
 812       /* It seems to be appropriate one, then use the matcher.  */
 813       /* We assume that the matching starts from 0.  */
 814       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 815       match_last = check_matching (&mctx, fl_longest_match,
 816                                    range >= 0 ? &match_first : NULL);
 817       if (match_last != -1)
 818         {
 819           if (BE (match_last == -2, 0))
 820             {
 821               err = REG_ESPACE;
 822               goto free_return;
 823             }
 824           else
 825             {
 826               mctx.match_last = match_last;
 827               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 828                 {
 829                   re_dfastate_t *pstate = mctx.state_log[match_last];
 830                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 831                                                              match_last);
 832                 }
 833               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 834                   || dfa->nbackref)
 835                 {
 836                   err = prune_impossible_nodes (&mctx);
 837                   if (err == REG_NOERROR)
 838                     break;
 839                   if (BE (err != REG_NOMATCH, 0))
 840                     goto free_return;
 841                   match_last = -1;
 842                 }
 843               else
 844                 break; /* We found a match.  */
 845             }
 846         }
 847
 848       match_ctx_clean (&mctx);
 849     }
 850
 851 #ifdef DEBUG
 852   assert (match_last != -1);
 853   assert (err == REG_NOERROR);
 854 #endif
 855
 856   /* Set pmatch[] if we need.  */
 857   if (nmatch > 0)
 858     {
 859       int reg_idx;
 860
 861       /* Initialize registers.  */
 862       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 863         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 864
 865       /* Set the points where matching start/end.  */
 866       pmatch[0].rm_so = 0;
 867       pmatch[0].rm_eo = mctx.match_last;
 868
 869       if (!preg->no_sub && nmatch > 1)
 870         {
 871           err = set_regs (preg, &mctx, nmatch, pmatch,
 872                           dfa->has_plural_match && dfa->nbackref > 0);
 873           if (BE (err != REG_NOERROR, 0))
 874             goto free_return;
 875         }
 876
 877       /* At last, add the offset to the each registers, since we slided
 878          the buffers so that we could assume that the matching starts
 879          from 0.  */
 880       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 881         if (pmatch[reg_idx].rm_so != -1)
 882           {
 883 #ifdef RE_ENABLE_I18N
 884             if (BE (mctx.input.offsets_needed != 0, 0))
 885               {
 886                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 887                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 888                 else
 889                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 890                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 891                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 892                 else
 893                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 894               }
 895 #else
 896             assert (mctx.input.offsets_needed == 0);
 897 #endif
 898             pmatch[reg_idx].rm_so += match_first;
 899             pmatch[reg_idx].rm_eo += match_first;
 900           }
 901       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 902         {
 903           pmatch[nmatch + reg_idx].rm_so = -1;
 904           pmatch[nmatch + reg_idx].rm_eo = -1;
 905         }
 906
 907       if (dfa->subexp_map)
 908         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 909           if (dfa->subexp_map[reg_idx] != reg_idx)
 910             {
 911               pmatch[reg_idx + 1].rm_so
 912                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 913               pmatch[reg_idx + 1].rm_eo
 914                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 915             }
 916     }
 917
 918  free_return:
 919   re_free (mctx.state_log);
 920   if (dfa->nbackref)
 921     match_ctx_free (&mctx);
 922   re_string_destruct (&mctx.input);
 923   return err;
 924 }
 925
 926 static reg_errcode_t
 927 prune_impossible_nodes (mctx)
 928      re_match_context_t *mctx;
 929 {
 930   re_dfa_t *const dfa = mctx->dfa;
 931   int halt_node, match_last;
 932   reg_errcode_t ret;
 933   re_dfastate_t **sifted_states;
 934   re_dfastate_t **lim_states = NULL;
 935   re_sift_context_t sctx;
 936 #ifdef DEBUG
 937   assert (mctx->state_log != NULL);
 938 #endif
 939   match_last = mctx->match_last;
 940   halt_node = mctx->last_node;
 941   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 942   if (BE (sifted_states == NULL, 0))
 943     {
 944       ret = REG_ESPACE;
 945       goto free_return;
 946     }
 947   if (dfa->nbackref)
 948     {
 949       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 950       if (BE (lim_states == NULL, 0))
 951         {
 952           ret = REG_ESPACE;
 953           goto free_return;
 954         }
 955       while (1)
 956         {
 957           memset (lim_states, '\0',
 958                   sizeof (re_dfastate_t *) * (match_last + 1));
 959           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 960                          match_last);
 961           ret = sift_states_backward (mctx, &sctx);
 962           re_node_set_free (&sctx.limits);
 963           if (BE (ret != REG_NOERROR, 0))
 964               goto free_return;
 965           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 966             break;
 967           do
 968             {
 969               --match_last;
 970               if (match_last < 0)
 971                 {
 972                   ret = REG_NOMATCH;
 973                   goto free_return;
 974                 }
 975             } while (mctx->state_log[match_last] == NULL
 976                      || !mctx->state_log[match_last]->halt);
 977           halt_node = check_halt_state_context (mctx,
 978                                                 mctx->state_log[match_last],
 979                                                 match_last);
 980         }
 981       ret = merge_state_array (dfa, sifted_states, lim_states,
 982                                match_last + 1);
 983       re_free (lim_states);
 984       lim_states = NULL;
 985       if (BE (ret != REG_NOERROR, 0))
 986         goto free_return;
 987     }
 988   else
 989     {
 990       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 991       ret = sift_states_backward (mctx, &sctx);
 992       re_node_set_free (&sctx.limits);
 993       if (BE (ret != REG_NOERROR, 0))
 994         goto free_return;
 995     }
 996   re_free (mctx->state_log);
 997   mctx->state_log = sifted_states;
 998   sifted_states = NULL;
 999   mctx->last_node = halt_node;
1000   mctx->match_last = match_last;
1001   ret = REG_NOERROR;
1002  free_return:
1003   re_free (sifted_states);
1004   re_free (lim_states);
1005   return ret;
1006 }
1007
1008 /* Acquire an initial state and return it.
1009    We must select appropriate initial state depending on the context,
1010    since initial states may have constraints like "\<", "^", etc..  */
1011
1012 static inline re_dfastate_t *
1013 acquire_init_state_context (err, mctx, idx)
1014      reg_errcode_t *err;
1015      const re_match_context_t *mctx;
1016      int idx;
1017 {
1018   re_dfa_t *const dfa = mctx->dfa;
1019   if (dfa->init_state->has_constraint)
1020     {
1021       unsigned int context;
1022       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1023       if (IS_WORD_CONTEXT (context))
1024         return dfa->init_state_word;
1025       else if (IS_ORDINARY_CONTEXT (context))
1026         return dfa->init_state;
1027       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1028         return dfa->init_state_begbuf;
1029       else if (IS_NEWLINE_CONTEXT (context))
1030         return dfa->init_state_nl;
1031       else if (IS_BEGBUF_CONTEXT (context))
1032         {
1033           /* It is relatively rare case, then calculate on demand.  */
1034           return re_acquire_state_context (err, dfa,
1035                                            dfa->init_state->entrance_nodes,
1036                                            context);
1037         }
1038       else
1039         /* Must not happen?  */
1040         return dfa->init_state;
1041     }
1042   else
1043     return dfa->init_state;
1044 }
1045
1046 /* Check whether the regular expression match input string INPUT or not,
1047    and return the index where the matching end, return -1 if not match,
1048    or return -2 in case of an error.
1049    FL_LONGEST_MATCH means we want the POSIX longest matching.
1050    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1051    next place where we may want to try matching.
1052    Note that the matcher assume that the maching starts from the current
1053    index of the buffer.  */
1054
1055 static int
1056 check_matching (mctx, fl_longest_match, p_match_first)
1057     re_match_context_t *mctx;
1058     int fl_longest_match;
1059     int *p_match_first;
1060 {
1061   re_dfa_t *const dfa = mctx->dfa;
1062   reg_errcode_t err;
1063   int match = 0;
1064   int match_last = -1;
1065   int cur_str_idx = re_string_cur_idx (&mctx->input);
1066   re_dfastate_t *cur_state;
1067   int at_init_state = p_match_first != NULL;
1068   int next_start_idx = cur_str_idx;
1069
1070   err = REG_NOERROR;
1071   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1072   /* An initial state must not be NULL (invalid).  */
1073   if (BE (cur_state == NULL, 0))
1074     {
1075       assert (err == REG_ESPACE);
1076       return -2;
1077     }
1078
1079   if (mctx->state_log != NULL)
1080     {
1081       mctx->state_log[cur_str_idx] = cur_state;
1082
1083       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1084          later.  E.g. Processing back references.  */
1085       if (BE (dfa->nbackref, 0))
1086         {
1087           at_init_state = 0;
1088           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1089           if (BE (err != REG_NOERROR, 0))
1090             return err;
1091
1092           if (cur_state->has_backref)
1093             {
1094               err = transit_state_bkref (mctx, &cur_state->nodes);
1095               if (BE (err != REG_NOERROR, 0))
1096                 return err;
1097             }
1098         }
1099     }
1100
1101   /* If the RE accepts NULL string.  */
1102   if (BE (cur_state->halt, 0))
1103     {
1104       if (!cur_state->has_constraint
1105           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1106         {
1107           if (!fl_longest_match)
1108             return cur_str_idx;
1109           else
1110             {
1111               match_last = cur_str_idx;
1112               match = 1;
1113             }
1114         }
1115     }
1116
1117   while (!re_string_eoi (&mctx->input))
1118     {
1119       re_dfastate_t *old_state = cur_state;
1120       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1121
1122       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1123           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1124               && mctx->input.valid_len < mctx->input.len))
1125         {
1126           err = extend_buffers (mctx);
1127           if (BE (err != REG_NOERROR, 0))
1128             {
1129               assert (err == REG_ESPACE);
1130               return -2;
1131             }
1132         }
1133
1134       cur_state = transit_state (&err, mctx, cur_state);
1135       if (mctx->state_log != NULL)
1136         cur_state = merge_state_with_log (&err, mctx, cur_state);
1137
1138       if (cur_state == NULL)
1139         {
1140           /* Reached the invalid state or an error.  Try to recover a valid
1141              state using the state log, if available and if we have not
1142              already found a valid (even if not the longest) match.  */
1143           if (BE (err != REG_NOERROR, 0))
1144             return -2;
1145
1146           if (mctx->state_log == NULL
1147               || (match && !fl_longest_match)
1148               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1149             break;
1150         }
1151
1152       if (BE (at_init_state, 0))
1153         {
1154           if (old_state == cur_state)
1155             next_start_idx = next_char_idx;
1156           else
1157             at_init_state = 0;
1158         }
1159
1160       if (cur_state->halt)
1161         {
1162           /* Reached a halt state.
1163              Check the halt state can satisfy the current context.  */
1164           if (!cur_state->has_constraint
1165               || check_halt_state_context (mctx, cur_state,
1166                                            re_string_cur_idx (&mctx->input)))
1167             {
1168               /* We found an appropriate halt state.  */
1169               match_last = re_string_cur_idx (&mctx->input);
1170               match = 1;
1171
1172               /* We found a match, do not modify match_first below.  */
1173               p_match_first = NULL;
1174               if (!fl_longest_match)
1175                 break;
1176             }
1177         }
1178     }
1179
1180   if (p_match_first)
1181     *p_match_first += next_start_idx;
1182
1183   return match_last;
1184 }
1185
1186 /* Check NODE match the current context.  */
1187
1188 static int check_halt_node_context (dfa, node, context)
1189     const re_dfa_t *dfa;
1190     int node;
1191     unsigned int context;
1192 {
1193   re_token_type_t type = dfa->nodes[node].type;
1194   unsigned int constraint = dfa->nodes[node].constraint;
1195   if (type != END_OF_RE)
1196     return 0;
1197   if (!constraint)
1198     return 1;
1199   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1200     return 0;
1201   return 1;
1202 }
1203
1204 /* Check the halt state STATE match the current context.
1205    Return 0 if not match, if the node, STATE has, is a halt node and
1206    match the context, return the node.  */
1207
1208 static int
1209 check_halt_state_context (mctx, state, idx)
1210     const re_match_context_t *mctx;
1211     const re_dfastate_t *state;
1212     int idx;
1213 {
1214   int i;
1215   unsigned int context;
1216 #ifdef DEBUG
1217   assert (state->halt);
1218 #endif
1219   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1220   for (i = 0; i < state->nodes.nelem; ++i)
1221     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1222       return state->nodes.elems[i];
1223   return 0;
1224 }
1225
1226 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1227    corresponding to the DFA).
1228    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1229    of errors.  */
1230
1231 static int
1232 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1233     const re_match_context_t *mctx;
1234     regmatch_t *regs;
1235     int nregs, *pidx, node;
1236     re_node_set *eps_via_nodes;
1237     struct re_fail_stack_t *fs;
1238 {
1239   re_dfa_t *const dfa = mctx->dfa;
1240   int i, err, dest_node;
1241   dest_node = -1;
1242   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1243     {
1244       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1245       re_node_set *edests = &dfa->edests[node];
1246       int dest_node;
1247       err = re_node_set_insert (eps_via_nodes, node);
1248       if (BE (err < 0, 0))
1249         return -2;
1250       /* Pick up a valid destination, or return -1 if none is found.  */
1251       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1252         {
1253           int candidate = edests->elems[i];
1254           if (!re_node_set_contains (cur_nodes, candidate))
1255             continue;
1256           if (dest_node == -1)
1257             dest_node = candidate;
1258
1259           else
1260             {
1261               /* In order to avoid infinite loop like "(a*)*", return the second
1262                  epsilon-transition if the first was already considered.  */
1263               if (re_node_set_contains (eps_via_nodes, dest_node))
1264                 return candidate;
1265
1266               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1267               else if (fs != NULL
1268                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1269                                            eps_via_nodes))
1270                 return -2;
1271
1272               /* We know we are going to exit.  */
1273               break;
1274             }
1275         }
1276       return dest_node;
1277     }
1278   else
1279     {
1280       int naccepted = 0;
1281       re_token_type_t type = dfa->nodes[node].type;
1282
1283 #ifdef RE_ENABLE_I18N
1284       if (dfa->nodes[node].accept_mb)
1285         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1286       else
1287 #endif /* RE_ENABLE_I18N */
1288       if (type == OP_BACK_REF)
1289         {
1290           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1291           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1292           if (fs != NULL)
1293             {
1294               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1295                 return -1;
1296               else if (naccepted)
1297                 {
1298                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1299                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1300                               naccepted) != 0)
1301                     return -1;
1302                 }
1303             }
1304
1305           if (naccepted == 0)
1306             {
1307               err = re_node_set_insert (eps_via_nodes, node);
1308               if (BE (err < 0, 0))
1309                 return -2;
1310               dest_node = dfa->edests[node].elems[0];
1311               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1312                                         dest_node))
1313                 return dest_node;
1314             }
1315         }
1316
1317       if (naccepted != 0
1318           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1319         {
1320           dest_node = dfa->nexts[node];
1321           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1322           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1323                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1324                                                dest_node)))
1325             return -1;
1326           re_node_set_empty (eps_via_nodes);
1327           return dest_node;
1328         }
1329     }
1330   return -1;
1331 }
1332
1333 static reg_errcode_t
1334 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1335      struct re_fail_stack_t *fs;
1336      int str_idx, dest_node, nregs;
1337      regmatch_t *regs;
1338      re_node_set *eps_via_nodes;
1339 {
1340   reg_errcode_t err;
1341   int num = fs->num++;
1342   if (fs->num == fs->alloc)
1343     {
1344       struct re_fail_stack_ent_t *new_array;
1345       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1346                                        * fs->alloc * 2));
1347       if (new_array == NULL)
1348         return REG_ESPACE;
1349       fs->alloc *= 2;
1350       fs->stack = new_array;
1351     }
1352   fs->stack[num].idx = str_idx;
1353   fs->stack[num].node = dest_node;
1354   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1355   if (fs->stack[num].regs == NULL)
1356     return REG_ESPACE;
1357   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1358   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1359   return err;
1360 }
1361
1362 static int
1363 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1364      struct re_fail_stack_t *fs;
1365      int *pidx, nregs;
1366      regmatch_t *regs;
1367      re_node_set *eps_via_nodes;
1368 {
1369   int num = --fs->num;
1370   assert (num >= 0);
1371   *pidx = fs->stack[num].idx;
1372   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1373   re_node_set_free (eps_via_nodes);
1374   re_free (fs->stack[num].regs);
1375   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1376   return fs->stack[num].node;
1377 }
1378
1379 /* Set the positions where the subexpressions are starts/ends to registers
1380    PMATCH.
1381    Note: We assume that pmatch[0] is already set, and
1382    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1383
1384 static reg_errcode_t
1385 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1386      const regex_t *preg;
1387      const re_match_context_t *mctx;
1388      size_t nmatch;
1389      regmatch_t *pmatch;
1390      int fl_backtrack;
1391 {
1392   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1393   int idx, cur_node;
1394   re_node_set eps_via_nodes;
1395   struct re_fail_stack_t *fs;
1396   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1397   regmatch_t *prev_idx_match;
1398
1399 #ifdef DEBUG
1400   assert (nmatch > 1);
1401   assert (mctx->state_log != NULL);
1402 #endif
1403   if (fl_backtrack)
1404     {
1405       fs = &fs_body;
1406       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1407       if (fs->stack == NULL)
1408         return REG_ESPACE;
1409     }
1410   else
1411     fs = NULL;
1412
1413   cur_node = dfa->init_node;
1414   re_node_set_init_empty (&eps_via_nodes);
1415
1416   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1417   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1418
1419   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1420     {
1421       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1422
1423       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1424         {
1425           int reg_idx;
1426           if (fs)
1427             {
1428               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1429                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1430                   break;
1431               if (reg_idx == nmatch)
1432                 {
1433                   re_node_set_free (&eps_via_nodes);
1434                   return free_fail_stack_return (fs);
1435                 }
1436               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437                                          &eps_via_nodes);
1438             }
1439           else
1440             {
1441               re_node_set_free (&eps_via_nodes);
1442               return REG_NOERROR;
1443             }
1444         }
1445
1446       /* Proceed to next node.  */
1447       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1448                                     &eps_via_nodes, fs);
1449
1450       if (BE (cur_node < 0, 0))
1451         {
1452           if (BE (cur_node == -2, 0))
1453             {
1454               re_node_set_free (&eps_via_nodes);
1455               free_fail_stack_return (fs);
1456               return REG_ESPACE;
1457             }
1458           if (fs)
1459             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1460                                        &eps_via_nodes);
1461           else
1462             {
1463               re_node_set_free (&eps_via_nodes);
1464               return REG_NOMATCH;
1465             }
1466         }
1467     }
1468   re_node_set_free (&eps_via_nodes);
1469   return free_fail_stack_return (fs);
1470 }
1471
1472 static reg_errcode_t
1473 free_fail_stack_return (fs)
1474      struct re_fail_stack_t *fs;
1475 {
1476   if (fs)
1477     {
1478       int fs_idx;
1479       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1480         {
1481           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1482           re_free (fs->stack[fs_idx].regs);
1483         }
1484       re_free (fs->stack);
1485     }
1486   return REG_NOERROR;
1487 }
1488
1489 static void
1490 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1491      re_dfa_t *dfa;
1492      regmatch_t *pmatch, *prev_idx_match;
1493      int cur_node, cur_idx, nmatch;
1494 {
1495   int type = dfa->nodes[cur_node].type;
1496   if (type == OP_OPEN_SUBEXP)
1497     {
1498       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1499
1500       /* We are at the first node of this sub expression.  */
1501       if (reg_num < nmatch)
1502         {
1503           pmatch[reg_num].rm_so = cur_idx;
1504           pmatch[reg_num].rm_eo = -1;
1505         }
1506     }
1507   else if (type == OP_CLOSE_SUBEXP)
1508     {
1509       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1510       if (reg_num < nmatch)
1511         {
1512           /* We are at the last node of this sub expression.  */
1513           if (pmatch[reg_num].rm_so < cur_idx)
1514             {
1515               pmatch[reg_num].rm_eo = cur_idx;
1516               /* This is a non-empty match or we are not inside an optional
1517                  subexpression.  Accept this right away.  */
1518               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1519             }
1520           else
1521             {
1522               if (dfa->nodes[cur_node].opt_subexp
1523                   && prev_idx_match[reg_num].rm_so != -1)
1524                 /* We transited through an empty match for an optional
1525                    subexpression, like (a?)*, and this is not the subexp's
1526                    first match.  Copy back the old content of the registers
1527                    so that matches of an inner subexpression are undone as
1528                    well, like in ((a?))*.  */
1529                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1530               else
1531                 /* We completed a subexpression, but it may be part of
1532                    an optional one, so do not update PREV_IDX_MATCH.  */
1533                 pmatch[reg_num].rm_eo = cur_idx;
1534             }
1535         }
1536     }
1537 }
1538
1539 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1540    and sift the nodes in each states according to the following rules.
1541    Updated state_log will be wrote to STATE_LOG.
1542
1543    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1544      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1545         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1546         the LAST_NODE, we throw away the node `a'.
1547      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1548         string `s' and transit to `b':
1549         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1550            away the node `a'.
1551         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1552             thrown away, we throw away the node `a'.
1553      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1554         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1555            node `a'.
1556         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1557             we throw away the node `a'.  */
1558
1559 #define STATE_NODE_CONTAINS(state,node) \
1560   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1561
1562 static reg_errcode_t
1563 sift_states_backward (mctx, sctx)
1564      re_match_context_t *mctx;
1565      re_sift_context_t *sctx;
1566 {
1567   reg_errcode_t err;
1568   int null_cnt = 0;
1569   int str_idx = sctx->last_str_idx;
1570   re_node_set cur_dest;
1571
1572 #ifdef DEBUG
1573   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1574 #endif
1575
1576   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1577      transit to the last_node and the last_node itself.  */
1578   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1579   if (BE (err != REG_NOERROR, 0))
1580     return err;
1581   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1582   if (BE (err != REG_NOERROR, 0))
1583     goto free_return;
1584
1585   /* Then check each states in the state_log.  */
1586   while (str_idx > 0)
1587     {
1588       /* Update counters.  */
1589       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1590       if (null_cnt > mctx->max_mb_elem_len)
1591         {
1592           memset (sctx->sifted_states, '\0',
1593                   sizeof (re_dfastate_t *) * str_idx);
1594           re_node_set_free (&cur_dest);
1595           return REG_NOERROR;
1596         }
1597       re_node_set_empty (&cur_dest);
1598       --str_idx;
1599
1600       if (mctx->state_log[str_idx])
1601         {
1602           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1603           if (BE (err != REG_NOERROR, 0))
1604             goto free_return;
1605         }
1606
1607       /* Add all the nodes which satisfy the following conditions:
1608          - It can epsilon transit to a node in CUR_DEST.
1609          - It is in CUR_SRC.
1610          And update state_log.  */
1611       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1612       if (BE (err != REG_NOERROR, 0))
1613         goto free_return;
1614     }
1615   err = REG_NOERROR;
1616  free_return:
1617   re_node_set_free (&cur_dest);
1618   return err;
1619 }
1620
1621 static reg_errcode_t
1622 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1623      re_match_context_t *mctx;
1624      re_sift_context_t *sctx;
1625      int str_idx;
1626      re_node_set *cur_dest;
1627 {
1628   re_dfa_t *const dfa = mctx->dfa;
1629   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1630   int i;
1631
1632   /* Then build the next sifted state.
1633      We build the next sifted state on `cur_dest', and update
1634      `sifted_states[str_idx]' with `cur_dest'.
1635      Note:
1636      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1637      `cur_src' points the node_set of the old `state_log[str_idx]'
1638      (with the epsilon nodes pre-filtered out).  */
1639   for (i = 0; i < cur_src->nelem; i++)
1640     {
1641       int prev_node = cur_src->elems[i];
1642       int naccepted = 0;
1643       int ret;
1644
1645 #ifdef DEBUG
1646       re_token_type_t type = dfa->nodes[prev_node].type;
1647       assert (!IS_EPSILON_NODE (type));
1648 #endif
1649 #ifdef RE_ENABLE_I18N
1650       /* If the node may accept `multi byte'.  */
1651       if (dfa->nodes[prev_node].accept_mb)
1652         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1653                                          str_idx, sctx->last_str_idx);
1654 #endif /* RE_ENABLE_I18N */
1655
1656       /* We don't check backreferences here.
1657          See update_cur_sifted_state().  */
1658       if (!naccepted
1659           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1660           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1661                                   dfa->nexts[prev_node]))
1662         naccepted = 1;
1663
1664       if (naccepted == 0)
1665         continue;
1666
1667       if (sctx->limits.nelem)
1668         {
1669           int to_idx = str_idx + naccepted;
1670           if (check_dst_limits (mctx, &sctx->limits,
1671                                 dfa->nexts[prev_node], to_idx,
1672                                 prev_node, str_idx))
1673             continue;
1674         }
1675       ret = re_node_set_insert (cur_dest, prev_node);
1676       if (BE (ret == -1, 0))
1677         return REG_ESPACE;
1678     }
1679
1680   return REG_NOERROR;
1681 }
1682
1683 /* Helper functions.  */
1684
1685 static reg_errcode_t
1686 clean_state_log_if_needed (mctx, next_state_log_idx)
1687     re_match_context_t *mctx;
1688     int next_state_log_idx;
1689 {
1690   int top = mctx->state_log_top;
1691
1692   if (next_state_log_idx >= mctx->input.bufs_len
1693       || (next_state_log_idx >= mctx->input.valid_len
1694           && mctx->input.valid_len < mctx->input.len))
1695     {
1696       reg_errcode_t err;
1697       err = extend_buffers (mctx);
1698       if (BE (err != REG_NOERROR, 0))
1699         return err;
1700     }
1701
1702   if (top < next_state_log_idx)
1703     {
1704       memset (mctx->state_log + top + 1, '\0',
1705               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1706       mctx->state_log_top = next_state_log_idx;
1707     }
1708   return REG_NOERROR;
1709 }
1710
1711 static reg_errcode_t
1712 merge_state_array (dfa, dst, src, num)
1713      re_dfa_t *dfa;
1714      re_dfastate_t **dst;
1715      re_dfastate_t **src;
1716      int num;
1717 {
1718   int st_idx;
1719   reg_errcode_t err;
1720   for (st_idx = 0; st_idx < num; ++st_idx)
1721     {
1722       if (dst[st_idx] == NULL)
1723         dst[st_idx] = src[st_idx];
1724       else if (src[st_idx] != NULL)
1725         {
1726           re_node_set merged_set;
1727           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1728                                         &src[st_idx]->nodes);
1729           if (BE (err != REG_NOERROR, 0))
1730             return err;
1731           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1732           re_node_set_free (&merged_set);
1733           if (BE (err != REG_NOERROR, 0))
1734             return err;
1735         }
1736     }
1737   return REG_NOERROR;
1738 }
1739
1740 static reg_errcode_t
1741 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1742      re_match_context_t *mctx;
1743      re_sift_context_t *sctx;
1744      int str_idx;
1745      re_node_set *dest_nodes;
1746 {
1747   re_dfa_t *const dfa = mctx->dfa;
1748   reg_errcode_t err;
1749   const re_node_set *candidates;
1750   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1751                 : &mctx->state_log[str_idx]->nodes);
1752
1753   if (dest_nodes->nelem == 0)
1754     sctx->sifted_states[str_idx] = NULL;
1755   else
1756     {
1757       if (candidates)
1758         {
1759           /* At first, add the nodes which can epsilon transit to a node in
1760              DEST_NODE.  */
1761           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1762           if (BE (err != REG_NOERROR, 0))
1763             return err;
1764
1765           /* Then, check the limitations in the current sift_context.  */
1766           if (sctx->limits.nelem)
1767             {
1768               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1769                                          mctx->bkref_ents, str_idx);
1770               if (BE (err != REG_NOERROR, 0))
1771                 return err;
1772             }
1773         }
1774
1775       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1776       if (BE (err != REG_NOERROR, 0))
1777         return err;
1778     }
1779
1780   if (candidates && mctx->state_log[str_idx]->has_backref)
1781     {
1782       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1783       if (BE (err != REG_NOERROR, 0))
1784         return err;
1785     }
1786   return REG_NOERROR;
1787 }
1788
1789 static reg_errcode_t
1790 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1791      re_dfa_t *dfa;
1792      re_node_set *dest_nodes;
1793      const re_node_set *candidates;
1794 {
1795   reg_errcode_t err = REG_NOERROR;
1796   int i;
1797
1798   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1799   if (BE (err != REG_NOERROR, 0))
1800     return err;
1801
1802   if (!state->inveclosure.alloc)
1803     {
1804       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1805       if (BE (err != REG_NOERROR, 0))
1806         return REG_ESPACE;
1807       for (i = 0; i < dest_nodes->nelem; i++)
1808         re_node_set_merge (&state->inveclosure,
1809                            dfa->inveclosures + dest_nodes->elems[i]);
1810     }
1811   return re_node_set_add_intersect (dest_nodes, candidates,
1812                                     &state->inveclosure);
1813 }
1814
1815 static reg_errcode_t
1816 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1817      re_dfa_t *dfa;
1818      int node;
1819      re_node_set *dest_nodes;
1820      const re_node_set *candidates;
1821 {
1822     int ecl_idx;
1823     reg_errcode_t err;
1824     re_node_set *inv_eclosure = dfa->inveclosures + node;
1825     re_node_set except_nodes;
1826     re_node_set_init_empty (&except_nodes);
1827     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1828       {
1829         int cur_node = inv_eclosure->elems[ecl_idx];
1830         if (cur_node == node)
1831           continue;
1832         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1833           {
1834             int edst1 = dfa->edests[cur_node].elems[0];
1835             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1836                          ? dfa->edests[cur_node].elems[1] : -1);
1837             if ((!re_node_set_contains (inv_eclosure, edst1)
1838                  && re_node_set_contains (dest_nodes, edst1))
1839                 || (edst2 > 0
1840                     && !re_node_set_contains (inv_eclosure, edst2)
1841                     && re_node_set_contains (dest_nodes, edst2)))
1842               {
1843                 err = re_node_set_add_intersect (&except_nodes, candidates,
1844                                                  dfa->inveclosures + cur_node);
1845                 if (BE (err != REG_NOERROR, 0))
1846                   {
1847                     re_node_set_free (&except_nodes);
1848                     return err;
1849                   }
1850               }
1851           }
1852       }
1853     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1854       {
1855         int cur_node = inv_eclosure->elems[ecl_idx];
1856         if (!re_node_set_contains (&except_nodes, cur_node))
1857           {
1858             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1859             re_node_set_remove_at (dest_nodes, idx);
1860           }
1861       }
1862     re_node_set_free (&except_nodes);
1863     return REG_NOERROR;
1864 }
1865
1866 static int
1867 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1868      re_match_context_t *mctx;
1869      re_node_set *limits;
1870      int dst_node, dst_idx, src_node, src_idx;
1871 {
1872   re_dfa_t *const dfa = mctx->dfa;
1873   int lim_idx, src_pos, dst_pos;
1874
1875   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1876   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1877   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1878     {
1879       int subexp_idx;
1880       struct re_backref_cache_entry *ent;
1881       ent = mctx->bkref_ents + limits->elems[lim_idx];
1882       subexp_idx = dfa->nodes[ent->node].opr.idx;
1883
1884       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1885                                            subexp_idx, dst_node, dst_idx,
1886                                            dst_bkref_idx);
1887       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1888                                            subexp_idx, src_node, src_idx,
1889                                            src_bkref_idx);
1890
1891       /* In case of:
1892          <src> <dst> ( <subexp> )
1893          ( <subexp> ) <src> <dst>
1894          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1895       if (src_pos == dst_pos)
1896         continue; /* This is unrelated limitation.  */
1897       else
1898         return 1;
1899     }
1900   return 0;
1901 }
1902
1903 static int
1904 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1905      re_match_context_t *mctx;
1906      int boundaries, subexp_idx, from_node, bkref_idx;
1907 {
1908   re_dfa_t *const dfa = mctx->dfa;
1909   re_node_set *eclosures = dfa->eclosures + from_node;
1910   int node_idx;
1911
1912   /* Else, we are on the boundary: examine the nodes on the epsilon
1913      closure.  */
1914   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1915     {
1916       int node = eclosures->elems[node_idx];
1917       switch (dfa->nodes[node].type)
1918         {
1919         case OP_BACK_REF:
1920           if (bkref_idx != -1)
1921             {
1922               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1923               do
1924                 {
1925                   int dst, cpos;
1926
1927                   if (ent->node != node)
1928                     continue;
1929
1930                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1931                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1932                     continue;
1933
1934                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1935                      OP_CLOSE_SUBEXP cases below.  But, if the
1936                      destination node is the same node as the source
1937                      node, don't recurse because it would cause an
1938                      infinite loop: a regex that exhibits this behavior
1939                      is ()\1*\1*  */
1940                   dst = dfa->edests[node].elems[0];
1941                   if (dst == from_node)
1942                     {
1943                       if (boundaries & 1)
1944                         return -1;
1945                       else /* if (boundaries & 2) */
1946                         return 0;
1947                     }
1948
1949                   cpos =
1950                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1951                                                  dst, bkref_idx);
1952                   if (cpos == -1 /* && (boundaries & 1) */)
1953                     return -1;
1954                   if (cpos == 0 && (boundaries & 2))
1955                     return 0;
1956
1957                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1958                 }
1959               while (ent++->more);
1960             }
1961           break;
1962
1963         case OP_OPEN_SUBEXP:
1964           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1965             return -1;
1966           break;
1967
1968         case OP_CLOSE_SUBEXP:
1969           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1970             return 0;
1971           break;
1972
1973         default:
1974             break;
1975         }
1976     }
1977
1978   return (boundaries & 2) ? 1 : 0;
1979 }
1980
1981 static int
1982 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1983      re_match_context_t *mctx;
1984      int limit, subexp_idx, from_node, str_idx, bkref_idx;
1985 {
1986   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1987   int boundaries;
1988
1989   /* If we are outside the range of the subexpression, return -1 or 1.  */
1990   if (str_idx < lim->subexp_from)
1991     return -1;
1992
1993   if (lim->subexp_to < str_idx)
1994     return 1;
1995
1996   /* If we are within the subexpression, return 0.  */
1997   boundaries = (str_idx == lim->subexp_from);
1998   boundaries |= (str_idx == lim->subexp_to) << 1;
1999   if (boundaries == 0)
2000     return 0;
2001
2002   /* Else, examine epsilon closure.  */
2003   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2004                                       from_node, bkref_idx);
2005 }
2006
2007 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2008    which are against limitations from DEST_NODES. */
2009
2010 static reg_errcode_t
2011 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2012      re_dfa_t *dfa;
2013      re_node_set *dest_nodes;
2014      const re_node_set *candidates;
2015      re_node_set *limits;
2016      struct re_backref_cache_entry *bkref_ents;
2017      int str_idx;
2018 {
2019   reg_errcode_t err;
2020   int node_idx, lim_idx;
2021
2022   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2023     {
2024       int subexp_idx;
2025       struct re_backref_cache_entry *ent;
2026       ent = bkref_ents + limits->elems[lim_idx];
2027
2028       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2029         continue; /* This is unrelated limitation.  */
2030
2031       subexp_idx = dfa->nodes[ent->node].opr.idx;
2032       if (ent->subexp_to == str_idx)
2033         {
2034           int ops_node = -1;
2035           int cls_node = -1;
2036           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2037             {
2038               int node = dest_nodes->elems[node_idx];
2039               re_token_type_t type = dfa->nodes[node].type;
2040               if (type == OP_OPEN_SUBEXP
2041                   && subexp_idx == dfa->nodes[node].opr.idx)
2042                 ops_node = node;
2043               else if (type == OP_CLOSE_SUBEXP
2044                        && subexp_idx == dfa->nodes[node].opr.idx)
2045                 cls_node = node;
2046             }
2047
2048           /* Check the limitation of the open subexpression.  */
2049           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2050           if (ops_node >= 0)
2051             {
2052               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2053                                            candidates);
2054               if (BE (err != REG_NOERROR, 0))
2055                 return err;
2056             }
2057
2058           /* Check the limitation of the close subexpression.  */
2059           if (cls_node >= 0)
2060             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2061               {
2062                 int node = dest_nodes->elems[node_idx];
2063                 if (!re_node_set_contains (dfa->inveclosures + node,
2064                                            cls_node)
2065                     && !re_node_set_contains (dfa->eclosures + node,
2066                                               cls_node))
2067                   {
2068                     /* It is against this limitation.
2069                        Remove it form the current sifted state.  */
2070                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2071                                                  candidates);
2072                     if (BE (err != REG_NOERROR, 0))
2073                       return err;
2074                     --node_idx;
2075                   }
2076               }
2077         }
2078       else /* (ent->subexp_to != str_idx)  */
2079         {
2080           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2081             {
2082               int node = dest_nodes->elems[node_idx];
2083               re_token_type_t type = dfa->nodes[node].type;
2084               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2085                 {
2086                   if (subexp_idx != dfa->nodes[node].opr.idx)
2087                     continue;
2088                   /* It is against this limitation.
2089                      Remove it form the current sifted state.  */
2090                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2091                                                candidates);
2092                   if (BE (err != REG_NOERROR, 0))
2093                     return err;
2094                 }
2095             }
2096         }
2097     }
2098   return REG_NOERROR;
2099 }
2100
2101 static reg_errcode_t
2102 sift_states_bkref (mctx, sctx, str_idx, candidates)
2103      re_match_context_t *mctx;
2104      re_sift_context_t *sctx;
2105      int str_idx;
2106      const re_node_set *candidates;
2107 {
2108   re_dfa_t *const dfa = mctx->dfa;
2109   reg_errcode_t err;
2110   int node_idx, node;
2111   re_sift_context_t local_sctx;
2112   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2113
2114   if (first_idx == -1)
2115     return REG_NOERROR;
2116
2117   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2118
2119   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2120     {
2121       int enabled_idx;
2122       re_token_type_t type;
2123       struct re_backref_cache_entry *entry;
2124       node = candidates->elems[node_idx];
2125       type = dfa->nodes[node].type;
2126       /* Avoid infinite loop for the REs like "()\1+".  */
2127       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2128         continue;
2129       if (type != OP_BACK_REF)
2130         continue;
2131
2132       entry = mctx->bkref_ents + first_idx;
2133       enabled_idx = first_idx;
2134       do
2135         {
2136           int subexp_len, to_idx, dst_node;
2137           re_dfastate_t *cur_state;
2138
2139           if (entry->node != node)
2140             continue;
2141           subexp_len = entry->subexp_to - entry->subexp_from;
2142           to_idx = str_idx + subexp_len;
2143           dst_node = (subexp_len ? dfa->nexts[node]
2144                       : dfa->edests[node].elems[0]);
2145
2146           if (to_idx > sctx->last_str_idx
2147               || sctx->sifted_states[to_idx] == NULL
2148               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2149               || check_dst_limits (mctx, &sctx->limits, node,
2150                                    str_idx, dst_node, to_idx))
2151             continue;
2152
2153           if (local_sctx.sifted_states == NULL)
2154             {
2155               local_sctx = *sctx;
2156               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2157               if (BE (err != REG_NOERROR, 0))
2158                 goto free_return;
2159             }
2160           local_sctx.last_node = node;
2161           local_sctx.last_str_idx = str_idx;
2162           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2163           if (BE (err < 0, 0))
2164             {
2165               err = REG_ESPACE;
2166               goto free_return;
2167             }
2168           cur_state = local_sctx.sifted_states[str_idx];
2169           err = sift_states_backward (mctx, &local_sctx);
2170           if (BE (err != REG_NOERROR, 0))
2171             goto free_return;
2172           if (sctx->limited_states != NULL)
2173             {
2174               err = merge_state_array (dfa, sctx->limited_states,
2175                                        local_sctx.sifted_states,
2176                                        str_idx + 1);
2177               if (BE (err != REG_NOERROR, 0))
2178                 goto free_return;
2179             }
2180           local_sctx.sifted_states[str_idx] = cur_state;
2181           re_node_set_remove (&local_sctx.limits, enabled_idx);
2182
2183           /* mctx->bkref_ents may have changed, reload the pointer.  */
2184           entry = mctx->bkref_ents + enabled_idx;
2185         }
2186       while (enabled_idx++, entry++->more);
2187     }
2188   err = REG_NOERROR;
2189  free_return:
2190   if (local_sctx.sifted_states != NULL)
2191     {
2192       re_node_set_free (&local_sctx.limits);
2193     }
2194
2195   return err;
2196 }
2197
2198
2199 #ifdef RE_ENABLE_I18N
2200 static int
2201 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2202     const re_match_context_t *mctx;
2203     re_sift_context_t *sctx;
2204     int node_idx, str_idx, max_str_idx;
2205 {
2206   re_dfa_t *const dfa = mctx->dfa;
2207   int naccepted;
2208   /* Check the node can accept `multi byte'.  */
2209   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2210   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2211       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2212                             dfa->nexts[node_idx]))
2213     /* The node can't accept the `multi byte', or the
2214        destination was already thrown away, then the node
2215        could't accept the current input `multi byte'.   */
2216     naccepted = 0;
2217   /* Otherwise, it is sure that the node could accept
2218      `naccepted' bytes input.  */
2219   return naccepted;
2220 }
2221 #endif /* RE_ENABLE_I18N */
2222
2223 \f
2224 /* Functions for state transition.  */
2225
2226 /* Return the next state to which the current state STATE will transit by
2227    accepting the current input byte, and update STATE_LOG if necessary.
2228    If STATE can accept a multibyte char/collating element/back reference
2229    update the destination of STATE_LOG.  */
2230
2231 static re_dfastate_t *
2232 transit_state (err, mctx, state)
2233      reg_errcode_t *err;
2234      re_match_context_t *mctx;
2235      re_dfastate_t *state;
2236 {
2237   re_dfastate_t **trtable;
2238   unsigned char ch;
2239
2240 #ifdef RE_ENABLE_I18N
2241   /* If the current state can accept multibyte.  */
2242   if (BE (state->accept_mb, 0))
2243     {
2244       *err = transit_state_mb (mctx, state);
2245       if (BE (*err != REG_NOERROR, 0))
2246         return NULL;
2247     }
2248 #endif /* RE_ENABLE_I18N */
2249
2250   /* Then decide the next state with the single byte.  */
2251 #if 0
2252   if (0)
2253     /* don't use transition table  */
2254     return transit_state_sb (err, mctx, state);
2255 #endif
2256
2257   /* Use transition table  */
2258   ch = re_string_fetch_byte (&mctx->input);
2259   for (;;)
2260     {
2261       trtable = state->trtable;
2262       if (BE (trtable != NULL, 1))
2263         return trtable[ch];
2264
2265       trtable = state->word_trtable;
2266       if (BE (trtable != NULL, 1))
2267         {
2268           unsigned int context;
2269           context
2270             = re_string_context_at (&mctx->input,
2271                                     re_string_cur_idx (&mctx->input) - 1,
2272                                     mctx->eflags);
2273           if (IS_WORD_CONTEXT (context))
2274             return trtable[ch + SBC_MAX];
2275           else
2276             return trtable[ch];
2277         }
2278
2279       if (!build_trtable (mctx->dfa, state))
2280         {
2281           *err = REG_ESPACE;
2282           return NULL;
2283         }
2284
2285       /* Retry, we now have a transition table.  */
2286     }
2287 }
2288
2289 /* Update the state_log if we need */
2290 re_dfastate_t *
2291 merge_state_with_log (err, mctx, next_state)
2292      reg_errcode_t *err;
2293      re_match_context_t *mctx;
2294      re_dfastate_t *next_state;
2295 {
2296   re_dfa_t *const dfa = mctx->dfa;
2297   int cur_idx = re_string_cur_idx (&mctx->input);
2298
2299   if (cur_idx > mctx->state_log_top)
2300     {
2301       mctx->state_log[cur_idx] = next_state;
2302       mctx->state_log_top = cur_idx;
2303     }
2304   else if (mctx->state_log[cur_idx] == 0)
2305     {
2306       mctx->state_log[cur_idx] = next_state;
2307     }
2308   else
2309     {
2310       re_dfastate_t *pstate;
2311       unsigned int context;
2312       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2313       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2314          the destination of a multibyte char/collating element/
2315          back reference.  Then the next state is the union set of
2316          these destinations and the results of the transition table.  */
2317       pstate = mctx->state_log[cur_idx];
2318       log_nodes = pstate->entrance_nodes;
2319       if (next_state != NULL)
2320         {
2321           table_nodes = next_state->entrance_nodes;
2322           *err = re_node_set_init_union (&next_nodes, table_nodes,
2323                                              log_nodes);
2324           if (BE (*err != REG_NOERROR, 0))
2325             return NULL;
2326         }
2327       else
2328         next_nodes = *log_nodes;
2329       /* Note: We already add the nodes of the initial state,
2330          then we don't need to add them here.  */
2331
2332       context = re_string_context_at (&mctx->input,
2333                                       re_string_cur_idx (&mctx->input) - 1,
2334                                       mctx->eflags);
2335       next_state = mctx->state_log[cur_idx]
2336         = re_acquire_state_context (err, dfa, &next_nodes, context);
2337       /* We don't need to check errors here, since the return value of
2338          this function is next_state and ERR is already set.  */
2339
2340       if (table_nodes != NULL)
2341         re_node_set_free (&next_nodes);
2342     }
2343
2344   if (BE (dfa->nbackref, 0) && next_state != NULL)
2345     {
2346       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2347          later.  We must check them here, since the back references in the
2348          next state might use them.  */
2349       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2350                                         cur_idx);
2351       if (BE (*err != REG_NOERROR, 0))
2352         return NULL;
2353
2354       /* If the next state has back references.  */
2355       if (next_state->has_backref)
2356         {
2357           *err = transit_state_bkref (mctx, &next_state->nodes);
2358           if (BE (*err != REG_NOERROR, 0))
2359             return NULL;
2360           next_state = mctx->state_log[cur_idx];
2361         }
2362     }
2363
2364   return next_state;
2365 }
2366
2367 /* Skip bytes in the input that correspond to part of a
2368    multi-byte match, then look in the log for a state
2369    from which to restart matching.  */
2370 re_dfastate_t *
2371 find_recover_state (err, mctx)
2372      reg_errcode_t *err;
2373      re_match_context_t *mctx;
2374 {
2375   re_dfastate_t *cur_state = NULL;
2376   do
2377     {
2378       int max = mctx->state_log_top;
2379       int cur_str_idx = re_string_cur_idx (&mctx->input);
2380
2381       do
2382         {
2383           if (++cur_str_idx > max)
2384             return NULL;
2385           re_string_skip_bytes (&mctx->input, 1);
2386         }
2387       while (mctx->state_log[cur_str_idx] == NULL);
2388
2389       cur_state = merge_state_with_log (err, mctx, NULL);
2390     }
2391   while (err == REG_NOERROR && cur_state == NULL);
2392   return cur_state;
2393 }
2394
2395 /* Helper functions for transit_state.  */
2396
2397 /* From the node set CUR_NODES, pick up the nodes whose types are
2398    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2399    expression. And register them to use them later for evaluating the
2400    correspoding back references.  */
2401
2402 static reg_errcode_t
2403 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2404      re_match_context_t *mctx;
2405      re_node_set *cur_nodes;
2406      int str_idx;
2407 {
2408   re_dfa_t *const dfa = mctx->dfa;
2409   int node_idx;
2410   reg_errcode_t err;
2411
2412   /* TODO: This isn't efficient.
2413            Because there might be more than one nodes whose types are
2414            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2415            nodes.
2416            E.g. RE: (a){2}  */
2417   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2418     {
2419       int node = cur_nodes->elems[node_idx];
2420       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2421           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2422           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2423         {
2424           err = match_ctx_add_subtop (mctx, node, str_idx);
2425           if (BE (err != REG_NOERROR, 0))
2426             return err;
2427         }
2428     }
2429   return REG_NOERROR;
2430 }
2431
2432 #if 0
2433 /* Return the next state to which the current state STATE will transit by
2434    accepting the current input byte.  */
2435
2436 static re_dfastate_t *
2437 transit_state_sb (err, mctx, state)
2438      reg_errcode_t *err;
2439      re_match_context_t *mctx;
2440      re_dfastate_t *state;
2441 {
2442   re_dfa_t *const dfa = mctx->dfa;
2443   re_node_set next_nodes;
2444   re_dfastate_t *next_state;
2445   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2446   unsigned int context;
2447
2448   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2449   if (BE (*err != REG_NOERROR, 0))
2450     return NULL;
2451   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2452     {
2453       int cur_node = state->nodes.elems[node_cnt];
2454       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2455         {
2456           *err = re_node_set_merge (&next_nodes,
2457                                     dfa->eclosures + dfa->nexts[cur_node]);
2458           if (BE (*err != REG_NOERROR, 0))
2459             {
2460               re_node_set_free (&next_nodes);
2461               return NULL;
2462             }
2463         }
2464     }
2465   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2466   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2467   /* We don't need to check errors here, since the return value of
2468      this function is next_state and ERR is already set.  */
2469
2470   re_node_set_free (&next_nodes);
2471   re_string_skip_bytes (&mctx->input, 1);
2472   return next_state;
2473 }
2474 #endif
2475
2476 #ifdef RE_ENABLE_I18N
2477 static reg_errcode_t
2478 transit_state_mb (mctx, pstate)
2479     re_match_context_t *mctx;
2480     re_dfastate_t *pstate;
2481 {
2482   re_dfa_t *const dfa = mctx->dfa;
2483   reg_errcode_t err;
2484   int i;
2485
2486   for (i = 0; i < pstate->nodes.nelem; ++i)
2487     {
2488       re_node_set dest_nodes, *new_nodes;
2489       int cur_node_idx = pstate->nodes.elems[i];
2490       int naccepted, dest_idx;
2491       unsigned int context;
2492       re_dfastate_t *dest_state;
2493
2494       if (!dfa->nodes[cur_node_idx].accept_mb)
2495         continue;
2496
2497       if (dfa->nodes[cur_node_idx].constraint)
2498         {
2499           context = re_string_context_at (&mctx->input,
2500                                           re_string_cur_idx (&mctx->input),
2501                                           mctx->eflags);
2502           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2503                                            context))
2504             continue;
2505         }
2506
2507       /* How many bytes the node can accept?  */
2508       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2509                                            re_string_cur_idx (&mctx->input));
2510       if (naccepted == 0)
2511         continue;
2512
2513       /* The node can accepts `naccepted' bytes.  */
2514       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2515       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2516                                : mctx->max_mb_elem_len);
2517       err = clean_state_log_if_needed (mctx, dest_idx);
2518       if (BE (err != REG_NOERROR, 0))
2519         return err;
2520 #ifdef DEBUG
2521       assert (dfa->nexts[cur_node_idx] != -1);
2522 #endif
2523       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2524
2525       dest_state = mctx->state_log[dest_idx];
2526       if (dest_state == NULL)
2527         dest_nodes = *new_nodes;
2528       else
2529         {
2530           err = re_node_set_init_union (&dest_nodes,
2531                                         dest_state->entrance_nodes, new_nodes);
2532           if (BE (err != REG_NOERROR, 0))
2533             return err;
2534         }
2535       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2536       mctx->state_log[dest_idx]
2537         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2538       if (dest_state != NULL)
2539         re_node_set_free (&dest_nodes);
2540       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2541         return err;
2542     }
2543   return REG_NOERROR;
2544 }
2545 #endif /* RE_ENABLE_I18N */
2546
2547 static reg_errcode_t
2548 transit_state_bkref (mctx, nodes)
2549     re_match_context_t *mctx;
2550     const re_node_set *nodes;
2551 {
2552   re_dfa_t *const dfa = mctx->dfa;
2553   reg_errcode_t err;
2554   int i;
2555   int cur_str_idx = re_string_cur_idx (&mctx->input);
2556
2557   for (i = 0; i < nodes->nelem; ++i)
2558     {
2559       int dest_str_idx, prev_nelem, bkc_idx;
2560       int node_idx = nodes->elems[i];
2561       unsigned int context;
2562       const re_token_t *node = dfa->nodes + node_idx;
2563       re_node_set *new_dest_nodes;
2564
2565       /* Check whether `node' is a backreference or not.  */
2566       if (node->type != OP_BACK_REF)
2567         continue;
2568
2569       if (node->constraint)
2570         {
2571           context = re_string_context_at (&mctx->input, cur_str_idx,
2572                                           mctx->eflags);
2573           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2574             continue;
2575         }
2576
2577       /* `node' is a backreference.
2578          Check the substring which the substring matched.  */
2579       bkc_idx = mctx->nbkref_ents;
2580       err = get_subexp (mctx, node_idx, cur_str_idx);
2581       if (BE (err != REG_NOERROR, 0))
2582         goto free_return;
2583
2584       /* And add the epsilon closures (which is `new_dest_nodes') of
2585          the backreference to appropriate state_log.  */
2586 #ifdef DEBUG
2587       assert (dfa->nexts[node_idx] != -1);
2588 #endif
2589       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2590         {
2591           int subexp_len;
2592           re_dfastate_t *dest_state;
2593           struct re_backref_cache_entry *bkref_ent;
2594           bkref_ent = mctx->bkref_ents + bkc_idx;
2595           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2596             continue;
2597           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2598           new_dest_nodes = (subexp_len == 0
2599                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2600                             : dfa->eclosures + dfa->nexts[node_idx]);
2601           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2602                           - bkref_ent->subexp_from);
2603           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2604                                           mctx->eflags);
2605           dest_state = mctx->state_log[dest_str_idx];
2606           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2607                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2608           /* Add `new_dest_node' to state_log.  */
2609           if (dest_state == NULL)
2610             {
2611               mctx->state_log[dest_str_idx]
2612                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2613                                             context);
2614               if (BE (mctx->state_log[dest_str_idx] == NULL
2615                       && err != REG_NOERROR, 0))
2616                 goto free_return;
2617             }
2618           else
2619             {
2620               re_node_set dest_nodes;
2621               err = re_node_set_init_union (&dest_nodes,
2622                                             dest_state->entrance_nodes,
2623                                             new_dest_nodes);
2624               if (BE (err != REG_NOERROR, 0))
2625                 {
2626                   re_node_set_free (&dest_nodes);
2627                   goto free_return;
2628                 }
2629               mctx->state_log[dest_str_idx]
2630                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2631               re_node_set_free (&dest_nodes);
2632               if (BE (mctx->state_log[dest_str_idx] == NULL
2633                       && err != REG_NOERROR, 0))
2634                 goto free_return;
2635             }
2636           /* We need to check recursively if the backreference can epsilon
2637              transit.  */
2638           if (subexp_len == 0
2639               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2640             {
2641               err = check_subexp_matching_top (mctx, new_dest_nodes,
2642                                                cur_str_idx);
2643               if (BE (err != REG_NOERROR, 0))
2644                 goto free_return;
2645               err = transit_state_bkref (mctx, new_dest_nodes);
2646               if (BE (err != REG_NOERROR, 0))
2647                 goto free_return;
2648             }
2649         }
2650     }
2651   err = REG_NOERROR;
2652  free_return:
2653   return err;
2654 }
2655
2656 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2657    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2658    Note that we might collect inappropriate candidates here.
2659    However, the cost of checking them strictly here is too high, then we
2660    delay these checking for prune_impossible_nodes().  */
2661
2662 static reg_errcode_t
2663 get_subexp (mctx, bkref_node, bkref_str_idx)
2664      re_match_context_t *mctx;
2665      int bkref_node, bkref_str_idx;
2666 {
2667   re_dfa_t *const dfa = mctx->dfa;
2668   int subexp_num, sub_top_idx;
2669   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2670   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2671   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2672   if (cache_idx != -1)
2673     {
2674       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2675       do
2676         if (entry->node == bkref_node)
2677           return REG_NOERROR; /* We already checked it.  */
2678       while (entry++->more);
2679     }
2680
2681   subexp_num = dfa->nodes[bkref_node].opr.idx;
2682
2683   /* For each sub expression  */
2684   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2685     {
2686       reg_errcode_t err;
2687       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2688       re_sub_match_last_t *sub_last;
2689       int sub_last_idx, sl_str, bkref_str_off;
2690
2691       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2692         continue; /* It isn't related.  */
2693
2694       sl_str = sub_top->str_idx;
2695       bkref_str_off = bkref_str_idx;
2696       /* At first, check the last node of sub expressions we already
2697          evaluated.  */
2698       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2699         {
2700           int sl_str_diff;
2701           sub_last = sub_top->lasts[sub_last_idx];
2702           sl_str_diff = sub_last->str_idx - sl_str;
2703           /* The matched string by the sub expression match with the substring
2704              at the back reference?  */
2705           if (sl_str_diff > 0)
2706             {
2707               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2708                 {
2709                   /* Not enough chars for a successful match.  */
2710                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2711                     break;
2712
2713                   err = clean_state_log_if_needed (mctx,
2714                                                    bkref_str_off
2715                                                    + sl_str_diff);
2716                   if (BE (err != REG_NOERROR, 0))
2717                     return err;
2718                   buf = (const char *) re_string_get_buffer (&mctx->input);
2719                 }
2720               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2721                 break; /* We don't need to search this sub expression any more.  */
2722             }
2723           bkref_str_off += sl_str_diff;
2724           sl_str += sl_str_diff;
2725           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2726                                 bkref_str_idx);
2727
2728           /* Reload buf, since the preceding call might have reallocated
2729              the buffer.  */
2730           buf = (const char *) re_string_get_buffer (&mctx->input);
2731
2732           if (err == REG_NOMATCH)
2733             continue;
2734           if (BE (err != REG_NOERROR, 0))
2735             return err;
2736         }
2737
2738       if (sub_last_idx < sub_top->nlasts)
2739         continue;
2740       if (sub_last_idx > 0)
2741         ++sl_str;
2742       /* Then, search for the other last nodes of the sub expression.  */
2743       for (; sl_str <= bkref_str_idx; ++sl_str)
2744         {
2745           int cls_node, sl_str_off;
2746           const re_node_set *nodes;
2747           sl_str_off = sl_str - sub_top->str_idx;
2748           /* The matched string by the sub expression match with the substring
2749              at the back reference?  */
2750           if (sl_str_off > 0)
2751             {
2752               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2753                 {
2754                   /* If we are at the end of the input, we cannot match.  */
2755                   if (bkref_str_off >= mctx->input.len)
2756                     break;
2757
2758                   err = extend_buffers (mctx);
2759                   if (BE (err != REG_NOERROR, 0))
2760                     return err;
2761
2762                   buf = (const char *) re_string_get_buffer (&mctx->input);
2763                 }
2764               if (buf [bkref_str_off++] != buf[sl_str - 1])
2765                 break; /* We don't need to search this sub expression
2766                           any more.  */
2767             }
2768           if (mctx->state_log[sl_str] == NULL)
2769             continue;
2770           /* Does this state have a ')' of the sub expression?  */
2771           nodes = &mctx->state_log[sl_str]->nodes;
2772           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2773           if (cls_node == -1)
2774             continue; /* No.  */
2775           if (sub_top->path == NULL)
2776             {
2777               sub_top->path = calloc (sizeof (state_array_t),
2778                                       sl_str - sub_top->str_idx + 1);
2779               if (sub_top->path == NULL)
2780                 return REG_ESPACE;
2781             }
2782           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2783              in the current context?  */
2784           err = check_arrival (mctx, sub_top->path, sub_top->node,
2785                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2786           if (err == REG_NOMATCH)
2787               continue;
2788           if (BE (err != REG_NOERROR, 0))
2789               return err;
2790           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2791           if (BE (sub_last == NULL, 0))
2792             return REG_ESPACE;
2793           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2794                                 bkref_str_idx);
2795           if (err == REG_NOMATCH)
2796             continue;
2797         }
2798     }
2799   return REG_NOERROR;
2800 }
2801
2802 /* Helper functions for get_subexp().  */
2803
2804 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2805    If it can arrive, register the sub expression expressed with SUB_TOP
2806    and SUB_LAST.  */
2807
2808 static reg_errcode_t
2809 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2810      re_match_context_t *mctx;
2811      const re_sub_match_top_t *sub_top;
2812      re_sub_match_last_t *sub_last;
2813      int bkref_node, bkref_str;
2814 {
2815   reg_errcode_t err;
2816   int to_idx;
2817   /* Can the subexpression arrive the back reference?  */
2818   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2819                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2820   if (err != REG_NOERROR)
2821     return err;
2822   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2823                              sub_last->str_idx);
2824   if (BE (err != REG_NOERROR, 0))
2825     return err;
2826   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2827   return clean_state_log_if_needed (mctx, to_idx);
2828 }
2829
2830 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2831    Search '(' if FL_OPEN, or search ')' otherwise.
2832    TODO: This function isn't efficient...
2833          Because there might be more than one nodes whose types are
2834          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2835          nodes.
2836          E.g. RE: (a){2}  */
2837
2838 static int
2839 find_subexp_node (dfa, nodes, subexp_idx, type)
2840      const re_dfa_t *dfa;
2841      const re_node_set *nodes;
2842      int subexp_idx, type;
2843 {
2844   int cls_idx;
2845   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2846     {
2847       int cls_node = nodes->elems[cls_idx];
2848       const re_token_t *node = dfa->nodes + cls_node;
2849       if (node->type == type
2850           && node->opr.idx == subexp_idx)
2851         return cls_node;
2852     }
2853   return -1;
2854 }
2855
2856 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2857    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2858    heavily reused.
2859    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2860
2861 static reg_errcode_t
2862 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2863                type)
2864      re_match_context_t *mctx;
2865      state_array_t *path;
2866      int top_node, top_str, last_node, last_str, type;
2867 {
2868   re_dfa_t *const dfa = mctx->dfa;
2869   reg_errcode_t err;
2870   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2871   re_dfastate_t *cur_state = NULL;
2872   re_node_set *cur_nodes, next_nodes;
2873   re_dfastate_t **backup_state_log;
2874   unsigned int context;
2875
2876   subexp_num = dfa->nodes[top_node].opr.idx;
2877   /* Extend the buffer if we need.  */
2878   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2879     {
2880       re_dfastate_t **new_array;
2881       int old_alloc = path->alloc;
2882       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2883       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2884       if (new_array == NULL)
2885         {
2886           path->alloc = old_alloc;
2887           return REG_ESPACE;
2888         }
2889       path->array = new_array;
2890       memset (new_array + old_alloc, '\0',
2891               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2892     }
2893
2894   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2895
2896   /* Temporary modify MCTX.  */
2897   backup_state_log = mctx->state_log;
2898   backup_cur_idx = mctx->input.cur_idx;
2899   mctx->state_log = path->array;
2900   mctx->input.cur_idx = str_idx;
2901
2902   /* Setup initial node set.  */
2903   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2904   if (str_idx == top_str)
2905     {
2906       err = re_node_set_init_1 (&next_nodes, top_node);
2907       if (BE (err != REG_NOERROR, 0))
2908         return err;
2909       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2910       if (BE (err != REG_NOERROR, 0))
2911         {
2912           re_node_set_free (&next_nodes);
2913           return err;
2914         }
2915     }
2916   else
2917     {
2918       cur_state = mctx->state_log[str_idx];
2919       if (cur_state && cur_state->has_backref)
2920         {
2921           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2922           if (BE ( err != REG_NOERROR, 0))
2923             return err;
2924         }
2925       else
2926         re_node_set_init_empty (&next_nodes);
2927     }
2928   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2929     {
2930       if (next_nodes.nelem)
2931         {
2932           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2933                                     subexp_num, type);
2934           if (BE ( err != REG_NOERROR, 0))
2935             {
2936               re_node_set_free (&next_nodes);
2937               return err;
2938             }
2939         }
2940       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2941       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2942         {
2943           re_node_set_free (&next_nodes);
2944           return err;
2945         }
2946       mctx->state_log[str_idx] = cur_state;
2947     }
2948
2949   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2950     {
2951       re_node_set_empty (&next_nodes);
2952       if (mctx->state_log[str_idx + 1])
2953         {
2954           err = re_node_set_merge (&next_nodes,
2955                                    &mctx->state_log[str_idx + 1]->nodes);
2956           if (BE (err != REG_NOERROR, 0))
2957             {
2958               re_node_set_free (&next_nodes);
2959               return err;
2960             }
2961         }
2962       if (cur_state)
2963         {
2964           err = check_arrival_add_next_nodes (mctx, str_idx,
2965                                               &cur_state->non_eps_nodes, &next_nodes);
2966           if (BE (err != REG_NOERROR, 0))
2967             {
2968               re_node_set_free (&next_nodes);
2969               return err;
2970             }
2971         }
2972       ++str_idx;
2973       if (next_nodes.nelem)
2974         {
2975           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2976           if (BE (err != REG_NOERROR, 0))
2977             {
2978               re_node_set_free (&next_nodes);
2979               return err;
2980             }
2981           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2982                                     subexp_num, type);
2983           if (BE ( err != REG_NOERROR, 0))
2984             {
2985               re_node_set_free (&next_nodes);
2986               return err;
2987             }
2988         }
2989       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2990       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2991       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2992         {
2993           re_node_set_free (&next_nodes);
2994           return err;
2995         }
2996       mctx->state_log[str_idx] = cur_state;
2997       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2998     }
2999   re_node_set_free (&next_nodes);
3000   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3001                : &mctx->state_log[last_str]->nodes);
3002   path->next_idx = str_idx;
3003
3004   /* Fix MCTX.  */
3005   mctx->state_log = backup_state_log;
3006   mctx->input.cur_idx = backup_cur_idx;
3007
3008   /* Then check the current node set has the node LAST_NODE.  */
3009   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3010     return REG_NOERROR;
3011
3012   return REG_NOMATCH;
3013 }
3014
3015 /* Helper functions for check_arrival.  */
3016
3017 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3018    to NEXT_NODES.
3019    TODO: This function is similar to the functions transit_state*(),
3020          however this function has many additional works.
3021          Can't we unify them?  */
3022
3023 static reg_errcode_t
3024 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3025      re_match_context_t *mctx;
3026      int str_idx;
3027      re_node_set *cur_nodes, *next_nodes;
3028 {
3029   re_dfa_t *const dfa = mctx->dfa;
3030   int result;
3031   int cur_idx;
3032   reg_errcode_t err;
3033   re_node_set union_set;
3034   re_node_set_init_empty (&union_set);
3035   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3036     {
3037       int naccepted = 0;
3038       int cur_node = cur_nodes->elems[cur_idx];
3039 #ifdef DEBUG
3040       re_token_type_t type = dfa->nodes[cur_node].type;
3041       assert (!IS_EPSILON_NODE (type));
3042 #endif
3043 #ifdef RE_ENABLE_I18N
3044       /* If the node may accept `multi byte'.  */
3045       if (dfa->nodes[cur_node].accept_mb)
3046         {
3047           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3048                                                str_idx);
3049           if (naccepted > 1)
3050             {
3051               re_dfastate_t *dest_state;
3052               int next_node = dfa->nexts[cur_node];
3053               int next_idx = str_idx + naccepted;
3054               dest_state = mctx->state_log[next_idx];
3055               re_node_set_empty (&union_set);
3056               if (dest_state)
3057                 {
3058                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3059                   if (BE (err != REG_NOERROR, 0))
3060                     {
3061                       re_node_set_free (&union_set);
3062                       return err;
3063                     }
3064                 }
3065               result = re_node_set_insert (&union_set, next_node);
3066               if (BE (result < 0, 0))
3067                 {
3068                   re_node_set_free (&union_set);
3069                   return REG_ESPACE;
3070                 }
3071               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3072                                                             &union_set);
3073               if (BE (mctx->state_log[next_idx] == NULL
3074                       && err != REG_NOERROR, 0))
3075                 {
3076                   re_node_set_free (&union_set);
3077                   return err;
3078                 }
3079             }
3080         }
3081 #endif /* RE_ENABLE_I18N */
3082       if (naccepted
3083           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3084         {
3085           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3086           if (BE (result < 0, 0))
3087             {
3088               re_node_set_free (&union_set);
3089               return REG_ESPACE;
3090             }
3091         }
3092     }
3093   re_node_set_free (&union_set);
3094   return REG_NOERROR;
3095 }
3096
3097 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3098    CUR_NODES, however exclude the nodes which are:
3099     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3100     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3101 */
3102
3103 static reg_errcode_t
3104 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3105      re_dfa_t *dfa;
3106      re_node_set *cur_nodes;
3107      int ex_subexp, type;
3108 {
3109   reg_errcode_t err;
3110   int idx, outside_node;
3111   re_node_set new_nodes;
3112 #ifdef DEBUG
3113   assert (cur_nodes->nelem);
3114 #endif
3115   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3116   if (BE (err != REG_NOERROR, 0))
3117     return err;
3118   /* Create a new node set NEW_NODES with the nodes which are epsilon
3119      closures of the node in CUR_NODES.  */
3120
3121   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3122     {
3123       int cur_node = cur_nodes->elems[idx];
3124       re_node_set *eclosure = dfa->eclosures + cur_node;
3125       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3126       if (outside_node == -1)
3127         {
3128           /* There are no problematic nodes, just merge them.  */
3129           err = re_node_set_merge (&new_nodes, eclosure);
3130           if (BE (err != REG_NOERROR, 0))
3131             {
3132               re_node_set_free (&new_nodes);
3133               return err;
3134             }
3135         }
3136       else
3137         {
3138           /* There are problematic nodes, re-calculate incrementally.  */
3139           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3140                                               ex_subexp, type);
3141           if (BE (err != REG_NOERROR, 0))
3142             {
3143               re_node_set_free (&new_nodes);
3144               return err;
3145             }
3146         }
3147     }
3148   re_node_set_free (cur_nodes);
3149   *cur_nodes = new_nodes;
3150   return REG_NOERROR;
3151 }
3152
3153 /* Helper function for check_arrival_expand_ecl.
3154    Check incrementally the epsilon closure of TARGET, and if it isn't
3155    problematic append it to DST_NODES.  */
3156
3157 static reg_errcode_t
3158 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3159      re_dfa_t *dfa;
3160      int target, ex_subexp, type;
3161      re_node_set *dst_nodes;
3162 {
3163   int cur_node;
3164   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3165     {
3166       int err;
3167
3168       if (dfa->nodes[cur_node].type == type
3169           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3170         {
3171           if (type == OP_CLOSE_SUBEXP)
3172             {
3173               err = re_node_set_insert (dst_nodes, cur_node);
3174               if (BE (err == -1, 0))
3175                 return REG_ESPACE;
3176             }
3177           break;
3178         }
3179       err = re_node_set_insert (dst_nodes, cur_node);
3180       if (BE (err == -1, 0))
3181         return REG_ESPACE;
3182       if (dfa->edests[cur_node].nelem == 0)
3183         break;
3184       if (dfa->edests[cur_node].nelem == 2)
3185         {
3186           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3187                                               dfa->edests[cur_node].elems[1],
3188                                               ex_subexp, type);
3189           if (BE (err != REG_NOERROR, 0))
3190             return err;
3191         }
3192       cur_node = dfa->edests[cur_node].elems[0];
3193     }
3194   return REG_NOERROR;
3195 }
3196
3197
3198 /* For all the back references in the current state, calculate the
3199    destination of the back references by the appropriate entry
3200    in MCTX->BKREF_ENTS.  */
3201
3202 static reg_errcode_t
3203 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3204                     type)
3205      re_match_context_t *mctx;
3206      int cur_str, subexp_num, type;
3207      re_node_set *cur_nodes;
3208 {
3209   re_dfa_t *const dfa = mctx->dfa;
3210   reg_errcode_t err;
3211   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3212   struct re_backref_cache_entry *ent;
3213
3214   if (cache_idx_start == -1)
3215     return REG_NOERROR;
3216
3217  restart:
3218   ent = mctx->bkref_ents + cache_idx_start;
3219   do
3220     {
3221       int to_idx, next_node;
3222
3223       /* Is this entry ENT is appropriate?  */
3224       if (!re_node_set_contains (cur_nodes, ent->node))
3225         continue; /* No.  */
3226
3227       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3228       /* Calculate the destination of the back reference, and append it
3229          to MCTX->STATE_LOG.  */
3230       if (to_idx == cur_str)
3231         {
3232           /* The backreference did epsilon transit, we must re-check all the
3233              node in the current state.  */
3234           re_node_set new_dests;
3235           reg_errcode_t err2, err3;
3236           next_node = dfa->edests[ent->node].elems[0];
3237           if (re_node_set_contains (cur_nodes, next_node))
3238             continue;
3239           err = re_node_set_init_1 (&new_dests, next_node);
3240           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3241           err3 = re_node_set_merge (cur_nodes, &new_dests);
3242           re_node_set_free (&new_dests);
3243           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3244                   || err3 != REG_NOERROR, 0))
3245             {
3246               err = (err != REG_NOERROR ? err
3247                      : (err2 != REG_NOERROR ? err2 : err3));
3248               return err;
3249             }
3250           /* TODO: It is still inefficient...  */
3251           goto restart;
3252         }
3253       else
3254         {
3255           re_node_set union_set;
3256           next_node = dfa->nexts[ent->node];
3257           if (mctx->state_log[to_idx])
3258             {
3259               int ret;
3260               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3261                                         next_node))
3262                 continue;
3263               err = re_node_set_init_copy (&union_set,
3264                                            &mctx->state_log[to_idx]->nodes);
3265               ret = re_node_set_insert (&union_set, next_node);
3266               if (BE (err != REG_NOERROR || ret < 0, 0))
3267                 {
3268                   re_node_set_free (&union_set);
3269                   err = err != REG_NOERROR ? err : REG_ESPACE;
3270                   return err;
3271                 }
3272             }
3273           else
3274             {
3275               err = re_node_set_init_1 (&union_set, next_node);
3276               if (BE (err != REG_NOERROR, 0))
3277                 return err;
3278             }
3279           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3280           re_node_set_free (&union_set);
3281           if (BE (mctx->state_log[to_idx] == NULL
3282                   && err != REG_NOERROR, 0))
3283             return err;
3284         }
3285     }
3286   while (ent++->more);
3287   return REG_NOERROR;
3288 }
3289
3290 /* Build transition table for the state.
3291    Return 1 if succeeded, otherwise return NULL.  */
3292
3293 static int
3294 build_trtable (dfa, state)
3295     re_dfa_t *dfa;
3296     re_dfastate_t *state;
3297 {
3298   reg_errcode_t err;
3299   int i, j, ch, need_word_trtable = 0;
3300   unsigned int elem, mask;
3301   int dests_node_malloced = 0, dest_states_malloced = 0;
3302   int ndests; /* Number of the destination states from `state'.  */
3303   re_dfastate_t **trtable;
3304   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3305   re_node_set follows, *dests_node;
3306   bitset *dests_ch;
3307   bitset acceptable;
3308
3309   /* We build DFA states which corresponds to the destination nodes
3310      from `state'.  `dests_node[i]' represents the nodes which i-th
3311      destination state contains, and `dests_ch[i]' represents the
3312      characters which i-th destination state accepts.  */
3313 #ifdef _LIBC
3314   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3315     dests_node = (re_node_set *)
3316       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3317   else
3318 #endif
3319     {
3320       dests_node = (re_node_set *)
3321         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3322       if (BE (dests_node == NULL, 0))
3323         return 0;
3324       dests_node_malloced = 1;
3325     }
3326   dests_ch = (bitset *) (dests_node + SBC_MAX);
3327
3328   /* Initialize transiton table.  */
3329   state->word_trtable = state->trtable = NULL;
3330
3331   /* At first, group all nodes belonging to `state' into several
3332      destinations.  */
3333   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3334   if (BE (ndests <= 0, 0))
3335     {
3336       if (dests_node_malloced)
3337         free (dests_node);
3338       /* Return 0 in case of an error, 1 otherwise.  */
3339       if (ndests == 0)
3340         {
3341           state->trtable = (re_dfastate_t **)
3342             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3343           return 1;
3344         }
3345       return 0;
3346     }
3347
3348   err = re_node_set_alloc (&follows, ndests + 1);
3349   if (BE (err != REG_NOERROR, 0))
3350     goto out_free;
3351
3352 #ifdef _LIBC
3353   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3354                          + ndests * 3 * sizeof (re_dfastate_t *)))
3355     dest_states = (re_dfastate_t **)
3356       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3357   else
3358 #endif
3359     {
3360       dest_states = (re_dfastate_t **)
3361         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3362       if (BE (dest_states == NULL, 0))
3363         {
3364 out_free:
3365           if (dest_states_malloced)
3366             free (dest_states);
3367           re_node_set_free (&follows);
3368           for (i = 0; i < ndests; ++i)
3369             re_node_set_free (dests_node + i);
3370           if (dests_node_malloced)
3371             free (dests_node);
3372           return 0;
3373         }
3374       dest_states_malloced = 1;
3375     }
3376   dest_states_word = dest_states + ndests;
3377   dest_states_nl = dest_states_word + ndests;
3378   bitset_empty (acceptable);
3379
3380   /* Then build the states for all destinations.  */
3381   for (i = 0; i < ndests; ++i)
3382     {
3383       int next_node;
3384       re_node_set_empty (&follows);
3385       /* Merge the follows of this destination states.  */
3386       for (j = 0; j < dests_node[i].nelem; ++j)
3387         {
3388           next_node = dfa->nexts[dests_node[i].elems[j]];
3389           if (next_node != -1)
3390             {
3391               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3392               if (BE (err != REG_NOERROR, 0))
3393                 goto out_free;
3394             }
3395         }
3396       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3397       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3398         goto out_free;
3399       /* If the new state has context constraint,
3400          build appropriate states for these contexts.  */
3401       if (dest_states[i]->has_constraint)
3402         {
3403           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3404                                                           CONTEXT_WORD);
3405           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3406             goto out_free;
3407
3408           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3409             need_word_trtable = 1;
3410
3411           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3412                                                         CONTEXT_NEWLINE);
3413           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3414             goto out_free;
3415         }
3416       else
3417         {
3418           dest_states_word[i] = dest_states[i];
3419           dest_states_nl[i] = dest_states[i];
3420         }
3421       bitset_merge (acceptable, dests_ch[i]);
3422     }
3423
3424   if (!BE (need_word_trtable, 0))
3425     {
3426       /* We don't care about whether the following character is a word
3427          character, or we are in a single-byte character set so we can
3428          discern by looking at the character code: allocate a
3429          256-entry transition table.  */
3430       trtable = state->trtable =
3431         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3432       if (BE (trtable == NULL, 0))
3433         goto out_free;
3434
3435       /* For all characters ch...:  */
3436       for (i = 0; i < BITSET_UINTS; ++i)
3437         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3438              elem;
3439              mask <<= 1, elem >>= 1, ++ch)
3440           if (BE (elem & 1, 0))
3441             {
3442               /* There must be exactly one destination which accepts
3443                  character ch.  See group_nodes_into_DFAstates.  */
3444               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3445                 ;
3446
3447               /* j-th destination accepts the word character ch.  */
3448               if (dfa->word_char[i] & mask)
3449                 trtable[ch] = dest_states_word[j];
3450               else
3451                 trtable[ch] = dest_states[j];
3452             }
3453     }
3454   else
3455     {
3456       /* We care about whether the following character is a word
3457          character, and we are in a multi-byte character set: discern
3458          by looking at the character code: build two 256-entry
3459          transition tables, one starting at trtable[0] and one
3460          starting at trtable[SBC_MAX].  */
3461       trtable = state->word_trtable =
3462         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3463       if (BE (trtable == NULL, 0))
3464         goto out_free;
3465
3466       /* For all characters ch...:  */
3467       for (i = 0; i < BITSET_UINTS; ++i)
3468         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3469              elem;
3470              mask <<= 1, elem >>= 1, ++ch)
3471           if (BE (elem & 1, 0))
3472             {
3473               /* There must be exactly one destination which accepts
3474                  character ch.  See group_nodes_into_DFAstates.  */
3475               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3476                 ;
3477
3478               /* j-th destination accepts the word character ch.  */
3479               trtable[ch] = dest_states[j];
3480               trtable[ch + SBC_MAX] = dest_states_word[j];
3481             }
3482     }
3483
3484   /* new line */
3485   if (bitset_contain (acceptable, NEWLINE_CHAR))
3486     {
3487       /* The current state accepts newline character.  */
3488       for (j = 0; j < ndests; ++j)
3489         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3490           {
3491             /* k-th destination accepts newline character.  */
3492             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3493             if (need_word_trtable)
3494               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3495             /* There must be only one destination which accepts
3496                newline.  See group_nodes_into_DFAstates.  */
3497             break;
3498           }
3499     }
3500
3501   if (dest_states_malloced)
3502     free (dest_states);
3503
3504   re_node_set_free (&follows);
3505   for (i = 0; i < ndests; ++i)
3506     re_node_set_free (dests_node + i);
3507
3508   if (dests_node_malloced)
3509     free (dests_node);
3510
3511   return 1;
3512 }
3513
3514 /* Group all nodes belonging to STATE into several destinations.
3515    Then for all destinations, set the nodes belonging to the destination
3516    to DESTS_NODE[i] and set the characters accepted by the destination
3517    to DEST_CH[i].  This function return the number of destinations.  */
3518
3519 static int
3520 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3521     re_dfa_t *dfa;
3522     const re_dfastate_t *state;
3523     re_node_set *dests_node;
3524     bitset *dests_ch;
3525 {
3526   reg_errcode_t err;
3527   int result;
3528   int i, j, k;
3529   int ndests; /* Number of the destinations from `state'.  */
3530   bitset accepts; /* Characters a node can accept.  */
3531   const re_node_set *cur_nodes = &state->nodes;
3532   bitset_empty (accepts);
3533   ndests = 0;
3534
3535   /* For all the nodes belonging to `state',  */
3536   for (i = 0; i < cur_nodes->nelem; ++i)
3537     {
3538       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3539       re_token_type_t type = node->type;
3540       unsigned int constraint = node->constraint;
3541
3542       /* Enumerate all single byte character this node can accept.  */
3543       if (type == CHARACTER)
3544         bitset_set (accepts, node->opr.c);
3545       else if (type == SIMPLE_BRACKET)
3546         {
3547           bitset_merge (accepts, node->opr.sbcset);
3548         }
3549       else if (type == OP_PERIOD)
3550         {
3551 #ifdef RE_ENABLE_I18N
3552           if (dfa->mb_cur_max > 1)
3553             bitset_merge (accepts, dfa->sb_char);
3554           else
3555 #endif
3556             bitset_set_all (accepts);
3557           if (!(dfa->syntax & RE_DOT_NEWLINE))
3558             bitset_clear (accepts, '\n');
3559           if (dfa->syntax & RE_DOT_NOT_NULL)
3560             bitset_clear (accepts, '\0');
3561         }
3562 #ifdef RE_ENABLE_I18N
3563       else if (type == OP_UTF8_PERIOD)
3564         {
3565           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3566           if (!(dfa->syntax & RE_DOT_NEWLINE))
3567             bitset_clear (accepts, '\n');
3568           if (dfa->syntax & RE_DOT_NOT_NULL)
3569             bitset_clear (accepts, '\0');
3570         }
3571 #endif
3572       else
3573         continue;
3574
3575       /* Check the `accepts' and sift the characters which are not
3576          match it the context.  */
3577       if (constraint)
3578         {
3579           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3580             {
3581               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3582               bitset_empty (accepts);
3583               if (accepts_newline)
3584                 bitset_set (accepts, NEWLINE_CHAR);
3585               else
3586                 continue;
3587             }
3588           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3589             {
3590               bitset_empty (accepts);
3591               continue;
3592             }
3593
3594           if (constraint & NEXT_WORD_CONSTRAINT)
3595             {
3596               unsigned int any_set = 0;
3597               if (type == CHARACTER && !node->word_char)
3598                 {
3599                   bitset_empty (accepts);
3600                   continue;
3601                 }
3602 #ifdef RE_ENABLE_I18N
3603               if (dfa->mb_cur_max > 1)
3604                 for (j = 0; j < BITSET_UINTS; ++j)
3605                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3606               else
3607 #endif
3608                 for (j = 0; j < BITSET_UINTS; ++j)
3609                   any_set |= (accepts[j] &= dfa->word_char[j]);
3610               if (!any_set)
3611                 continue;
3612             }
3613           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3614             {
3615               unsigned int any_set = 0;
3616               if (type == CHARACTER && node->word_char)
3617                 {
3618                   bitset_empty (accepts);
3619                   continue;
3620                 }
3621 #ifdef RE_ENABLE_I18N
3622               if (dfa->mb_cur_max > 1)
3623                 for (j = 0; j < BITSET_UINTS; ++j)
3624                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3625               else
3626 #endif
3627                 for (j = 0; j < BITSET_UINTS; ++j)
3628                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3629               if (!any_set)
3630                 continue;
3631             }
3632         }
3633
3634       /* Then divide `accepts' into DFA states, or create a new
3635          state.  Above, we make sure that accepts is not empty.  */
3636       for (j = 0; j < ndests; ++j)
3637         {
3638           bitset intersec; /* Intersection sets, see below.  */
3639           bitset remains;
3640           /* Flags, see below.  */
3641           int has_intersec, not_subset, not_consumed;
3642
3643           /* Optimization, skip if this state doesn't accept the character.  */
3644           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3645             continue;
3646
3647           /* Enumerate the intersection set of this state and `accepts'.  */
3648           has_intersec = 0;
3649           for (k = 0; k < BITSET_UINTS; ++k)
3650             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3651           /* And skip if the intersection set is empty.  */
3652           if (!has_intersec)
3653             continue;
3654
3655           /* Then check if this state is a subset of `accepts'.  */
3656           not_subset = not_consumed = 0;
3657           for (k = 0; k < BITSET_UINTS; ++k)
3658             {
3659               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3660               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3661             }
3662
3663           /* If this state isn't a subset of `accepts', create a
3664              new group state, which has the `remains'. */
3665           if (not_subset)
3666             {
3667               bitset_copy (dests_ch[ndests], remains);
3668               bitset_copy (dests_ch[j], intersec);
3669               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3670               if (BE (err != REG_NOERROR, 0))
3671                 goto error_return;
3672               ++ndests;
3673             }
3674
3675           /* Put the position in the current group. */
3676           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3677           if (BE (result < 0, 0))
3678             goto error_return;
3679
3680           /* If all characters are consumed, go to next node. */
3681           if (!not_consumed)
3682             break;
3683         }
3684       /* Some characters remain, create a new group. */
3685       if (j == ndests)
3686         {
3687           bitset_copy (dests_ch[ndests], accepts);
3688           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3689           if (BE (err != REG_NOERROR, 0))
3690             goto error_return;
3691           ++ndests;
3692           bitset_empty (accepts);
3693         }
3694     }
3695   return ndests;
3696  error_return:
3697   for (j = 0; j < ndests; ++j)
3698     re_node_set_free (dests_node + j);
3699   return -1;
3700 }
3701
3702 #ifdef RE_ENABLE_I18N
3703 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3704    Return the number of the bytes the node accepts.
3705    STR_IDX is the current index of the input string.
3706
3707    This function handles the nodes which can accept one character, or
3708    one collating element like '.', '[a-z]', opposite to the other nodes
3709    can only accept one byte.  */
3710
3711 static int
3712 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3713     re_dfa_t *dfa;
3714     int node_idx, str_idx;
3715     const re_string_t *input;
3716 {
3717   const re_token_t *node = dfa->nodes + node_idx;
3718   int char_len, elem_len;
3719   int i;
3720
3721   if (BE (node->type == OP_UTF8_PERIOD, 0))
3722     {
3723       unsigned char c = re_string_byte_at (input, str_idx), d;
3724       if (BE (c < 0xc2, 1))
3725         return 0;
3726
3727       if (str_idx + 2 > input->len)
3728         return 0;
3729
3730       d = re_string_byte_at (input, str_idx + 1);
3731       if (c < 0xe0)
3732         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3733       else if (c < 0xf0)
3734         {
3735           char_len = 3;
3736           if (c == 0xe0 && d < 0xa0)
3737             return 0;
3738         }
3739       else if (c < 0xf8)
3740         {
3741           char_len = 4;
3742           if (c == 0xf0 && d < 0x90)
3743             return 0;
3744         }
3745       else if (c < 0xfc)
3746         {
3747           char_len = 5;
3748           if (c == 0xf8 && d < 0x88)
3749             return 0;
3750         }
3751       else if (c < 0xfe)
3752         {
3753           char_len = 6;
3754           if (c == 0xfc && d < 0x84)
3755             return 0;
3756         }
3757       else
3758         return 0;
3759
3760       if (str_idx + char_len > input->len)
3761         return 0;
3762
3763       for (i = 1; i < char_len; ++i)
3764         {
3765           d = re_string_byte_at (input, str_idx + i);
3766           if (d < 0x80 || d > 0xbf)
3767             return 0;
3768         }
3769       return char_len;
3770     }
3771
3772   char_len = re_string_char_size_at (input, str_idx);
3773   if (node->type == OP_PERIOD)
3774     {
3775       if (char_len <= 1)
3776         return 0;
3777       /* FIXME: I don't think this if is needed, as both '\n'
3778          and '\0' are char_len == 1.  */
3779       /* '.' accepts any one character except the following two cases.  */
3780       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3781            re_string_byte_at (input, str_idx) == '\n') ||
3782           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3783            re_string_byte_at (input, str_idx) == '\0'))
3784         return 0;
3785       return char_len;
3786     }
3787
3788   elem_len = re_string_elem_size_at (input, str_idx);
3789   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3790     return 0;
3791
3792   if (node->type == COMPLEX_BRACKET)
3793     {
3794       const re_charset_t *cset = node->opr.mbcset;
3795 # ifdef _LIBC
3796       const unsigned char *pin
3797         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3798       int j;
3799       uint32_t nrules;
3800 # endif /* _LIBC */
3801       int match_len = 0;
3802       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3803                     ? re_string_wchar_at (input, str_idx) : 0);
3804
3805       /* match with multibyte character?  */
3806       for (i = 0; i < cset->nmbchars; ++i)
3807         if (wc == cset->mbchars[i])
3808           {
3809             match_len = char_len;
3810             goto check_node_accept_bytes_match;
3811           }
3812       /* match with character_class?  */
3813       for (i = 0; i < cset->nchar_classes; ++i)
3814         {
3815           wctype_t wt = cset->char_classes[i];
3816           if (__iswctype (wc, wt))
3817             {
3818               match_len = char_len;
3819               goto check_node_accept_bytes_match;
3820             }
3821         }
3822
3823 # ifdef _LIBC
3824       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3825       if (nrules != 0)
3826         {
3827           unsigned int in_collseq = 0;
3828           const int32_t *table, *indirect;
3829           const unsigned char *weights, *extra;
3830           const char *collseqwc;
3831           int32_t idx;
3832           /* This #include defines a local function!  */
3833 #  include <locale/weight.h>
3834
3835           /* match with collating_symbol?  */
3836           if (cset->ncoll_syms)
3837             extra = (const unsigned char *)
3838               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3839           for (i = 0; i < cset->ncoll_syms; ++i)
3840             {
3841               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3842               /* Compare the length of input collating element and
3843                  the length of current collating element.  */
3844               if (*coll_sym != elem_len)
3845                 continue;
3846               /* Compare each bytes.  */
3847               for (j = 0; j < *coll_sym; j++)
3848                 if (pin[j] != coll_sym[1 + j])
3849                   break;
3850               if (j == *coll_sym)
3851                 {
3852                   /* Match if every bytes is equal.  */
3853                   match_len = j;
3854                   goto check_node_accept_bytes_match;
3855                 }
3856             }
3857
3858           if (cset->nranges)
3859             {
3860               if (elem_len <= char_len)
3861                 {
3862                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3863                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3864                 }
3865               else
3866                 in_collseq = find_collation_sequence_value (pin, elem_len);
3867             }
3868           /* match with range expression?  */
3869           for (i = 0; i < cset->nranges; ++i)
3870             if (cset->range_starts[i] <= in_collseq
3871                 && in_collseq <= cset->range_ends[i])
3872               {
3873                 match_len = elem_len;
3874                 goto check_node_accept_bytes_match;
3875               }
3876
3877           /* match with equivalence_class?  */
3878           if (cset->nequiv_classes)
3879             {
3880               const unsigned char *cp = pin;
3881               table = (const int32_t *)
3882                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3883               weights = (const unsigned char *)
3884                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3885               extra = (const unsigned char *)
3886                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3887               indirect = (const int32_t *)
3888                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3889               idx = findidx (&cp);
3890               if (idx > 0)
3891                 for (i = 0; i < cset->nequiv_classes; ++i)
3892                   {
3893                     int32_t equiv_class_idx = cset->equiv_classes[i];
3894                     size_t weight_len = weights[idx];
3895                     if (weight_len == weights[equiv_class_idx])
3896                       {
3897                         int cnt = 0;
3898                         while (cnt <= weight_len
3899                                && (weights[equiv_class_idx + 1 + cnt]
3900                                    == weights[idx + 1 + cnt]))
3901                           ++cnt;
3902                         if (cnt > weight_len)
3903                           {
3904                             match_len = elem_len;
3905                             goto check_node_accept_bytes_match;
3906                           }
3907                       }
3908                   }
3909             }
3910         }
3911       else
3912 # endif /* _LIBC */
3913         {
3914           /* match with range expression?  */
3915 #if __GNUC__ >= 2
3916           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3917 #else
3918           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3919           cmp_buf[2] = wc;
3920 #endif
3921           for (i = 0; i < cset->nranges; ++i)
3922             {
3923               cmp_buf[0] = cset->range_starts[i];
3924               cmp_buf[4] = cset->range_ends[i];
3925               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3926                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3927                 {
3928                   match_len = char_len;
3929                   goto check_node_accept_bytes_match;
3930                 }
3931             }
3932         }
3933     check_node_accept_bytes_match:
3934       if (!cset->non_match)
3935         return match_len;
3936       else
3937         {
3938           if (match_len > 0)
3939             return 0;
3940           else
3941             return (elem_len > char_len) ? elem_len : char_len;
3942         }
3943     }
3944   return 0;
3945 }
3946
3947 # ifdef _LIBC
3948 static unsigned int
3949 find_collation_sequence_value (mbs, mbs_len)
3950     const unsigned char *mbs;
3951     size_t mbs_len;
3952 {
3953   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3954   if (nrules == 0)
3955     {
3956       if (mbs_len == 1)
3957         {
3958           /* No valid character.  Match it as a single byte character.  */
3959           const unsigned char *collseq = (const unsigned char *)
3960             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3961           return collseq[mbs[0]];
3962         }
3963       return UINT_MAX;
3964     }
3965   else
3966     {
3967       int32_t idx;
3968       const unsigned char *extra = (const unsigned char *)
3969         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3970       int32_t extrasize = (const unsigned char *)
3971         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3972
3973       for (idx = 0; idx < extrasize;)
3974         {
3975           int mbs_cnt, found = 0;
3976           int32_t elem_mbs_len;
3977           /* Skip the name of collating element name.  */
3978           idx = idx + extra[idx] + 1;
3979           elem_mbs_len = extra[idx++];
3980           if (mbs_len == elem_mbs_len)
3981             {
3982               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3983                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3984                   break;
3985               if (mbs_cnt == elem_mbs_len)
3986                 /* Found the entry.  */
3987                 found = 1;
3988             }
3989           /* Skip the byte sequence of the collating element.  */
3990           idx += elem_mbs_len;
3991           /* Adjust for the alignment.  */
3992           idx = (idx + 3) & ~3;
3993           /* Skip the collation sequence value.  */
3994           idx += sizeof (uint32_t);
3995           /* Skip the wide char sequence of the collating element.  */
3996           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3997           /* If we found the entry, return the sequence value.  */
3998           if (found)
3999             return *(uint32_t *) (extra + idx);
4000           /* Skip the collation sequence value.  */
4001           idx += sizeof (uint32_t);
4002         }
4003       return UINT_MAX;
4004     }
4005 }
4006 # endif /* _LIBC */
4007 #endif /* RE_ENABLE_I18N */
4008
4009 /* Check whether the node accepts the byte which is IDX-th
4010    byte of the INPUT.  */
4011
4012 static int
4013 check_node_accept (mctx, node, idx)
4014     const re_match_context_t *mctx;
4015     const re_token_t *node;
4016     int idx;
4017 {
4018   unsigned char ch;
4019   ch = re_string_byte_at (&mctx->input, idx);
4020   switch (node->type)
4021     {
4022     case CHARACTER:
4023       if (node->opr.c != ch)
4024         return 0;
4025       break;
4026
4027     case SIMPLE_BRACKET:
4028       if (!bitset_contain (node->opr.sbcset, ch))
4029         return 0;
4030       break;
4031
4032 #ifdef RE_ENABLE_I18N
4033     case OP_UTF8_PERIOD:
4034       if (ch >= 0x80)
4035         return 0;
4036       /* FALLTHROUGH */
4037 #endif
4038     case OP_PERIOD:
4039       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4040           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4041         return 0;
4042       break;
4043
4044     default:
4045       return 0;
4046     }
4047
4048   if (node->constraint)
4049     {
4050       /* The node has constraints.  Check whether the current context
4051          satisfies the constraints.  */
4052       unsigned int context = re_string_context_at (&mctx->input, idx,
4053                                                    mctx->eflags);
4054       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4055         return 0;
4056     }
4057
4058   return 1;
4059 }
4060
4061 /* Extend the buffers, if the buffers have run out.  */
4062
4063 static reg_errcode_t
4064 extend_buffers (mctx)
4065      re_match_context_t *mctx;
4066 {
4067   reg_errcode_t ret;
4068   re_string_t *pstr = &mctx->input;
4069
4070   /* Double the lengthes of the buffers.  */
4071   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4072   if (BE (ret != REG_NOERROR, 0))
4073     return ret;
4074
4075   if (mctx->state_log != NULL)
4076     {
4077       /* And double the length of state_log.  */
4078       /* XXX We have no indication of the size of this buffer.  If this
4079          allocation fail we have no indication that the state_log array
4080          does not have the right size.  */
4081       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4082                                               pstr->bufs_len + 1);
4083       if (BE (new_array == NULL, 0))
4084         return REG_ESPACE;
4085       mctx->state_log = new_array;
4086     }
4087
4088   /* Then reconstruct the buffers.  */
4089   if (pstr->icase)
4090     {
4091 #ifdef RE_ENABLE_I18N
4092       if (pstr->mb_cur_max > 1)
4093         {
4094           ret = build_wcs_upper_buffer (pstr);
4095           if (BE (ret != REG_NOERROR, 0))
4096             return ret;
4097         }
4098       else
4099 #endif /* RE_ENABLE_I18N  */
4100         build_upper_buffer (pstr);
4101     }
4102   else
4103     {
4104 #ifdef RE_ENABLE_I18N
4105       if (pstr->mb_cur_max > 1)
4106         build_wcs_buffer (pstr);
4107       else
4108 #endif /* RE_ENABLE_I18N  */
4109         {
4110           if (pstr->trans != NULL)
4111             re_string_translate_buffer (pstr);
4112         }
4113     }
4114   return REG_NOERROR;
4115 }
4116
4117 \f
4118 /* Functions for matching context.  */
4119
4120 /* Initialize MCTX.  */
4121
4122 static reg_errcode_t
4123 match_ctx_init (mctx, eflags, n)
4124     re_match_context_t *mctx;
4125     int eflags, n;
4126 {
4127   mctx->eflags = eflags;
4128   mctx->match_last = -1;
4129   if (n > 0)
4130     {
4131       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4132       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4133       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4134         return REG_ESPACE;
4135     }
4136   /* Already zero-ed by the caller.
4137      else
4138        mctx->bkref_ents = NULL;
4139      mctx->nbkref_ents = 0;
4140      mctx->nsub_tops = 0;  */
4141   mctx->abkref_ents = n;
4142   mctx->max_mb_elem_len = 1;
4143   mctx->asub_tops = n;
4144   return REG_NOERROR;
4145 }
4146
4147 /* Clean the entries which depend on the current input in MCTX.
4148    This function must be invoked when the matcher changes the start index
4149    of the input, or changes the input string.  */
4150
4151 static void
4152 match_ctx_clean (mctx)
4153     re_match_context_t *mctx;
4154 {
4155   int st_idx;
4156   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4157     {
4158       int sl_idx;
4159       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4160       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4161         {
4162           re_sub_match_last_t *last = top->lasts[sl_idx];
4163           re_free (last->path.array);
4164           re_free (last);
4165         }
4166       re_free (top->lasts);
4167       if (top->path)
4168         {
4169           re_free (top->path->array);
4170           re_free (top->path);
4171         }
4172       free (top);
4173     }
4174
4175   mctx->nsub_tops = 0;
4176   mctx->nbkref_ents = 0;
4177 }
4178
4179 /* Free all the memory associated with MCTX.  */
4180
4181 static void
4182 match_ctx_free (mctx)
4183     re_match_context_t *mctx;
4184 {
4185   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4186   match_ctx_clean (mctx);
4187   re_free (mctx->sub_tops);
4188   re_free (mctx->bkref_ents);
4189 }
4190
4191 /* Add a new backreference entry to MCTX.
4192    Note that we assume that caller never call this function with duplicate
4193    entry, and call with STR_IDX which isn't smaller than any existing entry.
4194 */
4195
4196 static reg_errcode_t
4197 match_ctx_add_entry (mctx, node, str_idx, from, to)
4198      re_match_context_t *mctx;
4199      int node, str_idx, from, to;
4200 {
4201   if (mctx->nbkref_ents >= mctx->abkref_ents)
4202     {
4203       struct re_backref_cache_entry* new_entry;
4204       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4205                               mctx->abkref_ents * 2);
4206       if (BE (new_entry == NULL, 0))
4207         {
4208           re_free (mctx->bkref_ents);
4209           return REG_ESPACE;
4210         }
4211       mctx->bkref_ents = new_entry;
4212       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4213               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4214       mctx->abkref_ents *= 2;
4215     }
4216   if (mctx->nbkref_ents > 0
4217       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4218     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4219
4220   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4221   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4222   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4223   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4224
4225   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4226      If bit N is clear, means that this entry won't epsilon-transition to
4227      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4228      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4229      such node.
4230
4231      A backreference does not epsilon-transition unless it is empty, so set
4232      to all zeros if FROM != TO.  */
4233   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4234     = (from == to ? ~0 : 0);
4235
4236   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4237   if (mctx->max_mb_elem_len < to - from)
4238     mctx->max_mb_elem_len = to - from;
4239   return REG_NOERROR;
4240 }
4241
4242 /* Search for the first entry which has the same str_idx, or -1 if none is
4243    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4244
4245 static int
4246 search_cur_bkref_entry (mctx, str_idx)
4247      re_match_context_t *mctx;
4248      int str_idx;
4249 {
4250   int left, right, mid, last;
4251   last = right = mctx->nbkref_ents;
4252   for (left = 0; left < right;)
4253     {
4254       mid = (left + right) / 2;
4255       if (mctx->bkref_ents[mid].str_idx < str_idx)
4256         left = mid + 1;
4257       else
4258         right = mid;
4259     }
4260   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4261     return left;
4262   else
4263     return -1;
4264 }
4265
4266 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4267    at STR_IDX.  */
4268
4269 static reg_errcode_t
4270 match_ctx_add_subtop (mctx, node, str_idx)
4271      re_match_context_t *mctx;
4272      int node, str_idx;
4273 {
4274 #ifdef DEBUG
4275   assert (mctx->sub_tops != NULL);
4276   assert (mctx->asub_tops > 0);
4277 #endif
4278   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4279     {
4280       int new_asub_tops = mctx->asub_tops * 2;
4281       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4282                                                    re_sub_match_top_t *,
4283                                                    new_asub_tops);
4284       if (BE (new_array == NULL, 0))
4285         return REG_ESPACE;
4286       mctx->sub_tops = new_array;
4287       mctx->asub_tops = new_asub_tops;
4288     }
4289   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4290   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4291     return REG_ESPACE;
4292   mctx->sub_tops[mctx->nsub_tops]->node = node;
4293   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4294   return REG_NOERROR;
4295 }
4296
4297 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4298    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4299
4300 static re_sub_match_last_t *
4301 match_ctx_add_sublast (subtop, node, str_idx)
4302      re_sub_match_top_t *subtop;
4303      int node, str_idx;
4304 {
4305   re_sub_match_last_t *new_entry;
4306   if (BE (subtop->nlasts == subtop->alasts, 0))
4307     {
4308       int new_alasts = 2 * subtop->alasts + 1;
4309       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4310                                                     re_sub_match_last_t *,
4311                                                     new_alasts);
4312       if (BE (new_array == NULL, 0))
4313         return NULL;
4314       subtop->lasts = new_array;
4315       subtop->alasts = new_alasts;
4316     }
4317   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4318   if (BE (new_entry != NULL, 1))
4319     {
4320       subtop->lasts[subtop->nlasts] = new_entry;
4321       new_entry->node = node;
4322       new_entry->str_idx = str_idx;
4323       ++subtop->nlasts;
4324     }
4325   return new_entry;
4326 }
4327
4328 static void
4329 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4330     re_sift_context_t *sctx;
4331     re_dfastate_t **sifted_sts, **limited_sts;
4332     int last_node, last_str_idx;
4333 {
4334   sctx->sifted_states = sifted_sts;
4335   sctx->limited_states = limited_sts;
4336   sctx->last_node = last_node;
4337   sctx->last_str_idx = last_str_idx;
4338   re_node_set_init_empty (&sctx->limits);
4339 }