regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first) internal_function;
  58 static int check_halt_state_context (const re_match_context_t *mctx,
  59                                      const re_dfastate_t *state, int idx)
  60      internal_function;
  61 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  62                          regmatch_t *prev_idx_match, int cur_node,
  63                          int cur_idx, int nmatch) internal_function;
  64 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  65                                       int str_idx, int dest_node, int nregs,
  66                                       regmatch_t *regs,
  67                                       re_node_set *eps_via_nodes)
  68      internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  74      internal_function;
  75
  76 #ifdef RE_ENABLE_I18N
  77 static int sift_states_iter_mb (const re_match_context_t *mctx,
  78                                 re_sift_context_t *sctx,
  79                                 int node_idx, int str_idx, int max_str_idx)
  80      internal_function;
  81 #endif /* RE_ENABLE_I18N */
  82 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  83                                            re_sift_context_t *sctx)
  84      internal_function;
  85 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  86                                           re_sift_context_t *sctx, int str_idx,
  87                                           re_node_set *cur_dest)
  88      internal_function;
  89 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  90                                               re_sift_context_t *sctx,
  91                                               int str_idx,
  92                                               re_node_set *dest_nodes)
  93      internal_function;
  94 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  95                                             re_node_set *dest_nodes,
  96                                             const re_node_set *candidates)
  97      internal_function;
  98 static int check_dst_limits (const re_match_context_t *mctx,
  99                              re_node_set *limits,
 100                              int dst_node, int dst_idx, int src_node,
 101                              int src_idx) internal_function;
 102 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 103                                         int boundaries, int subexp_idx,
 104                                         int from_node, int bkref_idx)
 105      internal_function;
 106 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 107                                       int limit, int subexp_idx,
 108                                       int node, int str_idx,
 109                                       int bkref_idx) internal_function;
 110 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 111                                           re_node_set *dest_nodes,
 112                                           const re_node_set *candidates,
 113                                           re_node_set *limits,
 114                                           struct re_backref_cache_entry *bkref_ents,
 115                                           int str_idx) internal_function;
 116 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 117                                         re_sift_context_t *sctx,
 118                                         int str_idx, const re_node_set *candidates)
 119      internal_function;
 120 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 121                                         re_dfastate_t **dst,
 122                                         re_dfastate_t **src, int num)
 123      internal_function;
 124 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 125                                          re_match_context_t *mctx) internal_function;
 126 static re_dfastate_t *transit_state (reg_errcode_t *err,
 127                                      re_match_context_t *mctx,
 128                                      re_dfastate_t *state) internal_function;
 129 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 130                                             re_match_context_t *mctx,
 131                                             re_dfastate_t *next_state)
 132      internal_function;
 133 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 134                                                 re_node_set *cur_nodes,
 135                                                 int str_idx) internal_function;
 136 #if 0
 137 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 138                                         re_match_context_t *mctx,
 139                                         re_dfastate_t *pstate)
 140      internal_function;
 141 #endif
 142 #ifdef RE_ENABLE_I18N
 143 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 144                                        re_dfastate_t *pstate)
 145      internal_function;
 146 #endif /* RE_ENABLE_I18N */
 147 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 148                                           const re_node_set *nodes)
 149      internal_function;
 150 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 151                                  int bkref_node, int bkref_str_idx)
 152      internal_function;
 153 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 154                                      const re_sub_match_top_t *sub_top,
 155                                      re_sub_match_last_t *sub_last,
 156                                      int bkref_node, int bkref_str)
 157      internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes)
 168      internal_function;
 169 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 170                                                re_node_set *cur_nodes,
 171                                                int ex_subexp, int type)
 172      internal_function;
 173 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 174                                                    re_node_set *dst_nodes,
 175                                                    int target, int ex_subexp,
 176                                                    int type) internal_function;
 177 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 178                                          re_node_set *cur_nodes, int cur_str,
 179                                          int subexp_num, int type)
 180      internal_function;
 181 static int build_trtable (const re_dfa_t *dfa,
 182                           re_dfastate_t *state) internal_function;
 183 #ifdef RE_ENABLE_I18N
 184 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 185                                     const re_string_t *input, int idx)
 186      internal_function;
 187 # ifdef _LIBC
 188 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 189                                                    size_t name_len)
 190      internal_function;
 191 # endif /* _LIBC */
 192 #endif /* RE_ENABLE_I18N */
 193 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 194                                        const re_dfastate_t *state,
 195                                        re_node_set *states_node,
 196                                        bitset_t *states_ch) internal_function;
 197 static int check_node_accept (const re_match_context_t *mctx,
 198                               const re_token_t *node, int idx)
 199      internal_function;
 200 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
 201      internal_function;
 202
 203 #ifdef GAWK
 204 #undef MIN      /* safety */
 205 static int
 206 MIN(size_t a, size_t b)
 207 {
 208         return (a < b ? a : b);
 209 }
 210 #endif
 211 \f
 212 /* Entry point for POSIX code.  */
 213
 214 /* regexec searches for a given pattern, specified by PREG, in the
 215    string STRING.
 216
 217    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 218    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 219    least NMATCH elements, and we set them to the offsets of the
 220    corresponding matched substrings.
 221
 222    EFLAGS specifies `execution flags' which affect matching: if
 223    REG_NOTBOL is set, then ^ does not match at the beginning of the
 224    string; if REG_NOTEOL is set, then $ does not match at the end.
 225
 226    We return 0 if we find a match and REG_NOMATCH if not.  */
 227
 228 int
 229 regexec (preg, string, nmatch, pmatch, eflags)
 230     const regex_t *__restrict preg;
 231     const char *__restrict string;
 232     size_t nmatch;
 233     regmatch_t pmatch[];
 234     int eflags;
 235 {
 236   reg_errcode_t err;
 237   int start, length;
 238
 239   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 240     return REG_BADPAT;
 241
 242   if (eflags & REG_STARTEND)
 243     {
 244       start = pmatch[0].rm_so;
 245       length = pmatch[0].rm_eo;
 246     }
 247   else
 248     {
 249       start = 0;
 250       length = strlen (string);
 251     }
 252
 253   __libc_lock_lock (dfa->lock);
 254   if (preg->no_sub)
 255     err = re_search_internal (preg, string, length, start, length - start,
 256                               length, 0, NULL, eflags);
 257   else
 258     err = re_search_internal (preg, string, length, start, length - start,
 259                               length, nmatch, pmatch, eflags);
 260   __libc_lock_unlock (dfa->lock);
 261   return err != REG_NOERROR;
 262 }
 263
 264 #ifdef _LIBC
 265 # include <shlib-compat.h>
 266 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 267
 268 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 269 __typeof__ (__regexec) __compat_regexec;
 270
 271 int
 272 attribute_compat_text_section
 273 __compat_regexec (const regex_t *__restrict preg,
 274                   const char *__restrict string, size_t nmatch,
 275                   regmatch_t pmatch[], int eflags)
 276 {
 277   return regexec (preg, string, nmatch, pmatch,
 278                   eflags & (REG_NOTBOL | REG_NOTEOL));
 279 }
 280 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 281 # endif
 282 #endif
 283
 284 /* Entry points for GNU code.  */
 285
 286 /* re_match, re_search, re_match_2, re_search_2
 287
 288    The former two functions operate on STRING with length LENGTH,
 289    while the later two operate on concatenation of STRING1 and STRING2
 290    with lengths LENGTH1 and LENGTH2, respectively.
 291
 292    re_match() matches the compiled pattern in BUFP against the string,
 293    starting at index START.
 294
 295    re_search() first tries matching at index START, then it tries to match
 296    starting from index START + 1, and so on.  The last start position tried
 297    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 298    way as re_match().)
 299
 300    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 301    the first STOP characters of the concatenation of the strings should be
 302    concerned.
 303
 304    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 305    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 306    computed relative to the concatenation, not relative to the individual
 307    strings.)
 308
 309    On success, re_match* functions return the length of the match, re_search*
 310    return the position of the start of the match.  Return value -1 means no
 311    match was found and -2 indicates an internal error.  */
 312
 313 int
 314 re_match (bufp, string, length, start, regs)
 315     struct re_pattern_buffer *bufp;
 316     const char *string;
 317     int length, start;
 318     struct re_registers *regs;
 319 {
 320   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 321 }
 322 #ifdef _LIBC
 323 weak_alias (__re_match, re_match)
 324 #endif
 325
 326 int
 327 re_search (bufp, string, length, start, range, regs)
 328     struct re_pattern_buffer *bufp;
 329     const char *string;
 330     int length, start, range;
 331     struct re_registers *regs;
 332 {
 333   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 334 }
 335 #ifdef _LIBC
 336 weak_alias (__re_search, re_search)
 337 #endif
 338
 339 int
 340 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 341     struct re_pattern_buffer *bufp;
 342     const char *string1, *string2;
 343     int length1, length2, start, stop;
 344     struct re_registers *regs;
 345 {
 346   return re_search_2_stub (bufp, string1, length1, string2, length2,
 347                            start, 0, regs, stop, 1);
 348 }
 349 #ifdef _LIBC
 350 weak_alias (__re_match_2, re_match_2)
 351 #endif
 352
 353 int
 354 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 355     struct re_pattern_buffer *bufp;
 356     const char *string1, *string2;
 357     int length1, length2, start, range, stop;
 358     struct re_registers *regs;
 359 {
 360   return re_search_2_stub (bufp, string1, length1, string2, length2,
 361                            start, range, regs, stop, 0);
 362 }
 363 #ifdef _LIBC
 364 weak_alias (__re_search_2, re_search_2)
 365 #endif
 366
 367 static int
 368 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 369                   stop, ret_len)
 370     struct re_pattern_buffer *bufp;
 371     const char *string1, *string2;
 372     int length1, length2, start, range, stop, ret_len;
 373     struct re_registers *regs;
 374 {
 375   const char *str;
 376   int rval;
 377   int len = length1 + length2;
 378   char *s = NULL;
 379
 380   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
 381     return -2;
 382
 383   /* Concatenate the strings.  */
 384   if (length2 > 0)
 385     if (length1 > 0)
 386       {
 387         s = re_malloc (char, len);
 388
 389         if (BE (s == NULL, 0))
 390           return -2;
 391 #ifdef _LIBC
 392         memcpy (__mempcpy (s, string1, length1), string2, length2);
 393 #else
 394         memcpy (s, string1, length1);
 395         memcpy (s + length1, string2, length2);
 396 #endif
 397         str = s;
 398       }
 399     else
 400       str = string2;
 401   else
 402     str = string1;
 403
 404   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 405   re_free (s);
 406   return rval;
 407 }
 408
 409 /* The parameters have the same meaning as those of re_search.
 410    Additional parameters:
 411    If RET_LEN is nonzero the length of the match is returned (re_match style);
 412    otherwise the position of the match is returned.  */
 413
 414 static int
 415 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 416     struct re_pattern_buffer *bufp;
 417     const char *string;
 418     int length, start, range, stop, ret_len;
 419     struct re_registers *regs;
 420 {
 421   reg_errcode_t result;
 422   regmatch_t *pmatch;
 423   int nregs, rval;
 424   int eflags = 0;
 425
 426   /* Check for out-of-range.  */
 427   if (BE (start < 0 || start > length, 0))
 428     return -1;
 429   if (BE (start + range > length, 0))
 430     range = length - start;
 431   else if (BE (start + range < 0, 0))
 432     range = -start;
 433
 434   __libc_lock_lock (dfa->lock);
 435
 436   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 437   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 438
 439   /* Compile fastmap if we haven't yet.  */
 440   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 441     re_compile_fastmap (bufp);
 442
 443   if (BE (bufp->no_sub, 0))
 444     regs = NULL;
 445
 446   /* We need at least 1 register.  */
 447   if (regs == NULL)
 448     nregs = 1;
 449   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 450                regs->num_regs < bufp->re_nsub + 1, 0))
 451     {
 452       nregs = regs->num_regs;
 453       if (BE (nregs < 1, 0))
 454         {
 455           /* Nothing can be copied to regs.  */
 456           regs = NULL;
 457           nregs = 1;
 458         }
 459     }
 460   else
 461     nregs = bufp->re_nsub + 1;
 462   pmatch = re_malloc (regmatch_t, nregs);
 463   if (BE (pmatch == NULL, 0))
 464     {
 465       rval = -2;
 466       goto out;
 467     }
 468
 469   result = re_search_internal (bufp, string, length, start, range, stop,
 470                                nregs, pmatch, eflags);
 471
 472   rval = 0;
 473
 474   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 475   if (result != REG_NOERROR)
 476     rval = -1;
 477   else if (regs != NULL)
 478     {
 479       /* If caller wants register contents data back, copy them.  */
 480       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 481                                            bufp->regs_allocated);
 482       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 483         rval = -2;
 484     }
 485
 486   if (BE (rval == 0, 1))
 487     {
 488       if (ret_len)
 489         {
 490           assert (pmatch[0].rm_so == start);
 491           rval = pmatch[0].rm_eo - start;
 492         }
 493       else
 494         rval = pmatch[0].rm_so;
 495     }
 496   re_free (pmatch);
 497  out:
 498   __libc_lock_unlock (dfa->lock);
 499   return rval;
 500 }
 501
 502 static unsigned
 503 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 504     struct re_registers *regs;
 505     regmatch_t *pmatch;
 506     int nregs, regs_allocated;
 507 {
 508   int rval = REGS_REALLOCATE;
 509   int i;
 510   int need_regs = nregs + 1;
 511   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 512      uses.  */
 513
 514   /* Have the register data arrays been allocated?  */
 515   if (regs_allocated == REGS_UNALLOCATED)
 516     { /* No.  So allocate them with malloc.  */
 517       regs->start = re_malloc (regoff_t, need_regs);
 518       if (BE (regs->start == NULL, 0))
 519         return REGS_UNALLOCATED;
 520       regs->end = re_malloc (regoff_t, need_regs);
 521       if (BE (regs->end == NULL, 0))
 522         {
 523           re_free (regs->start);
 524           return REGS_UNALLOCATED;
 525         }
 526       regs->num_regs = need_regs;
 527     }
 528   else if (regs_allocated == REGS_REALLOCATE)
 529     { /* Yes.  If we need more elements than were already
 530          allocated, reallocate them.  If we need fewer, just
 531          leave it alone.  */
 532       if (BE (need_regs > regs->num_regs, 0))
 533         {
 534           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 535           regoff_t *new_end;
 536           if (BE (new_start == NULL, 0))
 537             return REGS_UNALLOCATED;
 538           new_end = re_realloc (regs->end, regoff_t, need_regs);
 539           if (BE (new_end == NULL, 0))
 540             {
 541               re_free (new_start);
 542               return REGS_UNALLOCATED;
 543             }
 544           regs->start = new_start;
 545           regs->end = new_end;
 546           regs->num_regs = need_regs;
 547         }
 548     }
 549   else
 550     {
 551       assert (regs_allocated == REGS_FIXED);
 552       /* This function may not be called with REGS_FIXED and nregs too big.  */
 553       assert (regs->num_regs >= nregs);
 554       rval = REGS_FIXED;
 555     }
 556
 557   /* Copy the regs.  */
 558   for (i = 0; i < nregs; ++i)
 559     {
 560       regs->start[i] = pmatch[i].rm_so;
 561       regs->end[i] = pmatch[i].rm_eo;
 562     }
 563   for ( ; i < regs->num_regs; ++i)
 564     regs->start[i] = regs->end[i] = -1;
 565
 566   return rval;
 567 }
 568
 569 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 570    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 571    this memory for recording register information.  STARTS and ENDS
 572    must be allocated using the malloc library routine, and must each
 573    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 574
 575    If NUM_REGS == 0, then subsequent matches should allocate their own
 576    register data.
 577
 578    Unless this function is called, the first search or match using
 579    PATTERN_BUFFER will allocate its own register data, without
 580    freeing the old data.  */
 581
 582 void
 583 re_set_registers (bufp, regs, num_regs, starts, ends)
 584     struct re_pattern_buffer *bufp;
 585     struct re_registers *regs;
 586     unsigned num_regs;
 587     regoff_t *starts, *ends;
 588 {
 589   if (num_regs)
 590     {
 591       bufp->regs_allocated = REGS_REALLOCATE;
 592       regs->num_regs = num_regs;
 593       regs->start = starts;
 594       regs->end = ends;
 595     }
 596   else
 597     {
 598       bufp->regs_allocated = REGS_UNALLOCATED;
 599       regs->num_regs = 0;
 600       regs->start = regs->end = (regoff_t *) 0;
 601     }
 602 }
 603 #ifdef _LIBC
 604 weak_alias (__re_set_registers, re_set_registers)
 605 #endif
 606 \f
 607 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 608    them unless specifically requested.  */
 609
 610 #if defined _REGEX_RE_COMP || defined _LIBC
 611 int
 612 # ifdef _LIBC
 613 weak_function
 614 # endif
 615 re_exec (s)
 616      const char *s;
 617 {
 618   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 619 }
 620 #endif /* _REGEX_RE_COMP */
 621 \f
 622 /* Internal entry point.  */
 623
 624 /* Searches for a compiled pattern PREG in the string STRING, whose
 625    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 626    mingings with regexec.  START, and RANGE have the same meanings
 627    with re_search.
 628    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 629    otherwise return the error code.
 630    Note: We assume front end functions already check ranges.
 631    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 632
 633 static reg_errcode_t
 634 __attribute_warn_unused_result__
 635 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 636                     eflags)
 637     const regex_t *preg;
 638     const char *string;
 639     int length, start, range, stop, eflags;
 640     size_t nmatch;
 641     regmatch_t pmatch[];
 642 {
 643   reg_errcode_t err;
 644   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 645   int left_lim, right_lim, incr;
 646   int fl_longest_match, match_first, match_kind, match_last = -1;
 647   int extra_nmatch;
 648   int sb, ch;
 649 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 650   re_match_context_t mctx = { .dfa = dfa };
 651 #else
 652   re_match_context_t mctx;
 653 #endif
 654   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 655                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 656   RE_TRANSLATE_TYPE t = preg->translate;
 657
 658 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 659   memset (&mctx, '\0', sizeof (re_match_context_t));
 660   mctx.dfa = dfa;
 661 #endif
 662
 663   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 664   nmatch -= extra_nmatch;
 665
 666   /* Check if the DFA haven't been compiled.  */
 667   if (BE (preg->used == 0 || dfa == NULL || dfa->init_state == NULL
 668           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 669           || dfa->init_state_begbuf == NULL, 0))
 670     return REG_NOMATCH;
 671
 672 #ifdef DEBUG
 673   /* We assume front-end functions already check them.  */
 674   assert (start + range >= 0 && start + range <= length);
 675 #endif
 676
 677   /* If initial states with non-begbuf contexts have no elements,
 678      the regex must be anchored.  If preg->newline_anchor is set,
 679      we'll never use init_state_nl, so do not check it.  */
 680   if (dfa->init_state->nodes.nelem == 0
 681       && dfa->init_state_word->nodes.nelem == 0
 682       && (dfa->init_state_nl->nodes.nelem == 0
 683           || !preg->newline_anchor))
 684     {
 685       if (start != 0 && start + range != 0)
 686         return REG_NOMATCH;
 687       start = range = 0;
 688     }
 689
 690   /* We must check the longest matching, if nmatch > 0.  */
 691   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 692
 693   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 694                             preg->translate, preg->syntax & RE_ICASE, dfa);
 695   if (BE (err != REG_NOERROR, 0))
 696     goto free_return;
 697   mctx.input.stop = stop;
 698   mctx.input.raw_stop = stop;
 699   mctx.input.newline_anchor = preg->newline_anchor;
 700
 701   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 702   if (BE (err != REG_NOERROR, 0))
 703     goto free_return;
 704
 705   /* We will log all the DFA states through which the dfa pass,
 706      if nmatch > 1, or this dfa has "multibyte node", which is a
 707      back-reference or a node which can accept multibyte character or
 708      multi character collating element.  */
 709   if (nmatch > 1 || dfa->has_mb_node)
 710     {
 711       /* Avoid overflow.  */
 712       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 713         {
 714           err = REG_ESPACE;
 715           goto free_return;
 716         }
 717
 718       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 719       if (BE (mctx.state_log == NULL, 0))
 720         {
 721           err = REG_ESPACE;
 722           goto free_return;
 723         }
 724     }
 725   else
 726     mctx.state_log = NULL;
 727
 728   match_first = start;
 729   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 730                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 731
 732   /* Check incrementally whether of not the input string match.  */
 733   incr = (range < 0) ? -1 : 1;
 734   left_lim = (range < 0) ? start + range : start;
 735   right_lim = (range < 0) ? start : start + range;
 736   sb = dfa->mb_cur_max == 1;
 737   match_kind =
 738     (fastmap
 739      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 740         | (range >= 0 ? 2 : 0)
 741         | (t != NULL ? 1 : 0))
 742      : 8);
 743
 744   for (;; match_first += incr)
 745     {
 746       err = REG_NOMATCH;
 747       if (match_first < left_lim || right_lim < match_first)
 748         goto free_return;
 749
 750       /* Advance as rapidly as possible through the string, until we
 751          find a plausible place to start matching.  This may be done
 752          with varying efficiency, so there are various possibilities:
 753          only the most common of them are specialized, in order to
 754          save on code size.  We use a switch statement for speed.  */
 755       switch (match_kind)
 756         {
 757         case 8:
 758           /* No fastmap.  */
 759           break;
 760
 761         case 7:
 762           /* Fastmap with single-byte translation, match forward.  */
 763           while (BE (match_first < right_lim, 1)
 764                  && !fastmap[t[(unsigned char) string[match_first]]])
 765             ++match_first;
 766           goto forward_match_found_start_or_reached_end;
 767
 768         case 6:
 769           /* Fastmap without translation, match forward.  */
 770           while (BE (match_first < right_lim, 1)
 771                  && !fastmap[(unsigned char) string[match_first]])
 772             ++match_first;
 773
 774         forward_match_found_start_or_reached_end:
 775           if (BE (match_first == right_lim, 0))
 776             {
 777               ch = match_first >= length
 778                        ? 0 : (unsigned char) string[match_first];
 779               if (!fastmap[t ? t[ch] : ch])
 780                 goto free_return;
 781             }
 782           break;
 783
 784         case 4:
 785         case 5:
 786           /* Fastmap without multi-byte translation, match backwards.  */
 787           while (match_first >= left_lim)
 788             {
 789               ch = match_first >= length
 790                        ? 0 : (unsigned char) string[match_first];
 791               if (fastmap[t ? t[ch] : ch])
 792                 break;
 793               --match_first;
 794             }
 795           if (match_first < left_lim)
 796             goto free_return;
 797           break;
 798
 799         default:
 800           /* In this case, we can't determine easily the current byte,
 801              since it might be a component byte of a multibyte
 802              character.  Then we use the constructed buffer instead.  */
 803           for (;;)
 804             {
 805               /* If MATCH_FIRST is out of the valid range, reconstruct the
 806                  buffers.  */
 807               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 808               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 809                 {
 810                   err = re_string_reconstruct (&mctx.input, match_first,
 811                                                eflags);
 812                   if (BE (err != REG_NOERROR, 0))
 813                     goto free_return;
 814
 815                   offset = match_first - mctx.input.raw_mbs_idx;
 816                 }
 817               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 818                  Note that MATCH_FIRST must not be smaller than 0.  */
 819               ch = (match_first >= length
 820                     ? 0 : re_string_byte_at (&mctx.input, offset));
 821               if (fastmap[ch])
 822                 break;
 823               match_first += incr;
 824               if (match_first < left_lim || match_first > right_lim)
 825                 {
 826                   err = REG_NOMATCH;
 827                   goto free_return;
 828                 }
 829             }
 830           break;
 831         }
 832
 833       /* Reconstruct the buffers so that the matcher can assume that
 834          the matching starts from the beginning of the buffer.  */
 835       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 836       if (BE (err != REG_NOERROR, 0))
 837         goto free_return;
 838
 839 #ifdef RE_ENABLE_I18N
 840      /* Don't consider this char as a possible match start if it part,
 841         yet isn't the head, of a multibyte character.  */
 842       if (!sb && !re_string_first_byte (&mctx.input, 0))
 843         continue;
 844 #endif
 845
 846       /* It seems to be appropriate one, then use the matcher.  */
 847       /* We assume that the matching starts from 0.  */
 848       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 849       match_last = check_matching (&mctx, fl_longest_match,
 850                                    range >= 0 ? &match_first : NULL);
 851       if (match_last != -1)
 852         {
 853           if (BE (match_last == -2, 0))
 854             {
 855               err = REG_ESPACE;
 856               goto free_return;
 857             }
 858           else
 859             {
 860               mctx.match_last = match_last;
 861               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 862                 {
 863                   re_dfastate_t *pstate = mctx.state_log[match_last];
 864                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 865                                                              match_last);
 866                 }
 867               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 868                   || dfa->nbackref)
 869                 {
 870                   err = prune_impossible_nodes (&mctx);
 871                   if (err == REG_NOERROR)
 872                     break;
 873                   if (BE (err != REG_NOMATCH, 0))
 874                     goto free_return;
 875                   match_last = -1;
 876                 }
 877               else
 878                 break; /* We found a match.  */
 879             }
 880         }
 881
 882       match_ctx_clean (&mctx);
 883     }
 884
 885 #ifdef DEBUG
 886   assert (match_last != -1);
 887   assert (err == REG_NOERROR);
 888 #endif
 889
 890   /* Set pmatch[] if we need.  */
 891   if (nmatch > 0)
 892     {
 893       int reg_idx;
 894
 895       /* Initialize registers.  */
 896       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 897         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 898
 899       /* Set the points where matching start/end.  */
 900       pmatch[0].rm_so = 0;
 901       pmatch[0].rm_eo = mctx.match_last;
 902
 903       if (!preg->no_sub && nmatch > 1)
 904         {
 905           err = set_regs (preg, &mctx, nmatch, pmatch,
 906                           dfa->has_plural_match && dfa->nbackref > 0);
 907           if (BE (err != REG_NOERROR, 0))
 908             goto free_return;
 909         }
 910
 911       /* At last, add the offset to the each registers, since we slided
 912          the buffers so that we could assume that the matching starts
 913          from 0.  */
 914       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 915         if (pmatch[reg_idx].rm_so != -1)
 916           {
 917 #ifdef RE_ENABLE_I18N
 918             if (BE (mctx.input.offsets_needed != 0, 0))
 919               {
 920                 pmatch[reg_idx].rm_so =
 921                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 922                    ? mctx.input.valid_raw_len
 923                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 924                 pmatch[reg_idx].rm_eo =
 925                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 926                    ? mctx.input.valid_raw_len
 927                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 928               }
 929 #else
 930             assert (mctx.input.offsets_needed == 0);
 931 #endif
 932             pmatch[reg_idx].rm_so += match_first;
 933             pmatch[reg_idx].rm_eo += match_first;
 934           }
 935       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 936         {
 937           pmatch[nmatch + reg_idx].rm_so = -1;
 938           pmatch[nmatch + reg_idx].rm_eo = -1;
 939         }
 940
 941       if (dfa->subexp_map)
 942         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 943           if (dfa->subexp_map[reg_idx] != reg_idx)
 944             {
 945               pmatch[reg_idx + 1].rm_so
 946                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 947               pmatch[reg_idx + 1].rm_eo
 948                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 949             }
 950     }
 951
 952  free_return:
 953   re_free (mctx.state_log);
 954   if (dfa->nbackref)
 955     match_ctx_free (&mctx);
 956   re_string_destruct (&mctx.input);
 957   return err;
 958 }
 959
 960 static reg_errcode_t
 961 __attribute_warn_unused_result__
 962 prune_impossible_nodes (mctx)
 963      re_match_context_t *mctx;
 964 {
 965   const re_dfa_t *const dfa = mctx->dfa;
 966   int halt_node, match_last;
 967   reg_errcode_t ret;
 968   re_dfastate_t **sifted_states;
 969   re_dfastate_t **lim_states = NULL;
 970   re_sift_context_t sctx;
 971 #ifdef DEBUG
 972   assert (mctx->state_log != NULL);
 973 #endif
 974   match_last = mctx->match_last;
 975   halt_node = mctx->last_node;
 976
 977   /* Avoid overflow.  */
 978   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 979     return REG_ESPACE;
 980
 981   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 982   if (BE (sifted_states == NULL, 0))
 983     {
 984       ret = REG_ESPACE;
 985       goto free_return;
 986     }
 987   if (dfa->nbackref)
 988     {
 989       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 990       if (BE (lim_states == NULL, 0))
 991         {
 992           ret = REG_ESPACE;
 993           goto free_return;
 994         }
 995       while (1)
 996         {
 997           memset (lim_states, '\0',
 998                   sizeof (re_dfastate_t *) * (match_last + 1));
 999           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1000                          match_last);
1001           ret = sift_states_backward (mctx, &sctx);
1002           re_node_set_free (&sctx.limits);
1003           if (BE (ret != REG_NOERROR, 0))
1004               goto free_return;
1005           if (sifted_states[0] != NULL || lim_states[0] != NULL)
1006             break;
1007           do
1008             {
1009               --match_last;
1010               if (match_last < 0)
1011                 {
1012                   ret = REG_NOMATCH;
1013                   goto free_return;
1014                 }
1015             } while (mctx->state_log[match_last] == NULL
1016                      || !mctx->state_log[match_last]->halt);
1017           halt_node = check_halt_state_context (mctx,
1018                                                 mctx->state_log[match_last],
1019                                                 match_last);
1020         }
1021       ret = merge_state_array (dfa, sifted_states, lim_states,
1022                                match_last + 1);
1023       re_free (lim_states);
1024       lim_states = NULL;
1025       if (BE (ret != REG_NOERROR, 0))
1026         goto free_return;
1027     }
1028   else
1029     {
1030       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1031       ret = sift_states_backward (mctx, &sctx);
1032       re_node_set_free (&sctx.limits);
1033       if (BE (ret != REG_NOERROR, 0))
1034         goto free_return;
1035       if (sifted_states[0] == NULL)
1036         {
1037           ret = REG_NOMATCH;
1038           goto free_return;
1039         }
1040     }
1041   re_free (mctx->state_log);
1042   mctx->state_log = sifted_states;
1043   sifted_states = NULL;
1044   mctx->last_node = halt_node;
1045   mctx->match_last = match_last;
1046   ret = REG_NOERROR;
1047  free_return:
1048   re_free (sifted_states);
1049   re_free (lim_states);
1050   return ret;
1051 }
1052
1053 /* Acquire an initial state and return it.
1054    We must select appropriate initial state depending on the context,
1055    since initial states may have constraints like "\<", "^", etc..  */
1056
1057 static inline re_dfastate_t *
1058 __attribute ((always_inline)) internal_function
1059 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1060                             int idx)
1061 {
1062   const re_dfa_t *const dfa = mctx->dfa;
1063   if (dfa->init_state->has_constraint)
1064     {
1065       unsigned int context;
1066       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1067       if (IS_WORD_CONTEXT (context))
1068         return dfa->init_state_word;
1069       else if (IS_ORDINARY_CONTEXT (context))
1070         return dfa->init_state;
1071       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1072         return dfa->init_state_begbuf;
1073       else if (IS_NEWLINE_CONTEXT (context))
1074         return dfa->init_state_nl;
1075       else if (IS_BEGBUF_CONTEXT (context))
1076         {
1077           /* It is relatively rare case, then calculate on demand.  */
1078           return re_acquire_state_context (err, dfa,
1079                                            dfa->init_state->entrance_nodes,
1080                                            context);
1081         }
1082       else
1083         /* Must not happen?  */
1084         return dfa->init_state;
1085     }
1086   else
1087     return dfa->init_state;
1088 }
1089
1090 /* Check whether the regular expression match input string INPUT or not,
1091    and return the index where the matching end, return -1 if not match,
1092    or return -2 in case of an error.
1093    FL_LONGEST_MATCH means we want the POSIX longest matching.
1094    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1095    next place where we may want to try matching.
1096    Note that the matcher assume that the maching starts from the current
1097    index of the buffer.  */
1098
1099 static int
1100 internal_function __attribute_warn_unused_result__
1101 check_matching (re_match_context_t *mctx, int fl_longest_match,
1102                 int *p_match_first)
1103 {
1104   const re_dfa_t *const dfa = mctx->dfa;
1105   reg_errcode_t err;
1106   int match = 0;
1107   int match_last = -1;
1108   int cur_str_idx = re_string_cur_idx (&mctx->input);
1109   re_dfastate_t *cur_state;
1110   int at_init_state = p_match_first != NULL;
1111   int next_start_idx = cur_str_idx;
1112
1113   err = REG_NOERROR;
1114   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1115   /* An initial state must not be NULL (invalid).  */
1116   if (BE (cur_state == NULL, 0))
1117     {
1118       assert (err == REG_ESPACE);
1119       return -2;
1120     }
1121
1122   if (mctx->state_log != NULL)
1123     {
1124       mctx->state_log[cur_str_idx] = cur_state;
1125
1126       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1127          later.  E.g. Processing back references.  */
1128       if (BE (dfa->nbackref, 0))
1129         {
1130           at_init_state = 0;
1131           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1132           if (BE (err != REG_NOERROR, 0))
1133             return err;
1134
1135           if (cur_state->has_backref)
1136             {
1137               err = transit_state_bkref (mctx, &cur_state->nodes);
1138               if (BE (err != REG_NOERROR, 0))
1139                 return err;
1140             }
1141         }
1142     }
1143
1144   /* If the RE accepts NULL string.  */
1145   if (BE (cur_state->halt, 0))
1146     {
1147       if (!cur_state->has_constraint
1148           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1149         {
1150           if (!fl_longest_match)
1151             return cur_str_idx;
1152           else
1153             {
1154               match_last = cur_str_idx;
1155               match = 1;
1156             }
1157         }
1158     }
1159
1160   while (!re_string_eoi (&mctx->input))
1161     {
1162       re_dfastate_t *old_state = cur_state;
1163       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1164
1165       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1166            && mctx->input.bufs_len < mctx->input.len)
1167           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1168               && mctx->input.valid_len < mctx->input.len))
1169         {
1170           err = extend_buffers (mctx, next_char_idx + 1);
1171           if (BE (err != REG_NOERROR, 0))
1172             {
1173               assert (err == REG_ESPACE);
1174               return -2;
1175             }
1176         }
1177
1178       cur_state = transit_state (&err, mctx, cur_state);
1179       if (mctx->state_log != NULL)
1180         cur_state = merge_state_with_log (&err, mctx, cur_state);
1181
1182       if (cur_state == NULL)
1183         {
1184           /* Reached the invalid state or an error.  Try to recover a valid
1185              state using the state log, if available and if we have not
1186              already found a valid (even if not the longest) match.  */
1187           if (BE (err != REG_NOERROR, 0))
1188             return -2;
1189
1190           if (mctx->state_log == NULL
1191               || (match && !fl_longest_match)
1192               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1193             break;
1194         }
1195
1196       if (BE (at_init_state, 0))
1197         {
1198           if (old_state == cur_state)
1199             next_start_idx = next_char_idx;
1200           else
1201             at_init_state = 0;
1202         }
1203
1204       if (cur_state->halt)
1205         {
1206           /* Reached a halt state.
1207              Check the halt state can satisfy the current context.  */
1208           if (!cur_state->has_constraint
1209               || check_halt_state_context (mctx, cur_state,
1210                                            re_string_cur_idx (&mctx->input)))
1211             {
1212               /* We found an appropriate halt state.  */
1213               match_last = re_string_cur_idx (&mctx->input);
1214               match = 1;
1215
1216               /* We found a match, do not modify match_first below.  */
1217               p_match_first = NULL;
1218               if (!fl_longest_match)
1219                 break;
1220             }
1221         }
1222     }
1223
1224   if (p_match_first)
1225     *p_match_first += next_start_idx;
1226
1227   return match_last;
1228 }
1229
1230 /* Check NODE match the current context.  */
1231
1232 static int
1233 internal_function
1234 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1235 {
1236   re_token_type_t type = dfa->nodes[node].type;
1237   unsigned int constraint = dfa->nodes[node].constraint;
1238   if (type != END_OF_RE)
1239     return 0;
1240   if (!constraint)
1241     return 1;
1242   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1243     return 0;
1244   return 1;
1245 }
1246
1247 /* Check the halt state STATE match the current context.
1248    Return 0 if not match, if the node, STATE has, is a halt node and
1249    match the context, return the node.  */
1250
1251 static int
1252 internal_function
1253 check_halt_state_context (const re_match_context_t *mctx,
1254                           const re_dfastate_t *state, int idx)
1255 {
1256   int i;
1257   unsigned int context;
1258 #ifdef DEBUG
1259   assert (state->halt);
1260 #endif
1261   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1262   for (i = 0; i < state->nodes.nelem; ++i)
1263     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1264       return state->nodes.elems[i];
1265   return 0;
1266 }
1267
1268 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1269    corresponding to the DFA).
1270    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1271    of errors.  */
1272
1273 static int
1274 internal_function
1275 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1276                    int *pidx, int node, re_node_set *eps_via_nodes,
1277                    struct re_fail_stack_t *fs)
1278 {
1279   const re_dfa_t *const dfa = mctx->dfa;
1280   int i, err;
1281   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1282     {
1283       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1284       re_node_set *edests = &dfa->edests[node];
1285       int dest_node;
1286       err = re_node_set_insert (eps_via_nodes, node);
1287       if (BE (err < 0, 0))
1288         return -2;
1289       /* Pick up a valid destination, or return -1 if none is found.  */
1290       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1291         {
1292           int candidate = edests->elems[i];
1293           if (!re_node_set_contains (cur_nodes, candidate))
1294             continue;
1295           if (dest_node == -1)
1296             dest_node = candidate;
1297
1298           else
1299             {
1300               /* In order to avoid infinite loop like "(a*)*", return the second
1301                  epsilon-transition if the first was already considered.  */
1302               if (re_node_set_contains (eps_via_nodes, dest_node))
1303                 return candidate;
1304
1305               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1306               else if (fs != NULL
1307                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1308                                            eps_via_nodes))
1309                 return -2;
1310
1311               /* We know we are going to exit.  */
1312               break;
1313             }
1314         }
1315       return dest_node;
1316     }
1317   else
1318     {
1319       int naccepted = 0;
1320       re_token_type_t type = dfa->nodes[node].type;
1321
1322 #ifdef RE_ENABLE_I18N
1323       if (dfa->nodes[node].accept_mb)
1324         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1325       else
1326 #endif /* RE_ENABLE_I18N */
1327       if (type == OP_BACK_REF)
1328         {
1329           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1330           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1331           if (fs != NULL)
1332             {
1333               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1334                 return -1;
1335               else if (naccepted)
1336                 {
1337                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1338                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1339                               naccepted) != 0)
1340                     return -1;
1341                 }
1342             }
1343
1344           if (naccepted == 0)
1345             {
1346               int dest_node;
1347               err = re_node_set_insert (eps_via_nodes, node);
1348               if (BE (err < 0, 0))
1349                 return -2;
1350               dest_node = dfa->edests[node].elems[0];
1351               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1352                                         dest_node))
1353                 return dest_node;
1354             }
1355         }
1356
1357       if (naccepted != 0
1358           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1359         {
1360           int dest_node = dfa->nexts[node];
1361           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1362           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1363                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1364                                                dest_node)))
1365             return -1;
1366           re_node_set_empty (eps_via_nodes);
1367           return dest_node;
1368         }
1369     }
1370   return -1;
1371 }
1372
1373 static reg_errcode_t
1374 internal_function __attribute_warn_unused_result__
1375 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1376                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1377 {
1378   reg_errcode_t err;
1379   int num = fs->num++;
1380   if (fs->num == fs->alloc)
1381     {
1382       struct re_fail_stack_ent_t *new_array;
1383       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1384                                        * fs->alloc * 2));
1385       if (new_array == NULL)
1386         return REG_ESPACE;
1387       fs->alloc *= 2;
1388       fs->stack = new_array;
1389     }
1390   fs->stack[num].idx = str_idx;
1391   fs->stack[num].node = dest_node;
1392   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1393   if (fs->stack[num].regs == NULL)
1394     return REG_ESPACE;
1395   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1396   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1397   return err;
1398 }
1399
1400 static int
1401 internal_function
1402 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1403                 regmatch_t *regs, re_node_set *eps_via_nodes)
1404 {
1405   int num = --fs->num;
1406   assert (num >= 0);
1407   *pidx = fs->stack[num].idx;
1408   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1409   re_node_set_free (eps_via_nodes);
1410   re_free (fs->stack[num].regs);
1411   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1412   return fs->stack[num].node;
1413 }
1414
1415 /* Set the positions where the subexpressions are starts/ends to registers
1416    PMATCH.
1417    Note: We assume that pmatch[0] is already set, and
1418    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1419
1420 static reg_errcode_t
1421 internal_function __attribute_warn_unused_result__
1422 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1423           regmatch_t *pmatch, int fl_backtrack)
1424 {
1425   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1426   int idx, cur_node;
1427   re_node_set eps_via_nodes;
1428   struct re_fail_stack_t *fs;
1429   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1430   regmatch_t *prev_idx_match;
1431   int prev_idx_match_malloced = 0;
1432
1433 #ifdef DEBUG
1434   assert (nmatch > 1);
1435   assert (mctx->state_log != NULL);
1436 #endif
1437   if (fl_backtrack)
1438     {
1439       fs = &fs_body;
1440       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1441       if (fs->stack == NULL)
1442         return REG_ESPACE;
1443     }
1444   else
1445     fs = NULL;
1446
1447   cur_node = dfa->init_node;
1448   re_node_set_init_empty (&eps_via_nodes);
1449
1450 #ifdef HAVE_ALLOCA
1451   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1452     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1453   else
1454 #endif
1455     {
1456       prev_idx_match = re_malloc (regmatch_t, nmatch);
1457       if (prev_idx_match == NULL)
1458         {
1459           free_fail_stack_return (fs);
1460           return REG_ESPACE;
1461         }
1462       prev_idx_match_malloced = 1;
1463     }
1464   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1465
1466   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1467     {
1468       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1469
1470       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1471         {
1472           int reg_idx;
1473           if (fs)
1474             {
1475               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1476                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1477                   break;
1478               if (reg_idx == nmatch)
1479                 {
1480                   re_node_set_free (&eps_via_nodes);
1481                   if (prev_idx_match_malloced)
1482                     re_free (prev_idx_match);
1483                   return free_fail_stack_return (fs);
1484                 }
1485               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1486                                          &eps_via_nodes);
1487             }
1488           else
1489             {
1490               re_node_set_free (&eps_via_nodes);
1491               if (prev_idx_match_malloced)
1492                 re_free (prev_idx_match);
1493               return REG_NOERROR;
1494             }
1495         }
1496
1497       /* Proceed to next node.  */
1498       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1499                                     &eps_via_nodes, fs);
1500
1501       if (BE (cur_node < 0, 0))
1502         {
1503           if (BE (cur_node == -2, 0))
1504             {
1505               re_node_set_free (&eps_via_nodes);
1506               if (prev_idx_match_malloced)
1507                 re_free (prev_idx_match);
1508               free_fail_stack_return (fs);
1509               return REG_ESPACE;
1510             }
1511           if (fs)
1512             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1513                                        &eps_via_nodes);
1514           else
1515             {
1516               re_node_set_free (&eps_via_nodes);
1517               if (prev_idx_match_malloced)
1518                 re_free (prev_idx_match);
1519               return REG_NOMATCH;
1520             }
1521         }
1522     }
1523   re_node_set_free (&eps_via_nodes);
1524   if (prev_idx_match_malloced)
1525     re_free (prev_idx_match);
1526   return free_fail_stack_return (fs);
1527 }
1528
1529 static reg_errcode_t
1530 internal_function
1531 free_fail_stack_return (struct re_fail_stack_t *fs)
1532 {
1533   if (fs)
1534     {
1535       int fs_idx;
1536       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1537         {
1538           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1539           re_free (fs->stack[fs_idx].regs);
1540         }
1541       re_free (fs->stack);
1542     }
1543   return REG_NOERROR;
1544 }
1545
1546 static void
1547 internal_function
1548 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1549              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1550 {
1551   int type = dfa->nodes[cur_node].type;
1552   if (type == OP_OPEN_SUBEXP)
1553     {
1554       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1555
1556       /* We are at the first node of this sub expression.  */
1557       if (reg_num < nmatch)
1558         {
1559           pmatch[reg_num].rm_so = cur_idx;
1560           pmatch[reg_num].rm_eo = -1;
1561         }
1562     }
1563   else if (type == OP_CLOSE_SUBEXP)
1564     {
1565       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1566       if (reg_num < nmatch)
1567         {
1568           /* We are at the last node of this sub expression.  */
1569           if (pmatch[reg_num].rm_so < cur_idx)
1570             {
1571               pmatch[reg_num].rm_eo = cur_idx;
1572               /* This is a non-empty match or we are not inside an optional
1573                  subexpression.  Accept this right away.  */
1574               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1575             }
1576           else
1577             {
1578               if (dfa->nodes[cur_node].opt_subexp
1579                   && prev_idx_match[reg_num].rm_so != -1)
1580                 /* We transited through an empty match for an optional
1581                    subexpression, like (a?)*, and this is not the subexp's
1582                    first match.  Copy back the old content of the registers
1583                    so that matches of an inner subexpression are undone as
1584                    well, like in ((a?))*.  */
1585                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1586               else
1587                 /* We completed a subexpression, but it may be part of
1588                    an optional one, so do not update PREV_IDX_MATCH.  */
1589                 pmatch[reg_num].rm_eo = cur_idx;
1590             }
1591         }
1592     }
1593 }
1594
1595 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1596    and sift the nodes in each states according to the following rules.
1597    Updated state_log will be wrote to STATE_LOG.
1598
1599    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1600      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1601         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1602         the LAST_NODE, we throw away the node `a'.
1603      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1604         string `s' and transit to `b':
1605         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1606            away the node `a'.
1607         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1608             thrown away, we throw away the node `a'.
1609      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1610         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1611            node `a'.
1612         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1613             we throw away the node `a'.  */
1614
1615 #define STATE_NODE_CONTAINS(state,node) \
1616   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1617
1618 static reg_errcode_t
1619 internal_function
1620 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1621 {
1622   reg_errcode_t err;
1623   int null_cnt = 0;
1624   int str_idx = sctx->last_str_idx;
1625   re_node_set cur_dest;
1626
1627 #ifdef DEBUG
1628   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1629 #endif
1630
1631   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1632      transit to the last_node and the last_node itself.  */
1633   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1634   if (BE (err != REG_NOERROR, 0))
1635     return err;
1636   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1637   if (BE (err != REG_NOERROR, 0))
1638     goto free_return;
1639
1640   /* Then check each states in the state_log.  */
1641   while (str_idx > 0)
1642     {
1643       /* Update counters.  */
1644       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1645       if (null_cnt > mctx->max_mb_elem_len)
1646         {
1647           memset (sctx->sifted_states, '\0',
1648                   sizeof (re_dfastate_t *) * str_idx);
1649           re_node_set_free (&cur_dest);
1650           return REG_NOERROR;
1651         }
1652       re_node_set_empty (&cur_dest);
1653       --str_idx;
1654
1655       if (mctx->state_log[str_idx])
1656         {
1657           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1658           if (BE (err != REG_NOERROR, 0))
1659             goto free_return;
1660         }
1661
1662       /* Add all the nodes which satisfy the following conditions:
1663          - It can epsilon transit to a node in CUR_DEST.
1664          - It is in CUR_SRC.
1665          And update state_log.  */
1666       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1667       if (BE (err != REG_NOERROR, 0))
1668         goto free_return;
1669     }
1670   err = REG_NOERROR;
1671  free_return:
1672   re_node_set_free (&cur_dest);
1673   return err;
1674 }
1675
1676 static reg_errcode_t
1677 internal_function __attribute_warn_unused_result__
1678 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1679                      int str_idx, re_node_set *cur_dest)
1680 {
1681   const re_dfa_t *const dfa = mctx->dfa;
1682   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1683   int i;
1684
1685   /* Then build the next sifted state.
1686      We build the next sifted state on `cur_dest', and update
1687      `sifted_states[str_idx]' with `cur_dest'.
1688      Note:
1689      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1690      `cur_src' points the node_set of the old `state_log[str_idx]'
1691      (with the epsilon nodes pre-filtered out).  */
1692   for (i = 0; i < cur_src->nelem; i++)
1693     {
1694       int prev_node = cur_src->elems[i];
1695       int naccepted = 0;
1696       int ret;
1697
1698 #ifdef DEBUG
1699       re_token_type_t type = dfa->nodes[prev_node].type;
1700       assert (!IS_EPSILON_NODE (type));
1701 #endif
1702 #ifdef RE_ENABLE_I18N
1703       /* If the node may accept `multi byte'.  */
1704       if (dfa->nodes[prev_node].accept_mb)
1705         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1706                                          str_idx, sctx->last_str_idx);
1707 #endif /* RE_ENABLE_I18N */
1708
1709       /* We don't check backreferences here.
1710          See update_cur_sifted_state().  */
1711       if (!naccepted
1712           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1713           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1714                                   dfa->nexts[prev_node]))
1715         naccepted = 1;
1716
1717       if (naccepted == 0)
1718         continue;
1719
1720       if (sctx->limits.nelem)
1721         {
1722           int to_idx = str_idx + naccepted;
1723           if (check_dst_limits (mctx, &sctx->limits,
1724                                 dfa->nexts[prev_node], to_idx,
1725                                 prev_node, str_idx))
1726             continue;
1727         }
1728       ret = re_node_set_insert (cur_dest, prev_node);
1729       if (BE (ret == -1, 0))
1730         return REG_ESPACE;
1731     }
1732
1733   return REG_NOERROR;
1734 }
1735
1736 /* Helper functions.  */
1737
1738 static reg_errcode_t
1739 internal_function
1740 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1741 {
1742   int top = mctx->state_log_top;
1743
1744   if ((next_state_log_idx >= mctx->input.bufs_len
1745        && mctx->input.bufs_len < mctx->input.len)
1746       || (next_state_log_idx >= mctx->input.valid_len
1747           && mctx->input.valid_len < mctx->input.len))
1748     {
1749       reg_errcode_t err;
1750       err = extend_buffers (mctx, next_state_log_idx + 1);
1751       if (BE (err != REG_NOERROR, 0))
1752         return err;
1753     }
1754
1755   if (top < next_state_log_idx)
1756     {
1757       memset (mctx->state_log + top + 1, '\0',
1758               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1759       mctx->state_log_top = next_state_log_idx;
1760     }
1761   return REG_NOERROR;
1762 }
1763
1764 static reg_errcode_t
1765 internal_function
1766 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1767                    re_dfastate_t **src, int num)
1768 {
1769   int st_idx;
1770   reg_errcode_t err;
1771   for (st_idx = 0; st_idx < num; ++st_idx)
1772     {
1773       if (dst[st_idx] == NULL)
1774         dst[st_idx] = src[st_idx];
1775       else if (src[st_idx] != NULL)
1776         {
1777           re_node_set merged_set;
1778           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1779                                         &src[st_idx]->nodes);
1780           if (BE (err != REG_NOERROR, 0))
1781             return err;
1782           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1783           re_node_set_free (&merged_set);
1784           if (BE (err != REG_NOERROR, 0))
1785             return err;
1786         }
1787     }
1788   return REG_NOERROR;
1789 }
1790
1791 static reg_errcode_t
1792 internal_function
1793 update_cur_sifted_state (const re_match_context_t *mctx,
1794                          re_sift_context_t *sctx, int str_idx,
1795                          re_node_set *dest_nodes)
1796 {
1797   const re_dfa_t *const dfa = mctx->dfa;
1798   reg_errcode_t err = REG_NOERROR;
1799   const re_node_set *candidates;
1800   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1801                 : &mctx->state_log[str_idx]->nodes);
1802
1803   if (dest_nodes->nelem == 0)
1804     sctx->sifted_states[str_idx] = NULL;
1805   else
1806     {
1807       if (candidates)
1808         {
1809           /* At first, add the nodes which can epsilon transit to a node in
1810              DEST_NODE.  */
1811           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1812           if (BE (err != REG_NOERROR, 0))
1813             return err;
1814
1815           /* Then, check the limitations in the current sift_context.  */
1816           if (sctx->limits.nelem)
1817             {
1818               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1819                                          mctx->bkref_ents, str_idx);
1820               if (BE (err != REG_NOERROR, 0))
1821                 return err;
1822             }
1823         }
1824
1825       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1826       if (BE (err != REG_NOERROR, 0))
1827         return err;
1828     }
1829
1830   if (candidates && mctx->state_log[str_idx]->has_backref)
1831     {
1832       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1833       if (BE (err != REG_NOERROR, 0))
1834         return err;
1835     }
1836   return REG_NOERROR;
1837 }
1838
1839 static reg_errcode_t
1840 internal_function __attribute_warn_unused_result__
1841 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1842                        const re_node_set *candidates)
1843 {
1844   reg_errcode_t err = REG_NOERROR;
1845   int i;
1846
1847   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1848   if (BE (err != REG_NOERROR, 0))
1849     return err;
1850
1851   if (!state->inveclosure.alloc)
1852     {
1853       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1854       if (BE (err != REG_NOERROR, 0))
1855         return REG_ESPACE;
1856       for (i = 0; i < dest_nodes->nelem; i++)
1857         {
1858           err = re_node_set_merge (&state->inveclosure,
1859                                    dfa->inveclosures + dest_nodes->elems[i]);
1860           if (BE (err != REG_NOERROR, 0))
1861             return REG_ESPACE;
1862         }
1863     }
1864   return re_node_set_add_intersect (dest_nodes, candidates,
1865                                     &state->inveclosure);
1866 }
1867
1868 static reg_errcode_t
1869 internal_function
1870 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1871                        const re_node_set *candidates)
1872 {
1873     int ecl_idx;
1874     reg_errcode_t err;
1875     re_node_set *inv_eclosure = dfa->inveclosures + node;
1876     re_node_set except_nodes;
1877     re_node_set_init_empty (&except_nodes);
1878     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1879       {
1880         int cur_node = inv_eclosure->elems[ecl_idx];
1881         if (cur_node == node)
1882           continue;
1883         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1884           {
1885             int edst1 = dfa->edests[cur_node].elems[0];
1886             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1887                          ? dfa->edests[cur_node].elems[1] : -1);
1888             if ((!re_node_set_contains (inv_eclosure, edst1)
1889                  && re_node_set_contains (dest_nodes, edst1))
1890                 || (edst2 > 0
1891                     && !re_node_set_contains (inv_eclosure, edst2)
1892                     && re_node_set_contains (dest_nodes, edst2)))
1893               {
1894                 err = re_node_set_add_intersect (&except_nodes, candidates,
1895                                                  dfa->inveclosures + cur_node);
1896                 if (BE (err != REG_NOERROR, 0))
1897                   {
1898                     re_node_set_free (&except_nodes);
1899                     return err;
1900                   }
1901               }
1902           }
1903       }
1904     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1905       {
1906         int cur_node = inv_eclosure->elems[ecl_idx];
1907         if (!re_node_set_contains (&except_nodes, cur_node))
1908           {
1909             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1910             re_node_set_remove_at (dest_nodes, idx);
1911           }
1912       }
1913     re_node_set_free (&except_nodes);
1914     return REG_NOERROR;
1915 }
1916
1917 static int
1918 internal_function
1919 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1920                   int dst_node, int dst_idx, int src_node, int src_idx)
1921 {
1922   const re_dfa_t *const dfa = mctx->dfa;
1923   int lim_idx, src_pos, dst_pos;
1924
1925   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1926   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1927   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1928     {
1929       int subexp_idx;
1930       struct re_backref_cache_entry *ent;
1931       ent = mctx->bkref_ents + limits->elems[lim_idx];
1932       subexp_idx = dfa->nodes[ent->node].opr.idx;
1933
1934       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1935                                            subexp_idx, dst_node, dst_idx,
1936                                            dst_bkref_idx);
1937       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1938                                            subexp_idx, src_node, src_idx,
1939                                            src_bkref_idx);
1940
1941       /* In case of:
1942          <src> <dst> ( <subexp> )
1943          ( <subexp> ) <src> <dst>
1944          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1945       if (src_pos == dst_pos)
1946         continue; /* This is unrelated limitation.  */
1947       else
1948         return 1;
1949     }
1950   return 0;
1951 }
1952
1953 static int
1954 internal_function
1955 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1956                              int subexp_idx, int from_node, int bkref_idx)
1957 {
1958   const re_dfa_t *const dfa = mctx->dfa;
1959   const re_node_set *eclosures = dfa->eclosures + from_node;
1960   int node_idx;
1961
1962   /* Else, we are on the boundary: examine the nodes on the epsilon
1963      closure.  */
1964   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1965     {
1966       int node = eclosures->elems[node_idx];
1967       switch (dfa->nodes[node].type)
1968         {
1969         case OP_BACK_REF:
1970           if (bkref_idx != -1)
1971             {
1972               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1973               do
1974                 {
1975                   int dst, cpos;
1976
1977                   if (ent->node != node)
1978                     continue;
1979
1980                   if (subexp_idx < BITSET_WORD_BITS
1981                       && !(ent->eps_reachable_subexps_map
1982                            & ((bitset_word_t) 1 << subexp_idx)))
1983                     continue;
1984
1985                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1986                      OP_CLOSE_SUBEXP cases below.  But, if the
1987                      destination node is the same node as the source
1988                      node, don't recurse because it would cause an
1989                      infinite loop: a regex that exhibits this behavior
1990                      is ()\1*\1*  */
1991                   dst = dfa->edests[node].elems[0];
1992                   if (dst == from_node)
1993                     {
1994                       if (boundaries & 1)
1995                         return -1;
1996                       else /* if (boundaries & 2) */
1997                         return 0;
1998                     }
1999
2000                   cpos =
2001                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2002                                                  dst, bkref_idx);
2003                   if (cpos == -1 /* && (boundaries & 1) */)
2004                     return -1;
2005                   if (cpos == 0 && (boundaries & 2))
2006                     return 0;
2007
2008                   if (subexp_idx < BITSET_WORD_BITS)
2009                     ent->eps_reachable_subexps_map
2010                       &= ~((bitset_word_t) 1 << subexp_idx);
2011                 }
2012               while (ent++->more);
2013             }
2014           break;
2015
2016         case OP_OPEN_SUBEXP:
2017           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2018             return -1;
2019           break;
2020
2021         case OP_CLOSE_SUBEXP:
2022           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2023             return 0;
2024           break;
2025
2026         default:
2027             break;
2028         }
2029     }
2030
2031   return (boundaries & 2) ? 1 : 0;
2032 }
2033
2034 static int
2035 internal_function
2036 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2037                            int subexp_idx, int from_node, int str_idx,
2038                            int bkref_idx)
2039 {
2040   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2041   int boundaries;
2042
2043   /* If we are outside the range of the subexpression, return -1 or 1.  */
2044   if (str_idx < lim->subexp_from)
2045     return -1;
2046
2047   if (lim->subexp_to < str_idx)
2048     return 1;
2049
2050   /* If we are within the subexpression, return 0.  */
2051   boundaries = (str_idx == lim->subexp_from);
2052   boundaries |= (str_idx == lim->subexp_to) << 1;
2053   if (boundaries == 0)
2054     return 0;
2055
2056   /* Else, examine epsilon closure.  */
2057   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2058                                       from_node, bkref_idx);
2059 }
2060
2061 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2062    which are against limitations from DEST_NODES. */
2063
2064 static reg_errcode_t
2065 internal_function
2066 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2067                      const re_node_set *candidates, re_node_set *limits,
2068                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2069 {
2070   reg_errcode_t err;
2071   int node_idx, lim_idx;
2072
2073   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2074     {
2075       int subexp_idx;
2076       struct re_backref_cache_entry *ent;
2077       ent = bkref_ents + limits->elems[lim_idx];
2078
2079       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2080         continue; /* This is unrelated limitation.  */
2081
2082       subexp_idx = dfa->nodes[ent->node].opr.idx;
2083       if (ent->subexp_to == str_idx)
2084         {
2085           int ops_node = -1;
2086           int cls_node = -1;
2087           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2088             {
2089               int node = dest_nodes->elems[node_idx];
2090               re_token_type_t type = dfa->nodes[node].type;
2091               if (type == OP_OPEN_SUBEXP
2092                   && subexp_idx == dfa->nodes[node].opr.idx)
2093                 ops_node = node;
2094               else if (type == OP_CLOSE_SUBEXP
2095                        && subexp_idx == dfa->nodes[node].opr.idx)
2096                 cls_node = node;
2097             }
2098
2099           /* Check the limitation of the open subexpression.  */
2100           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2101           if (ops_node >= 0)
2102             {
2103               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2104                                            candidates);
2105               if (BE (err != REG_NOERROR, 0))
2106                 return err;
2107             }
2108
2109           /* Check the limitation of the close subexpression.  */
2110           if (cls_node >= 0)
2111             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2112               {
2113                 int node = dest_nodes->elems[node_idx];
2114                 if (!re_node_set_contains (dfa->inveclosures + node,
2115                                            cls_node)
2116                     && !re_node_set_contains (dfa->eclosures + node,
2117                                               cls_node))
2118                   {
2119                     /* It is against this limitation.
2120                        Remove it form the current sifted state.  */
2121                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2122                                                  candidates);
2123                     if (BE (err != REG_NOERROR, 0))
2124                       return err;
2125                     --node_idx;
2126                   }
2127               }
2128         }
2129       else /* (ent->subexp_to != str_idx)  */
2130         {
2131           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2132             {
2133               int node = dest_nodes->elems[node_idx];
2134               re_token_type_t type = dfa->nodes[node].type;
2135               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2136                 {
2137                   if (subexp_idx != dfa->nodes[node].opr.idx)
2138                     continue;
2139                   /* It is against this limitation.
2140                      Remove it form the current sifted state.  */
2141                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2142                                                candidates);
2143                   if (BE (err != REG_NOERROR, 0))
2144                     return err;
2145                 }
2146             }
2147         }
2148     }
2149   return REG_NOERROR;
2150 }
2151
2152 static reg_errcode_t
2153 internal_function __attribute_warn_unused_result__
2154 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2155                    int str_idx, const re_node_set *candidates)
2156 {
2157   const re_dfa_t *const dfa = mctx->dfa;
2158   reg_errcode_t err;
2159   int node_idx, node;
2160   re_sift_context_t local_sctx;
2161   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2162
2163   if (first_idx == -1)
2164     return REG_NOERROR;
2165
2166   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2167
2168   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2169     {
2170       int enabled_idx;
2171       re_token_type_t type;
2172       struct re_backref_cache_entry *entry;
2173       node = candidates->elems[node_idx];
2174       type = dfa->nodes[node].type;
2175       /* Avoid infinite loop for the REs like "()\1+".  */
2176       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2177         continue;
2178       if (type != OP_BACK_REF)
2179         continue;
2180
2181       entry = mctx->bkref_ents + first_idx;
2182       enabled_idx = first_idx;
2183       do
2184         {
2185           int subexp_len;
2186           int to_idx;
2187           int dst_node;
2188           int ret;
2189           re_dfastate_t *cur_state;
2190
2191           if (entry->node != node)
2192             continue;
2193           subexp_len = entry->subexp_to - entry->subexp_from;
2194           to_idx = str_idx + subexp_len;
2195           dst_node = (subexp_len ? dfa->nexts[node]
2196                       : dfa->edests[node].elems[0]);
2197
2198           if (to_idx > sctx->last_str_idx
2199               || sctx->sifted_states[to_idx] == NULL
2200               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2201               || check_dst_limits (mctx, &sctx->limits, node,
2202                                    str_idx, dst_node, to_idx))
2203             continue;
2204
2205           if (local_sctx.sifted_states == NULL)
2206             {
2207               local_sctx = *sctx;
2208               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2209               if (BE (err != REG_NOERROR, 0))
2210                 goto free_return;
2211             }
2212           local_sctx.last_node = node;
2213           local_sctx.last_str_idx = str_idx;
2214           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2215           if (BE (ret < 0, 0))
2216             {
2217               err = REG_ESPACE;
2218               goto free_return;
2219             }
2220           cur_state = local_sctx.sifted_states[str_idx];
2221           err = sift_states_backward (mctx, &local_sctx);
2222           if (BE (err != REG_NOERROR, 0))
2223             goto free_return;
2224           if (sctx->limited_states != NULL)
2225             {
2226               err = merge_state_array (dfa, sctx->limited_states,
2227                                        local_sctx.sifted_states,
2228                                        str_idx + 1);
2229               if (BE (err != REG_NOERROR, 0))
2230                 goto free_return;
2231             }
2232           local_sctx.sifted_states[str_idx] = cur_state;
2233           re_node_set_remove (&local_sctx.limits, enabled_idx);
2234
2235           /* mctx->bkref_ents may have changed, reload the pointer.  */
2236           entry = mctx->bkref_ents + enabled_idx;
2237         }
2238       while (enabled_idx++, entry++->more);
2239     }
2240   err = REG_NOERROR;
2241  free_return:
2242   if (local_sctx.sifted_states != NULL)
2243     {
2244       re_node_set_free (&local_sctx.limits);
2245     }
2246
2247   return err;
2248 }
2249
2250
2251 #ifdef RE_ENABLE_I18N
2252 static int
2253 internal_function
2254 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2255                      int node_idx, int str_idx, int max_str_idx)
2256 {
2257   const re_dfa_t *const dfa = mctx->dfa;
2258   int naccepted;
2259   /* Check the node can accept `multi byte'.  */
2260   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2261   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2262       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2263                             dfa->nexts[node_idx]))
2264     /* The node can't accept the `multi byte', or the
2265        destination was already thrown away, then the node
2266        could't accept the current input `multi byte'.   */
2267     naccepted = 0;
2268   /* Otherwise, it is sure that the node could accept
2269      `naccepted' bytes input.  */
2270   return naccepted;
2271 }
2272 #endif /* RE_ENABLE_I18N */
2273
2274 \f
2275 /* Functions for state transition.  */
2276
2277 /* Return the next state to which the current state STATE will transit by
2278    accepting the current input byte, and update STATE_LOG if necessary.
2279    If STATE can accept a multibyte char/collating element/back reference
2280    update the destination of STATE_LOG.  */
2281
2282 static re_dfastate_t *
2283 internal_function __attribute_warn_unused_result__
2284 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2285                re_dfastate_t *state)
2286 {
2287   re_dfastate_t **trtable;
2288   unsigned char ch;
2289
2290 #ifdef RE_ENABLE_I18N
2291   /* If the current state can accept multibyte.  */
2292   if (BE (state->accept_mb, 0))
2293     {
2294       *err = transit_state_mb (mctx, state);
2295       if (BE (*err != REG_NOERROR, 0))
2296         return NULL;
2297     }
2298 #endif /* RE_ENABLE_I18N */
2299
2300   /* Then decide the next state with the single byte.  */
2301 #if 0
2302   if (0)
2303     /* don't use transition table  */
2304     return transit_state_sb (err, mctx, state);
2305 #endif
2306
2307   /* Use transition table  */
2308   ch = re_string_fetch_byte (&mctx->input);
2309   for (;;)
2310     {
2311       trtable = state->trtable;
2312       if (BE (trtable != NULL, 1))
2313         return trtable[ch];
2314
2315       trtable = state->word_trtable;
2316       if (BE (trtable != NULL, 1))
2317         {
2318           unsigned int context;
2319           context
2320             = re_string_context_at (&mctx->input,
2321                                     re_string_cur_idx (&mctx->input) - 1,
2322                                     mctx->eflags);
2323           if (IS_WORD_CONTEXT (context))
2324             return trtable[ch + SBC_MAX];
2325           else
2326             return trtable[ch];
2327         }
2328
2329       if (!build_trtable (mctx->dfa, state))
2330         {
2331           *err = REG_ESPACE;
2332           return NULL;
2333         }
2334
2335       /* Retry, we now have a transition table.  */
2336     }
2337 }
2338
2339 /* Update the state_log if we need */
2340 re_dfastate_t *
2341 internal_function
2342 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2343                       re_dfastate_t *next_state)
2344 {
2345   const re_dfa_t *const dfa = mctx->dfa;
2346   int cur_idx = re_string_cur_idx (&mctx->input);
2347
2348   if (cur_idx > mctx->state_log_top)
2349     {
2350       mctx->state_log[cur_idx] = next_state;
2351       mctx->state_log_top = cur_idx;
2352     }
2353   else if (mctx->state_log[cur_idx] == 0)
2354     {
2355       mctx->state_log[cur_idx] = next_state;
2356     }
2357   else
2358     {
2359       re_dfastate_t *pstate;
2360       unsigned int context;
2361       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2362       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2363          the destination of a multibyte char/collating element/
2364          back reference.  Then the next state is the union set of
2365          these destinations and the results of the transition table.  */
2366       pstate = mctx->state_log[cur_idx];
2367       log_nodes = pstate->entrance_nodes;
2368       if (next_state != NULL)
2369         {
2370           table_nodes = next_state->entrance_nodes;
2371           *err = re_node_set_init_union (&next_nodes, table_nodes,
2372                                              log_nodes);
2373           if (BE (*err != REG_NOERROR, 0))
2374             return NULL;
2375         }
2376       else
2377         next_nodes = *log_nodes;
2378       /* Note: We already add the nodes of the initial state,
2379          then we don't need to add them here.  */
2380
2381       context = re_string_context_at (&mctx->input,
2382                                       re_string_cur_idx (&mctx->input) - 1,
2383                                       mctx->eflags);
2384       next_state = mctx->state_log[cur_idx]
2385         = re_acquire_state_context (err, dfa, &next_nodes, context);
2386       /* We don't need to check errors here, since the return value of
2387          this function is next_state and ERR is already set.  */
2388
2389       if (table_nodes != NULL)
2390         re_node_set_free (&next_nodes);
2391     }
2392
2393   if (BE (dfa->nbackref, 0) && next_state != NULL)
2394     {
2395       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2396          later.  We must check them here, since the back references in the
2397          next state might use them.  */
2398       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2399                                         cur_idx);
2400       if (BE (*err != REG_NOERROR, 0))
2401         return NULL;
2402
2403       /* If the next state has back references.  */
2404       if (next_state->has_backref)
2405         {
2406           *err = transit_state_bkref (mctx, &next_state->nodes);
2407           if (BE (*err != REG_NOERROR, 0))
2408             return NULL;
2409           next_state = mctx->state_log[cur_idx];
2410         }
2411     }
2412
2413   return next_state;
2414 }
2415
2416 /* Skip bytes in the input that correspond to part of a
2417    multi-byte match, then look in the log for a state
2418    from which to restart matching.  */
2419 re_dfastate_t *
2420 internal_function
2421 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2422 {
2423   re_dfastate_t *cur_state;
2424   do
2425     {
2426       int max = mctx->state_log_top;
2427       int cur_str_idx = re_string_cur_idx (&mctx->input);
2428
2429       do
2430         {
2431           if (++cur_str_idx > max)
2432             return NULL;
2433           re_string_skip_bytes (&mctx->input, 1);
2434         }
2435       while (mctx->state_log[cur_str_idx] == NULL);
2436
2437       cur_state = merge_state_with_log (err, mctx, NULL);
2438     }
2439   while (*err == REG_NOERROR && cur_state == NULL);
2440   return cur_state;
2441 }
2442
2443 /* Helper functions for transit_state.  */
2444
2445 /* From the node set CUR_NODES, pick up the nodes whose types are
2446    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2447    expression. And register them to use them later for evaluating the
2448    correspoding back references.  */
2449
2450 static reg_errcode_t
2451 internal_function
2452 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2453                            int str_idx)
2454 {
2455   const re_dfa_t *const dfa = mctx->dfa;
2456   int node_idx;
2457   reg_errcode_t err;
2458
2459   /* TODO: This isn't efficient.
2460            Because there might be more than one nodes whose types are
2461            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2462            nodes.
2463            E.g. RE: (a){2}  */
2464   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2465     {
2466       int node = cur_nodes->elems[node_idx];
2467       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2468           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2469           && (dfa->used_bkref_map
2470               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2471         {
2472           err = match_ctx_add_subtop (mctx, node, str_idx);
2473           if (BE (err != REG_NOERROR, 0))
2474             return err;
2475         }
2476     }
2477   return REG_NOERROR;
2478 }
2479
2480 #if 0
2481 /* Return the next state to which the current state STATE will transit by
2482    accepting the current input byte.  */
2483
2484 static re_dfastate_t *
2485 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2486                   re_dfastate_t *state)
2487 {
2488   const re_dfa_t *const dfa = mctx->dfa;
2489   re_node_set next_nodes;
2490   re_dfastate_t *next_state;
2491   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2492   unsigned int context;
2493
2494   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2495   if (BE (*err != REG_NOERROR, 0))
2496     return NULL;
2497   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2498     {
2499       int cur_node = state->nodes.elems[node_cnt];
2500       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2501         {
2502           *err = re_node_set_merge (&next_nodes,
2503                                     dfa->eclosures + dfa->nexts[cur_node]);
2504           if (BE (*err != REG_NOERROR, 0))
2505             {
2506               re_node_set_free (&next_nodes);
2507               return NULL;
2508             }
2509         }
2510     }
2511   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2512   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2513   /* We don't need to check errors here, since the return value of
2514      this function is next_state and ERR is already set.  */
2515
2516   re_node_set_free (&next_nodes);
2517   re_string_skip_bytes (&mctx->input, 1);
2518   return next_state;
2519 }
2520 #endif
2521
2522 #ifdef RE_ENABLE_I18N
2523 static reg_errcode_t
2524 internal_function
2525 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2526 {
2527   const re_dfa_t *const dfa = mctx->dfa;
2528   reg_errcode_t err;
2529   int i;
2530
2531   for (i = 0; i < pstate->nodes.nelem; ++i)
2532     {
2533       re_node_set dest_nodes, *new_nodes;
2534       int cur_node_idx = pstate->nodes.elems[i];
2535       int naccepted, dest_idx;
2536       unsigned int context;
2537       re_dfastate_t *dest_state;
2538
2539       if (!dfa->nodes[cur_node_idx].accept_mb)
2540         continue;
2541
2542       if (dfa->nodes[cur_node_idx].constraint)
2543         {
2544           context = re_string_context_at (&mctx->input,
2545                                           re_string_cur_idx (&mctx->input),
2546                                           mctx->eflags);
2547           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2548                                            context))
2549             continue;
2550         }
2551
2552       /* How many bytes the node can accept?  */
2553       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2554                                            re_string_cur_idx (&mctx->input));
2555       if (naccepted == 0)
2556         continue;
2557
2558       /* The node can accepts `naccepted' bytes.  */
2559       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2560       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2561                                : mctx->max_mb_elem_len);
2562       err = clean_state_log_if_needed (mctx, dest_idx);
2563       if (BE (err != REG_NOERROR, 0))
2564         return err;
2565 #ifdef DEBUG
2566       assert (dfa->nexts[cur_node_idx] != -1);
2567 #endif
2568       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2569
2570       dest_state = mctx->state_log[dest_idx];
2571       if (dest_state == NULL)
2572         dest_nodes = *new_nodes;
2573       else
2574         {
2575           err = re_node_set_init_union (&dest_nodes,
2576                                         dest_state->entrance_nodes, new_nodes);
2577           if (BE (err != REG_NOERROR, 0))
2578             return err;
2579         }
2580       context = re_string_context_at (&mctx->input, dest_idx - 1,
2581                                       mctx->eflags);
2582       mctx->state_log[dest_idx]
2583         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2584       if (dest_state != NULL)
2585         re_node_set_free (&dest_nodes);
2586       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2587         return err;
2588     }
2589   return REG_NOERROR;
2590 }
2591 #endif /* RE_ENABLE_I18N */
2592
2593 static reg_errcode_t
2594 internal_function
2595 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2596 {
2597   const re_dfa_t *const dfa = mctx->dfa;
2598   reg_errcode_t err;
2599   int i;
2600   int cur_str_idx = re_string_cur_idx (&mctx->input);
2601
2602   for (i = 0; i < nodes->nelem; ++i)
2603     {
2604       int dest_str_idx, prev_nelem, bkc_idx;
2605       int node_idx = nodes->elems[i];
2606       unsigned int context;
2607       const re_token_t *node = dfa->nodes + node_idx;
2608       re_node_set *new_dest_nodes;
2609
2610       /* Check whether `node' is a backreference or not.  */
2611       if (node->type != OP_BACK_REF)
2612         continue;
2613
2614       if (node->constraint)
2615         {
2616           context = re_string_context_at (&mctx->input, cur_str_idx,
2617                                           mctx->eflags);
2618           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2619             continue;
2620         }
2621
2622       /* `node' is a backreference.
2623          Check the substring which the substring matched.  */
2624       bkc_idx = mctx->nbkref_ents;
2625       err = get_subexp (mctx, node_idx, cur_str_idx);
2626       if (BE (err != REG_NOERROR, 0))
2627         goto free_return;
2628
2629       /* And add the epsilon closures (which is `new_dest_nodes') of
2630          the backreference to appropriate state_log.  */
2631 #ifdef DEBUG
2632       assert (dfa->nexts[node_idx] != -1);
2633 #endif
2634       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2635         {
2636           int subexp_len;
2637           re_dfastate_t *dest_state;
2638           struct re_backref_cache_entry *bkref_ent;
2639           bkref_ent = mctx->bkref_ents + bkc_idx;
2640           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2641             continue;
2642           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2643           new_dest_nodes = (subexp_len == 0
2644                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2645                             : dfa->eclosures + dfa->nexts[node_idx]);
2646           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2647                           - bkref_ent->subexp_from);
2648           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2649                                           mctx->eflags);
2650           dest_state = mctx->state_log[dest_str_idx];
2651           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2652                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2653           /* Add `new_dest_node' to state_log.  */
2654           if (dest_state == NULL)
2655             {
2656               mctx->state_log[dest_str_idx]
2657                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2658                                             context);
2659               if (BE (mctx->state_log[dest_str_idx] == NULL
2660                       && err != REG_NOERROR, 0))
2661                 goto free_return;
2662             }
2663           else
2664             {
2665               re_node_set dest_nodes;
2666               err = re_node_set_init_union (&dest_nodes,
2667                                             dest_state->entrance_nodes,
2668                                             new_dest_nodes);
2669               if (BE (err != REG_NOERROR, 0))
2670                 {
2671                   re_node_set_free (&dest_nodes);
2672                   goto free_return;
2673                 }
2674               mctx->state_log[dest_str_idx]
2675                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2676               re_node_set_free (&dest_nodes);
2677               if (BE (mctx->state_log[dest_str_idx] == NULL
2678                       && err != REG_NOERROR, 0))
2679                 goto free_return;
2680             }
2681           /* We need to check recursively if the backreference can epsilon
2682              transit.  */
2683           if (subexp_len == 0
2684               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2685             {
2686               err = check_subexp_matching_top (mctx, new_dest_nodes,
2687                                                cur_str_idx);
2688               if (BE (err != REG_NOERROR, 0))
2689                 goto free_return;
2690               err = transit_state_bkref (mctx, new_dest_nodes);
2691               if (BE (err != REG_NOERROR, 0))
2692                 goto free_return;
2693             }
2694         }
2695     }
2696   err = REG_NOERROR;
2697  free_return:
2698   return err;
2699 }
2700
2701 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2702    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2703    Note that we might collect inappropriate candidates here.
2704    However, the cost of checking them strictly here is too high, then we
2705    delay these checking for prune_impossible_nodes().  */
2706
2707 static reg_errcode_t
2708 internal_function __attribute_warn_unused_result__
2709 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2710 {
2711   const re_dfa_t *const dfa = mctx->dfa;
2712   int subexp_num, sub_top_idx;
2713   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2714   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2715   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2716   if (cache_idx != -1)
2717     {
2718       const struct re_backref_cache_entry *entry
2719         = mctx->bkref_ents + cache_idx;
2720       do
2721         if (entry->node == bkref_node)
2722           return REG_NOERROR; /* We already checked it.  */
2723       while (entry++->more);
2724     }
2725
2726   subexp_num = dfa->nodes[bkref_node].opr.idx;
2727
2728   /* For each sub expression  */
2729   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2730     {
2731       reg_errcode_t err;
2732       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2733       re_sub_match_last_t *sub_last;
2734       int sub_last_idx, sl_str, bkref_str_off;
2735
2736       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2737         continue; /* It isn't related.  */
2738
2739       sl_str = sub_top->str_idx;
2740       bkref_str_off = bkref_str_idx;
2741       /* At first, check the last node of sub expressions we already
2742          evaluated.  */
2743       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2744         {
2745           int sl_str_diff;
2746           sub_last = sub_top->lasts[sub_last_idx];
2747           sl_str_diff = sub_last->str_idx - sl_str;
2748           /* The matched string by the sub expression match with the substring
2749              at the back reference?  */
2750           if (sl_str_diff > 0)
2751             {
2752               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2753                 {
2754                   /* Not enough chars for a successful match.  */
2755                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2756                     break;
2757
2758                   err = clean_state_log_if_needed (mctx,
2759                                                    bkref_str_off
2760                                                    + sl_str_diff);
2761                   if (BE (err != REG_NOERROR, 0))
2762                     return err;
2763                   buf = (const char *) re_string_get_buffer (&mctx->input);
2764                 }
2765               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2766                 /* We don't need to search this sub expression any more.  */
2767                 break;
2768             }
2769           bkref_str_off += sl_str_diff;
2770           sl_str += sl_str_diff;
2771           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2772                                 bkref_str_idx);
2773
2774           /* Reload buf, since the preceding call might have reallocated
2775              the buffer.  */
2776           buf = (const char *) re_string_get_buffer (&mctx->input);
2777
2778           if (err == REG_NOMATCH)
2779             continue;
2780           if (BE (err != REG_NOERROR, 0))
2781             return err;
2782         }
2783
2784       if (sub_last_idx < sub_top->nlasts)
2785         continue;
2786       if (sub_last_idx > 0)
2787         ++sl_str;
2788       /* Then, search for the other last nodes of the sub expression.  */
2789       for (; sl_str <= bkref_str_idx; ++sl_str)
2790         {
2791           int cls_node, sl_str_off;
2792           const re_node_set *nodes;
2793           sl_str_off = sl_str - sub_top->str_idx;
2794           /* The matched string by the sub expression match with the substring
2795              at the back reference?  */
2796           if (sl_str_off > 0)
2797             {
2798               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2799                 {
2800                   /* If we are at the end of the input, we cannot match.  */
2801                   if (bkref_str_off >= mctx->input.len)
2802                     break;
2803
2804                   err = extend_buffers (mctx, bkref_str_off + 1);
2805                   if (BE (err != REG_NOERROR, 0))
2806                     return err;
2807
2808                   buf = (const char *) re_string_get_buffer (&mctx->input);
2809                 }
2810               if (buf [bkref_str_off++] != buf[sl_str - 1])
2811                 break; /* We don't need to search this sub expression
2812                           any more.  */
2813             }
2814           if (mctx->state_log[sl_str] == NULL)
2815             continue;
2816           /* Does this state have a ')' of the sub expression?  */
2817           nodes = &mctx->state_log[sl_str]->nodes;
2818           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2819                                        OP_CLOSE_SUBEXP);
2820           if (cls_node == -1)
2821             continue; /* No.  */
2822           if (sub_top->path == NULL)
2823             {
2824               sub_top->path = calloc (sizeof (state_array_t),
2825                                       sl_str - sub_top->str_idx + 1);
2826               if (sub_top->path == NULL)
2827                 return REG_ESPACE;
2828             }
2829           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2830              in the current context?  */
2831           err = check_arrival (mctx, sub_top->path, sub_top->node,
2832                                sub_top->str_idx, cls_node, sl_str,
2833                                OP_CLOSE_SUBEXP);
2834           if (err == REG_NOMATCH)
2835               continue;
2836           if (BE (err != REG_NOERROR, 0))
2837               return err;
2838           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2839           if (BE (sub_last == NULL, 0))
2840             return REG_ESPACE;
2841           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2842                                 bkref_str_idx);
2843           if (err == REG_NOMATCH)
2844             continue;
2845         }
2846     }
2847   return REG_NOERROR;
2848 }
2849
2850 /* Helper functions for get_subexp().  */
2851
2852 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2853    If it can arrive, register the sub expression expressed with SUB_TOP
2854    and SUB_LAST.  */
2855
2856 static reg_errcode_t
2857 internal_function
2858 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2859                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2860 {
2861   reg_errcode_t err;
2862   int to_idx;
2863   /* Can the subexpression arrive the back reference?  */
2864   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2865                        sub_last->str_idx, bkref_node, bkref_str,
2866                        OP_OPEN_SUBEXP);
2867   if (err != REG_NOERROR)
2868     return err;
2869   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2870                              sub_last->str_idx);
2871   if (BE (err != REG_NOERROR, 0))
2872     return err;
2873   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2874   return clean_state_log_if_needed (mctx, to_idx);
2875 }
2876
2877 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2878    Search '(' if FL_OPEN, or search ')' otherwise.
2879    TODO: This function isn't efficient...
2880          Because there might be more than one nodes whose types are
2881          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2882          nodes.
2883          E.g. RE: (a){2}  */
2884
2885 static int
2886 internal_function
2887 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2888                   int subexp_idx, int type)
2889 {
2890   int cls_idx;
2891   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2892     {
2893       int cls_node = nodes->elems[cls_idx];
2894       const re_token_t *node = dfa->nodes + cls_node;
2895       if (node->type == type
2896           && node->opr.idx == subexp_idx)
2897         return cls_node;
2898     }
2899   return -1;
2900 }
2901
2902 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2903    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2904    heavily reused.
2905    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2906
2907 static reg_errcode_t
2908 internal_function __attribute_warn_unused_result__
2909 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2910                int top_str, int last_node, int last_str, int type)
2911 {
2912   const re_dfa_t *const dfa = mctx->dfa;
2913   reg_errcode_t err = REG_NOERROR;
2914   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2915   re_dfastate_t *cur_state = NULL;
2916   re_node_set *cur_nodes, next_nodes;
2917   re_dfastate_t **backup_state_log;
2918   unsigned int context;
2919
2920   subexp_num = dfa->nodes[top_node].opr.idx;
2921   /* Extend the buffer if we need.  */
2922   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2923     {
2924       re_dfastate_t **new_array;
2925       int old_alloc = path->alloc;
2926       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2927       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2928       if (BE (new_array == NULL, 0))
2929         {
2930           path->alloc = old_alloc;
2931           return REG_ESPACE;
2932         }
2933       path->array = new_array;
2934       memset (new_array + old_alloc, '\0',
2935               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2936     }
2937
2938   str_idx = path->next_idx ? path->next_idx : top_str;
2939
2940   /* Temporary modify MCTX.  */
2941   backup_state_log = mctx->state_log;
2942   backup_cur_idx = mctx->input.cur_idx;
2943   mctx->state_log = path->array;
2944   mctx->input.cur_idx = str_idx;
2945
2946   /* Setup initial node set.  */
2947   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2948   if (str_idx == top_str)
2949     {
2950       err = re_node_set_init_1 (&next_nodes, top_node);
2951       if (BE (err != REG_NOERROR, 0))
2952         return err;
2953       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2954       if (BE (err != REG_NOERROR, 0))
2955         {
2956           re_node_set_free (&next_nodes);
2957           return err;
2958         }
2959     }
2960   else
2961     {
2962       cur_state = mctx->state_log[str_idx];
2963       if (cur_state && cur_state->has_backref)
2964         {
2965           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2966           if (BE (err != REG_NOERROR, 0))
2967             return err;
2968         }
2969       else
2970         re_node_set_init_empty (&next_nodes);
2971     }
2972   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2973     {
2974       if (next_nodes.nelem)
2975         {
2976           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2977                                     subexp_num, type);
2978           if (BE (err != REG_NOERROR, 0))
2979             {
2980               re_node_set_free (&next_nodes);
2981               return err;
2982             }
2983         }
2984       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2985       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2986         {
2987           re_node_set_free (&next_nodes);
2988           return err;
2989         }
2990       mctx->state_log[str_idx] = cur_state;
2991     }
2992
2993   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2994     {
2995       re_node_set_empty (&next_nodes);
2996       if (mctx->state_log[str_idx + 1])
2997         {
2998           err = re_node_set_merge (&next_nodes,
2999                                    &mctx->state_log[str_idx + 1]->nodes);
3000           if (BE (err != REG_NOERROR, 0))
3001             {
3002               re_node_set_free (&next_nodes);
3003               return err;
3004             }
3005         }
3006       if (cur_state)
3007         {
3008           err = check_arrival_add_next_nodes (mctx, str_idx,
3009                                               &cur_state->non_eps_nodes,
3010                                               &next_nodes);
3011           if (BE (err != REG_NOERROR, 0))
3012             {
3013               re_node_set_free (&next_nodes);
3014               return err;
3015             }
3016         }
3017       ++str_idx;
3018       if (next_nodes.nelem)
3019         {
3020           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3021           if (BE (err != REG_NOERROR, 0))
3022             {
3023               re_node_set_free (&next_nodes);
3024               return err;
3025             }
3026           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3027                                     subexp_num, type);
3028           if (BE (err != REG_NOERROR, 0))
3029             {
3030               re_node_set_free (&next_nodes);
3031               return err;
3032             }
3033         }
3034       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3035       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3036       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3037         {
3038           re_node_set_free (&next_nodes);
3039           return err;
3040         }
3041       mctx->state_log[str_idx] = cur_state;
3042       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3043     }
3044   re_node_set_free (&next_nodes);
3045   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3046                : &mctx->state_log[last_str]->nodes);
3047   path->next_idx = str_idx;
3048
3049   /* Fix MCTX.  */
3050   mctx->state_log = backup_state_log;
3051   mctx->input.cur_idx = backup_cur_idx;
3052
3053   /* Then check the current node set has the node LAST_NODE.  */
3054   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3055     return REG_NOERROR;
3056
3057   return REG_NOMATCH;
3058 }
3059
3060 /* Helper functions for check_arrival.  */
3061
3062 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3063    to NEXT_NODES.
3064    TODO: This function is similar to the functions transit_state*(),
3065          however this function has many additional works.
3066          Can't we unify them?  */
3067
3068 static reg_errcode_t
3069 internal_function __attribute_warn_unused_result__
3070 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3071                               re_node_set *cur_nodes, re_node_set *next_nodes)
3072 {
3073   const re_dfa_t *const dfa = mctx->dfa;
3074   int result;
3075   int cur_idx;
3076 #ifdef RE_ENABLE_I18N
3077   reg_errcode_t err = REG_NOERROR;
3078 #endif
3079   re_node_set union_set;
3080   re_node_set_init_empty (&union_set);
3081   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3082     {
3083       int naccepted = 0;
3084       int cur_node = cur_nodes->elems[cur_idx];
3085 #ifdef DEBUG
3086       re_token_type_t type = dfa->nodes[cur_node].type;
3087       assert (!IS_EPSILON_NODE (type));
3088 #endif
3089 #ifdef RE_ENABLE_I18N
3090       /* If the node may accept `multi byte'.  */
3091       if (dfa->nodes[cur_node].accept_mb)
3092         {
3093           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3094                                                str_idx);
3095           if (naccepted > 1)
3096             {
3097               re_dfastate_t *dest_state;
3098               int next_node = dfa->nexts[cur_node];
3099               int next_idx = str_idx + naccepted;
3100               dest_state = mctx->state_log[next_idx];
3101               re_node_set_empty (&union_set);
3102               if (dest_state)
3103                 {
3104                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3105                   if (BE (err != REG_NOERROR, 0))
3106                     {
3107                       re_node_set_free (&union_set);
3108                       return err;
3109                     }
3110                 }
3111               result = re_node_set_insert (&union_set, next_node);
3112               if (BE (result < 0, 0))
3113                 {
3114                   re_node_set_free (&union_set);
3115                   return REG_ESPACE;
3116                 }
3117               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3118                                                             &union_set);
3119               if (BE (mctx->state_log[next_idx] == NULL
3120                       && err != REG_NOERROR, 0))
3121                 {
3122                   re_node_set_free (&union_set);
3123                   return err;
3124                 }
3125             }
3126         }
3127 #endif /* RE_ENABLE_I18N */
3128       if (naccepted
3129           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3130         {
3131           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3132           if (BE (result < 0, 0))
3133             {
3134               re_node_set_free (&union_set);
3135               return REG_ESPACE;
3136             }
3137         }
3138     }
3139   re_node_set_free (&union_set);
3140   return REG_NOERROR;
3141 }
3142
3143 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3144    CUR_NODES, however exclude the nodes which are:
3145     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3146     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3147 */
3148
3149 static reg_errcode_t
3150 internal_function
3151 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3152                           int ex_subexp, int type)
3153 {
3154   reg_errcode_t err;
3155   int idx, outside_node;
3156   re_node_set new_nodes;
3157 #ifdef DEBUG
3158   assert (cur_nodes->nelem);
3159 #endif
3160   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3161   if (BE (err != REG_NOERROR, 0))
3162     return err;
3163   /* Create a new node set NEW_NODES with the nodes which are epsilon
3164      closures of the node in CUR_NODES.  */
3165
3166   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3167     {
3168       int cur_node = cur_nodes->elems[idx];
3169       const re_node_set *eclosure = dfa->eclosures + cur_node;
3170       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3171       if (outside_node == -1)
3172         {
3173           /* There are no problematic nodes, just merge them.  */
3174           err = re_node_set_merge (&new_nodes, eclosure);
3175           if (BE (err != REG_NOERROR, 0))
3176             {
3177               re_node_set_free (&new_nodes);
3178               return err;
3179             }
3180         }
3181       else
3182         {
3183           /* There are problematic nodes, re-calculate incrementally.  */
3184           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3185                                               ex_subexp, type);
3186           if (BE (err != REG_NOERROR, 0))
3187             {
3188               re_node_set_free (&new_nodes);
3189               return err;
3190             }
3191         }
3192     }
3193   re_node_set_free (cur_nodes);
3194   *cur_nodes = new_nodes;
3195   return REG_NOERROR;
3196 }
3197
3198 /* Helper function for check_arrival_expand_ecl.
3199    Check incrementally the epsilon closure of TARGET, and if it isn't
3200    problematic append it to DST_NODES.  */
3201
3202 static reg_errcode_t
3203 internal_function __attribute_warn_unused_result__
3204 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3205                               int target, int ex_subexp, int type)
3206 {
3207   int cur_node;
3208   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3209     {
3210       int err;
3211
3212       if (dfa->nodes[cur_node].type == type
3213           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3214         {
3215           if (type == OP_CLOSE_SUBEXP)
3216             {
3217               err = re_node_set_insert (dst_nodes, cur_node);
3218               if (BE (err == -1, 0))
3219                 return REG_ESPACE;
3220             }
3221           break;
3222         }
3223       err = re_node_set_insert (dst_nodes, cur_node);
3224       if (BE (err == -1, 0))
3225         return REG_ESPACE;
3226       if (dfa->edests[cur_node].nelem == 0)
3227         break;
3228       if (dfa->edests[cur_node].nelem == 2)
3229         {
3230           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3231                                               dfa->edests[cur_node].elems[1],
3232                                               ex_subexp, type);
3233           if (BE (err != REG_NOERROR, 0))
3234             return err;
3235         }
3236       cur_node = dfa->edests[cur_node].elems[0];
3237     }
3238   return REG_NOERROR;
3239 }
3240
3241
3242 /* For all the back references in the current state, calculate the
3243    destination of the back references by the appropriate entry
3244    in MCTX->BKREF_ENTS.  */
3245
3246 static reg_errcode_t
3247 internal_function __attribute_warn_unused_result__
3248 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3249                     int cur_str, int subexp_num, int type)
3250 {
3251   const re_dfa_t *const dfa = mctx->dfa;
3252   reg_errcode_t err;
3253   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3254   struct re_backref_cache_entry *ent;
3255
3256   if (cache_idx_start == -1)
3257     return REG_NOERROR;
3258
3259  restart:
3260   ent = mctx->bkref_ents + cache_idx_start;
3261   do
3262     {
3263       int to_idx, next_node;
3264
3265       /* Is this entry ENT is appropriate?  */
3266       if (!re_node_set_contains (cur_nodes, ent->node))
3267         continue; /* No.  */
3268
3269       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3270       /* Calculate the destination of the back reference, and append it
3271          to MCTX->STATE_LOG.  */
3272       if (to_idx == cur_str)
3273         {
3274           /* The backreference did epsilon transit, we must re-check all the
3275              node in the current state.  */
3276           re_node_set new_dests;
3277           reg_errcode_t err2, err3;
3278           next_node = dfa->edests[ent->node].elems[0];
3279           if (re_node_set_contains (cur_nodes, next_node))
3280             continue;
3281           err = re_node_set_init_1 (&new_dests, next_node);
3282           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3283           err3 = re_node_set_merge (cur_nodes, &new_dests);
3284           re_node_set_free (&new_dests);
3285           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3286                   || err3 != REG_NOERROR, 0))
3287             {
3288               err = (err != REG_NOERROR ? err
3289                      : (err2 != REG_NOERROR ? err2 : err3));
3290               return err;
3291             }
3292           /* TODO: It is still inefficient...  */
3293           goto restart;
3294         }
3295       else
3296         {
3297           re_node_set union_set;
3298           next_node = dfa->nexts[ent->node];
3299           if (mctx->state_log[to_idx])
3300             {
3301               int ret;
3302               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3303                                         next_node))
3304                 continue;
3305               err = re_node_set_init_copy (&union_set,
3306                                            &mctx->state_log[to_idx]->nodes);
3307               ret = re_node_set_insert (&union_set, next_node);
3308               if (BE (err != REG_NOERROR || ret < 0, 0))
3309                 {
3310                   re_node_set_free (&union_set);
3311                   err = err != REG_NOERROR ? err : REG_ESPACE;
3312                   return err;
3313                 }
3314             }
3315           else
3316             {
3317               err = re_node_set_init_1 (&union_set, next_node);
3318               if (BE (err != REG_NOERROR, 0))
3319                 return err;
3320             }
3321           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3322           re_node_set_free (&union_set);
3323           if (BE (mctx->state_log[to_idx] == NULL
3324                   && err != REG_NOERROR, 0))
3325             return err;
3326         }
3327     }
3328   while (ent++->more);
3329   return REG_NOERROR;
3330 }
3331
3332 /* Build transition table for the state.
3333    Return 1 if succeeded, otherwise return NULL.  */
3334
3335 static int
3336 internal_function
3337 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3338 {
3339   reg_errcode_t err;
3340   int i, j, ch, need_word_trtable = 0;
3341   bitset_word_t elem, mask;
3342   bool dests_node_malloced = false;
3343   bool dest_states_malloced = false;
3344   int ndests; /* Number of the destination states from `state'.  */
3345   re_dfastate_t **trtable;
3346   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3347   re_node_set follows, *dests_node;
3348   bitset_t *dests_ch;
3349   bitset_t acceptable;
3350
3351   struct dests_alloc
3352   {
3353     re_node_set dests_node[SBC_MAX];
3354     bitset_t dests_ch[SBC_MAX];
3355   } *dests_alloc;
3356
3357   /* We build DFA states which corresponds to the destination nodes
3358      from `state'.  `dests_node[i]' represents the nodes which i-th
3359      destination state contains, and `dests_ch[i]' represents the
3360      characters which i-th destination state accepts.  */
3361 #ifdef HAVE_ALLOCA
3362   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3363     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3364   else
3365 #endif
3366     {
3367       dests_alloc = re_malloc (struct dests_alloc, 1);
3368       if (BE (dests_alloc == NULL, 0))
3369         return 0;
3370       dests_node_malloced = true;
3371     }
3372   dests_node = dests_alloc->dests_node;
3373   dests_ch = dests_alloc->dests_ch;
3374
3375   /* Initialize transiton table.  */
3376   state->word_trtable = state->trtable = NULL;
3377
3378   /* At first, group all nodes belonging to `state' into several
3379      destinations.  */
3380   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3381   if (BE (ndests <= 0, 0))
3382     {
3383       if (dests_node_malloced)
3384         free (dests_alloc);
3385       /* Return 0 in case of an error, 1 otherwise.  */
3386       if (ndests == 0)
3387         {
3388           state->trtable = (re_dfastate_t **)
3389             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3390           if (BE (state->trtable == NULL, 0))
3391             return 0;
3392           return 1;
3393         }
3394       return 0;
3395     }
3396
3397   err = re_node_set_alloc (&follows, ndests + 1);
3398   if (BE (err != REG_NOERROR, 0))
3399     goto out_free;
3400
3401   /* Avoid arithmetic overflow in size calculation.  */
3402   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3403             / (3 * sizeof (re_dfastate_t *)))
3404            < ndests),
3405           0))
3406     goto out_free;
3407
3408 #ifdef HAVE_ALLOCA
3409   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3410                          + ndests * 3 * sizeof (re_dfastate_t *)))
3411     dest_states = (re_dfastate_t **)
3412       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3413   else
3414 #endif
3415     {
3416       dest_states = (re_dfastate_t **)
3417         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3418       if (BE (dest_states == NULL, 0))
3419         {
3420 out_free:
3421           if (dest_states_malloced)
3422             free (dest_states);
3423           re_node_set_free (&follows);
3424           for (i = 0; i < ndests; ++i)
3425             re_node_set_free (dests_node + i);
3426           if (dests_node_malloced)
3427             free (dests_alloc);
3428           return 0;
3429         }
3430       dest_states_malloced = true;
3431     }
3432   dest_states_word = dest_states + ndests;
3433   dest_states_nl = dest_states_word + ndests;
3434   bitset_empty (acceptable);
3435
3436   /* Then build the states for all destinations.  */
3437   for (i = 0; i < ndests; ++i)
3438     {
3439       int next_node;
3440       re_node_set_empty (&follows);
3441       /* Merge the follows of this destination states.  */
3442       for (j = 0; j < dests_node[i].nelem; ++j)
3443         {
3444           next_node = dfa->nexts[dests_node[i].elems[j]];
3445           if (next_node != -1)
3446             {
3447               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3448               if (BE (err != REG_NOERROR, 0))
3449                 goto out_free;
3450             }
3451         }
3452       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3453       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3454         goto out_free;
3455       /* If the new state has context constraint,
3456          build appropriate states for these contexts.  */
3457       if (dest_states[i]->has_constraint)
3458         {
3459           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3460                                                           CONTEXT_WORD);
3461           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3462             goto out_free;
3463
3464           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3465             need_word_trtable = 1;
3466
3467           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3468                                                         CONTEXT_NEWLINE);
3469           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3470             goto out_free;
3471         }
3472       else
3473         {
3474           dest_states_word[i] = dest_states[i];
3475           dest_states_nl[i] = dest_states[i];
3476         }
3477       bitset_merge (acceptable, dests_ch[i]);
3478     }
3479
3480   if (!BE (need_word_trtable, 0))
3481     {
3482       /* We don't care about whether the following character is a word
3483          character, or we are in a single-byte character set so we can
3484          discern by looking at the character code: allocate a
3485          256-entry transition table.  */
3486       trtable = state->trtable =
3487         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3488       if (BE (trtable == NULL, 0))
3489         goto out_free;
3490
3491       /* For all characters ch...:  */
3492       for (i = 0; i < BITSET_WORDS; ++i)
3493         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3494              elem;
3495              mask <<= 1, elem >>= 1, ++ch)
3496           if (BE (elem & 1, 0))
3497             {
3498               /* There must be exactly one destination which accepts
3499                  character ch.  See group_nodes_into_DFAstates.  */
3500               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3501                 ;
3502
3503               /* j-th destination accepts the word character ch.  */
3504               if (dfa->word_char[i] & mask)
3505                 trtable[ch] = dest_states_word[j];
3506               else
3507                 trtable[ch] = dest_states[j];
3508             }
3509     }
3510   else
3511     {
3512       /* We care about whether the following character is a word
3513          character, and we are in a multi-byte character set: discern
3514          by looking at the character code: build two 256-entry
3515          transition tables, one starting at trtable[0] and one
3516          starting at trtable[SBC_MAX].  */
3517       trtable = state->word_trtable =
3518         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3519       if (BE (trtable == NULL, 0))
3520         goto out_free;
3521
3522       /* For all characters ch...:  */
3523       for (i = 0; i < BITSET_WORDS; ++i)
3524         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3525              elem;
3526              mask <<= 1, elem >>= 1, ++ch)
3527           if (BE (elem & 1, 0))
3528             {
3529               /* There must be exactly one destination which accepts
3530                  character ch.  See group_nodes_into_DFAstates.  */
3531               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3532                 ;
3533
3534               /* j-th destination accepts the word character ch.  */
3535               trtable[ch] = dest_states[j];
3536               trtable[ch + SBC_MAX] = dest_states_word[j];
3537             }
3538     }
3539
3540   /* new line */
3541   if (bitset_contain (acceptable, NEWLINE_CHAR))
3542     {
3543       /* The current state accepts newline character.  */
3544       for (j = 0; j < ndests; ++j)
3545         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3546           {
3547             /* k-th destination accepts newline character.  */
3548             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3549             if (need_word_trtable)
3550               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3551             /* There must be only one destination which accepts
3552                newline.  See group_nodes_into_DFAstates.  */
3553             break;
3554           }
3555     }
3556
3557   if (dest_states_malloced)
3558     free (dest_states);
3559
3560   re_node_set_free (&follows);
3561   for (i = 0; i < ndests; ++i)
3562     re_node_set_free (dests_node + i);
3563
3564   if (dests_node_malloced)
3565     free (dests_alloc);
3566
3567   return 1;
3568 }
3569
3570 /* Group all nodes belonging to STATE into several destinations.
3571    Then for all destinations, set the nodes belonging to the destination
3572    to DESTS_NODE[i] and set the characters accepted by the destination
3573    to DEST_CH[i].  This function return the number of destinations.  */
3574
3575 static int
3576 internal_function
3577 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3578                             re_node_set *dests_node, bitset_t *dests_ch)
3579 {
3580   reg_errcode_t err;
3581   int result;
3582   int i, j, k;
3583   int ndests; /* Number of the destinations from `state'.  */
3584   bitset_t accepts; /* Characters a node can accept.  */
3585   const re_node_set *cur_nodes = &state->nodes;
3586   bitset_empty (accepts);
3587   ndests = 0;
3588
3589   /* For all the nodes belonging to `state',  */
3590   for (i = 0; i < cur_nodes->nelem; ++i)
3591     {
3592       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3593       re_token_type_t type = node->type;
3594       unsigned int constraint = node->constraint;
3595
3596       /* Enumerate all single byte character this node can accept.  */
3597       if (type == CHARACTER)
3598         bitset_set (accepts, node->opr.c);
3599       else if (type == SIMPLE_BRACKET)
3600         {
3601           bitset_merge (accepts, node->opr.sbcset);
3602         }
3603       else if (type == OP_PERIOD)
3604         {
3605 #ifdef RE_ENABLE_I18N
3606           if (dfa->mb_cur_max > 1)
3607             bitset_merge (accepts, dfa->sb_char);
3608           else
3609 #endif
3610             bitset_set_all (accepts);
3611           if (!(dfa->syntax & RE_DOT_NEWLINE))
3612             bitset_clear (accepts, '\n');
3613           if (dfa->syntax & RE_DOT_NOT_NULL)
3614             bitset_clear (accepts, '\0');
3615         }
3616 #ifdef RE_ENABLE_I18N
3617       else if (type == OP_UTF8_PERIOD)
3618         {
3619           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3620           if (!(dfa->syntax & RE_DOT_NEWLINE))
3621             bitset_clear (accepts, '\n');
3622           if (dfa->syntax & RE_DOT_NOT_NULL)
3623             bitset_clear (accepts, '\0');
3624         }
3625 #endif
3626       else
3627         continue;
3628
3629       /* Check the `accepts' and sift the characters which are not
3630          match it the context.  */
3631       if (constraint)
3632         {
3633           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3634             {
3635               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3636               bitset_empty (accepts);
3637               if (accepts_newline)
3638                 bitset_set (accepts, NEWLINE_CHAR);
3639               else
3640                 continue;
3641             }
3642           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3643             {
3644               bitset_empty (accepts);
3645               continue;
3646             }
3647
3648           if (constraint & NEXT_WORD_CONSTRAINT)
3649             {
3650               bitset_word_t any_set = 0;
3651               if (type == CHARACTER && !node->word_char)
3652                 {
3653                   bitset_empty (accepts);
3654                   continue;
3655                 }
3656 #ifdef RE_ENABLE_I18N
3657               if (dfa->mb_cur_max > 1)
3658                 for (j = 0; j < BITSET_WORDS; ++j)
3659                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3660               else
3661 #endif
3662                 for (j = 0; j < BITSET_WORDS; ++j)
3663                   any_set |= (accepts[j] &= dfa->word_char[j]);
3664               if (!any_set)
3665                 continue;
3666             }
3667           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3668             {
3669               bitset_word_t any_set = 0;
3670               if (type == CHARACTER && node->word_char)
3671                 {
3672                   bitset_empty (accepts);
3673                   continue;
3674                 }
3675 #ifdef RE_ENABLE_I18N
3676               if (dfa->mb_cur_max > 1)
3677                 for (j = 0; j < BITSET_WORDS; ++j)
3678                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3679               else
3680 #endif
3681                 for (j = 0; j < BITSET_WORDS; ++j)
3682                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3683               if (!any_set)
3684                 continue;
3685             }
3686         }
3687
3688       /* Then divide `accepts' into DFA states, or create a new
3689          state.  Above, we make sure that accepts is not empty.  */
3690       for (j = 0; j < ndests; ++j)
3691         {
3692           bitset_t intersec; /* Intersection sets, see below.  */
3693           bitset_t remains;
3694           /* Flags, see below.  */
3695           bitset_word_t has_intersec, not_subset, not_consumed;
3696
3697           /* Optimization, skip if this state doesn't accept the character.  */
3698           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3699             continue;
3700
3701           /* Enumerate the intersection set of this state and `accepts'.  */
3702           has_intersec = 0;
3703           for (k = 0; k < BITSET_WORDS; ++k)
3704             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3705           /* And skip if the intersection set is empty.  */
3706           if (!has_intersec)
3707             continue;
3708
3709           /* Then check if this state is a subset of `accepts'.  */
3710           not_subset = not_consumed = 0;
3711           for (k = 0; k < BITSET_WORDS; ++k)
3712             {
3713               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3714               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3715             }
3716
3717           /* If this state isn't a subset of `accepts', create a
3718              new group state, which has the `remains'. */
3719           if (not_subset)
3720             {
3721               bitset_copy (dests_ch[ndests], remains);
3722               bitset_copy (dests_ch[j], intersec);
3723               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3724               if (BE (err != REG_NOERROR, 0))
3725                 goto error_return;
3726               ++ndests;
3727             }
3728
3729           /* Put the position in the current group. */
3730           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3731           if (BE (result < 0, 0))
3732             goto error_return;
3733
3734           /* If all characters are consumed, go to next node. */
3735           if (!not_consumed)
3736             break;
3737         }
3738       /* Some characters remain, create a new group. */
3739       if (j == ndests)
3740         {
3741           bitset_copy (dests_ch[ndests], accepts);
3742           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3743           if (BE (err != REG_NOERROR, 0))
3744             goto error_return;
3745           ++ndests;
3746           bitset_empty (accepts);
3747         }
3748     }
3749   return ndests;
3750  error_return:
3751   for (j = 0; j < ndests; ++j)
3752     re_node_set_free (dests_node + j);
3753   return -1;
3754 }
3755
3756 #ifdef RE_ENABLE_I18N
3757 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3758    Return the number of the bytes the node accepts.
3759    STR_IDX is the current index of the input string.
3760
3761    This function handles the nodes which can accept one character, or
3762    one collating element like '.', '[a-z]', opposite to the other nodes
3763    can only accept one byte.  */
3764
3765 static int
3766 internal_function
3767 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3768                          const re_string_t *input, int str_idx)
3769 {
3770   const re_token_t *node = dfa->nodes + node_idx;
3771   int char_len, elem_len;
3772   int i;
3773   wint_t wc;
3774
3775   if (BE (node->type == OP_UTF8_PERIOD, 0))
3776     {
3777       unsigned char c = re_string_byte_at (input, str_idx), d;
3778       if (BE (c < 0xc2, 1))
3779         return 0;
3780
3781       if (str_idx + 2 > input->len)
3782         return 0;
3783
3784       d = re_string_byte_at (input, str_idx + 1);
3785       if (c < 0xe0)
3786         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3787       else if (c < 0xf0)
3788         {
3789           char_len = 3;
3790           if (c == 0xe0 && d < 0xa0)
3791             return 0;
3792         }
3793       else if (c < 0xf8)
3794         {
3795           char_len = 4;
3796           if (c == 0xf0 && d < 0x90)
3797             return 0;
3798         }
3799       else if (c < 0xfc)
3800         {
3801           char_len = 5;
3802           if (c == 0xf8 && d < 0x88)
3803             return 0;
3804         }
3805       else if (c < 0xfe)
3806         {
3807           char_len = 6;
3808           if (c == 0xfc && d < 0x84)
3809             return 0;
3810         }
3811       else
3812         return 0;
3813
3814       if (str_idx + char_len > input->len)
3815         return 0;
3816
3817       for (i = 1; i < char_len; ++i)
3818         {
3819           d = re_string_byte_at (input, str_idx + i);
3820           if (d < 0x80 || d > 0xbf)
3821             return 0;
3822         }
3823       return char_len;
3824     }
3825
3826   char_len = re_string_char_size_at (input, str_idx);
3827   if (node->type == OP_PERIOD)
3828     {
3829       if (char_len <= 1)
3830         return 0;
3831       /* FIXME: I don't think this if is needed, as both '\n'
3832          and '\0' are char_len == 1.  */
3833       /* '.' accepts any one character except the following two cases.  */
3834       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3835            re_string_byte_at (input, str_idx) == '\n') ||
3836           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3837            re_string_byte_at (input, str_idx) == '\0'))
3838         return 0;
3839       return char_len;
3840     }
3841
3842   elem_len = re_string_elem_size_at (input, str_idx);
3843   wc = __btowc(*(input->mbs+str_idx));
3844   if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3845     return 0;
3846
3847   if (node->type == COMPLEX_BRACKET)
3848     {
3849       const re_charset_t *cset = node->opr.mbcset;
3850 # ifdef _LIBC
3851       const unsigned char *pin
3852         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3853       int j;
3854       uint32_t nrules;
3855 # endif /* _LIBC */
3856       int match_len = 0;
3857       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3858                     ? re_string_wchar_at (input, str_idx) : 0);
3859
3860       /* match with multibyte character?  */
3861       for (i = 0; i < cset->nmbchars; ++i)
3862         if (wc == cset->mbchars[i])
3863           {
3864             match_len = char_len;
3865             goto check_node_accept_bytes_match;
3866           }
3867       /* match with character_class?  */
3868       for (i = 0; i < cset->nchar_classes; ++i)
3869         {
3870           wctype_t wt = cset->char_classes[i];
3871           if (__iswctype (wc, wt))
3872             {
3873               match_len = char_len;
3874               goto check_node_accept_bytes_match;
3875             }
3876         }
3877
3878 # ifdef _LIBC
3879       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3880       if (nrules != 0)
3881         {
3882           unsigned int in_collseq = 0;
3883           const int32_t *table, *indirect;
3884           const unsigned char *weights, *extra;
3885           const char *collseqwc;
3886           /* This #include defines a local function!  */
3887 #  include <locale/weight.h>
3888
3889           /* match with collating_symbol?  */
3890           if (cset->ncoll_syms)
3891             extra = (const unsigned char *)
3892               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3893           for (i = 0; i < cset->ncoll_syms; ++i)
3894             {
3895               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3896               /* Compare the length of input collating element and
3897                  the length of current collating element.  */
3898               if (*coll_sym != elem_len)
3899                 continue;
3900               /* Compare each bytes.  */
3901               for (j = 0; j < *coll_sym; j++)
3902                 if (pin[j] != coll_sym[1 + j])
3903                   break;
3904               if (j == *coll_sym)
3905                 {
3906                   /* Match if every bytes is equal.  */
3907                   match_len = j;
3908                   goto check_node_accept_bytes_match;
3909                 }
3910             }
3911
3912           if (cset->nranges)
3913             {
3914               if (elem_len <= char_len)
3915                 {
3916                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3917                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3918                 }
3919               else
3920                 in_collseq = find_collation_sequence_value (pin, elem_len);
3921             }
3922           /* match with range expression?  */
3923           for (i = 0; i < cset->nranges; ++i)
3924             if (cset->range_starts[i] <= in_collseq
3925                 && in_collseq <= cset->range_ends[i])
3926               {
3927                 match_len = elem_len;
3928                 goto check_node_accept_bytes_match;
3929               }
3930
3931           /* match with equivalence_class?  */
3932           if (cset->nequiv_classes)
3933             {
3934               const unsigned char *cp = pin;
3935               table = (const int32_t *)
3936                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3937               weights = (const unsigned char *)
3938                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3939               extra = (const unsigned char *)
3940                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3941               indirect = (const int32_t *)
3942                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3943               int32_t idx = findidx (&cp, elem_len);
3944               if (idx > 0)
3945                 for (i = 0; i < cset->nequiv_classes; ++i)
3946                   {
3947                     int32_t equiv_class_idx = cset->equiv_classes[i];
3948                     size_t weight_len = weights[idx & 0xffffff];
3949                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3950                         && (idx >> 24) == (equiv_class_idx >> 24))
3951                       {
3952                         int cnt = 0;
3953
3954                         idx &= 0xffffff;
3955                         equiv_class_idx &= 0xffffff;
3956
3957                         while (cnt <= weight_len
3958                                && (weights[equiv_class_idx + 1 + cnt]
3959                                    == weights[idx + 1 + cnt]))
3960                           ++cnt;
3961                         if (cnt > weight_len)
3962                           {
3963                             match_len = elem_len;
3964                             goto check_node_accept_bytes_match;
3965                           }
3966                       }
3967                   }
3968             }
3969         }
3970       else
3971 # endif /* _LIBC */
3972         {
3973           /* match with range expression?  */
3974           for (i = 0; i < cset->nranges; ++i)
3975             {
3976               if (cset->range_starts[i] <= wc
3977                   && wc <= cset->range_ends[i])
3978                 {
3979                   match_len = char_len;
3980                   goto check_node_accept_bytes_match;
3981                 }
3982             }
3983         }
3984     check_node_accept_bytes_match:
3985       if (!cset->non_match)
3986         return match_len;
3987       else
3988         {
3989           if (match_len > 0)
3990             return 0;
3991           else
3992             return (elem_len > char_len) ? elem_len : char_len;
3993         }
3994     }
3995   return 0;
3996 }
3997
3998 # ifdef _LIBC
3999 static unsigned int
4000 internal_function
4001 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4002 {
4003   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4004   if (nrules == 0)
4005     {
4006       if (mbs_len == 1)
4007         {
4008           /* No valid character.  Match it as a single byte character.  */
4009           const unsigned char *collseq = (const unsigned char *)
4010             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4011           return collseq[mbs[0]];
4012         }
4013       return UINT_MAX;
4014     }
4015   else
4016     {
4017       int32_t idx;
4018       const unsigned char *extra = (const unsigned char *)
4019         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4020       int32_t extrasize = (const unsigned char *)
4021         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4022
4023       for (idx = 0; idx < extrasize;)
4024         {
4025           int mbs_cnt, found = 0;
4026           int32_t elem_mbs_len;
4027           /* Skip the name of collating element name.  */
4028           idx = idx + extra[idx] + 1;
4029           elem_mbs_len = extra[idx++];
4030           if (mbs_len == elem_mbs_len)
4031             {
4032               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4033                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4034                   break;
4035               if (mbs_cnt == elem_mbs_len)
4036                 /* Found the entry.  */
4037                 found = 1;
4038             }
4039           /* Skip the byte sequence of the collating element.  */
4040           idx += elem_mbs_len;
4041           /* Adjust for the alignment.  */
4042           idx = (idx + 3) & ~3;
4043           /* Skip the collation sequence value.  */
4044           idx += sizeof (uint32_t);
4045           /* Skip the wide char sequence of the collating element.  */
4046           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4047           /* If we found the entry, return the sequence value.  */
4048           if (found)
4049             return *(uint32_t *) (extra + idx);
4050           /* Skip the collation sequence value.  */
4051           idx += sizeof (uint32_t);
4052         }
4053       return UINT_MAX;
4054     }
4055 }
4056 # endif /* _LIBC */
4057 #endif /* RE_ENABLE_I18N */
4058
4059 /* Check whether the node accepts the byte which is IDX-th
4060    byte of the INPUT.  */
4061
4062 static int
4063 internal_function
4064 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4065                    int idx)
4066 {
4067   unsigned char ch;
4068   ch = re_string_byte_at (&mctx->input, idx);
4069   switch (node->type)
4070     {
4071     case CHARACTER:
4072       if (node->opr.c != ch)
4073         return 0;
4074       break;
4075
4076     case SIMPLE_BRACKET:
4077       if (!bitset_contain (node->opr.sbcset, ch))
4078         return 0;
4079       break;
4080
4081 #ifdef RE_ENABLE_I18N
4082     case OP_UTF8_PERIOD:
4083       if (ch >= 0x80)
4084         return 0;
4085       /* FALLTHROUGH */
4086 #endif
4087     case OP_PERIOD:
4088       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4089           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4090         return 0;
4091       break;
4092
4093     default:
4094       return 0;
4095     }
4096
4097   if (node->constraint)
4098     {
4099       /* The node has constraints.  Check whether the current context
4100          satisfies the constraints.  */
4101       unsigned int context = re_string_context_at (&mctx->input, idx,
4102                                                    mctx->eflags);
4103       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4104         return 0;
4105     }
4106
4107   return 1;
4108 }
4109
4110 /* Extend the buffers, if the buffers have run out.  */
4111
4112 static reg_errcode_t
4113 internal_function __attribute_warn_unused_result__
4114 extend_buffers (re_match_context_t *mctx, int min_len)
4115 {
4116   reg_errcode_t ret;
4117   re_string_t *pstr = &mctx->input;
4118
4119   /* Avoid overflow.  */
4120   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4121     return REG_ESPACE;
4122
4123   /* Double the lengthes of the buffers, but allocate at least MIN_LEN.  */
4124   ret = re_string_realloc_buffers (pstr,
4125                                    MAX (min_len,
4126                                         MIN (pstr->len, pstr->bufs_len * 2)));
4127   if (BE (ret != REG_NOERROR, 0))
4128     return ret;
4129
4130   if (mctx->state_log != NULL)
4131     {
4132       /* And double the length of state_log.  */
4133       /* XXX We have no indication of the size of this buffer.  If this
4134          allocation fail we have no indication that the state_log array
4135          does not have the right size.  */
4136       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4137                                               pstr->bufs_len + 1);
4138       if (BE (new_array == NULL, 0))
4139         return REG_ESPACE;
4140       mctx->state_log = new_array;
4141     }
4142
4143   /* Then reconstruct the buffers.  */
4144   if (pstr->icase)
4145     {
4146 #ifdef RE_ENABLE_I18N
4147       if (pstr->mb_cur_max > 1)
4148         {
4149           ret = build_wcs_upper_buffer (pstr);
4150           if (BE (ret != REG_NOERROR, 0))
4151             return ret;
4152         }
4153       else
4154 #endif /* RE_ENABLE_I18N  */
4155         build_upper_buffer (pstr);
4156     }
4157   else
4158     {
4159 #ifdef RE_ENABLE_I18N
4160       if (pstr->mb_cur_max > 1)
4161         build_wcs_buffer (pstr);
4162       else
4163 #endif /* RE_ENABLE_I18N  */
4164         {
4165           if (pstr->trans != NULL)
4166             re_string_translate_buffer (pstr);
4167         }
4168     }
4169   return REG_NOERROR;
4170 }
4171
4172 \f
4173 /* Functions for matching context.  */
4174
4175 /* Initialize MCTX.  */
4176
4177 static reg_errcode_t
4178 internal_function __attribute_warn_unused_result__
4179 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4180 {
4181   mctx->eflags = eflags;
4182   mctx->match_last = -1;
4183   if (n > 0)
4184     {
4185       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4186       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4187       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4188         return REG_ESPACE;
4189     }
4190   /* Already zero-ed by the caller.
4191      else
4192        mctx->bkref_ents = NULL;
4193      mctx->nbkref_ents = 0;
4194      mctx->nsub_tops = 0;  */
4195   mctx->abkref_ents = n;
4196   mctx->max_mb_elem_len = 1;
4197   mctx->asub_tops = n;
4198   return REG_NOERROR;
4199 }
4200
4201 /* Clean the entries which depend on the current input in MCTX.
4202    This function must be invoked when the matcher changes the start index
4203    of the input, or changes the input string.  */
4204
4205 static void
4206 internal_function
4207 match_ctx_clean (re_match_context_t *mctx)
4208 {
4209   int st_idx;
4210   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4211     {
4212       int sl_idx;
4213       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4214       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4215         {
4216           re_sub_match_last_t *last = top->lasts[sl_idx];
4217           re_free (last->path.array);
4218           re_free (last);
4219         }
4220       re_free (top->lasts);
4221       if (top->path)
4222         {
4223           re_free (top->path->array);
4224           re_free (top->path);
4225         }
4226       free (top);
4227     }
4228
4229   mctx->nsub_tops = 0;
4230   mctx->nbkref_ents = 0;
4231 }
4232
4233 /* Free all the memory associated with MCTX.  */
4234
4235 static void
4236 internal_function
4237 match_ctx_free (re_match_context_t *mctx)
4238 {
4239   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4240   match_ctx_clean (mctx);
4241   re_free (mctx->sub_tops);
4242   re_free (mctx->bkref_ents);
4243 }
4244
4245 /* Add a new backreference entry to MCTX.
4246    Note that we assume that caller never call this function with duplicate
4247    entry, and call with STR_IDX which isn't smaller than any existing entry.
4248 */
4249
4250 static reg_errcode_t
4251 internal_function __attribute_warn_unused_result__
4252 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4253                      int to)
4254 {
4255   if (mctx->nbkref_ents >= mctx->abkref_ents)
4256     {
4257       struct re_backref_cache_entry* new_entry;
4258       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4259                               mctx->abkref_ents * 2);
4260       if (BE (new_entry == NULL, 0))
4261         {
4262           re_free (mctx->bkref_ents);
4263           return REG_ESPACE;
4264         }
4265       mctx->bkref_ents = new_entry;
4266       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4267               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4268       mctx->abkref_ents *= 2;
4269     }
4270   if (mctx->nbkref_ents > 0
4271       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4272     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4273
4274   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4275   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4276   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4277   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4278
4279   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4280      If bit N is clear, means that this entry won't epsilon-transition to
4281      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4282      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4283      such node.
4284
4285      A backreference does not epsilon-transition unless it is empty, so set
4286      to all zeros if FROM != TO.  */
4287   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4288     = (from == to ? ~0 : 0);
4289
4290   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4291   if (mctx->max_mb_elem_len < to - from)
4292     mctx->max_mb_elem_len = to - from;
4293   return REG_NOERROR;
4294 }
4295
4296 /* Search for the first entry which has the same str_idx, or -1 if none is
4297    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4298
4299 static int
4300 internal_function
4301 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4302 {
4303   int left, right, mid, last;
4304   last = right = mctx->nbkref_ents;
4305   for (left = 0; left < right;)
4306     {
4307       mid = (left + right) / 2;
4308       if (mctx->bkref_ents[mid].str_idx < str_idx)
4309         left = mid + 1;
4310       else
4311         right = mid;
4312     }
4313   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4314     return left;
4315   else
4316     return -1;
4317 }
4318
4319 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4320    at STR_IDX.  */
4321
4322 static reg_errcode_t
4323 internal_function __attribute_warn_unused_result__
4324 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4325 {
4326 #ifdef DEBUG
4327   assert (mctx->sub_tops != NULL);
4328   assert (mctx->asub_tops > 0);
4329 #endif
4330   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4331     {
4332       int new_asub_tops = mctx->asub_tops * 2;
4333       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4334                                                    re_sub_match_top_t *,
4335                                                    new_asub_tops);
4336       if (BE (new_array == NULL, 0))
4337         return REG_ESPACE;
4338       mctx->sub_tops = new_array;
4339       mctx->asub_tops = new_asub_tops;
4340     }
4341   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4342   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4343     return REG_ESPACE;
4344   mctx->sub_tops[mctx->nsub_tops]->node = node;
4345   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4346   return REG_NOERROR;
4347 }
4348
4349 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4350    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4351
4352 static re_sub_match_last_t *
4353 internal_function
4354 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4355 {
4356   re_sub_match_last_t *new_entry;
4357   if (BE (subtop->nlasts == subtop->alasts, 0))
4358     {
4359       int new_alasts = 2 * subtop->alasts + 1;
4360       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4361                                                     re_sub_match_last_t *,
4362                                                     new_alasts);
4363       if (BE (new_array == NULL, 0))
4364         return NULL;
4365       subtop->lasts = new_array;
4366       subtop->alasts = new_alasts;
4367     }
4368   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4369   if (BE (new_entry != NULL, 1))
4370     {
4371       subtop->lasts[subtop->nlasts] = new_entry;
4372       new_entry->node = node;
4373       new_entry->str_idx = str_idx;
4374       ++subtop->nlasts;
4375     }
4376   return new_entry;
4377 }
4378
4379 static void
4380 internal_function
4381 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4382                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4383 {
4384   sctx->sifted_states = sifted_sts;
4385   sctx->limited_states = limited_sts;
4386   sctx->last_node = last_node;
4387   sctx->last_str_idx = last_str_idx;
4388   re_node_set_init_empty (&sctx->limits);
4389 }