posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static inline re_dfastate_t *acquire_init_state_context
  56      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  57      __attribute ((always_inline)) internal_function;
  58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  59      internal_function;
  60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  61                            int *p_match_first) internal_function;
  62 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  63                                     unsigned int context) internal_function;
  64 static int check_halt_state_context (const re_match_context_t *mctx,
  65                                      const re_dfastate_t *state, int idx)
  66      internal_function;
  67 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  68                          regmatch_t *prev_idx_match, int cur_node,
  69                          int cur_idx, int nmatch) internal_function;
  70 static int proceed_next_node (const re_match_context_t *mctx,
  71                               int nregs, regmatch_t *regs,
  72                               int *pidx, int node, re_node_set *eps_via_nodes,
  73                               struct re_fail_stack_t *fs) internal_function;
  74 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  75                                       int str_idx, int dest_node, int nregs,
  76                                       regmatch_t *regs,
  77                                       re_node_set *eps_via_nodes)
  78      internal_function;
  79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  80                            regmatch_t *regs, re_node_set *eps_via_nodes)
  81      internal_function;
  82 static reg_errcode_t set_regs (const regex_t *preg,
  83                                const re_match_context_t *mctx,
  84                                size_t nmatch, regmatch_t *pmatch,
  85                                int fl_backtrack) internal_function;
  86 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  87      internal_function;
  88
  89 #ifdef RE_ENABLE_I18N
  90 static int sift_states_iter_mb (const re_match_context_t *mctx,
  91                                 re_sift_context_t *sctx,
  92                                 int node_idx, int str_idx, int max_str_idx)
  93      internal_function;
  94 #endif /* RE_ENABLE_I18N */
  95 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  96                                            re_sift_context_t *sctx)
  97      internal_function;
  98 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  99                                           re_sift_context_t *sctx, int str_idx,
 100                                           re_node_set *cur_dest)
 101      internal_function;
 102 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
 103                                               re_sift_context_t *sctx,
 104                                               int str_idx,
 105                                               re_node_set *dest_nodes)
 106      internal_function;
 107 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
 108                                             re_node_set *dest_nodes,
 109                                             const re_node_set *candidates)
 110      internal_function;
 111 static reg_errcode_t sub_epsilon_src_nodes (const re_dfa_t *dfa, int node,
 112                                             re_node_set *dest_nodes,
 113                                             const re_node_set *and_nodes)
 114      internal_function;
 115 static int check_dst_limits (const re_match_context_t *mctx,
 116                              re_node_set *limits,
 117                              int dst_node, int dst_idx, int src_node,
 118                              int src_idx) internal_function;
 119 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 120                                         int boundaries, int subexp_idx,
 121                                         int from_node, int bkref_idx)
 122      internal_function;
 123 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 124                                       int limit, int subexp_idx,
 125                                       int node, int str_idx,
 126                                       int bkref_idx) internal_function;
 127 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 128                                           re_node_set *dest_nodes,
 129                                           const re_node_set *candidates,
 130                                           re_node_set *limits,
 131                                           struct re_backref_cache_entry *bkref_ents,
 132                                           int str_idx) internal_function;
 133 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 134                                         re_sift_context_t *sctx,
 135                                         int str_idx, const re_node_set *candidates)
 136      internal_function;
 137 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 138                                                 int next_state_log_idx)
 139      internal_function;
 140 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 141                                         re_dfastate_t **dst,
 142                                         re_dfastate_t **src, int num)
 143      internal_function;
 144 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 145                                          re_match_context_t *mctx) internal_function;
 146 static re_dfastate_t *transit_state (reg_errcode_t *err,
 147                                      re_match_context_t *mctx,
 148                                      re_dfastate_t *state) internal_function;
 149 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 150                                             re_match_context_t *mctx,
 151                                             re_dfastate_t *next_state)
 152      internal_function;
 153 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 154                                                 re_node_set *cur_nodes,
 155                                                 int str_idx) internal_function;
 156 #if 0
 157 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 158                                         re_match_context_t *mctx,
 159                                         re_dfastate_t *pstate)
 160      internal_function;
 161 #endif
 162 #ifdef RE_ENABLE_I18N
 163 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 164                                        re_dfastate_t *pstate)
 165      internal_function;
 166 #endif /* RE_ENABLE_I18N */
 167 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 168                                           const re_node_set *nodes)
 169      internal_function;
 170 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 171                                  int bkref_node, int bkref_str_idx)
 172      internal_function;
 173 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 174                                      const re_sub_match_top_t *sub_top,
 175                                      re_sub_match_last_t *sub_last,
 176                                      int bkref_node, int bkref_str)
 177      internal_function;
 178 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 179                              int subexp_idx, int type) internal_function;
 180 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 181                                     state_array_t *path, int top_node,
 182                                     int top_str, int last_node, int last_str,
 183                                     int type) internal_function;
 184 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 185                                                    int str_idx,
 186                                                    re_node_set *cur_nodes,
 187                                                    re_node_set *next_nodes)
 188      internal_function;
 189 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 190                                                re_node_set *cur_nodes,
 191                                                int ex_subexp, int type)
 192      internal_function;
 193 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 194                                                    re_node_set *dst_nodes,
 195                                                    int target, int ex_subexp,
 196                                                    int type) internal_function;
 197 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 198                                          re_node_set *cur_nodes, int cur_str,
 199                                          int subexp_num, int type)
 200      internal_function;
 201 static int build_trtable (const re_dfa_t *dfa,
 202                           re_dfastate_t *state) internal_function;
 203 #ifdef RE_ENABLE_I18N
 204 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 205                                     const re_string_t *input, int idx)
 206      internal_function;
 207 # ifdef _LIBC
 208 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 209                                                    size_t name_len)
 210      internal_function;
 211 # endif /* _LIBC */
 212 #endif /* RE_ENABLE_I18N */
 213 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 214                                        const re_dfastate_t *state,
 215                                        re_node_set *states_node,
 216                                        bitset *states_ch) internal_function;
 217 static int check_node_accept (const re_match_context_t *mctx,
 218                               const re_token_t *node, int idx)
 219      internal_function;
 220 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 221      internal_function;
 222 \f
 223 /* Entry point for POSIX code.  */
 224
 225 /* regexec searches for a given pattern, specified by PREG, in the
 226    string STRING.
 227
 228    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 229    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 230    least NMATCH elements, and we set them to the offsets of the
 231    corresponding matched substrings.
 232
 233    EFLAGS specifies `execution flags' which affect matching: if
 234    REG_NOTBOL is set, then ^ does not match at the beginning of the
 235    string; if REG_NOTEOL is set, then $ does not match at the end.
 236
 237    We return 0 if we find a match and REG_NOMATCH if not.  */
 238
 239 int
 240 regexec (preg, string, nmatch, pmatch, eflags)
 241     const regex_t *__restrict preg;
 242     const char *__restrict string;
 243     size_t nmatch;
 244     regmatch_t pmatch[];
 245     int eflags;
 246 {
 247   reg_errcode_t err;
 248   int start, length;
 249   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 250
 251   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 252     return REG_BADPAT;
 253
 254   if (eflags & REG_STARTEND)
 255     {
 256       start = pmatch[0].rm_so;
 257       length = pmatch[0].rm_eo;
 258     }
 259   else
 260     {
 261       start = 0;
 262       length = strlen (string);
 263     }
 264
 265   __libc_lock_lock (dfa->lock);
 266   if (preg->no_sub)
 267     err = re_search_internal (preg, string, length, start, length - start,
 268                               length, 0, NULL, eflags);
 269   else
 270     err = re_search_internal (preg, string, length, start, length - start,
 271                               length, nmatch, pmatch, eflags);
 272   __libc_lock_unlock (dfa->lock);
 273   return err != REG_NOERROR;
 274 }
 275
 276 #ifdef _LIBC
 277 # include <shlib-compat.h>
 278 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 279
 280 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 281 __typeof__ (__regexec) __compat_regexec;
 282
 283 int
 284 attribute_compat_text_section
 285 __compat_regexec (const regex_t *__restrict preg,
 286                   const char *__restrict string, size_t nmatch,
 287                   regmatch_t pmatch[], int eflags)
 288 {
 289   return regexec (preg, string, nmatch, pmatch,
 290                   eflags & (REG_NOTBOL | REG_NOTEOL));
 291 }
 292 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 293 # endif
 294 #endif
 295
 296 /* Entry points for GNU code.  */
 297
 298 /* re_match, re_search, re_match_2, re_search_2
 299
 300    The former two functions operate on STRING with length LENGTH,
 301    while the later two operate on concatenation of STRING1 and STRING2
 302    with lengths LENGTH1 and LENGTH2, respectively.
 303
 304    re_match() matches the compiled pattern in BUFP against the string,
 305    starting at index START.
 306
 307    re_search() first tries matching at index START, then it tries to match
 308    starting from index START + 1, and so on.  The last start position tried
 309    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 310    way as re_match().)
 311
 312    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 313    the first STOP characters of the concatenation of the strings should be
 314    concerned.
 315
 316    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 317    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 318    computed relative to the concatenation, not relative to the individual
 319    strings.)
 320
 321    On success, re_match* functions return the length of the match, re_search*
 322    return the position of the start of the match.  Return value -1 means no
 323    match was found and -2 indicates an internal error.  */
 324
 325 int
 326 re_match (bufp, string, length, start, regs)
 327     struct re_pattern_buffer *bufp;
 328     const char *string;
 329     int length, start;
 330     struct re_registers *regs;
 331 {
 332   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 333 }
 334 #ifdef _LIBC
 335 weak_alias (__re_match, re_match)
 336 #endif
 337
 338 int
 339 re_search (bufp, string, length, start, range, regs)
 340     struct re_pattern_buffer *bufp;
 341     const char *string;
 342     int length, start, range;
 343     struct re_registers *regs;
 344 {
 345   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 346 }
 347 #ifdef _LIBC
 348 weak_alias (__re_search, re_search)
 349 #endif
 350
 351 int
 352 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 353     struct re_pattern_buffer *bufp;
 354     const char *string1, *string2;
 355     int length1, length2, start, stop;
 356     struct re_registers *regs;
 357 {
 358   return re_search_2_stub (bufp, string1, length1, string2, length2,
 359                            start, 0, regs, stop, 1);
 360 }
 361 #ifdef _LIBC
 362 weak_alias (__re_match_2, re_match_2)
 363 #endif
 364
 365 int
 366 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 367     struct re_pattern_buffer *bufp;
 368     const char *string1, *string2;
 369     int length1, length2, start, range, stop;
 370     struct re_registers *regs;
 371 {
 372   return re_search_2_stub (bufp, string1, length1, string2, length2,
 373                            start, range, regs, stop, 0);
 374 }
 375 #ifdef _LIBC
 376 weak_alias (__re_search_2, re_search_2)
 377 #endif
 378
 379 static int
 380 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 381                   stop, ret_len)
 382     struct re_pattern_buffer *bufp;
 383     const char *string1, *string2;
 384     int length1, length2, start, range, stop, ret_len;
 385     struct re_registers *regs;
 386 {
 387   const char *str;
 388   int rval;
 389   int len = length1 + length2;
 390   int free_str = 0;
 391
 392   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 393     return -2;
 394
 395   /* Concatenate the strings.  */
 396   if (length2 > 0)
 397     if (length1 > 0)
 398       {
 399         char *s = re_malloc (char, len);
 400
 401         if (BE (s == NULL, 0))
 402           return -2;
 403 #ifdef _LIBC
 404         memcpy (__mempcpy (s, string1, length1), string2, length2);
 405 #else
 406         memcpy (s, string1, length1);
 407         memcpy (s + length1, string2, length2);
 408 #endif
 409         str = s;
 410         free_str = 1;
 411       }
 412     else
 413       str = string2;
 414   else
 415     str = string1;
 416
 417   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 418                          ret_len);
 419   if (free_str)
 420     re_free ((char *) str);
 421   return rval;
 422 }
 423
 424 /* The parameters have the same meaning as those of re_search.
 425    Additional parameters:
 426    If RET_LEN is nonzero the length of the match is returned (re_match style);
 427    otherwise the position of the match is returned.  */
 428
 429 static int
 430 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 431     struct re_pattern_buffer *bufp;
 432     const char *string;
 433     int length, start, range, stop, ret_len;
 434     struct re_registers *regs;
 435 {
 436   reg_errcode_t result;
 437   regmatch_t *pmatch;
 438   int nregs, rval;
 439   int eflags = 0;
 440   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 441
 442   /* Check for out-of-range.  */
 443   if (BE (start < 0 || start > length, 0))
 444     return -1;
 445   if (BE (start + range > length, 0))
 446     range = length - start;
 447   else if (BE (start + range < 0, 0))
 448     range = -start;
 449
 450   __libc_lock_lock (dfa->lock);
 451
 452   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 453   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 454
 455   /* Compile fastmap if we haven't yet.  */
 456   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 457     re_compile_fastmap (bufp);
 458
 459   if (BE (bufp->no_sub, 0))
 460     regs = NULL;
 461
 462   /* We need at least 1 register.  */
 463   if (regs == NULL)
 464     nregs = 1;
 465   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 466                regs->num_regs < bufp->re_nsub + 1, 0))
 467     {
 468       nregs = regs->num_regs;
 469       if (BE (nregs < 1, 0))
 470         {
 471           /* Nothing can be copied to regs.  */
 472           regs = NULL;
 473           nregs = 1;
 474         }
 475     }
 476   else
 477     nregs = bufp->re_nsub + 1;
 478   pmatch = re_malloc (regmatch_t, nregs);
 479   if (BE (pmatch == NULL, 0))
 480     {
 481       rval = -2;
 482       goto out;
 483     }
 484
 485   result = re_search_internal (bufp, string, length, start, range, stop,
 486                                nregs, pmatch, eflags);
 487
 488   rval = 0;
 489
 490   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 491   if (result != REG_NOERROR)
 492     rval = -1;
 493   else if (regs != NULL)
 494     {
 495       /* If caller wants register contents data back, copy them.  */
 496       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 497                                            bufp->regs_allocated);
 498       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 499         rval = -2;
 500     }
 501
 502   if (BE (rval == 0, 1))
 503     {
 504       if (ret_len)
 505         {
 506           assert (pmatch[0].rm_so == start);
 507           rval = pmatch[0].rm_eo - start;
 508         }
 509       else
 510         rval = pmatch[0].rm_so;
 511     }
 512   re_free (pmatch);
 513  out:
 514   __libc_lock_unlock (dfa->lock);
 515   return rval;
 516 }
 517
 518 static unsigned
 519 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 520     struct re_registers *regs;
 521     regmatch_t *pmatch;
 522     int nregs, regs_allocated;
 523 {
 524   int rval = REGS_REALLOCATE;
 525   int i;
 526   int need_regs = nregs + 1;
 527   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 528      uses.  */
 529
 530   /* Have the register data arrays been allocated?  */
 531   if (regs_allocated == REGS_UNALLOCATED)
 532     { /* No.  So allocate them with malloc.  */
 533       regs->start = re_malloc (regoff_t, need_regs);
 534       regs->end = re_malloc (regoff_t, need_regs);
 535       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 536         return REGS_UNALLOCATED;
 537       regs->num_regs = need_regs;
 538     }
 539   else if (regs_allocated == REGS_REALLOCATE)
 540     { /* Yes.  If we need more elements than were already
 541          allocated, reallocate them.  If we need fewer, just
 542          leave it alone.  */
 543       if (BE (need_regs > regs->num_regs, 0))
 544         {
 545           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 546           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 547           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 548             return REGS_UNALLOCATED;
 549           regs->start = new_start;
 550           regs->end = new_end;
 551           regs->num_regs = need_regs;
 552         }
 553     }
 554   else
 555     {
 556       assert (regs_allocated == REGS_FIXED);
 557       /* This function may not be called with REGS_FIXED and nregs too big.  */
 558       assert (regs->num_regs >= nregs);
 559       rval = REGS_FIXED;
 560     }
 561
 562   /* Copy the regs.  */
 563   for (i = 0; i < nregs; ++i)
 564     {
 565       regs->start[i] = pmatch[i].rm_so;
 566       regs->end[i] = pmatch[i].rm_eo;
 567     }
 568   for ( ; i < regs->num_regs; ++i)
 569     regs->start[i] = regs->end[i] = -1;
 570
 571   return rval;
 572 }
 573
 574 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 575    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 576    this memory for recording register information.  STARTS and ENDS
 577    must be allocated using the malloc library routine, and must each
 578    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 579
 580    If NUM_REGS == 0, then subsequent matches should allocate their own
 581    register data.
 582
 583    Unless this function is called, the first search or match using
 584    PATTERN_BUFFER will allocate its own register data, without
 585    freeing the old data.  */
 586
 587 void
 588 re_set_registers (bufp, regs, num_regs, starts, ends)
 589     struct re_pattern_buffer *bufp;
 590     struct re_registers *regs;
 591     unsigned num_regs;
 592     regoff_t *starts, *ends;
 593 {
 594   if (num_regs)
 595     {
 596       bufp->regs_allocated = REGS_REALLOCATE;
 597       regs->num_regs = num_regs;
 598       regs->start = starts;
 599       regs->end = ends;
 600     }
 601   else
 602     {
 603       bufp->regs_allocated = REGS_UNALLOCATED;
 604       regs->num_regs = 0;
 605       regs->start = regs->end = (regoff_t *) 0;
 606     }
 607 }
 608 #ifdef _LIBC
 609 weak_alias (__re_set_registers, re_set_registers)
 610 #endif
 611 \f
 612 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 613    them unless specifically requested.  */
 614
 615 #if defined _REGEX_RE_COMP || defined _LIBC
 616 int
 617 # ifdef _LIBC
 618 weak_function
 619 # endif
 620 re_exec (s)
 621      const char *s;
 622 {
 623   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 624 }
 625 #endif /* _REGEX_RE_COMP */
 626 \f
 627 /* Internal entry point.  */
 628
 629 /* Searches for a compiled pattern PREG in the string STRING, whose
 630    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 631    mingings with regexec.  START, and RANGE have the same meanings
 632    with re_search.
 633    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 634    otherwise return the error code.
 635    Note: We assume front end functions already check ranges.
 636    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 637
 638 static reg_errcode_t
 639 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 640                     eflags)
 641     const regex_t *preg;
 642     const char *string;
 643     int length, start, range, stop, eflags;
 644     size_t nmatch;
 645     regmatch_t pmatch[];
 646 {
 647   reg_errcode_t err;
 648   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 649   int left_lim, right_lim, incr;
 650   int fl_longest_match, match_first, match_kind, match_last = -1;
 651   int extra_nmatch;
 652   int sb, ch;
 653 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 654   re_match_context_t mctx = { .dfa = dfa };
 655 #else
 656   re_match_context_t mctx;
 657 #endif
 658   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 659                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 660   RE_TRANSLATE_TYPE t = preg->translate;
 661
 662 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 663   memset (&mctx, '\0', sizeof (re_match_context_t));
 664   mctx.dfa = dfa;
 665 #endif
 666
 667   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 668   nmatch -= extra_nmatch;
 669
 670   /* Check if the DFA haven't been compiled.  */
 671   if (BE (preg->used == 0 || dfa->init_state == NULL
 672           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 673           || dfa->init_state_begbuf == NULL, 0))
 674     return REG_NOMATCH;
 675
 676 #ifdef DEBUG
 677   /* We assume front-end functions already check them.  */
 678   assert (start + range >= 0 && start + range <= length);
 679 #endif
 680
 681   /* If initial states with non-begbuf contexts have no elements,
 682      the regex must be anchored.  If preg->newline_anchor is set,
 683      we'll never use init_state_nl, so do not check it.  */
 684   if (dfa->init_state->nodes.nelem == 0
 685       && dfa->init_state_word->nodes.nelem == 0
 686       && (dfa->init_state_nl->nodes.nelem == 0
 687           || !preg->newline_anchor))
 688     {
 689       if (start != 0 && start + range != 0)
 690         return REG_NOMATCH;
 691       start = range = 0;
 692     }
 693
 694   /* We must check the longest matching, if nmatch > 0.  */
 695   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 696
 697   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 698                             preg->translate, preg->syntax & RE_ICASE, dfa);
 699   if (BE (err != REG_NOERROR, 0))
 700     goto free_return;
 701   mctx.input.stop = stop;
 702   mctx.input.raw_stop = stop;
 703   mctx.input.newline_anchor = preg->newline_anchor;
 704
 705   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 706   if (BE (err != REG_NOERROR, 0))
 707     goto free_return;
 708
 709   /* We will log all the DFA states through which the dfa pass,
 710      if nmatch > 1, or this dfa has "multibyte node", which is a
 711      back-reference or a node which can accept multibyte character or
 712      multi character collating element.  */
 713   if (nmatch > 1 || dfa->has_mb_node)
 714     {
 715       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 716       if (BE (mctx.state_log == NULL, 0))
 717         {
 718           err = REG_ESPACE;
 719           goto free_return;
 720         }
 721     }
 722   else
 723     mctx.state_log = NULL;
 724
 725   match_first = start;
 726   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 727                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 728
 729   /* Check incrementally whether of not the input string match.  */
 730   incr = (range < 0) ? -1 : 1;
 731   left_lim = (range < 0) ? start + range : start;
 732   right_lim = (range < 0) ? start : start + range;
 733   sb = dfa->mb_cur_max == 1;
 734   match_kind =
 735     (fastmap
 736      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 737         | (range >= 0 ? 2 : 0)
 738         | (t != NULL ? 1 : 0))
 739      : 8);
 740
 741   for (;; match_first += incr)
 742     {
 743       err = REG_NOMATCH;
 744       if (match_first < left_lim || right_lim < match_first)
 745         goto free_return;
 746
 747       /* Advance as rapidly as possible through the string, until we
 748          find a plausible place to start matching.  This may be done
 749          with varying efficiency, so there are various possibilities:
 750          only the most common of them are specialized, in order to
 751          save on code size.  We use a switch statement for speed.  */
 752       switch (match_kind)
 753         {
 754         case 8:
 755           /* No fastmap.  */
 756           break;
 757
 758         case 7:
 759           /* Fastmap with single-byte translation, match forward.  */
 760           while (BE (match_first < right_lim, 1)
 761                  && !fastmap[t[(unsigned char) string[match_first]]])
 762             ++match_first;
 763           goto forward_match_found_start_or_reached_end;
 764
 765         case 6:
 766           /* Fastmap without translation, match forward.  */
 767           while (BE (match_first < right_lim, 1)
 768                  && !fastmap[(unsigned char) string[match_first]])
 769             ++match_first;
 770
 771         forward_match_found_start_or_reached_end:
 772           if (BE (match_first == right_lim, 0))
 773             {
 774               ch = match_first >= length
 775                        ? 0 : (unsigned char) string[match_first];
 776               if (!fastmap[t ? t[ch] : ch])
 777                 goto free_return;
 778             }
 779           break;
 780
 781         case 4:
 782         case 5:
 783           /* Fastmap without multi-byte translation, match backwards.  */
 784           while (match_first >= left_lim)
 785             {
 786               ch = match_first >= length
 787                        ? 0 : (unsigned char) string[match_first];
 788               if (fastmap[t ? t[ch] : ch])
 789                 break;
 790               --match_first;
 791             }
 792           if (match_first < left_lim)
 793             goto free_return;
 794           break;
 795
 796         default:
 797           /* In this case, we can't determine easily the current byte,
 798              since it might be a component byte of a multibyte
 799              character.  Then we use the constructed buffer instead.  */
 800           for (;;)
 801             {
 802               /* If MATCH_FIRST is out of the valid range, reconstruct the
 803                  buffers.  */
 804               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 805               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 806                 {
 807                   err = re_string_reconstruct (&mctx.input, match_first,
 808                                                eflags);
 809                   if (BE (err != REG_NOERROR, 0))
 810                     goto free_return;
 811
 812                   offset = match_first - mctx.input.raw_mbs_idx;
 813                 }
 814               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 815                  Note that MATCH_FIRST must not be smaller than 0.  */
 816               ch = (match_first >= length
 817                     ? 0 : re_string_byte_at (&mctx.input, offset));
 818               if (fastmap[ch])
 819                 break;
 820               match_first += incr;
 821               if (match_first < left_lim || match_first > right_lim)
 822                 {
 823                   err = REG_NOMATCH;
 824                   goto free_return;
 825                 }
 826             }
 827           break;
 828         }
 829
 830       /* Reconstruct the buffers so that the matcher can assume that
 831          the matching starts from the beginning of the buffer.  */
 832       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 833       if (BE (err != REG_NOERROR, 0))
 834         goto free_return;
 835
 836 #ifdef RE_ENABLE_I18N
 837      /* Don't consider this char as a possible match start if it part,
 838         yet isn't the head, of a multibyte character.  */
 839       if (!sb && !re_string_first_byte (&mctx.input, 0))
 840         continue;
 841 #endif
 842
 843       /* It seems to be appropriate one, then use the matcher.  */
 844       /* We assume that the matching starts from 0.  */
 845       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 846       match_last = check_matching (&mctx, fl_longest_match,
 847                                    range >= 0 ? &match_first : NULL);
 848       if (match_last != -1)
 849         {
 850           if (BE (match_last == -2, 0))
 851             {
 852               err = REG_ESPACE;
 853               goto free_return;
 854             }
 855           else
 856             {
 857               mctx.match_last = match_last;
 858               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 859                 {
 860                   re_dfastate_t *pstate = mctx.state_log[match_last];
 861                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 862                                                              match_last);
 863                 }
 864               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 865                   || dfa->nbackref)
 866                 {
 867                   err = prune_impossible_nodes (&mctx);
 868                   if (err == REG_NOERROR)
 869                     break;
 870                   if (BE (err != REG_NOMATCH, 0))
 871                     goto free_return;
 872                   match_last = -1;
 873                 }
 874               else
 875                 break; /* We found a match.  */
 876             }
 877         }
 878
 879       match_ctx_clean (&mctx);
 880     }
 881
 882 #ifdef DEBUG
 883   assert (match_last != -1);
 884   assert (err == REG_NOERROR);
 885 #endif
 886
 887   /* Set pmatch[] if we need.  */
 888   if (nmatch > 0)
 889     {
 890       int reg_idx;
 891
 892       /* Initialize registers.  */
 893       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 894         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 895
 896       /* Set the points where matching start/end.  */
 897       pmatch[0].rm_so = 0;
 898       pmatch[0].rm_eo = mctx.match_last;
 899
 900       if (!preg->no_sub && nmatch > 1)
 901         {
 902           err = set_regs (preg, &mctx, nmatch, pmatch,
 903                           dfa->has_plural_match && dfa->nbackref > 0);
 904           if (BE (err != REG_NOERROR, 0))
 905             goto free_return;
 906         }
 907
 908       /* At last, add the offset to the each registers, since we slided
 909          the buffers so that we could assume that the matching starts
 910          from 0.  */
 911       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 912         if (pmatch[reg_idx].rm_so != -1)
 913           {
 914 #ifdef RE_ENABLE_I18N
 915             if (BE (mctx.input.offsets_needed != 0, 0))
 916               {
 917                 pmatch[reg_idx].rm_so =
 918                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 919                    ? mctx.input.valid_raw_len
 920                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 921                 pmatch[reg_idx].rm_eo =
 922                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 923                    ? mctx.input.valid_raw_len
 924                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 925               }
 926 #else
 927             assert (mctx.input.offsets_needed == 0);
 928 #endif
 929             pmatch[reg_idx].rm_so += match_first;
 930             pmatch[reg_idx].rm_eo += match_first;
 931           }
 932       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 933         {
 934           pmatch[nmatch + reg_idx].rm_so = -1;
 935           pmatch[nmatch + reg_idx].rm_eo = -1;
 936         }
 937
 938       if (dfa->subexp_map)
 939         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 940           if (dfa->subexp_map[reg_idx] != reg_idx)
 941             {
 942               pmatch[reg_idx + 1].rm_so
 943                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 944               pmatch[reg_idx + 1].rm_eo
 945                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 946             }
 947     }
 948
 949  free_return:
 950   re_free (mctx.state_log);
 951   if (dfa->nbackref)
 952     match_ctx_free (&mctx);
 953   re_string_destruct (&mctx.input);
 954   return err;
 955 }
 956
 957 static reg_errcode_t
 958 prune_impossible_nodes (mctx)
 959      re_match_context_t *mctx;
 960 {
 961   const re_dfa_t *const dfa = mctx->dfa;
 962   int halt_node, match_last;
 963   reg_errcode_t ret;
 964   re_dfastate_t **sifted_states;
 965   re_dfastate_t **lim_states = NULL;
 966   re_sift_context_t sctx;
 967 #ifdef DEBUG
 968   assert (mctx->state_log != NULL);
 969 #endif
 970   match_last = mctx->match_last;
 971   halt_node = mctx->last_node;
 972   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 973   if (BE (sifted_states == NULL, 0))
 974     {
 975       ret = REG_ESPACE;
 976       goto free_return;
 977     }
 978   if (dfa->nbackref)
 979     {
 980       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 981       if (BE (lim_states == NULL, 0))
 982         {
 983           ret = REG_ESPACE;
 984           goto free_return;
 985         }
 986       while (1)
 987         {
 988           memset (lim_states, '\0',
 989                   sizeof (re_dfastate_t *) * (match_last + 1));
 990           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 991                          match_last);
 992           ret = sift_states_backward (mctx, &sctx);
 993           re_node_set_free (&sctx.limits);
 994           if (BE (ret != REG_NOERROR, 0))
 995               goto free_return;
 996           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 997             break;
 998           do
 999             {
1000               --match_last;
1001               if (match_last < 0)
1002                 {
1003                   ret = REG_NOMATCH;
1004                   goto free_return;
1005                 }
1006             } while (mctx->state_log[match_last] == NULL
1007                      || !mctx->state_log[match_last]->halt);
1008           halt_node = check_halt_state_context (mctx,
1009                                                 mctx->state_log[match_last],
1010                                                 match_last);
1011         }
1012       ret = merge_state_array (dfa, sifted_states, lim_states,
1013                                match_last + 1);
1014       re_free (lim_states);
1015       lim_states = NULL;
1016       if (BE (ret != REG_NOERROR, 0))
1017         goto free_return;
1018     }
1019   else
1020     {
1021       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1022       ret = sift_states_backward (mctx, &sctx);
1023       re_node_set_free (&sctx.limits);
1024       if (BE (ret != REG_NOERROR, 0))
1025         goto free_return;
1026     }
1027   re_free (mctx->state_log);
1028   mctx->state_log = sifted_states;
1029   sifted_states = NULL;
1030   mctx->last_node = halt_node;
1031   mctx->match_last = match_last;
1032   ret = REG_NOERROR;
1033  free_return:
1034   re_free (sifted_states);
1035   re_free (lim_states);
1036   return ret;
1037 }
1038
1039 /* Acquire an initial state and return it.
1040    We must select appropriate initial state depending on the context,
1041    since initial states may have constraints like "\<", "^", etc..  */
1042
1043 static inline re_dfastate_t *
1044 acquire_init_state_context (err, mctx, idx)
1045      reg_errcode_t *err;
1046      const re_match_context_t *mctx;
1047      int idx;
1048 {
1049   const re_dfa_t *const dfa = mctx->dfa;
1050   if (dfa->init_state->has_constraint)
1051     {
1052       unsigned int context;
1053       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1054       if (IS_WORD_CONTEXT (context))
1055         return dfa->init_state_word;
1056       else if (IS_ORDINARY_CONTEXT (context))
1057         return dfa->init_state;
1058       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1059         return dfa->init_state_begbuf;
1060       else if (IS_NEWLINE_CONTEXT (context))
1061         return dfa->init_state_nl;
1062       else if (IS_BEGBUF_CONTEXT (context))
1063         {
1064           /* It is relatively rare case, then calculate on demand.  */
1065           return re_acquire_state_context (err, dfa,
1066                                            dfa->init_state->entrance_nodes,
1067                                            context);
1068         }
1069       else
1070         /* Must not happen?  */
1071         return dfa->init_state;
1072     }
1073   else
1074     return dfa->init_state;
1075 }
1076
1077 /* Check whether the regular expression match input string INPUT or not,
1078    and return the index where the matching end, return -1 if not match,
1079    or return -2 in case of an error.
1080    FL_LONGEST_MATCH means we want the POSIX longest matching.
1081    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1082    next place where we may want to try matching.
1083    Note that the matcher assume that the maching starts from the current
1084    index of the buffer.  */
1085
1086 static int
1087 check_matching (mctx, fl_longest_match, p_match_first)
1088      re_match_context_t *mctx;
1089      int fl_longest_match;
1090      int *p_match_first;
1091 {
1092   const re_dfa_t *const dfa = mctx->dfa;
1093   reg_errcode_t err;
1094   int match = 0;
1095   int match_last = -1;
1096   int cur_str_idx = re_string_cur_idx (&mctx->input);
1097   re_dfastate_t *cur_state;
1098   int at_init_state = p_match_first != NULL;
1099   int next_start_idx = cur_str_idx;
1100
1101   err = REG_NOERROR;
1102   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1103   /* An initial state must not be NULL (invalid).  */
1104   if (BE (cur_state == NULL, 0))
1105     {
1106       assert (err == REG_ESPACE);
1107       return -2;
1108     }
1109
1110   if (mctx->state_log != NULL)
1111     {
1112       mctx->state_log[cur_str_idx] = cur_state;
1113
1114       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1115          later.  E.g. Processing back references.  */
1116       if (BE (dfa->nbackref, 0))
1117         {
1118           at_init_state = 0;
1119           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1120           if (BE (err != REG_NOERROR, 0))
1121             return err;
1122
1123           if (cur_state->has_backref)
1124             {
1125               err = transit_state_bkref (mctx, &cur_state->nodes);
1126               if (BE (err != REG_NOERROR, 0))
1127                 return err;
1128             }
1129         }
1130     }
1131
1132   /* If the RE accepts NULL string.  */
1133   if (BE (cur_state->halt, 0))
1134     {
1135       if (!cur_state->has_constraint
1136           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1137         {
1138           if (!fl_longest_match)
1139             return cur_str_idx;
1140           else
1141             {
1142               match_last = cur_str_idx;
1143               match = 1;
1144             }
1145         }
1146     }
1147
1148   while (!re_string_eoi (&mctx->input))
1149     {
1150       re_dfastate_t *old_state = cur_state;
1151       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1152
1153       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1154           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1155               && mctx->input.valid_len < mctx->input.len))
1156         {
1157           err = extend_buffers (mctx);
1158           if (BE (err != REG_NOERROR, 0))
1159             {
1160               assert (err == REG_ESPACE);
1161               return -2;
1162             }
1163         }
1164
1165       cur_state = transit_state (&err, mctx, cur_state);
1166       if (mctx->state_log != NULL)
1167         cur_state = merge_state_with_log (&err, mctx, cur_state);
1168
1169       if (cur_state == NULL)
1170         {
1171           /* Reached the invalid state or an error.  Try to recover a valid
1172              state using the state log, if available and if we have not
1173              already found a valid (even if not the longest) match.  */
1174           if (BE (err != REG_NOERROR, 0))
1175             return -2;
1176
1177           if (mctx->state_log == NULL
1178               || (match && !fl_longest_match)
1179               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1180             break;
1181         }
1182
1183       if (BE (at_init_state, 0))
1184         {
1185           if (old_state == cur_state)
1186             next_start_idx = next_char_idx;
1187           else
1188             at_init_state = 0;
1189         }
1190
1191       if (cur_state->halt)
1192         {
1193           /* Reached a halt state.
1194              Check the halt state can satisfy the current context.  */
1195           if (!cur_state->has_constraint
1196               || check_halt_state_context (mctx, cur_state,
1197                                            re_string_cur_idx (&mctx->input)))
1198             {
1199               /* We found an appropriate halt state.  */
1200               match_last = re_string_cur_idx (&mctx->input);
1201               match = 1;
1202
1203               /* We found a match, do not modify match_first below.  */
1204               p_match_first = NULL;
1205               if (!fl_longest_match)
1206                 break;
1207             }
1208         }
1209     }
1210
1211   if (p_match_first)
1212     *p_match_first += next_start_idx;
1213
1214   return match_last;
1215 }
1216
1217 /* Check NODE match the current context.  */
1218
1219 static int check_halt_node_context (dfa, node, context)
1220     const re_dfa_t *dfa;
1221     int node;
1222     unsigned int context;
1223 {
1224   re_token_type_t type = dfa->nodes[node].type;
1225   unsigned int constraint = dfa->nodes[node].constraint;
1226   if (type != END_OF_RE)
1227     return 0;
1228   if (!constraint)
1229     return 1;
1230   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1231     return 0;
1232   return 1;
1233 }
1234
1235 /* Check the halt state STATE match the current context.
1236    Return 0 if not match, if the node, STATE has, is a halt node and
1237    match the context, return the node.  */
1238
1239 static int
1240 check_halt_state_context (mctx, state, idx)
1241     const re_match_context_t *mctx;
1242     const re_dfastate_t *state;
1243     int idx;
1244 {
1245   int i;
1246   unsigned int context;
1247 #ifdef DEBUG
1248   assert (state->halt);
1249 #endif
1250   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1251   for (i = 0; i < state->nodes.nelem; ++i)
1252     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1253       return state->nodes.elems[i];
1254   return 0;
1255 }
1256
1257 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1258    corresponding to the DFA).
1259    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1260    of errors.  */
1261
1262 static int
1263 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1264     const re_match_context_t *mctx;
1265     regmatch_t *regs;
1266     int nregs, *pidx, node;
1267     re_node_set *eps_via_nodes;
1268     struct re_fail_stack_t *fs;
1269 {
1270   const re_dfa_t *const dfa = mctx->dfa;
1271   int i, err;
1272   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1273     {
1274       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1275       re_node_set *edests = &dfa->edests[node];
1276       int dest_node;
1277       err = re_node_set_insert (eps_via_nodes, node);
1278       if (BE (err < 0, 0))
1279         return -2;
1280       /* Pick up a valid destination, or return -1 if none is found.  */
1281       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1282         {
1283           int candidate = edests->elems[i];
1284           if (!re_node_set_contains (cur_nodes, candidate))
1285             continue;
1286           if (dest_node == -1)
1287             dest_node = candidate;
1288
1289           else
1290             {
1291               /* In order to avoid infinite loop like "(a*)*", return the second
1292                  epsilon-transition if the first was already considered.  */
1293               if (re_node_set_contains (eps_via_nodes, dest_node))
1294                 return candidate;
1295
1296               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1297               else if (fs != NULL
1298                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1299                                            eps_via_nodes))
1300                 return -2;
1301
1302               /* We know we are going to exit.  */
1303               break;
1304             }
1305         }
1306       return dest_node;
1307     }
1308   else
1309     {
1310       int naccepted = 0;
1311       re_token_type_t type = dfa->nodes[node].type;
1312
1313 #ifdef RE_ENABLE_I18N
1314       if (dfa->nodes[node].accept_mb)
1315         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1316       else
1317 #endif /* RE_ENABLE_I18N */
1318       if (type == OP_BACK_REF)
1319         {
1320           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1321           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1322           if (fs != NULL)
1323             {
1324               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1325                 return -1;
1326               else if (naccepted)
1327                 {
1328                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1329                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1330                               naccepted) != 0)
1331                     return -1;
1332                 }
1333             }
1334
1335           if (naccepted == 0)
1336             {
1337               int dest_node;
1338               err = re_node_set_insert (eps_via_nodes, node);
1339               if (BE (err < 0, 0))
1340                 return -2;
1341               dest_node = dfa->edests[node].elems[0];
1342               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1343                                         dest_node))
1344                 return dest_node;
1345             }
1346         }
1347
1348       if (naccepted != 0
1349           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1350         {
1351           int dest_node = dfa->nexts[node];
1352           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1353           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1354                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1355                                                dest_node)))
1356             return -1;
1357           re_node_set_empty (eps_via_nodes);
1358           return dest_node;
1359         }
1360     }
1361   return -1;
1362 }
1363
1364 static reg_errcode_t
1365 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1366      struct re_fail_stack_t *fs;
1367      int str_idx, dest_node, nregs;
1368      regmatch_t *regs;
1369      re_node_set *eps_via_nodes;
1370 {
1371   reg_errcode_t err;
1372   int num = fs->num++;
1373   if (fs->num == fs->alloc)
1374     {
1375       struct re_fail_stack_ent_t *new_array;
1376       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1377                                        * fs->alloc * 2));
1378       if (new_array == NULL)
1379         return REG_ESPACE;
1380       fs->alloc *= 2;
1381       fs->stack = new_array;
1382     }
1383   fs->stack[num].idx = str_idx;
1384   fs->stack[num].node = dest_node;
1385   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1386   if (fs->stack[num].regs == NULL)
1387     return REG_ESPACE;
1388   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1389   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1390   return err;
1391 }
1392
1393 static int
1394 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1395      struct re_fail_stack_t *fs;
1396      int *pidx, nregs;
1397      regmatch_t *regs;
1398      re_node_set *eps_via_nodes;
1399 {
1400   int num = --fs->num;
1401   assert (num >= 0);
1402   *pidx = fs->stack[num].idx;
1403   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1404   re_node_set_free (eps_via_nodes);
1405   re_free (fs->stack[num].regs);
1406   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1407   return fs->stack[num].node;
1408 }
1409
1410 /* Set the positions where the subexpressions are starts/ends to registers
1411    PMATCH.
1412    Note: We assume that pmatch[0] is already set, and
1413    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1414
1415 static reg_errcode_t
1416 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1417      const regex_t *preg;
1418      const re_match_context_t *mctx;
1419      size_t nmatch;
1420      regmatch_t *pmatch;
1421      int fl_backtrack;
1422 {
1423   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1424   int idx, cur_node;
1425   re_node_set eps_via_nodes;
1426   struct re_fail_stack_t *fs;
1427   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1428   regmatch_t *prev_idx_match;
1429   int prev_idx_match_malloced = 0;
1430
1431 #ifdef DEBUG
1432   assert (nmatch > 1);
1433   assert (mctx->state_log != NULL);
1434 #endif
1435   if (fl_backtrack)
1436     {
1437       fs = &fs_body;
1438       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1439       if (fs->stack == NULL)
1440         return REG_ESPACE;
1441     }
1442   else
1443     fs = NULL;
1444
1445   cur_node = dfa->init_node;
1446   re_node_set_init_empty (&eps_via_nodes);
1447
1448   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1449     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1450   else
1451     {
1452       prev_idx_match = re_malloc (regmatch_t, nmatch);
1453       if (prev_idx_match == NULL)
1454         {
1455           free_fail_stack_return (fs);
1456           return REG_ESPACE;
1457         }
1458       prev_idx_match_malloced = 1;
1459     }
1460   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1461
1462   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1463     {
1464       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1465
1466       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1467         {
1468           int reg_idx;
1469           if (fs)
1470             {
1471               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1472                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1473                   break;
1474               if (reg_idx == nmatch)
1475                 {
1476                   re_node_set_free (&eps_via_nodes);
1477                   if (prev_idx_match_malloced)
1478                     re_free (prev_idx_match);
1479                   return free_fail_stack_return (fs);
1480                 }
1481               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1482                                          &eps_via_nodes);
1483             }
1484           else
1485             {
1486               re_node_set_free (&eps_via_nodes);
1487               if (prev_idx_match_malloced)
1488                 re_free (prev_idx_match);
1489               return REG_NOERROR;
1490             }
1491         }
1492
1493       /* Proceed to next node.  */
1494       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1495                                     &eps_via_nodes, fs);
1496
1497       if (BE (cur_node < 0, 0))
1498         {
1499           if (BE (cur_node == -2, 0))
1500             {
1501               re_node_set_free (&eps_via_nodes);
1502               if (prev_idx_match_malloced)
1503                 re_free (prev_idx_match);
1504               free_fail_stack_return (fs);
1505               return REG_ESPACE;
1506             }
1507           if (fs)
1508             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1509                                        &eps_via_nodes);
1510           else
1511             {
1512               re_node_set_free (&eps_via_nodes);
1513               if (prev_idx_match_malloced)
1514                 re_free (prev_idx_match);
1515               return REG_NOMATCH;
1516             }
1517         }
1518     }
1519   re_node_set_free (&eps_via_nodes);
1520   if (prev_idx_match_malloced)
1521     re_free (prev_idx_match);
1522   return free_fail_stack_return (fs);
1523 }
1524
1525 static reg_errcode_t
1526 free_fail_stack_return (fs)
1527      struct re_fail_stack_t *fs;
1528 {
1529   if (fs)
1530     {
1531       int fs_idx;
1532       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1533         {
1534           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1535           re_free (fs->stack[fs_idx].regs);
1536         }
1537       re_free (fs->stack);
1538     }
1539   return REG_NOERROR;
1540 }
1541
1542 static void
1543 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1544      const re_dfa_t *dfa;
1545      regmatch_t *pmatch, *prev_idx_match;
1546      int cur_node, cur_idx, nmatch;
1547 {
1548   int type = dfa->nodes[cur_node].type;
1549   if (type == OP_OPEN_SUBEXP)
1550     {
1551       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1552
1553       /* We are at the first node of this sub expression.  */
1554       if (reg_num < nmatch)
1555         {
1556           pmatch[reg_num].rm_so = cur_idx;
1557           pmatch[reg_num].rm_eo = -1;
1558         }
1559     }
1560   else if (type == OP_CLOSE_SUBEXP)
1561     {
1562       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1563       if (reg_num < nmatch)
1564         {
1565           /* We are at the last node of this sub expression.  */
1566           if (pmatch[reg_num].rm_so < cur_idx)
1567             {
1568               pmatch[reg_num].rm_eo = cur_idx;
1569               /* This is a non-empty match or we are not inside an optional
1570                  subexpression.  Accept this right away.  */
1571               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1572             }
1573           else
1574             {
1575               if (dfa->nodes[cur_node].opt_subexp
1576                   && prev_idx_match[reg_num].rm_so != -1)
1577                 /* We transited through an empty match for an optional
1578                    subexpression, like (a?)*, and this is not the subexp's
1579                    first match.  Copy back the old content of the registers
1580                    so that matches of an inner subexpression are undone as
1581                    well, like in ((a?))*.  */
1582                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1583               else
1584                 /* We completed a subexpression, but it may be part of
1585                    an optional one, so do not update PREV_IDX_MATCH.  */
1586                 pmatch[reg_num].rm_eo = cur_idx;
1587             }
1588         }
1589     }
1590 }
1591
1592 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1593    and sift the nodes in each states according to the following rules.
1594    Updated state_log will be wrote to STATE_LOG.
1595
1596    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1597      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1598         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1599         the LAST_NODE, we throw away the node `a'.
1600      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1601         string `s' and transit to `b':
1602         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1603            away the node `a'.
1604         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1605             thrown away, we throw away the node `a'.
1606      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1607         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1608            node `a'.
1609         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1610             we throw away the node `a'.  */
1611
1612 #define STATE_NODE_CONTAINS(state,node) \
1613   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1614
1615 static reg_errcode_t
1616 sift_states_backward (mctx, sctx)
1617      const re_match_context_t *mctx;
1618      re_sift_context_t *sctx;
1619 {
1620   reg_errcode_t err;
1621   int null_cnt = 0;
1622   int str_idx = sctx->last_str_idx;
1623   re_node_set cur_dest;
1624
1625 #ifdef DEBUG
1626   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1627 #endif
1628
1629   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1630      transit to the last_node and the last_node itself.  */
1631   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1632   if (BE (err != REG_NOERROR, 0))
1633     return err;
1634   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1635   if (BE (err != REG_NOERROR, 0))
1636     goto free_return;
1637
1638   /* Then check each states in the state_log.  */
1639   while (str_idx > 0)
1640     {
1641       /* Update counters.  */
1642       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1643       if (null_cnt > mctx->max_mb_elem_len)
1644         {
1645           memset (sctx->sifted_states, '\0',
1646                   sizeof (re_dfastate_t *) * str_idx);
1647           re_node_set_free (&cur_dest);
1648           return REG_NOERROR;
1649         }
1650       re_node_set_empty (&cur_dest);
1651       --str_idx;
1652
1653       if (mctx->state_log[str_idx])
1654         {
1655           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1656           if (BE (err != REG_NOERROR, 0))
1657             goto free_return;
1658         }
1659
1660       /* Add all the nodes which satisfy the following conditions:
1661          - It can epsilon transit to a node in CUR_DEST.
1662          - It is in CUR_SRC.
1663          And update state_log.  */
1664       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1665       if (BE (err != REG_NOERROR, 0))
1666         goto free_return;
1667     }
1668   err = REG_NOERROR;
1669  free_return:
1670   re_node_set_free (&cur_dest);
1671   return err;
1672 }
1673
1674 static reg_errcode_t
1675 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1676      const re_match_context_t *mctx;
1677      re_sift_context_t *sctx;
1678      int str_idx;
1679      re_node_set *cur_dest;
1680 {
1681   const re_dfa_t *const dfa = mctx->dfa;
1682   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1683   int i;
1684
1685   /* Then build the next sifted state.
1686      We build the next sifted state on `cur_dest', and update
1687      `sifted_states[str_idx]' with `cur_dest'.
1688      Note:
1689      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1690      `cur_src' points the node_set of the old `state_log[str_idx]'
1691      (with the epsilon nodes pre-filtered out).  */
1692   for (i = 0; i < cur_src->nelem; i++)
1693     {
1694       int prev_node = cur_src->elems[i];
1695       int naccepted = 0;
1696       int ret;
1697
1698 #ifdef DEBUG
1699       re_token_type_t type = dfa->nodes[prev_node].type;
1700       assert (!IS_EPSILON_NODE (type));
1701 #endif
1702 #ifdef RE_ENABLE_I18N
1703       /* If the node may accept `multi byte'.  */
1704       if (dfa->nodes[prev_node].accept_mb)
1705         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1706                                          str_idx, sctx->last_str_idx);
1707 #endif /* RE_ENABLE_I18N */
1708
1709       /* We don't check backreferences here.
1710          See update_cur_sifted_state().  */
1711       if (!naccepted
1712           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1713           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1714                                   dfa->nexts[prev_node]))
1715         naccepted = 1;
1716
1717       if (naccepted == 0)
1718         continue;
1719
1720       if (sctx->limits.nelem)
1721         {
1722           int to_idx = str_idx + naccepted;
1723           if (check_dst_limits (mctx, &sctx->limits,
1724                                 dfa->nexts[prev_node], to_idx,
1725                                 prev_node, str_idx))
1726             continue;
1727         }
1728       ret = re_node_set_insert (cur_dest, prev_node);
1729       if (BE (ret == -1, 0))
1730         return REG_ESPACE;
1731     }
1732
1733   return REG_NOERROR;
1734 }
1735
1736 /* Helper functions.  */
1737
1738 static reg_errcode_t
1739 clean_state_log_if_needed (mctx, next_state_log_idx)
1740     re_match_context_t *mctx;
1741     int next_state_log_idx;
1742 {
1743   int top = mctx->state_log_top;
1744
1745   if (next_state_log_idx >= mctx->input.bufs_len
1746       || (next_state_log_idx >= mctx->input.valid_len
1747           && mctx->input.valid_len < mctx->input.len))
1748     {
1749       reg_errcode_t err;
1750       err = extend_buffers (mctx);
1751       if (BE (err != REG_NOERROR, 0))
1752         return err;
1753     }
1754
1755   if (top < next_state_log_idx)
1756     {
1757       memset (mctx->state_log + top + 1, '\0',
1758               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1759       mctx->state_log_top = next_state_log_idx;
1760     }
1761   return REG_NOERROR;
1762 }
1763
1764 static reg_errcode_t
1765 merge_state_array (dfa, dst, src, num)
1766      const re_dfa_t *dfa;
1767      re_dfastate_t **dst;
1768      re_dfastate_t **src;
1769      int num;
1770 {
1771   int st_idx;
1772   reg_errcode_t err;
1773   for (st_idx = 0; st_idx < num; ++st_idx)
1774     {
1775       if (dst[st_idx] == NULL)
1776         dst[st_idx] = src[st_idx];
1777       else if (src[st_idx] != NULL)
1778         {
1779           re_node_set merged_set;
1780           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1781                                         &src[st_idx]->nodes);
1782           if (BE (err != REG_NOERROR, 0))
1783             return err;
1784           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1785           re_node_set_free (&merged_set);
1786           if (BE (err != REG_NOERROR, 0))
1787             return err;
1788         }
1789     }
1790   return REG_NOERROR;
1791 }
1792
1793 static reg_errcode_t
1794 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1795      const re_match_context_t *mctx;
1796      re_sift_context_t *sctx;
1797      int str_idx;
1798      re_node_set *dest_nodes;
1799 {
1800   const re_dfa_t *const dfa = mctx->dfa;
1801   reg_errcode_t err;
1802   const re_node_set *candidates;
1803   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1804                 : &mctx->state_log[str_idx]->nodes);
1805
1806   if (dest_nodes->nelem == 0)
1807     sctx->sifted_states[str_idx] = NULL;
1808   else
1809     {
1810       if (candidates)
1811         {
1812           /* At first, add the nodes which can epsilon transit to a node in
1813              DEST_NODE.  */
1814           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1815           if (BE (err != REG_NOERROR, 0))
1816             return err;
1817
1818           /* Then, check the limitations in the current sift_context.  */
1819           if (sctx->limits.nelem)
1820             {
1821               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1822                                          mctx->bkref_ents, str_idx);
1823               if (BE (err != REG_NOERROR, 0))
1824                 return err;
1825             }
1826         }
1827
1828       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1829       if (BE (err != REG_NOERROR, 0))
1830         return err;
1831     }
1832
1833   if (candidates && mctx->state_log[str_idx]->has_backref)
1834     {
1835       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1836       if (BE (err != REG_NOERROR, 0))
1837         return err;
1838     }
1839   return REG_NOERROR;
1840 }
1841
1842 static reg_errcode_t
1843 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1844      const re_dfa_t *dfa;
1845      re_node_set *dest_nodes;
1846      const re_node_set *candidates;
1847 {
1848   reg_errcode_t err = REG_NOERROR;
1849   int i;
1850
1851   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1852   if (BE (err != REG_NOERROR, 0))
1853     return err;
1854
1855   if (!state->inveclosure.alloc)
1856     {
1857       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1858       if (BE (err != REG_NOERROR, 0))
1859         return REG_ESPACE;
1860       for (i = 0; i < dest_nodes->nelem; i++)
1861         re_node_set_merge (&state->inveclosure,
1862                            dfa->inveclosures + dest_nodes->elems[i]);
1863     }
1864   return re_node_set_add_intersect (dest_nodes, candidates,
1865                                     &state->inveclosure);
1866 }
1867
1868 static reg_errcode_t
1869 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1870      const re_dfa_t *dfa;
1871      int node;
1872      re_node_set *dest_nodes;
1873      const re_node_set *candidates;
1874 {
1875     int ecl_idx;
1876     reg_errcode_t err;
1877     re_node_set *inv_eclosure = dfa->inveclosures + node;
1878     re_node_set except_nodes;
1879     re_node_set_init_empty (&except_nodes);
1880     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1881       {
1882         int cur_node = inv_eclosure->elems[ecl_idx];
1883         if (cur_node == node)
1884           continue;
1885         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1886           {
1887             int edst1 = dfa->edests[cur_node].elems[0];
1888             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1889                          ? dfa->edests[cur_node].elems[1] : -1);
1890             if ((!re_node_set_contains (inv_eclosure, edst1)
1891                  && re_node_set_contains (dest_nodes, edst1))
1892                 || (edst2 > 0
1893                     && !re_node_set_contains (inv_eclosure, edst2)
1894                     && re_node_set_contains (dest_nodes, edst2)))
1895               {
1896                 err = re_node_set_add_intersect (&except_nodes, candidates,
1897                                                  dfa->inveclosures + cur_node);
1898                 if (BE (err != REG_NOERROR, 0))
1899                   {
1900                     re_node_set_free (&except_nodes);
1901                     return err;
1902                   }
1903               }
1904           }
1905       }
1906     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1907       {
1908         int cur_node = inv_eclosure->elems[ecl_idx];
1909         if (!re_node_set_contains (&except_nodes, cur_node))
1910           {
1911             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1912             re_node_set_remove_at (dest_nodes, idx);
1913           }
1914       }
1915     re_node_set_free (&except_nodes);
1916     return REG_NOERROR;
1917 }
1918
1919 static int
1920 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1921      const re_match_context_t *mctx;
1922      re_node_set *limits;
1923      int dst_node, dst_idx, src_node, src_idx;
1924 {
1925   const re_dfa_t *const dfa = mctx->dfa;
1926   int lim_idx, src_pos, dst_pos;
1927
1928   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1929   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1930   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1931     {
1932       int subexp_idx;
1933       struct re_backref_cache_entry *ent;
1934       ent = mctx->bkref_ents + limits->elems[lim_idx];
1935       subexp_idx = dfa->nodes[ent->node].opr.idx;
1936
1937       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1938                                            subexp_idx, dst_node, dst_idx,
1939                                            dst_bkref_idx);
1940       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1941                                            subexp_idx, src_node, src_idx,
1942                                            src_bkref_idx);
1943
1944       /* In case of:
1945          <src> <dst> ( <subexp> )
1946          ( <subexp> ) <src> <dst>
1947          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1948       if (src_pos == dst_pos)
1949         continue; /* This is unrelated limitation.  */
1950       else
1951         return 1;
1952     }
1953   return 0;
1954 }
1955
1956 static int
1957 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1958      const re_match_context_t *mctx;
1959      int boundaries, subexp_idx, from_node, bkref_idx;
1960 {
1961   const re_dfa_t *const dfa = mctx->dfa;
1962   const re_node_set *eclosures = dfa->eclosures + from_node;
1963   int node_idx;
1964
1965   /* Else, we are on the boundary: examine the nodes on the epsilon
1966      closure.  */
1967   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1968     {
1969       int node = eclosures->elems[node_idx];
1970       switch (dfa->nodes[node].type)
1971         {
1972         case OP_BACK_REF:
1973           if (bkref_idx != -1)
1974             {
1975               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1976               do
1977                 {
1978                   int dst, cpos;
1979
1980                   if (ent->node != node)
1981                     continue;
1982
1983                   if (subexp_idx
1984                       < CHAR_BIT * sizeof ent->eps_reachable_subexps_map
1985                       && !(ent->eps_reachable_subexps_map & (1u << subexp_idx)))
1986                     continue;
1987
1988                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1989                      OP_CLOSE_SUBEXP cases below.  But, if the
1990                      destination node is the same node as the source
1991                      node, don't recurse because it would cause an
1992                      infinite loop: a regex that exhibits this behavior
1993                      is ()\1*\1*  */
1994                   dst = dfa->edests[node].elems[0];
1995                   if (dst == from_node)
1996                     {
1997                       if (boundaries & 1)
1998                         return -1;
1999                       else /* if (boundaries & 2) */
2000                         return 0;
2001                     }
2002
2003                   cpos =
2004                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2005                                                  dst, bkref_idx);
2006                   if (cpos == -1 /* && (boundaries & 1) */)
2007                     return -1;
2008                   if (cpos == 0 && (boundaries & 2))
2009                     return 0;
2010
2011                   if (subexp_idx
2012                       < CHAR_BIT * sizeof ent->eps_reachable_subexps_map)
2013                     ent->eps_reachable_subexps_map &= ~(1u << subexp_idx);
2014                 }
2015               while (ent++->more);
2016             }
2017           break;
2018
2019         case OP_OPEN_SUBEXP:
2020           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2021             return -1;
2022           break;
2023
2024         case OP_CLOSE_SUBEXP:
2025           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2026             return 0;
2027           break;
2028
2029         default:
2030             break;
2031         }
2032     }
2033
2034   return (boundaries & 2) ? 1 : 0;
2035 }
2036
2037 static int
2038 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
2039      const re_match_context_t *mctx;
2040      int limit, subexp_idx, from_node, str_idx, bkref_idx;
2041 {
2042   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2043   int boundaries;
2044
2045   /* If we are outside the range of the subexpression, return -1 or 1.  */
2046   if (str_idx < lim->subexp_from)
2047     return -1;
2048
2049   if (lim->subexp_to < str_idx)
2050     return 1;
2051
2052   /* If we are within the subexpression, return 0.  */
2053   boundaries = (str_idx == lim->subexp_from);
2054   boundaries |= (str_idx == lim->subexp_to) << 1;
2055   if (boundaries == 0)
2056     return 0;
2057
2058   /* Else, examine epsilon closure.  */
2059   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2060                                       from_node, bkref_idx);
2061 }
2062
2063 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2064    which are against limitations from DEST_NODES. */
2065
2066 static reg_errcode_t
2067 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2068      const re_dfa_t *dfa;
2069      re_node_set *dest_nodes;
2070      const re_node_set *candidates;
2071      re_node_set *limits;
2072      struct re_backref_cache_entry *bkref_ents;
2073      int str_idx;
2074 {
2075   reg_errcode_t err;
2076   int node_idx, lim_idx;
2077
2078   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2079     {
2080       int subexp_idx;
2081       struct re_backref_cache_entry *ent;
2082       ent = bkref_ents + limits->elems[lim_idx];
2083
2084       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2085         continue; /* This is unrelated limitation.  */
2086
2087       subexp_idx = dfa->nodes[ent->node].opr.idx;
2088       if (ent->subexp_to == str_idx)
2089         {
2090           int ops_node = -1;
2091           int cls_node = -1;
2092           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2093             {
2094               int node = dest_nodes->elems[node_idx];
2095               re_token_type_t type = dfa->nodes[node].type;
2096               if (type == OP_OPEN_SUBEXP
2097                   && subexp_idx == dfa->nodes[node].opr.idx)
2098                 ops_node = node;
2099               else if (type == OP_CLOSE_SUBEXP
2100                        && subexp_idx == dfa->nodes[node].opr.idx)
2101                 cls_node = node;
2102             }
2103
2104           /* Check the limitation of the open subexpression.  */
2105           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2106           if (ops_node >= 0)
2107             {
2108               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2109                                            candidates);
2110               if (BE (err != REG_NOERROR, 0))
2111                 return err;
2112             }
2113
2114           /* Check the limitation of the close subexpression.  */
2115           if (cls_node >= 0)
2116             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2117               {
2118                 int node = dest_nodes->elems[node_idx];
2119                 if (!re_node_set_contains (dfa->inveclosures + node,
2120                                            cls_node)
2121                     && !re_node_set_contains (dfa->eclosures + node,
2122                                               cls_node))
2123                   {
2124                     /* It is against this limitation.
2125                        Remove it form the current sifted state.  */
2126                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2127                                                  candidates);
2128                     if (BE (err != REG_NOERROR, 0))
2129                       return err;
2130                     --node_idx;
2131                   }
2132               }
2133         }
2134       else /* (ent->subexp_to != str_idx)  */
2135         {
2136           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2137             {
2138               int node = dest_nodes->elems[node_idx];
2139               re_token_type_t type = dfa->nodes[node].type;
2140               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2141                 {
2142                   if (subexp_idx != dfa->nodes[node].opr.idx)
2143                     continue;
2144                   /* It is against this limitation.
2145                      Remove it form the current sifted state.  */
2146                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2147                                                candidates);
2148                   if (BE (err != REG_NOERROR, 0))
2149                     return err;
2150                 }
2151             }
2152         }
2153     }
2154   return REG_NOERROR;
2155 }
2156
2157 static reg_errcode_t
2158 sift_states_bkref (mctx, sctx, str_idx, candidates)
2159      const re_match_context_t *mctx;
2160      re_sift_context_t *sctx;
2161      int str_idx;
2162      const re_node_set *candidates;
2163 {
2164   const re_dfa_t *const dfa = mctx->dfa;
2165   reg_errcode_t err;
2166   int node_idx, node;
2167   re_sift_context_t local_sctx;
2168   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2169
2170   if (first_idx == -1)
2171     return REG_NOERROR;
2172
2173   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2174
2175   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2176     {
2177       int enabled_idx;
2178       re_token_type_t type;
2179       struct re_backref_cache_entry *entry;
2180       node = candidates->elems[node_idx];
2181       type = dfa->nodes[node].type;
2182       /* Avoid infinite loop for the REs like "()\1+".  */
2183       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2184         continue;
2185       if (type != OP_BACK_REF)
2186         continue;
2187
2188       entry = mctx->bkref_ents + first_idx;
2189       enabled_idx = first_idx;
2190       do
2191         {
2192           int subexp_len;
2193           int to_idx;
2194           int dst_node;
2195           int ret;
2196           re_dfastate_t *cur_state;
2197
2198           if (entry->node != node)
2199             continue;
2200           subexp_len = entry->subexp_to - entry->subexp_from;
2201           to_idx = str_idx + subexp_len;
2202           dst_node = (subexp_len ? dfa->nexts[node]
2203                       : dfa->edests[node].elems[0]);
2204
2205           if (to_idx > sctx->last_str_idx
2206               || sctx->sifted_states[to_idx] == NULL
2207               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2208               || check_dst_limits (mctx, &sctx->limits, node,
2209                                    str_idx, dst_node, to_idx))
2210             continue;
2211
2212           if (local_sctx.sifted_states == NULL)
2213             {
2214               local_sctx = *sctx;
2215               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2216               if (BE (err != REG_NOERROR, 0))
2217                 goto free_return;
2218             }
2219           local_sctx.last_node = node;
2220           local_sctx.last_str_idx = str_idx;
2221           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2222           if (BE (ret < 0, 0))
2223             {
2224               err = REG_ESPACE;
2225               goto free_return;
2226             }
2227           cur_state = local_sctx.sifted_states[str_idx];
2228           err = sift_states_backward (mctx, &local_sctx);
2229           if (BE (err != REG_NOERROR, 0))
2230             goto free_return;
2231           if (sctx->limited_states != NULL)
2232             {
2233               err = merge_state_array (dfa, sctx->limited_states,
2234                                        local_sctx.sifted_states,
2235                                        str_idx + 1);
2236               if (BE (err != REG_NOERROR, 0))
2237                 goto free_return;
2238             }
2239           local_sctx.sifted_states[str_idx] = cur_state;
2240           re_node_set_remove (&local_sctx.limits, enabled_idx);
2241
2242           /* mctx->bkref_ents may have changed, reload the pointer.  */
2243           entry = mctx->bkref_ents + enabled_idx;
2244         }
2245       while (enabled_idx++, entry++->more);
2246     }
2247   err = REG_NOERROR;
2248  free_return:
2249   if (local_sctx.sifted_states != NULL)
2250     {
2251       re_node_set_free (&local_sctx.limits);
2252     }
2253
2254   return err;
2255 }
2256
2257
2258 #ifdef RE_ENABLE_I18N
2259 static int
2260 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2261     const re_match_context_t *mctx;
2262     re_sift_context_t *sctx;
2263     int node_idx, str_idx, max_str_idx;
2264 {
2265   const re_dfa_t *const dfa = mctx->dfa;
2266   int naccepted;
2267   /* Check the node can accept `multi byte'.  */
2268   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2269   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2270       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2271                             dfa->nexts[node_idx]))
2272     /* The node can't accept the `multi byte', or the
2273        destination was already thrown away, then the node
2274        could't accept the current input `multi byte'.   */
2275     naccepted = 0;
2276   /* Otherwise, it is sure that the node could accept
2277      `naccepted' bytes input.  */
2278   return naccepted;
2279 }
2280 #endif /* RE_ENABLE_I18N */
2281
2282 \f
2283 /* Functions for state transition.  */
2284
2285 /* Return the next state to which the current state STATE will transit by
2286    accepting the current input byte, and update STATE_LOG if necessary.
2287    If STATE can accept a multibyte char/collating element/back reference
2288    update the destination of STATE_LOG.  */
2289
2290 static re_dfastate_t *
2291 transit_state (err, mctx, state)
2292      reg_errcode_t *err;
2293      re_match_context_t *mctx;
2294      re_dfastate_t *state;
2295 {
2296   re_dfastate_t **trtable;
2297   unsigned char ch;
2298
2299 #ifdef RE_ENABLE_I18N
2300   /* If the current state can accept multibyte.  */
2301   if (BE (state->accept_mb, 0))
2302     {
2303       *err = transit_state_mb (mctx, state);
2304       if (BE (*err != REG_NOERROR, 0))
2305         return NULL;
2306     }
2307 #endif /* RE_ENABLE_I18N */
2308
2309   /* Then decide the next state with the single byte.  */
2310 #if 0
2311   if (0)
2312     /* don't use transition table  */
2313     return transit_state_sb (err, mctx, state);
2314 #endif
2315
2316   /* Use transition table  */
2317   ch = re_string_fetch_byte (&mctx->input);
2318   for (;;)
2319     {
2320       trtable = state->trtable;
2321       if (BE (trtable != NULL, 1))
2322         return trtable[ch];
2323
2324       trtable = state->word_trtable;
2325       if (BE (trtable != NULL, 1))
2326         {
2327           unsigned int context;
2328           context
2329             = re_string_context_at (&mctx->input,
2330                                     re_string_cur_idx (&mctx->input) - 1,
2331                                     mctx->eflags);
2332           if (IS_WORD_CONTEXT (context))
2333             return trtable[ch + SBC_MAX];
2334           else
2335             return trtable[ch];
2336         }
2337
2338       if (!build_trtable (mctx->dfa, state))
2339         {
2340           *err = REG_ESPACE;
2341           return NULL;
2342         }
2343
2344       /* Retry, we now have a transition table.  */
2345     }
2346 }
2347
2348 /* Update the state_log if we need */
2349 re_dfastate_t *
2350 merge_state_with_log (err, mctx, next_state)
2351      reg_errcode_t *err;
2352      re_match_context_t *mctx;
2353      re_dfastate_t *next_state;
2354 {
2355   const re_dfa_t *const dfa = mctx->dfa;
2356   int cur_idx = re_string_cur_idx (&mctx->input);
2357
2358   if (cur_idx > mctx->state_log_top)
2359     {
2360       mctx->state_log[cur_idx] = next_state;
2361       mctx->state_log_top = cur_idx;
2362     }
2363   else if (mctx->state_log[cur_idx] == 0)
2364     {
2365       mctx->state_log[cur_idx] = next_state;
2366     }
2367   else
2368     {
2369       re_dfastate_t *pstate;
2370       unsigned int context;
2371       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2372       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2373          the destination of a multibyte char/collating element/
2374          back reference.  Then the next state is the union set of
2375          these destinations and the results of the transition table.  */
2376       pstate = mctx->state_log[cur_idx];
2377       log_nodes = pstate->entrance_nodes;
2378       if (next_state != NULL)
2379         {
2380           table_nodes = next_state->entrance_nodes;
2381           *err = re_node_set_init_union (&next_nodes, table_nodes,
2382                                              log_nodes);
2383           if (BE (*err != REG_NOERROR, 0))
2384             return NULL;
2385         }
2386       else
2387         next_nodes = *log_nodes;
2388       /* Note: We already add the nodes of the initial state,
2389          then we don't need to add them here.  */
2390
2391       context = re_string_context_at (&mctx->input,
2392                                       re_string_cur_idx (&mctx->input) - 1,
2393                                       mctx->eflags);
2394       next_state = mctx->state_log[cur_idx]
2395         = re_acquire_state_context (err, dfa, &next_nodes, context);
2396       /* We don't need to check errors here, since the return value of
2397          this function is next_state and ERR is already set.  */
2398
2399       if (table_nodes != NULL)
2400         re_node_set_free (&next_nodes);
2401     }
2402
2403   if (BE (dfa->nbackref, 0) && next_state != NULL)
2404     {
2405       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2406          later.  We must check them here, since the back references in the
2407          next state might use them.  */
2408       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2409                                         cur_idx);
2410       if (BE (*err != REG_NOERROR, 0))
2411         return NULL;
2412
2413       /* If the next state has back references.  */
2414       if (next_state->has_backref)
2415         {
2416           *err = transit_state_bkref (mctx, &next_state->nodes);
2417           if (BE (*err != REG_NOERROR, 0))
2418             return NULL;
2419           next_state = mctx->state_log[cur_idx];
2420         }
2421     }
2422
2423   return next_state;
2424 }
2425
2426 /* Skip bytes in the input that correspond to part of a
2427    multi-byte match, then look in the log for a state
2428    from which to restart matching.  */
2429 re_dfastate_t *
2430 find_recover_state (err, mctx)
2431      reg_errcode_t *err;
2432      re_match_context_t *mctx;
2433 {
2434   re_dfastate_t *cur_state;
2435   do
2436     {
2437       int max = mctx->state_log_top;
2438       int cur_str_idx = re_string_cur_idx (&mctx->input);
2439
2440       do
2441         {
2442           if (++cur_str_idx > max)
2443             return NULL;
2444           re_string_skip_bytes (&mctx->input, 1);
2445         }
2446       while (mctx->state_log[cur_str_idx] == NULL);
2447
2448       cur_state = merge_state_with_log (err, mctx, NULL);
2449     }
2450   while (*err == REG_NOERROR && cur_state == NULL);
2451   return cur_state;
2452 }
2453
2454 /* Helper functions for transit_state.  */
2455
2456 /* From the node set CUR_NODES, pick up the nodes whose types are
2457    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2458    expression. And register them to use them later for evaluating the
2459    correspoding back references.  */
2460
2461 static reg_errcode_t
2462 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2463      re_match_context_t *mctx;
2464      re_node_set *cur_nodes;
2465      int str_idx;
2466 {
2467   const re_dfa_t *const dfa = mctx->dfa;
2468   int node_idx;
2469   reg_errcode_t err;
2470
2471   /* TODO: This isn't efficient.
2472            Because there might be more than one nodes whose types are
2473            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2474            nodes.
2475            E.g. RE: (a){2}  */
2476   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2477     {
2478       int node = cur_nodes->elems[node_idx];
2479       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2480           && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map
2481           && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx))
2482         {
2483           err = match_ctx_add_subtop (mctx, node, str_idx);
2484           if (BE (err != REG_NOERROR, 0))
2485             return err;
2486         }
2487     }
2488   return REG_NOERROR;
2489 }
2490
2491 #if 0
2492 /* Return the next state to which the current state STATE will transit by
2493    accepting the current input byte.  */
2494
2495 static re_dfastate_t *
2496 transit_state_sb (err, mctx, state)
2497      reg_errcode_t *err;
2498      re_match_context_t *mctx;
2499      re_dfastate_t *state;
2500 {
2501   const re_dfa_t *const dfa = mctx->dfa;
2502   re_node_set next_nodes;
2503   re_dfastate_t *next_state;
2504   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2505   unsigned int context;
2506
2507   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2508   if (BE (*err != REG_NOERROR, 0))
2509     return NULL;
2510   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2511     {
2512       int cur_node = state->nodes.elems[node_cnt];
2513       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2514         {
2515           *err = re_node_set_merge (&next_nodes,
2516                                     dfa->eclosures + dfa->nexts[cur_node]);
2517           if (BE (*err != REG_NOERROR, 0))
2518             {
2519               re_node_set_free (&next_nodes);
2520               return NULL;
2521             }
2522         }
2523     }
2524   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2525   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2526   /* We don't need to check errors here, since the return value of
2527      this function is next_state and ERR is already set.  */
2528
2529   re_node_set_free (&next_nodes);
2530   re_string_skip_bytes (&mctx->input, 1);
2531   return next_state;
2532 }
2533 #endif
2534
2535 #ifdef RE_ENABLE_I18N
2536 static reg_errcode_t
2537 transit_state_mb (mctx, pstate)
2538     re_match_context_t *mctx;
2539     re_dfastate_t *pstate;
2540 {
2541   const re_dfa_t *const dfa = mctx->dfa;
2542   reg_errcode_t err;
2543   int i;
2544
2545   for (i = 0; i < pstate->nodes.nelem; ++i)
2546     {
2547       re_node_set dest_nodes, *new_nodes;
2548       int cur_node_idx = pstate->nodes.elems[i];
2549       int naccepted, dest_idx;
2550       unsigned int context;
2551       re_dfastate_t *dest_state;
2552
2553       if (!dfa->nodes[cur_node_idx].accept_mb)
2554         continue;
2555
2556       if (dfa->nodes[cur_node_idx].constraint)
2557         {
2558           context = re_string_context_at (&mctx->input,
2559                                           re_string_cur_idx (&mctx->input),
2560                                           mctx->eflags);
2561           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2562                                            context))
2563             continue;
2564         }
2565
2566       /* How many bytes the node can accept?  */
2567       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2568                                            re_string_cur_idx (&mctx->input));
2569       if (naccepted == 0)
2570         continue;
2571
2572       /* The node can accepts `naccepted' bytes.  */
2573       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2574       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2575                                : mctx->max_mb_elem_len);
2576       err = clean_state_log_if_needed (mctx, dest_idx);
2577       if (BE (err != REG_NOERROR, 0))
2578         return err;
2579 #ifdef DEBUG
2580       assert (dfa->nexts[cur_node_idx] != -1);
2581 #endif
2582       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2583
2584       dest_state = mctx->state_log[dest_idx];
2585       if (dest_state == NULL)
2586         dest_nodes = *new_nodes;
2587       else
2588         {
2589           err = re_node_set_init_union (&dest_nodes,
2590                                         dest_state->entrance_nodes, new_nodes);
2591           if (BE (err != REG_NOERROR, 0))
2592             return err;
2593         }
2594       context = re_string_context_at (&mctx->input, dest_idx - 1,
2595                                       mctx->eflags);
2596       mctx->state_log[dest_idx]
2597         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2598       if (dest_state != NULL)
2599         re_node_set_free (&dest_nodes);
2600       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2601         return err;
2602     }
2603   return REG_NOERROR;
2604 }
2605 #endif /* RE_ENABLE_I18N */
2606
2607 static reg_errcode_t
2608 transit_state_bkref (mctx, nodes)
2609     re_match_context_t *mctx;
2610     const re_node_set *nodes;
2611 {
2612   const re_dfa_t *const dfa = mctx->dfa;
2613   reg_errcode_t err;
2614   int i;
2615   int cur_str_idx = re_string_cur_idx (&mctx->input);
2616
2617   for (i = 0; i < nodes->nelem; ++i)
2618     {
2619       int dest_str_idx, prev_nelem, bkc_idx;
2620       int node_idx = nodes->elems[i];
2621       unsigned int context;
2622       const re_token_t *node = dfa->nodes + node_idx;
2623       re_node_set *new_dest_nodes;
2624
2625       /* Check whether `node' is a backreference or not.  */
2626       if (node->type != OP_BACK_REF)
2627         continue;
2628
2629       if (node->constraint)
2630         {
2631           context = re_string_context_at (&mctx->input, cur_str_idx,
2632                                           mctx->eflags);
2633           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2634             continue;
2635         }
2636
2637       /* `node' is a backreference.
2638          Check the substring which the substring matched.  */
2639       bkc_idx = mctx->nbkref_ents;
2640       err = get_subexp (mctx, node_idx, cur_str_idx);
2641       if (BE (err != REG_NOERROR, 0))
2642         goto free_return;
2643
2644       /* And add the epsilon closures (which is `new_dest_nodes') of
2645          the backreference to appropriate state_log.  */
2646 #ifdef DEBUG
2647       assert (dfa->nexts[node_idx] != -1);
2648 #endif
2649       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2650         {
2651           int subexp_len;
2652           re_dfastate_t *dest_state;
2653           struct re_backref_cache_entry *bkref_ent;
2654           bkref_ent = mctx->bkref_ents + bkc_idx;
2655           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2656             continue;
2657           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2658           new_dest_nodes = (subexp_len == 0
2659                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2660                             : dfa->eclosures + dfa->nexts[node_idx]);
2661           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2662                           - bkref_ent->subexp_from);
2663           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2664                                           mctx->eflags);
2665           dest_state = mctx->state_log[dest_str_idx];
2666           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2667                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2668           /* Add `new_dest_node' to state_log.  */
2669           if (dest_state == NULL)
2670             {
2671               mctx->state_log[dest_str_idx]
2672                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2673                                             context);
2674               if (BE (mctx->state_log[dest_str_idx] == NULL
2675                       && err != REG_NOERROR, 0))
2676                 goto free_return;
2677             }
2678           else
2679             {
2680               re_node_set dest_nodes;
2681               err = re_node_set_init_union (&dest_nodes,
2682                                             dest_state->entrance_nodes,
2683                                             new_dest_nodes);
2684               if (BE (err != REG_NOERROR, 0))
2685                 {
2686                   re_node_set_free (&dest_nodes);
2687                   goto free_return;
2688                 }
2689               mctx->state_log[dest_str_idx]
2690                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2691               re_node_set_free (&dest_nodes);
2692               if (BE (mctx->state_log[dest_str_idx] == NULL
2693                       && err != REG_NOERROR, 0))
2694                 goto free_return;
2695             }
2696           /* We need to check recursively if the backreference can epsilon
2697              transit.  */
2698           if (subexp_len == 0
2699               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2700             {
2701               err = check_subexp_matching_top (mctx, new_dest_nodes,
2702                                                cur_str_idx);
2703               if (BE (err != REG_NOERROR, 0))
2704                 goto free_return;
2705               err = transit_state_bkref (mctx, new_dest_nodes);
2706               if (BE (err != REG_NOERROR, 0))
2707                 goto free_return;
2708             }
2709         }
2710     }
2711   err = REG_NOERROR;
2712  free_return:
2713   return err;
2714 }
2715
2716 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2717    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2718    Note that we might collect inappropriate candidates here.
2719    However, the cost of checking them strictly here is too high, then we
2720    delay these checking for prune_impossible_nodes().  */
2721
2722 static reg_errcode_t
2723 get_subexp (mctx, bkref_node, bkref_str_idx)
2724      re_match_context_t *mctx;
2725      int bkref_node, bkref_str_idx;
2726 {
2727   const re_dfa_t *const dfa = mctx->dfa;
2728   int subexp_num, sub_top_idx;
2729   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2730   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2731   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2732   if (cache_idx != -1)
2733     {
2734       const struct re_backref_cache_entry *entry
2735         = mctx->bkref_ents + cache_idx;
2736       do
2737         if (entry->node == bkref_node)
2738           return REG_NOERROR; /* We already checked it.  */
2739       while (entry++->more);
2740     }
2741
2742   subexp_num = dfa->nodes[bkref_node].opr.idx;
2743
2744   /* For each sub expression  */
2745   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2746     {
2747       reg_errcode_t err;
2748       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2749       re_sub_match_last_t *sub_last;
2750       int sub_last_idx, sl_str, bkref_str_off;
2751
2752       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2753         continue; /* It isn't related.  */
2754
2755       sl_str = sub_top->str_idx;
2756       bkref_str_off = bkref_str_idx;
2757       /* At first, check the last node of sub expressions we already
2758          evaluated.  */
2759       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2760         {
2761           int sl_str_diff;
2762           sub_last = sub_top->lasts[sub_last_idx];
2763           sl_str_diff = sub_last->str_idx - sl_str;
2764           /* The matched string by the sub expression match with the substring
2765              at the back reference?  */
2766           if (sl_str_diff > 0)
2767             {
2768               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2769                 {
2770                   /* Not enough chars for a successful match.  */
2771                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2772                     break;
2773
2774                   err = clean_state_log_if_needed (mctx,
2775                                                    bkref_str_off
2776                                                    + sl_str_diff);
2777                   if (BE (err != REG_NOERROR, 0))
2778                     return err;
2779                   buf = (const char *) re_string_get_buffer (&mctx->input);
2780                 }
2781               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2782                 /* We don't need to search this sub expression any more.  */
2783                 break;
2784             }
2785           bkref_str_off += sl_str_diff;
2786           sl_str += sl_str_diff;
2787           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2788                                 bkref_str_idx);
2789
2790           /* Reload buf, since the preceding call might have reallocated
2791              the buffer.  */
2792           buf = (const char *) re_string_get_buffer (&mctx->input);
2793
2794           if (err == REG_NOMATCH)
2795             continue;
2796           if (BE (err != REG_NOERROR, 0))
2797             return err;
2798         }
2799
2800       if (sub_last_idx < sub_top->nlasts)
2801         continue;
2802       if (sub_last_idx > 0)
2803         ++sl_str;
2804       /* Then, search for the other last nodes of the sub expression.  */
2805       for (; sl_str <= bkref_str_idx; ++sl_str)
2806         {
2807           int cls_node, sl_str_off;
2808           const re_node_set *nodes;
2809           sl_str_off = sl_str - sub_top->str_idx;
2810           /* The matched string by the sub expression match with the substring
2811              at the back reference?  */
2812           if (sl_str_off > 0)
2813             {
2814               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2815                 {
2816                   /* If we are at the end of the input, we cannot match.  */
2817                   if (bkref_str_off >= mctx->input.len)
2818                     break;
2819
2820                   err = extend_buffers (mctx);
2821                   if (BE (err != REG_NOERROR, 0))
2822                     return err;
2823
2824                   buf = (const char *) re_string_get_buffer (&mctx->input);
2825                 }
2826               if (buf [bkref_str_off++] != buf[sl_str - 1])
2827                 break; /* We don't need to search this sub expression
2828                           any more.  */
2829             }
2830           if (mctx->state_log[sl_str] == NULL)
2831             continue;
2832           /* Does this state have a ')' of the sub expression?  */
2833           nodes = &mctx->state_log[sl_str]->nodes;
2834           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2835                                        OP_CLOSE_SUBEXP);
2836           if (cls_node == -1)
2837             continue; /* No.  */
2838           if (sub_top->path == NULL)
2839             {
2840               sub_top->path = calloc (sizeof (state_array_t),
2841                                       sl_str - sub_top->str_idx + 1);
2842               if (sub_top->path == NULL)
2843                 return REG_ESPACE;
2844             }
2845           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2846              in the current context?  */
2847           err = check_arrival (mctx, sub_top->path, sub_top->node,
2848                                sub_top->str_idx, cls_node, sl_str,
2849                                OP_CLOSE_SUBEXP);
2850           if (err == REG_NOMATCH)
2851               continue;
2852           if (BE (err != REG_NOERROR, 0))
2853               return err;
2854           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2855           if (BE (sub_last == NULL, 0))
2856             return REG_ESPACE;
2857           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2858                                 bkref_str_idx);
2859           if (err == REG_NOMATCH)
2860             continue;
2861         }
2862     }
2863   return REG_NOERROR;
2864 }
2865
2866 /* Helper functions for get_subexp().  */
2867
2868 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2869    If it can arrive, register the sub expression expressed with SUB_TOP
2870    and SUB_LAST.  */
2871
2872 static reg_errcode_t
2873 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2874      re_match_context_t *mctx;
2875      const re_sub_match_top_t *sub_top;
2876      re_sub_match_last_t *sub_last;
2877      int bkref_node, bkref_str;
2878 {
2879   reg_errcode_t err;
2880   int to_idx;
2881   /* Can the subexpression arrive the back reference?  */
2882   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2883                        sub_last->str_idx, bkref_node, bkref_str,
2884                        OP_OPEN_SUBEXP);
2885   if (err != REG_NOERROR)
2886     return err;
2887   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2888                              sub_last->str_idx);
2889   if (BE (err != REG_NOERROR, 0))
2890     return err;
2891   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2892   return clean_state_log_if_needed (mctx, to_idx);
2893 }
2894
2895 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2896    Search '(' if FL_OPEN, or search ')' otherwise.
2897    TODO: This function isn't efficient...
2898          Because there might be more than one nodes whose types are
2899          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2900          nodes.
2901          E.g. RE: (a){2}  */
2902
2903 static int
2904 find_subexp_node (dfa, nodes, subexp_idx, type)
2905      const re_dfa_t *dfa;
2906      const re_node_set *nodes;
2907      int subexp_idx, type;
2908 {
2909   int cls_idx;
2910   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2911     {
2912       int cls_node = nodes->elems[cls_idx];
2913       const re_token_t *node = dfa->nodes + cls_node;
2914       if (node->type == type
2915           && node->opr.idx == subexp_idx)
2916         return cls_node;
2917     }
2918   return -1;
2919 }
2920
2921 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2922    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2923    heavily reused.
2924    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2925
2926 static reg_errcode_t
2927 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2928                type)
2929      re_match_context_t *mctx;
2930      state_array_t *path;
2931      int top_node, top_str, last_node, last_str, type;
2932 {
2933   const re_dfa_t *const dfa = mctx->dfa;
2934   reg_errcode_t err;
2935   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2936   re_dfastate_t *cur_state = NULL;
2937   re_node_set *cur_nodes, next_nodes;
2938   re_dfastate_t **backup_state_log;
2939   unsigned int context;
2940
2941   subexp_num = dfa->nodes[top_node].opr.idx;
2942   /* Extend the buffer if we need.  */
2943   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2944     {
2945       re_dfastate_t **new_array;
2946       int old_alloc = path->alloc;
2947       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2948       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2949       if (BE (new_array == NULL, 0))
2950         {
2951           path->alloc = old_alloc;
2952           return REG_ESPACE;
2953         }
2954       path->array = new_array;
2955       memset (new_array + old_alloc, '\0',
2956               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2957     }
2958
2959   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2960
2961   /* Temporary modify MCTX.  */
2962   backup_state_log = mctx->state_log;
2963   backup_cur_idx = mctx->input.cur_idx;
2964   mctx->state_log = path->array;
2965   mctx->input.cur_idx = str_idx;
2966
2967   /* Setup initial node set.  */
2968   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2969   if (str_idx == top_str)
2970     {
2971       err = re_node_set_init_1 (&next_nodes, top_node);
2972       if (BE (err != REG_NOERROR, 0))
2973         return err;
2974       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2975       if (BE (err != REG_NOERROR, 0))
2976         {
2977           re_node_set_free (&next_nodes);
2978           return err;
2979         }
2980     }
2981   else
2982     {
2983       cur_state = mctx->state_log[str_idx];
2984       if (cur_state && cur_state->has_backref)
2985         {
2986           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2987           if (BE (err != REG_NOERROR, 0))
2988             return err;
2989         }
2990       else
2991         re_node_set_init_empty (&next_nodes);
2992     }
2993   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2994     {
2995       if (next_nodes.nelem)
2996         {
2997           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2998                                     subexp_num, type);
2999           if (BE (err != REG_NOERROR, 0))
3000             {
3001               re_node_set_free (&next_nodes);
3002               return err;
3003             }
3004         }
3005       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3006       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3007         {
3008           re_node_set_free (&next_nodes);
3009           return err;
3010         }
3011       mctx->state_log[str_idx] = cur_state;
3012     }
3013
3014   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
3015     {
3016       re_node_set_empty (&next_nodes);
3017       if (mctx->state_log[str_idx + 1])
3018         {
3019           err = re_node_set_merge (&next_nodes,
3020                                    &mctx->state_log[str_idx + 1]->nodes);
3021           if (BE (err != REG_NOERROR, 0))
3022             {
3023               re_node_set_free (&next_nodes);
3024               return err;
3025             }
3026         }
3027       if (cur_state)
3028         {
3029           err = check_arrival_add_next_nodes (mctx, str_idx,
3030                                               &cur_state->non_eps_nodes,
3031                                               &next_nodes);
3032           if (BE (err != REG_NOERROR, 0))
3033             {
3034               re_node_set_free (&next_nodes);
3035               return err;
3036             }
3037         }
3038       ++str_idx;
3039       if (next_nodes.nelem)
3040         {
3041           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3042           if (BE (err != REG_NOERROR, 0))
3043             {
3044               re_node_set_free (&next_nodes);
3045               return err;
3046             }
3047           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3048                                     subexp_num, type);
3049           if (BE (err != REG_NOERROR, 0))
3050             {
3051               re_node_set_free (&next_nodes);
3052               return err;
3053             }
3054         }
3055       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3056       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3057       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3058         {
3059           re_node_set_free (&next_nodes);
3060           return err;
3061         }
3062       mctx->state_log[str_idx] = cur_state;
3063       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3064     }
3065   re_node_set_free (&next_nodes);
3066   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3067                : &mctx->state_log[last_str]->nodes);
3068   path->next_idx = str_idx;
3069
3070   /* Fix MCTX.  */
3071   mctx->state_log = backup_state_log;
3072   mctx->input.cur_idx = backup_cur_idx;
3073
3074   /* Then check the current node set has the node LAST_NODE.  */
3075   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3076     return REG_NOERROR;
3077
3078   return REG_NOMATCH;
3079 }
3080
3081 /* Helper functions for check_arrival.  */
3082
3083 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3084    to NEXT_NODES.
3085    TODO: This function is similar to the functions transit_state*(),
3086          however this function has many additional works.
3087          Can't we unify them?  */
3088
3089 static reg_errcode_t
3090 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3091      re_match_context_t *mctx;
3092      int str_idx;
3093      re_node_set *cur_nodes, *next_nodes;
3094 {
3095   const re_dfa_t *const dfa = mctx->dfa;
3096   int result;
3097   int cur_idx;
3098   reg_errcode_t err;
3099   re_node_set union_set;
3100   re_node_set_init_empty (&union_set);
3101   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3102     {
3103       int naccepted = 0;
3104       int cur_node = cur_nodes->elems[cur_idx];
3105 #ifdef DEBUG
3106       re_token_type_t type = dfa->nodes[cur_node].type;
3107       assert (!IS_EPSILON_NODE (type));
3108 #endif
3109 #ifdef RE_ENABLE_I18N
3110       /* If the node may accept `multi byte'.  */
3111       if (dfa->nodes[cur_node].accept_mb)
3112         {
3113           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3114                                                str_idx);
3115           if (naccepted > 1)
3116             {
3117               re_dfastate_t *dest_state;
3118               int next_node = dfa->nexts[cur_node];
3119               int next_idx = str_idx + naccepted;
3120               dest_state = mctx->state_log[next_idx];
3121               re_node_set_empty (&union_set);
3122               if (dest_state)
3123                 {
3124                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3125                   if (BE (err != REG_NOERROR, 0))
3126                     {
3127                       re_node_set_free (&union_set);
3128                       return err;
3129                     }
3130                 }
3131               result = re_node_set_insert (&union_set, next_node);
3132               if (BE (result < 0, 0))
3133                 {
3134                   re_node_set_free (&union_set);
3135                   return REG_ESPACE;
3136                 }
3137               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3138                                                             &union_set);
3139               if (BE (mctx->state_log[next_idx] == NULL
3140                       && err != REG_NOERROR, 0))
3141                 {
3142                   re_node_set_free (&union_set);
3143                   return err;
3144                 }
3145             }
3146         }
3147 #endif /* RE_ENABLE_I18N */
3148       if (naccepted
3149           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3150         {
3151           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3152           if (BE (result < 0, 0))
3153             {
3154               re_node_set_free (&union_set);
3155               return REG_ESPACE;
3156             }
3157         }
3158     }
3159   re_node_set_free (&union_set);
3160   return REG_NOERROR;
3161 }
3162
3163 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3164    CUR_NODES, however exclude the nodes which are:
3165     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3166     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3167 */
3168
3169 static reg_errcode_t
3170 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3171      const re_dfa_t *dfa;
3172      re_node_set *cur_nodes;
3173      int ex_subexp, type;
3174 {
3175   reg_errcode_t err;
3176   int idx, outside_node;
3177   re_node_set new_nodes;
3178 #ifdef DEBUG
3179   assert (cur_nodes->nelem);
3180 #endif
3181   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3182   if (BE (err != REG_NOERROR, 0))
3183     return err;
3184   /* Create a new node set NEW_NODES with the nodes which are epsilon
3185      closures of the node in CUR_NODES.  */
3186
3187   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3188     {
3189       int cur_node = cur_nodes->elems[idx];
3190       const re_node_set *eclosure = dfa->eclosures + cur_node;
3191       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3192       if (outside_node == -1)
3193         {
3194           /* There are no problematic nodes, just merge them.  */
3195           err = re_node_set_merge (&new_nodes, eclosure);
3196           if (BE (err != REG_NOERROR, 0))
3197             {
3198               re_node_set_free (&new_nodes);
3199               return err;
3200             }
3201         }
3202       else
3203         {
3204           /* There are problematic nodes, re-calculate incrementally.  */
3205           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3206                                               ex_subexp, type);
3207           if (BE (err != REG_NOERROR, 0))
3208             {
3209               re_node_set_free (&new_nodes);
3210               return err;
3211             }
3212         }
3213     }
3214   re_node_set_free (cur_nodes);
3215   *cur_nodes = new_nodes;
3216   return REG_NOERROR;
3217 }
3218
3219 /* Helper function for check_arrival_expand_ecl.
3220    Check incrementally the epsilon closure of TARGET, and if it isn't
3221    problematic append it to DST_NODES.  */
3222
3223 static reg_errcode_t
3224 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3225      const re_dfa_t *dfa;
3226      int target, ex_subexp, type;
3227      re_node_set *dst_nodes;
3228 {
3229   int cur_node;
3230   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3231     {
3232       int err;
3233
3234       if (dfa->nodes[cur_node].type == type
3235           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3236         {
3237           if (type == OP_CLOSE_SUBEXP)
3238             {
3239               err = re_node_set_insert (dst_nodes, cur_node);
3240               if (BE (err == -1, 0))
3241                 return REG_ESPACE;
3242             }
3243           break;
3244         }
3245       err = re_node_set_insert (dst_nodes, cur_node);
3246       if (BE (err == -1, 0))
3247         return REG_ESPACE;
3248       if (dfa->edests[cur_node].nelem == 0)
3249         break;
3250       if (dfa->edests[cur_node].nelem == 2)
3251         {
3252           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3253                                               dfa->edests[cur_node].elems[1],
3254                                               ex_subexp, type);
3255           if (BE (err != REG_NOERROR, 0))
3256             return err;
3257         }
3258       cur_node = dfa->edests[cur_node].elems[0];
3259     }
3260   return REG_NOERROR;
3261 }
3262
3263
3264 /* For all the back references in the current state, calculate the
3265    destination of the back references by the appropriate entry
3266    in MCTX->BKREF_ENTS.  */
3267
3268 static reg_errcode_t
3269 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3270                     type)
3271      re_match_context_t *mctx;
3272      int cur_str, subexp_num, type;
3273      re_node_set *cur_nodes;
3274 {
3275   const re_dfa_t *const dfa = mctx->dfa;
3276   reg_errcode_t err;
3277   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3278   struct re_backref_cache_entry *ent;
3279
3280   if (cache_idx_start == -1)
3281     return REG_NOERROR;
3282
3283  restart:
3284   ent = mctx->bkref_ents + cache_idx_start;
3285   do
3286     {
3287       int to_idx, next_node;
3288
3289       /* Is this entry ENT is appropriate?  */
3290       if (!re_node_set_contains (cur_nodes, ent->node))
3291         continue; /* No.  */
3292
3293       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3294       /* Calculate the destination of the back reference, and append it
3295          to MCTX->STATE_LOG.  */
3296       if (to_idx == cur_str)
3297         {
3298           /* The backreference did epsilon transit, we must re-check all the
3299              node in the current state.  */
3300           re_node_set new_dests;
3301           reg_errcode_t err2, err3;
3302           next_node = dfa->edests[ent->node].elems[0];
3303           if (re_node_set_contains (cur_nodes, next_node))
3304             continue;
3305           err = re_node_set_init_1 (&new_dests, next_node);
3306           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3307           err3 = re_node_set_merge (cur_nodes, &new_dests);
3308           re_node_set_free (&new_dests);
3309           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3310                   || err3 != REG_NOERROR, 0))
3311             {
3312               err = (err != REG_NOERROR ? err
3313                      : (err2 != REG_NOERROR ? err2 : err3));
3314               return err;
3315             }
3316           /* TODO: It is still inefficient...  */
3317           goto restart;
3318         }
3319       else
3320         {
3321           re_node_set union_set;
3322           next_node = dfa->nexts[ent->node];
3323           if (mctx->state_log[to_idx])
3324             {
3325               int ret;
3326               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3327                                         next_node))
3328                 continue;
3329               err = re_node_set_init_copy (&union_set,
3330                                            &mctx->state_log[to_idx]->nodes);
3331               ret = re_node_set_insert (&union_set, next_node);
3332               if (BE (err != REG_NOERROR || ret < 0, 0))
3333                 {
3334                   re_node_set_free (&union_set);
3335                   err = err != REG_NOERROR ? err : REG_ESPACE;
3336                   return err;
3337                 }
3338             }
3339           else
3340             {
3341               err = re_node_set_init_1 (&union_set, next_node);
3342               if (BE (err != REG_NOERROR, 0))
3343                 return err;
3344             }
3345           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3346           re_node_set_free (&union_set);
3347           if (BE (mctx->state_log[to_idx] == NULL
3348                   && err != REG_NOERROR, 0))
3349             return err;
3350         }
3351     }
3352   while (ent++->more);
3353   return REG_NOERROR;
3354 }
3355
3356 /* Build transition table for the state.
3357    Return 1 if succeeded, otherwise return NULL.  */
3358
3359 static int
3360 build_trtable (dfa, state)
3361      const re_dfa_t *dfa;
3362      re_dfastate_t *state;
3363 {
3364   reg_errcode_t err;
3365   int i, j, ch, need_word_trtable = 0;
3366   unsigned int elem, mask;
3367   int dests_node_malloced = 0, dest_states_malloced = 0;
3368   int ndests; /* Number of the destination states from `state'.  */
3369   re_dfastate_t **trtable;
3370   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3371   re_node_set follows, *dests_node;
3372   bitset *dests_ch;
3373   bitset acceptable;
3374
3375   /* We build DFA states which corresponds to the destination nodes
3376      from `state'.  `dests_node[i]' represents the nodes which i-th
3377      destination state contains, and `dests_ch[i]' represents the
3378      characters which i-th destination state accepts.  */
3379   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3380     dests_node = (re_node_set *)
3381       alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3382   else
3383     {
3384       dests_node = (re_node_set *)
3385         malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3386       if (BE (dests_node == NULL, 0))
3387         return 0;
3388       dests_node_malloced = 1;
3389     }
3390   dests_ch = (bitset *) (dests_node + SBC_MAX);
3391
3392   /* Initialize transiton table.  */
3393   state->word_trtable = state->trtable = NULL;
3394
3395   /* At first, group all nodes belonging to `state' into several
3396      destinations.  */
3397   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3398   if (BE (ndests <= 0, 0))
3399     {
3400       if (dests_node_malloced)
3401         free (dests_node);
3402       /* Return 0 in case of an error, 1 otherwise.  */
3403       if (ndests == 0)
3404         {
3405           state->trtable = (re_dfastate_t **)
3406             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3407           return 1;
3408         }
3409       return 0;
3410     }
3411
3412   err = re_node_set_alloc (&follows, ndests + 1);
3413   if (BE (err != REG_NOERROR, 0))
3414     goto out_free;
3415
3416   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3417                          + ndests * 3 * sizeof (re_dfastate_t *)))
3418     dest_states = (re_dfastate_t **)
3419       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3420   else
3421     {
3422       dest_states = (re_dfastate_t **)
3423         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3424       if (BE (dest_states == NULL, 0))
3425         {
3426 out_free:
3427           if (dest_states_malloced)
3428             free (dest_states);
3429           re_node_set_free (&follows);
3430           for (i = 0; i < ndests; ++i)
3431             re_node_set_free (dests_node + i);
3432           if (dests_node_malloced)
3433             free (dests_node);
3434           return 0;
3435         }
3436       dest_states_malloced = 1;
3437     }
3438   dest_states_word = dest_states + ndests;
3439   dest_states_nl = dest_states_word + ndests;
3440   bitset_empty (acceptable);
3441
3442   /* Then build the states for all destinations.  */
3443   for (i = 0; i < ndests; ++i)
3444     {
3445       int next_node;
3446       re_node_set_empty (&follows);
3447       /* Merge the follows of this destination states.  */
3448       for (j = 0; j < dests_node[i].nelem; ++j)
3449         {
3450           next_node = dfa->nexts[dests_node[i].elems[j]];
3451           if (next_node != -1)
3452             {
3453               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3454               if (BE (err != REG_NOERROR, 0))
3455                 goto out_free;
3456             }
3457         }
3458       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3459       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3460         goto out_free;
3461       /* If the new state has context constraint,
3462          build appropriate states for these contexts.  */
3463       if (dest_states[i]->has_constraint)
3464         {
3465           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3466                                                           CONTEXT_WORD);
3467           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3468             goto out_free;
3469
3470           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3471             need_word_trtable = 1;
3472
3473           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3474                                                         CONTEXT_NEWLINE);
3475           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3476             goto out_free;
3477         }
3478       else
3479         {
3480           dest_states_word[i] = dest_states[i];
3481           dest_states_nl[i] = dest_states[i];
3482         }
3483       bitset_merge (acceptable, dests_ch[i]);
3484     }
3485
3486   if (!BE (need_word_trtable, 0))
3487     {
3488       /* We don't care about whether the following character is a word
3489          character, or we are in a single-byte character set so we can
3490          discern by looking at the character code: allocate a
3491          256-entry transition table.  */
3492       trtable = state->trtable =
3493         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3494       if (BE (trtable == NULL, 0))
3495         goto out_free;
3496
3497       /* For all characters ch...:  */
3498       for (i = 0; i < BITSET_UINTS; ++i)
3499         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3500              elem;
3501              mask <<= 1, elem >>= 1, ++ch)
3502           if (BE (elem & 1, 0))
3503             {
3504               /* There must be exactly one destination which accepts
3505                  character ch.  See group_nodes_into_DFAstates.  */
3506               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3507                 ;
3508
3509               /* j-th destination accepts the word character ch.  */
3510               if (dfa->word_char[i] & mask)
3511                 trtable[ch] = dest_states_word[j];
3512               else
3513                 trtable[ch] = dest_states[j];
3514             }
3515     }
3516   else
3517     {
3518       /* We care about whether the following character is a word
3519          character, and we are in a multi-byte character set: discern
3520          by looking at the character code: build two 256-entry
3521          transition tables, one starting at trtable[0] and one
3522          starting at trtable[SBC_MAX].  */
3523       trtable = state->word_trtable =
3524         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3525       if (BE (trtable == NULL, 0))
3526         goto out_free;
3527
3528       /* For all characters ch...:  */
3529       for (i = 0; i < BITSET_UINTS; ++i)
3530         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3531              elem;
3532              mask <<= 1, elem >>= 1, ++ch)
3533           if (BE (elem & 1, 0))
3534             {
3535               /* There must be exactly one destination which accepts
3536                  character ch.  See group_nodes_into_DFAstates.  */
3537               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3538                 ;
3539
3540               /* j-th destination accepts the word character ch.  */
3541               trtable[ch] = dest_states[j];
3542               trtable[ch + SBC_MAX] = dest_states_word[j];
3543             }
3544     }
3545
3546   /* new line */
3547   if (bitset_contain (acceptable, NEWLINE_CHAR))
3548     {
3549       /* The current state accepts newline character.  */
3550       for (j = 0; j < ndests; ++j)
3551         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3552           {
3553             /* k-th destination accepts newline character.  */
3554             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3555             if (need_word_trtable)
3556               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3557             /* There must be only one destination which accepts
3558                newline.  See group_nodes_into_DFAstates.  */
3559             break;
3560           }
3561     }
3562
3563   if (dest_states_malloced)
3564     free (dest_states);
3565
3566   re_node_set_free (&follows);
3567   for (i = 0; i < ndests; ++i)
3568     re_node_set_free (dests_node + i);
3569
3570   if (dests_node_malloced)
3571     free (dests_node);
3572
3573   return 1;
3574 }
3575
3576 /* Group all nodes belonging to STATE into several destinations.
3577    Then for all destinations, set the nodes belonging to the destination
3578    to DESTS_NODE[i] and set the characters accepted by the destination
3579    to DEST_CH[i].  This function return the number of destinations.  */
3580
3581 static int
3582 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3583      const re_dfa_t *dfa;
3584      const re_dfastate_t *state;
3585      re_node_set *dests_node;
3586      bitset *dests_ch;
3587 {
3588   reg_errcode_t err;
3589   int result;
3590   int i, j, k;
3591   int ndests; /* Number of the destinations from `state'.  */
3592   bitset accepts; /* Characters a node can accept.  */
3593   const re_node_set *cur_nodes = &state->nodes;
3594   bitset_empty (accepts);
3595   ndests = 0;
3596
3597   /* For all the nodes belonging to `state',  */
3598   for (i = 0; i < cur_nodes->nelem; ++i)
3599     {
3600       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3601       re_token_type_t type = node->type;
3602       unsigned int constraint = node->constraint;
3603
3604       /* Enumerate all single byte character this node can accept.  */
3605       if (type == CHARACTER)
3606         bitset_set (accepts, node->opr.c);
3607       else if (type == SIMPLE_BRACKET)
3608         {
3609           bitset_merge (accepts, node->opr.sbcset);
3610         }
3611       else if (type == OP_PERIOD)
3612         {
3613 #ifdef RE_ENABLE_I18N
3614           if (dfa->mb_cur_max > 1)
3615             bitset_merge (accepts, dfa->sb_char);
3616           else
3617 #endif
3618             bitset_set_all (accepts);
3619           if (!(dfa->syntax & RE_DOT_NEWLINE))
3620             bitset_clear (accepts, '\n');
3621           if (dfa->syntax & RE_DOT_NOT_NULL)
3622             bitset_clear (accepts, '\0');
3623         }
3624 #ifdef RE_ENABLE_I18N
3625       else if (type == OP_UTF8_PERIOD)
3626         {
3627           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3628           if (!(dfa->syntax & RE_DOT_NEWLINE))
3629             bitset_clear (accepts, '\n');
3630           if (dfa->syntax & RE_DOT_NOT_NULL)
3631             bitset_clear (accepts, '\0');
3632         }
3633 #endif
3634       else
3635         continue;
3636
3637       /* Check the `accepts' and sift the characters which are not
3638          match it the context.  */
3639       if (constraint)
3640         {
3641           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3642             {
3643               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3644               bitset_empty (accepts);
3645               if (accepts_newline)
3646                 bitset_set (accepts, NEWLINE_CHAR);
3647               else
3648                 continue;
3649             }
3650           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3651             {
3652               bitset_empty (accepts);
3653               continue;
3654             }
3655
3656           if (constraint & NEXT_WORD_CONSTRAINT)
3657             {
3658               unsigned int any_set = 0;
3659               if (type == CHARACTER && !node->word_char)
3660                 {
3661                   bitset_empty (accepts);
3662                   continue;
3663                 }
3664 #ifdef RE_ENABLE_I18N
3665               if (dfa->mb_cur_max > 1)
3666                 for (j = 0; j < BITSET_UINTS; ++j)
3667                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3668               else
3669 #endif
3670                 for (j = 0; j < BITSET_UINTS; ++j)
3671                   any_set |= (accepts[j] &= dfa->word_char[j]);
3672               if (!any_set)
3673                 continue;
3674             }
3675           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3676             {
3677               unsigned int any_set = 0;
3678               if (type == CHARACTER && node->word_char)
3679                 {
3680                   bitset_empty (accepts);
3681                   continue;
3682                 }
3683 #ifdef RE_ENABLE_I18N
3684               if (dfa->mb_cur_max > 1)
3685                 for (j = 0; j < BITSET_UINTS; ++j)
3686                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3687               else
3688 #endif
3689                 for (j = 0; j < BITSET_UINTS; ++j)
3690                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3691               if (!any_set)
3692                 continue;
3693             }
3694         }
3695
3696       /* Then divide `accepts' into DFA states, or create a new
3697          state.  Above, we make sure that accepts is not empty.  */
3698       for (j = 0; j < ndests; ++j)
3699         {
3700           bitset intersec; /* Intersection sets, see below.  */
3701           bitset remains;
3702           /* Flags, see below.  */
3703           int has_intersec, not_subset, not_consumed;
3704
3705           /* Optimization, skip if this state doesn't accept the character.  */
3706           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3707             continue;
3708
3709           /* Enumerate the intersection set of this state and `accepts'.  */
3710           has_intersec = 0;
3711           for (k = 0; k < BITSET_UINTS; ++k)
3712             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3713           /* And skip if the intersection set is empty.  */
3714           if (!has_intersec)
3715             continue;
3716
3717           /* Then check if this state is a subset of `accepts'.  */
3718           not_subset = not_consumed = 0;
3719           for (k = 0; k < BITSET_UINTS; ++k)
3720             {
3721               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3722               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3723             }
3724
3725           /* If this state isn't a subset of `accepts', create a
3726              new group state, which has the `remains'. */
3727           if (not_subset)
3728             {
3729               bitset_copy (dests_ch[ndests], remains);
3730               bitset_copy (dests_ch[j], intersec);
3731               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3732               if (BE (err != REG_NOERROR, 0))
3733                 goto error_return;
3734               ++ndests;
3735             }
3736
3737           /* Put the position in the current group. */
3738           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3739           if (BE (result < 0, 0))
3740             goto error_return;
3741
3742           /* If all characters are consumed, go to next node. */
3743           if (!not_consumed)
3744             break;
3745         }
3746       /* Some characters remain, create a new group. */
3747       if (j == ndests)
3748         {
3749           bitset_copy (dests_ch[ndests], accepts);
3750           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3751           if (BE (err != REG_NOERROR, 0))
3752             goto error_return;
3753           ++ndests;
3754           bitset_empty (accepts);
3755         }
3756     }
3757   return ndests;
3758  error_return:
3759   for (j = 0; j < ndests; ++j)
3760     re_node_set_free (dests_node + j);
3761   return -1;
3762 }
3763
3764 #ifdef RE_ENABLE_I18N
3765 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3766    Return the number of the bytes the node accepts.
3767    STR_IDX is the current index of the input string.
3768
3769    This function handles the nodes which can accept one character, or
3770    one collating element like '.', '[a-z]', opposite to the other nodes
3771    can only accept one byte.  */
3772
3773 static int
3774 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3775      const re_dfa_t *dfa;
3776      int node_idx, str_idx;
3777      const re_string_t *input;
3778 {
3779   const re_token_t *node = dfa->nodes + node_idx;
3780   int char_len, elem_len;
3781   int i;
3782
3783   if (BE (node->type == OP_UTF8_PERIOD, 0))
3784     {
3785       unsigned char c = re_string_byte_at (input, str_idx), d;
3786       if (BE (c < 0xc2, 1))
3787         return 0;
3788
3789       if (str_idx + 2 > input->len)
3790         return 0;
3791
3792       d = re_string_byte_at (input, str_idx + 1);
3793       if (c < 0xe0)
3794         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3795       else if (c < 0xf0)
3796         {
3797           char_len = 3;
3798           if (c == 0xe0 && d < 0xa0)
3799             return 0;
3800         }
3801       else if (c < 0xf8)
3802         {
3803           char_len = 4;
3804           if (c == 0xf0 && d < 0x90)
3805             return 0;
3806         }
3807       else if (c < 0xfc)
3808         {
3809           char_len = 5;
3810           if (c == 0xf8 && d < 0x88)
3811             return 0;
3812         }
3813       else if (c < 0xfe)
3814         {
3815           char_len = 6;
3816           if (c == 0xfc && d < 0x84)
3817             return 0;
3818         }
3819       else
3820         return 0;
3821
3822       if (str_idx + char_len > input->len)
3823         return 0;
3824
3825       for (i = 1; i < char_len; ++i)
3826         {
3827           d = re_string_byte_at (input, str_idx + i);
3828           if (d < 0x80 || d > 0xbf)
3829             return 0;
3830         }
3831       return char_len;
3832     }
3833
3834   char_len = re_string_char_size_at (input, str_idx);
3835   if (node->type == OP_PERIOD)
3836     {
3837       if (char_len <= 1)
3838         return 0;
3839       /* FIXME: I don't think this if is needed, as both '\n'
3840          and '\0' are char_len == 1.  */
3841       /* '.' accepts any one character except the following two cases.  */
3842       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3843            re_string_byte_at (input, str_idx) == '\n') ||
3844           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3845            re_string_byte_at (input, str_idx) == '\0'))
3846         return 0;
3847       return char_len;
3848     }
3849
3850   elem_len = re_string_elem_size_at (input, str_idx);
3851   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3852     return 0;
3853
3854   if (node->type == COMPLEX_BRACKET)
3855     {
3856       const re_charset_t *cset = node->opr.mbcset;
3857 # ifdef _LIBC
3858       const unsigned char *pin
3859         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3860       int j;
3861       uint32_t nrules;
3862 # endif /* _LIBC */
3863       int match_len = 0;
3864       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3865                     ? re_string_wchar_at (input, str_idx) : 0);
3866
3867       /* match with multibyte character?  */
3868       for (i = 0; i < cset->nmbchars; ++i)
3869         if (wc == cset->mbchars[i])
3870           {
3871             match_len = char_len;
3872             goto check_node_accept_bytes_match;
3873           }
3874       /* match with character_class?  */
3875       for (i = 0; i < cset->nchar_classes; ++i)
3876         {
3877           wctype_t wt = cset->char_classes[i];
3878           if (__iswctype (wc, wt))
3879             {
3880               match_len = char_len;
3881               goto check_node_accept_bytes_match;
3882             }
3883         }
3884
3885 # ifdef _LIBC
3886       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3887       if (nrules != 0)
3888         {
3889           unsigned int in_collseq = 0;
3890           const int32_t *table, *indirect;
3891           const unsigned char *weights, *extra;
3892           const char *collseqwc;
3893           int32_t idx;
3894           /* This #include defines a local function!  */
3895 #  include <locale/weight.h>
3896
3897           /* match with collating_symbol?  */
3898           if (cset->ncoll_syms)
3899             extra = (const unsigned char *)
3900               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3901           for (i = 0; i < cset->ncoll_syms; ++i)
3902             {
3903               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3904               /* Compare the length of input collating element and
3905                  the length of current collating element.  */
3906               if (*coll_sym != elem_len)
3907                 continue;
3908               /* Compare each bytes.  */
3909               for (j = 0; j < *coll_sym; j++)
3910                 if (pin[j] != coll_sym[1 + j])
3911                   break;
3912               if (j == *coll_sym)
3913                 {
3914                   /* Match if every bytes is equal.  */
3915                   match_len = j;
3916                   goto check_node_accept_bytes_match;
3917                 }
3918             }
3919
3920           if (cset->nranges)
3921             {
3922               if (elem_len <= char_len)
3923                 {
3924                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3925                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3926                 }
3927               else
3928                 in_collseq = find_collation_sequence_value (pin, elem_len);
3929             }
3930           /* match with range expression?  */
3931           for (i = 0; i < cset->nranges; ++i)
3932             if (cset->range_starts[i] <= in_collseq
3933                 && in_collseq <= cset->range_ends[i])
3934               {
3935                 match_len = elem_len;
3936                 goto check_node_accept_bytes_match;
3937               }
3938
3939           /* match with equivalence_class?  */
3940           if (cset->nequiv_classes)
3941             {
3942               const unsigned char *cp = pin;
3943               table = (const int32_t *)
3944                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3945               weights = (const unsigned char *)
3946                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3947               extra = (const unsigned char *)
3948                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3949               indirect = (const int32_t *)
3950                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3951               idx = findidx (&cp);
3952               if (idx > 0)
3953                 for (i = 0; i < cset->nequiv_classes; ++i)
3954                   {
3955                     int32_t equiv_class_idx = cset->equiv_classes[i];
3956                     size_t weight_len = weights[idx];
3957                     if (weight_len == weights[equiv_class_idx])
3958                       {
3959                         int cnt = 0;
3960                         while (cnt <= weight_len
3961                                && (weights[equiv_class_idx + 1 + cnt]
3962                                    == weights[idx + 1 + cnt]))
3963                           ++cnt;
3964                         if (cnt > weight_len)
3965                           {
3966                             match_len = elem_len;
3967                             goto check_node_accept_bytes_match;
3968                           }
3969                       }
3970                   }
3971             }
3972         }
3973       else
3974 # endif /* _LIBC */
3975         {
3976           /* match with range expression?  */
3977 #if __GNUC__ >= 2
3978           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3979 #else
3980           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3981           cmp_buf[2] = wc;
3982 #endif
3983           for (i = 0; i < cset->nranges; ++i)
3984             {
3985               cmp_buf[0] = cset->range_starts[i];
3986               cmp_buf[4] = cset->range_ends[i];
3987               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3988                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3989                 {
3990                   match_len = char_len;
3991                   goto check_node_accept_bytes_match;
3992                 }
3993             }
3994         }
3995     check_node_accept_bytes_match:
3996       if (!cset->non_match)
3997         return match_len;
3998       else
3999         {
4000           if (match_len > 0)
4001             return 0;
4002           else
4003             return (elem_len > char_len) ? elem_len : char_len;
4004         }
4005     }
4006   return 0;
4007 }
4008
4009 # ifdef _LIBC
4010 static unsigned int
4011 find_collation_sequence_value (mbs, mbs_len)
4012     const unsigned char *mbs;
4013     size_t mbs_len;
4014 {
4015   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4016   if (nrules == 0)
4017     {
4018       if (mbs_len == 1)
4019         {
4020           /* No valid character.  Match it as a single byte character.  */
4021           const unsigned char *collseq = (const unsigned char *)
4022             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4023           return collseq[mbs[0]];
4024         }
4025       return UINT_MAX;
4026     }
4027   else
4028     {
4029       int32_t idx;
4030       const unsigned char *extra = (const unsigned char *)
4031         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4032       int32_t extrasize = (const unsigned char *)
4033         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4034
4035       for (idx = 0; idx < extrasize;)
4036         {
4037           int mbs_cnt, found = 0;
4038           int32_t elem_mbs_len;
4039           /* Skip the name of collating element name.  */
4040           idx = idx + extra[idx] + 1;
4041           elem_mbs_len = extra[idx++];
4042           if (mbs_len == elem_mbs_len)
4043             {
4044               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4045                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4046                   break;
4047               if (mbs_cnt == elem_mbs_len)
4048                 /* Found the entry.  */
4049                 found = 1;
4050             }
4051           /* Skip the byte sequence of the collating element.  */
4052           idx += elem_mbs_len;
4053           /* Adjust for the alignment.  */
4054           idx = (idx + 3) & ~3;
4055           /* Skip the collation sequence value.  */
4056           idx += sizeof (uint32_t);
4057           /* Skip the wide char sequence of the collating element.  */
4058           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4059           /* If we found the entry, return the sequence value.  */
4060           if (found)
4061             return *(uint32_t *) (extra + idx);
4062           /* Skip the collation sequence value.  */
4063           idx += sizeof (uint32_t);
4064         }
4065       return UINT_MAX;
4066     }
4067 }
4068 # endif /* _LIBC */
4069 #endif /* RE_ENABLE_I18N */
4070
4071 /* Check whether the node accepts the byte which is IDX-th
4072    byte of the INPUT.  */
4073
4074 static int
4075 check_node_accept (mctx, node, idx)
4076     const re_match_context_t *mctx;
4077     const re_token_t *node;
4078     int idx;
4079 {
4080   unsigned char ch;
4081   ch = re_string_byte_at (&mctx->input, idx);
4082   switch (node->type)
4083     {
4084     case CHARACTER:
4085       if (node->opr.c != ch)
4086         return 0;
4087       break;
4088
4089     case SIMPLE_BRACKET:
4090       if (!bitset_contain (node->opr.sbcset, ch))
4091         return 0;
4092       break;
4093
4094 #ifdef RE_ENABLE_I18N
4095     case OP_UTF8_PERIOD:
4096       if (ch >= 0x80)
4097         return 0;
4098       /* FALLTHROUGH */
4099 #endif
4100     case OP_PERIOD:
4101       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4102           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4103         return 0;
4104       break;
4105
4106     default:
4107       return 0;
4108     }
4109
4110   if (node->constraint)
4111     {
4112       /* The node has constraints.  Check whether the current context
4113          satisfies the constraints.  */
4114       unsigned int context = re_string_context_at (&mctx->input, idx,
4115                                                    mctx->eflags);
4116       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4117         return 0;
4118     }
4119
4120   return 1;
4121 }
4122
4123 /* Extend the buffers, if the buffers have run out.  */
4124
4125 static reg_errcode_t
4126 extend_buffers (mctx)
4127      re_match_context_t *mctx;
4128 {
4129   reg_errcode_t ret;
4130   re_string_t *pstr = &mctx->input;
4131
4132   /* Double the lengthes of the buffers.  */
4133   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4134   if (BE (ret != REG_NOERROR, 0))
4135     return ret;
4136
4137   if (mctx->state_log != NULL)
4138     {
4139       /* And double the length of state_log.  */
4140       /* XXX We have no indication of the size of this buffer.  If this
4141          allocation fail we have no indication that the state_log array
4142          does not have the right size.  */
4143       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4144                                               pstr->bufs_len + 1);
4145       if (BE (new_array == NULL, 0))
4146         return REG_ESPACE;
4147       mctx->state_log = new_array;
4148     }
4149
4150   /* Then reconstruct the buffers.  */
4151   if (pstr->icase)
4152     {
4153 #ifdef RE_ENABLE_I18N
4154       if (pstr->mb_cur_max > 1)
4155         {
4156           ret = build_wcs_upper_buffer (pstr);
4157           if (BE (ret != REG_NOERROR, 0))
4158             return ret;
4159         }
4160       else
4161 #endif /* RE_ENABLE_I18N  */
4162         build_upper_buffer (pstr);
4163     }
4164   else
4165     {
4166 #ifdef RE_ENABLE_I18N
4167       if (pstr->mb_cur_max > 1)
4168         build_wcs_buffer (pstr);
4169       else
4170 #endif /* RE_ENABLE_I18N  */
4171         {
4172           if (pstr->trans != NULL)
4173             re_string_translate_buffer (pstr);
4174         }
4175     }
4176   return REG_NOERROR;
4177 }
4178
4179 \f
4180 /* Functions for matching context.  */
4181
4182 /* Initialize MCTX.  */
4183
4184 static reg_errcode_t
4185 match_ctx_init (mctx, eflags, n)
4186     re_match_context_t *mctx;
4187     int eflags, n;
4188 {
4189   mctx->eflags = eflags;
4190   mctx->match_last = -1;
4191   if (n > 0)
4192     {
4193       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4194       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4195       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4196         return REG_ESPACE;
4197     }
4198   /* Already zero-ed by the caller.
4199      else
4200        mctx->bkref_ents = NULL;
4201      mctx->nbkref_ents = 0;
4202      mctx->nsub_tops = 0;  */
4203   mctx->abkref_ents = n;
4204   mctx->max_mb_elem_len = 1;
4205   mctx->asub_tops = n;
4206   return REG_NOERROR;
4207 }
4208
4209 /* Clean the entries which depend on the current input in MCTX.
4210    This function must be invoked when the matcher changes the start index
4211    of the input, or changes the input string.  */
4212
4213 static void
4214 match_ctx_clean (mctx)
4215     re_match_context_t *mctx;
4216 {
4217   int st_idx;
4218   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4219     {
4220       int sl_idx;
4221       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4222       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4223         {
4224           re_sub_match_last_t *last = top->lasts[sl_idx];
4225           re_free (last->path.array);
4226           re_free (last);
4227         }
4228       re_free (top->lasts);
4229       if (top->path)
4230         {
4231           re_free (top->path->array);
4232           re_free (top->path);
4233         }
4234       free (top);
4235     }
4236
4237   mctx->nsub_tops = 0;
4238   mctx->nbkref_ents = 0;
4239 }
4240
4241 /* Free all the memory associated with MCTX.  */
4242
4243 static void
4244 match_ctx_free (mctx)
4245     re_match_context_t *mctx;
4246 {
4247   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4248   match_ctx_clean (mctx);
4249   re_free (mctx->sub_tops);
4250   re_free (mctx->bkref_ents);
4251 }
4252
4253 /* Add a new backreference entry to MCTX.
4254    Note that we assume that caller never call this function with duplicate
4255    entry, and call with STR_IDX which isn't smaller than any existing entry.
4256 */
4257
4258 static reg_errcode_t
4259 match_ctx_add_entry (mctx, node, str_idx, from, to)
4260      re_match_context_t *mctx;
4261      int node, str_idx, from, to;
4262 {
4263   if (mctx->nbkref_ents >= mctx->abkref_ents)
4264     {
4265       struct re_backref_cache_entry* new_entry;
4266       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4267                               mctx->abkref_ents * 2);
4268       if (BE (new_entry == NULL, 0))
4269         {
4270           re_free (mctx->bkref_ents);
4271           return REG_ESPACE;
4272         }
4273       mctx->bkref_ents = new_entry;
4274       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4275               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4276       mctx->abkref_ents *= 2;
4277     }
4278   if (mctx->nbkref_ents > 0
4279       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4280     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4281
4282   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4283   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4284   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4285   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4286
4287   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4288      If bit N is clear, means that this entry won't epsilon-transition to
4289      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4290      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4291      such node.
4292
4293      A backreference does not epsilon-transition unless it is empty, so set
4294      to all zeros if FROM != TO.  */
4295   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4296     = (from == to ? ~0 : 0);
4297
4298   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4299   if (mctx->max_mb_elem_len < to - from)
4300     mctx->max_mb_elem_len = to - from;
4301   return REG_NOERROR;
4302 }
4303
4304 /* Search for the first entry which has the same str_idx, or -1 if none is
4305    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4306
4307 static int
4308 search_cur_bkref_entry (mctx, str_idx)
4309      const re_match_context_t *mctx;
4310      int str_idx;
4311 {
4312   int left, right, mid, last;
4313   last = right = mctx->nbkref_ents;
4314   for (left = 0; left < right;)
4315     {
4316       mid = (left + right) / 2;
4317       if (mctx->bkref_ents[mid].str_idx < str_idx)
4318         left = mid + 1;
4319       else
4320         right = mid;
4321     }
4322   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4323     return left;
4324   else
4325     return -1;
4326 }
4327
4328 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4329    at STR_IDX.  */
4330
4331 static reg_errcode_t
4332 match_ctx_add_subtop (mctx, node, str_idx)
4333      re_match_context_t *mctx;
4334      int node, str_idx;
4335 {
4336 #ifdef DEBUG
4337   assert (mctx->sub_tops != NULL);
4338   assert (mctx->asub_tops > 0);
4339 #endif
4340   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4341     {
4342       int new_asub_tops = mctx->asub_tops * 2;
4343       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4344                                                    re_sub_match_top_t *,
4345                                                    new_asub_tops);
4346       if (BE (new_array == NULL, 0))
4347         return REG_ESPACE;
4348       mctx->sub_tops = new_array;
4349       mctx->asub_tops = new_asub_tops;
4350     }
4351   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4352   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4353     return REG_ESPACE;
4354   mctx->sub_tops[mctx->nsub_tops]->node = node;
4355   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4356   return REG_NOERROR;
4357 }
4358
4359 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4360    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4361
4362 static re_sub_match_last_t *
4363 match_ctx_add_sublast (subtop, node, str_idx)
4364      re_sub_match_top_t *subtop;
4365      int node, str_idx;
4366 {
4367   re_sub_match_last_t *new_entry;
4368   if (BE (subtop->nlasts == subtop->alasts, 0))
4369     {
4370       int new_alasts = 2 * subtop->alasts + 1;
4371       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4372                                                     re_sub_match_last_t *,
4373                                                     new_alasts);
4374       if (BE (new_array == NULL, 0))
4375         return NULL;
4376       subtop->lasts = new_array;
4377       subtop->alasts = new_alasts;
4378     }
4379   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4380   if (BE (new_entry != NULL, 1))
4381     {
4382       subtop->lasts[subtop->nlasts] = new_entry;
4383       new_entry->node = node;
4384       new_entry->str_idx = str_idx;
4385       ++subtop->nlasts;
4386     }
4387   return new_entry;
4388 }
4389
4390 static void
4391 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4392     re_sift_context_t *sctx;
4393     re_dfastate_t **sifted_sts, **limited_sts;
4394     int last_node, last_str_idx;
4395 {
4396   sctx->sifted_states = sifted_sts;
4397   sctx->limited_states = limited_sts;
4398   sctx->last_node = last_node;
4399   sctx->last_str_idx = last_str_idx;
4400   re_node_set_init_empty (&sctx->limits);
4401 }