1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 int n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
25 int str_idx, int from, int to)
27 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
30 int str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 int node, int str_idx)
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, int last_node,
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, int length,
40 int start, int range, int stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, int length1,
45 const char *string2, int length2,
46 int start, int range, struct re_registers *regs,
47 int stop, int ret_len) internal_function;
48 static int re_search_stub (struct re_pattern_buffer *bufp,
49 const char *string, int length, int start,
50 int range, int stop, struct re_registers *regs,
51 int ret_len) internal_function;
52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
53 int nregs, int regs_allocated) internal_function;
54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
57 int *p_match_first) internal_function;
58 static int check_halt_state_context (const re_match_context_t *mctx,
59 const re_dfastate_t *state, int idx)
61 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
62 regmatch_t *prev_idx_match, int cur_node,
63 int cur_idx, int nmatch) internal_function;
64 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
65 int str_idx, int dest_node, int nregs,
67 re_node_set *eps_via_nodes)
69 static reg_errcode_t set_regs (const regex_t *preg,
70 const re_match_context_t *mctx,
71 size_t nmatch, regmatch_t *pmatch,
72 int fl_backtrack) internal_function;
73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
77 static int sift_states_iter_mb (const re_match_context_t *mctx,
78 re_sift_context_t *sctx,
79 int node_idx, int str_idx, int max_str_idx)
81 #endif /* RE_ENABLE_I18N */
82 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
83 re_sift_context_t *sctx)
85 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
86 re_sift_context_t *sctx, int str_idx,
87 re_node_set *cur_dest)
89 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
90 re_sift_context_t *sctx,
92 re_node_set *dest_nodes)
94 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
95 re_node_set *dest_nodes,
96 const re_node_set *candidates)
98 static int check_dst_limits (const re_match_context_t *mctx,
100 int dst_node, int dst_idx, int src_node,
101 int src_idx) internal_function;
102 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
103 int boundaries, int subexp_idx,
104 int from_node, int bkref_idx)
106 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
107 int limit, int subexp_idx,
108 int node, int str_idx,
109 int bkref_idx) internal_function;
110 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
111 re_node_set *dest_nodes,
112 const re_node_set *candidates,
114 struct re_backref_cache_entry *bkref_ents,
115 int str_idx) internal_function;
116 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
117 re_sift_context_t *sctx,
118 int str_idx, const re_node_set *candidates)
120 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **src, int num)
124 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
125 re_match_context_t *mctx) internal_function;
126 static re_dfastate_t *transit_state (reg_errcode_t *err,
127 re_match_context_t *mctx,
128 re_dfastate_t *state) internal_function;
129 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
130 re_match_context_t *mctx,
131 re_dfastate_t *next_state)
133 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
134 re_node_set *cur_nodes,
135 int str_idx) internal_function;
137 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
138 re_match_context_t *mctx,
139 re_dfastate_t *pstate)
142 #ifdef RE_ENABLE_I18N
143 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
144 re_dfastate_t *pstate)
146 #endif /* RE_ENABLE_I18N */
147 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
148 const re_node_set *nodes)
150 static reg_errcode_t get_subexp (re_match_context_t *mctx,
151 int bkref_node, int bkref_str_idx)
153 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
154 const re_sub_match_top_t *sub_top,
155 re_sub_match_last_t *sub_last,
156 int bkref_node, int bkref_str)
158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes)
169 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
170 re_node_set *cur_nodes,
171 int ex_subexp, int type)
173 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
174 re_node_set *dst_nodes,
175 int target, int ex_subexp,
176 int type) internal_function;
177 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
178 re_node_set *cur_nodes, int cur_str,
179 int subexp_num, int type)
181 static int build_trtable (const re_dfa_t *dfa,
182 re_dfastate_t *state) internal_function;
183 #ifdef RE_ENABLE_I18N
184 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
185 const re_string_t *input, int idx)
188 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
192 #endif /* RE_ENABLE_I18N */
193 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
194 const re_dfastate_t *state,
195 re_node_set *states_node,
196 bitset_t *states_ch) internal_function;
197 static int check_node_accept (const re_match_context_t *mctx,
198 const re_token_t *node, int idx)
200 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
204 #undef MIN /* safety */
206 MIN(size_t a, size_t b)
208 return (a < b ? a : b);
212 /* Entry point for POSIX code. */
214 /* regexec searches for a given pattern, specified by PREG, in the
217 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
218 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
219 least NMATCH elements, and we set them to the offsets of the
220 corresponding matched substrings.
222 EFLAGS specifies `execution flags' which affect matching: if
223 REG_NOTBOL is set, then ^ does not match at the beginning of the
224 string; if REG_NOTEOL is set, then $ does not match at the end.
226 We return 0 if we find a match and REG_NOMATCH if not. */
229 regexec (preg, string, nmatch, pmatch, eflags)
230 const regex_t *__restrict preg;
231 const char *__restrict string;
239 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
242 if (eflags & REG_STARTEND)
244 start = pmatch[0].rm_so;
245 length = pmatch[0].rm_eo;
250 length = strlen (string);
253 __libc_lock_lock (dfa->lock);
255 err = re_search_internal (preg, string, length, start, length - start,
256 length, 0, NULL, eflags);
258 err = re_search_internal (preg, string, length, start, length - start,
259 length, nmatch, pmatch, eflags);
260 __libc_lock_unlock (dfa->lock);
261 return err != REG_NOERROR;
265 # include <shlib-compat.h>
266 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
268 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
269 __typeof__ (__regexec) __compat_regexec;
272 attribute_compat_text_section
273 __compat_regexec (const regex_t *__restrict preg,
274 const char *__restrict string, size_t nmatch,
275 regmatch_t pmatch[], int eflags)
277 return regexec (preg, string, nmatch, pmatch,
278 eflags & (REG_NOTBOL | REG_NOTEOL));
280 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
284 /* Entry points for GNU code. */
286 /* re_match, re_search, re_match_2, re_search_2
288 The former two functions operate on STRING with length LENGTH,
289 while the later two operate on concatenation of STRING1 and STRING2
290 with lengths LENGTH1 and LENGTH2, respectively.
292 re_match() matches the compiled pattern in BUFP against the string,
293 starting at index START.
295 re_search() first tries matching at index START, then it tries to match
296 starting from index START + 1, and so on. The last start position tried
297 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
300 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
301 the first STOP characters of the concatenation of the strings should be
304 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
305 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
306 computed relative to the concatenation, not relative to the individual
309 On success, re_match* functions return the length of the match, re_search*
310 return the position of the start of the match. Return value -1 means no
311 match was found and -2 indicates an internal error. */
314 re_match (bufp, string, length, start, regs)
315 struct re_pattern_buffer *bufp;
318 struct re_registers *regs;
320 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
323 weak_alias (__re_match, re_match)
327 re_search (bufp, string, length, start, range, regs)
328 struct re_pattern_buffer *bufp;
330 int length, start, range;
331 struct re_registers *regs;
333 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
336 weak_alias (__re_search, re_search)
340 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
341 struct re_pattern_buffer *bufp;
342 const char *string1, *string2;
343 int length1, length2, start, stop;
344 struct re_registers *regs;
346 return re_search_2_stub (bufp, string1, length1, string2, length2,
347 start, 0, regs, stop, 1);
350 weak_alias (__re_match_2, re_match_2)
354 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
355 struct re_pattern_buffer *bufp;
356 const char *string1, *string2;
357 int length1, length2, start, range, stop;
358 struct re_registers *regs;
360 return re_search_2_stub (bufp, string1, length1, string2, length2,
361 start, range, regs, stop, 0);
364 weak_alias (__re_search_2, re_search_2)
368 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
370 struct re_pattern_buffer *bufp;
371 const char *string1, *string2;
372 int length1, length2, start, range, stop, ret_len;
373 struct re_registers *regs;
377 int len = length1 + length2;
380 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
383 /* Concatenate the strings. */
387 s = re_malloc (char, len);
389 if (BE (s == NULL, 0))
392 memcpy (__mempcpy (s, string1, length1), string2, length2);
394 memcpy (s, string1, length1);
395 memcpy (s + length1, string2, length2);
404 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
409 /* The parameters have the same meaning as those of re_search.
410 Additional parameters:
411 If RET_LEN is nonzero the length of the match is returned (re_match style);
412 otherwise the position of the match is returned. */
415 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
416 struct re_pattern_buffer *bufp;
418 int length, start, range, stop, ret_len;
419 struct re_registers *regs;
421 reg_errcode_t result;
426 /* Check for out-of-range. */
427 if (BE (start < 0 || start > length, 0))
429 if (BE (start + range > length, 0))
430 range = length - start;
431 else if (BE (start + range < 0, 0))
434 __libc_lock_lock (dfa->lock);
436 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
437 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
439 /* Compile fastmap if we haven't yet. */
440 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
441 re_compile_fastmap (bufp);
443 if (BE (bufp->no_sub, 0))
446 /* We need at least 1 register. */
449 else if (BE (bufp->regs_allocated == REGS_FIXED &&
450 regs->num_regs < bufp->re_nsub + 1, 0))
452 nregs = regs->num_regs;
453 if (BE (nregs < 1, 0))
455 /* Nothing can be copied to regs. */
461 nregs = bufp->re_nsub + 1;
462 pmatch = re_malloc (regmatch_t, nregs);
463 if (BE (pmatch == NULL, 0))
469 result = re_search_internal (bufp, string, length, start, range, stop,
470 nregs, pmatch, eflags);
474 /* I hope we needn't fill ther regs with -1's when no match was found. */
475 if (result != REG_NOERROR)
477 else if (regs != NULL)
479 /* If caller wants register contents data back, copy them. */
480 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
481 bufp->regs_allocated);
482 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
486 if (BE (rval == 0, 1))
490 assert (pmatch[0].rm_so == start);
491 rval = pmatch[0].rm_eo - start;
494 rval = pmatch[0].rm_so;
498 __libc_lock_unlock (dfa->lock);
503 re_copy_regs (regs, pmatch, nregs, regs_allocated)
504 struct re_registers *regs;
506 int nregs, regs_allocated;
508 int rval = REGS_REALLOCATE;
510 int need_regs = nregs + 1;
511 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
514 /* Have the register data arrays been allocated? */
515 if (regs_allocated == REGS_UNALLOCATED)
516 { /* No. So allocate them with malloc. */
517 regs->start = re_malloc (regoff_t, need_regs);
518 if (BE (regs->start == NULL, 0))
519 return REGS_UNALLOCATED;
520 regs->end = re_malloc (regoff_t, need_regs);
521 if (BE (regs->end == NULL, 0))
523 re_free (regs->start);
524 return REGS_UNALLOCATED;
526 regs->num_regs = need_regs;
528 else if (regs_allocated == REGS_REALLOCATE)
529 { /* Yes. If we need more elements than were already
530 allocated, reallocate them. If we need fewer, just
532 if (BE (need_regs > regs->num_regs, 0))
534 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
536 if (BE (new_start == NULL, 0))
537 return REGS_UNALLOCATED;
538 new_end = re_realloc (regs->end, regoff_t, need_regs);
539 if (BE (new_end == NULL, 0))
542 return REGS_UNALLOCATED;
544 regs->start = new_start;
546 regs->num_regs = need_regs;
551 assert (regs_allocated == REGS_FIXED);
552 /* This function may not be called with REGS_FIXED and nregs too big. */
553 assert (regs->num_regs >= nregs);
558 for (i = 0; i < nregs; ++i)
560 regs->start[i] = pmatch[i].rm_so;
561 regs->end[i] = pmatch[i].rm_eo;
563 for ( ; i < regs->num_regs; ++i)
564 regs->start[i] = regs->end[i] = -1;
569 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
570 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
571 this memory for recording register information. STARTS and ENDS
572 must be allocated using the malloc library routine, and must each
573 be at least NUM_REGS * sizeof (regoff_t) bytes long.
575 If NUM_REGS == 0, then subsequent matches should allocate their own
578 Unless this function is called, the first search or match using
579 PATTERN_BUFFER will allocate its own register data, without
580 freeing the old data. */
583 re_set_registers (bufp, regs, num_regs, starts, ends)
584 struct re_pattern_buffer *bufp;
585 struct re_registers *regs;
587 regoff_t *starts, *ends;
591 bufp->regs_allocated = REGS_REALLOCATE;
592 regs->num_regs = num_regs;
593 regs->start = starts;
598 bufp->regs_allocated = REGS_UNALLOCATED;
600 regs->start = regs->end = (regoff_t *) 0;
604 weak_alias (__re_set_registers, re_set_registers)
607 /* Entry points compatible with 4.2 BSD regex library. We don't define
608 them unless specifically requested. */
610 #if defined _REGEX_RE_COMP || defined _LIBC
618 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
620 #endif /* _REGEX_RE_COMP */
622 /* Internal entry point. */
624 /* Searches for a compiled pattern PREG in the string STRING, whose
625 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
626 mingings with regexec. START, and RANGE have the same meanings
628 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
629 otherwise return the error code.
630 Note: We assume front end functions already check ranges.
631 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
634 __attribute_warn_unused_result__
635 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
639 int length, start, range, stop, eflags;
644 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
645 int left_lim, right_lim, incr;
646 int fl_longest_match, match_first, match_kind, match_last = -1;
649 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
650 re_match_context_t mctx = { .dfa = dfa };
652 re_match_context_t mctx;
654 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
655 && range && !preg->can_be_null) ? preg->fastmap : NULL;
656 RE_TRANSLATE_TYPE t = preg->translate;
658 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
659 memset (&mctx, '\0', sizeof (re_match_context_t));
663 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
664 nmatch -= extra_nmatch;
666 /* Check if the DFA haven't been compiled. */
667 if (BE (preg->used == 0 || dfa == NULL || dfa->init_state == NULL
668 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
669 || dfa->init_state_begbuf == NULL, 0))
673 /* We assume front-end functions already check them. */
674 assert (start + range >= 0 && start + range <= length);
677 /* If initial states with non-begbuf contexts have no elements,
678 the regex must be anchored. If preg->newline_anchor is set,
679 we'll never use init_state_nl, so do not check it. */
680 if (dfa->init_state->nodes.nelem == 0
681 && dfa->init_state_word->nodes.nelem == 0
682 && (dfa->init_state_nl->nodes.nelem == 0
683 || !preg->newline_anchor))
685 if (start != 0 && start + range != 0)
690 /* We must check the longest matching, if nmatch > 0. */
691 fl_longest_match = (nmatch != 0 || dfa->nbackref);
693 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
694 preg->translate, preg->syntax & RE_ICASE, dfa);
695 if (BE (err != REG_NOERROR, 0))
697 mctx.input.stop = stop;
698 mctx.input.raw_stop = stop;
699 mctx.input.newline_anchor = preg->newline_anchor;
701 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
702 if (BE (err != REG_NOERROR, 0))
705 /* We will log all the DFA states through which the dfa pass,
706 if nmatch > 1, or this dfa has "multibyte node", which is a
707 back-reference or a node which can accept multibyte character or
708 multi character collating element. */
709 if (nmatch > 1 || dfa->has_mb_node)
711 /* Avoid overflow. */
712 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
718 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
719 if (BE (mctx.state_log == NULL, 0))
726 mctx.state_log = NULL;
729 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
730 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
732 /* Check incrementally whether of not the input string match. */
733 incr = (range < 0) ? -1 : 1;
734 left_lim = (range < 0) ? start + range : start;
735 right_lim = (range < 0) ? start : start + range;
736 sb = dfa->mb_cur_max == 1;
739 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
740 | (range >= 0 ? 2 : 0)
741 | (t != NULL ? 1 : 0))
744 for (;; match_first += incr)
747 if (match_first < left_lim || right_lim < match_first)
750 /* Advance as rapidly as possible through the string, until we
751 find a plausible place to start matching. This may be done
752 with varying efficiency, so there are various possibilities:
753 only the most common of them are specialized, in order to
754 save on code size. We use a switch statement for speed. */
762 /* Fastmap with single-byte translation, match forward. */
763 while (BE (match_first < right_lim, 1)
764 && !fastmap[t[(unsigned char) string[match_first]]])
766 goto forward_match_found_start_or_reached_end;
769 /* Fastmap without translation, match forward. */
770 while (BE (match_first < right_lim, 1)
771 && !fastmap[(unsigned char) string[match_first]])
774 forward_match_found_start_or_reached_end:
775 if (BE (match_first == right_lim, 0))
777 ch = match_first >= length
778 ? 0 : (unsigned char) string[match_first];
779 if (!fastmap[t ? t[ch] : ch])
786 /* Fastmap without multi-byte translation, match backwards. */
787 while (match_first >= left_lim)
789 ch = match_first >= length
790 ? 0 : (unsigned char) string[match_first];
791 if (fastmap[t ? t[ch] : ch])
795 if (match_first < left_lim)
800 /* In this case, we can't determine easily the current byte,
801 since it might be a component byte of a multibyte
802 character. Then we use the constructed buffer instead. */
805 /* If MATCH_FIRST is out of the valid range, reconstruct the
807 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
808 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
810 err = re_string_reconstruct (&mctx.input, match_first,
812 if (BE (err != REG_NOERROR, 0))
815 offset = match_first - mctx.input.raw_mbs_idx;
817 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
818 Note that MATCH_FIRST must not be smaller than 0. */
819 ch = (match_first >= length
820 ? 0 : re_string_byte_at (&mctx.input, offset));
824 if (match_first < left_lim || match_first > right_lim)
833 /* Reconstruct the buffers so that the matcher can assume that
834 the matching starts from the beginning of the buffer. */
835 err = re_string_reconstruct (&mctx.input, match_first, eflags);
836 if (BE (err != REG_NOERROR, 0))
839 #ifdef RE_ENABLE_I18N
840 /* Don't consider this char as a possible match start if it part,
841 yet isn't the head, of a multibyte character. */
842 if (!sb && !re_string_first_byte (&mctx.input, 0))
846 /* It seems to be appropriate one, then use the matcher. */
847 /* We assume that the matching starts from 0. */
848 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
849 match_last = check_matching (&mctx, fl_longest_match,
850 range >= 0 ? &match_first : NULL);
851 if (match_last != -1)
853 if (BE (match_last == -2, 0))
860 mctx.match_last = match_last;
861 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
863 re_dfastate_t *pstate = mctx.state_log[match_last];
864 mctx.last_node = check_halt_state_context (&mctx, pstate,
867 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
870 err = prune_impossible_nodes (&mctx);
871 if (err == REG_NOERROR)
873 if (BE (err != REG_NOMATCH, 0))
878 break; /* We found a match. */
882 match_ctx_clean (&mctx);
886 assert (match_last != -1);
887 assert (err == REG_NOERROR);
890 /* Set pmatch[] if we need. */
895 /* Initialize registers. */
896 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
897 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
899 /* Set the points where matching start/end. */
901 pmatch[0].rm_eo = mctx.match_last;
903 if (!preg->no_sub && nmatch > 1)
905 err = set_regs (preg, &mctx, nmatch, pmatch,
906 dfa->has_plural_match && dfa->nbackref > 0);
907 if (BE (err != REG_NOERROR, 0))
911 /* At last, add the offset to the each registers, since we slided
912 the buffers so that we could assume that the matching starts
914 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
915 if (pmatch[reg_idx].rm_so != -1)
917 #ifdef RE_ENABLE_I18N
918 if (BE (mctx.input.offsets_needed != 0, 0))
920 pmatch[reg_idx].rm_so =
921 (pmatch[reg_idx].rm_so == mctx.input.valid_len
922 ? mctx.input.valid_raw_len
923 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
924 pmatch[reg_idx].rm_eo =
925 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
926 ? mctx.input.valid_raw_len
927 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
930 assert (mctx.input.offsets_needed == 0);
932 pmatch[reg_idx].rm_so += match_first;
933 pmatch[reg_idx].rm_eo += match_first;
935 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
937 pmatch[nmatch + reg_idx].rm_so = -1;
938 pmatch[nmatch + reg_idx].rm_eo = -1;
942 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
943 if (dfa->subexp_map[reg_idx] != reg_idx)
945 pmatch[reg_idx + 1].rm_so
946 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
947 pmatch[reg_idx + 1].rm_eo
948 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
953 re_free (mctx.state_log);
955 match_ctx_free (&mctx);
956 re_string_destruct (&mctx.input);
961 __attribute_warn_unused_result__
962 prune_impossible_nodes (mctx)
963 re_match_context_t *mctx;
965 const re_dfa_t *const dfa = mctx->dfa;
966 int halt_node, match_last;
968 re_dfastate_t **sifted_states;
969 re_dfastate_t **lim_states = NULL;
970 re_sift_context_t sctx;
972 assert (mctx->state_log != NULL);
974 match_last = mctx->match_last;
975 halt_node = mctx->last_node;
977 /* Avoid overflow. */
978 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
981 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
982 if (BE (sifted_states == NULL, 0))
989 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
990 if (BE (lim_states == NULL, 0))
997 memset (lim_states, '\0',
998 sizeof (re_dfastate_t *) * (match_last + 1));
999 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
1001 ret = sift_states_backward (mctx, &sctx);
1002 re_node_set_free (&sctx.limits);
1003 if (BE (ret != REG_NOERROR, 0))
1005 if (sifted_states[0] != NULL || lim_states[0] != NULL)
1015 } while (mctx->state_log[match_last] == NULL
1016 || !mctx->state_log[match_last]->halt);
1017 halt_node = check_halt_state_context (mctx,
1018 mctx->state_log[match_last],
1021 ret = merge_state_array (dfa, sifted_states, lim_states,
1023 re_free (lim_states);
1025 if (BE (ret != REG_NOERROR, 0))
1030 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1031 ret = sift_states_backward (mctx, &sctx);
1032 re_node_set_free (&sctx.limits);
1033 if (BE (ret != REG_NOERROR, 0))
1035 if (sifted_states[0] == NULL)
1041 re_free (mctx->state_log);
1042 mctx->state_log = sifted_states;
1043 sifted_states = NULL;
1044 mctx->last_node = halt_node;
1045 mctx->match_last = match_last;
1048 re_free (sifted_states);
1049 re_free (lim_states);
1053 /* Acquire an initial state and return it.
1054 We must select appropriate initial state depending on the context,
1055 since initial states may have constraints like "\<", "^", etc.. */
1057 static inline re_dfastate_t *
1058 __attribute ((always_inline)) internal_function
1059 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1062 const re_dfa_t *const dfa = mctx->dfa;
1063 if (dfa->init_state->has_constraint)
1065 unsigned int context;
1066 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1067 if (IS_WORD_CONTEXT (context))
1068 return dfa->init_state_word;
1069 else if (IS_ORDINARY_CONTEXT (context))
1070 return dfa->init_state;
1071 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1072 return dfa->init_state_begbuf;
1073 else if (IS_NEWLINE_CONTEXT (context))
1074 return dfa->init_state_nl;
1075 else if (IS_BEGBUF_CONTEXT (context))
1077 /* It is relatively rare case, then calculate on demand. */
1078 return re_acquire_state_context (err, dfa,
1079 dfa->init_state->entrance_nodes,
1083 /* Must not happen? */
1084 return dfa->init_state;
1087 return dfa->init_state;
1090 /* Check whether the regular expression match input string INPUT or not,
1091 and return the index where the matching end, return -1 if not match,
1092 or return -2 in case of an error.
1093 FL_LONGEST_MATCH means we want the POSIX longest matching.
1094 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1095 next place where we may want to try matching.
1096 Note that the matcher assume that the maching starts from the current
1097 index of the buffer. */
1100 internal_function __attribute_warn_unused_result__
1101 check_matching (re_match_context_t *mctx, int fl_longest_match,
1104 const re_dfa_t *const dfa = mctx->dfa;
1107 int match_last = -1;
1108 int cur_str_idx = re_string_cur_idx (&mctx->input);
1109 re_dfastate_t *cur_state;
1110 int at_init_state = p_match_first != NULL;
1111 int next_start_idx = cur_str_idx;
1114 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1115 /* An initial state must not be NULL (invalid). */
1116 if (BE (cur_state == NULL, 0))
1118 assert (err == REG_ESPACE);
1122 if (mctx->state_log != NULL)
1124 mctx->state_log[cur_str_idx] = cur_state;
1126 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1127 later. E.g. Processing back references. */
1128 if (BE (dfa->nbackref, 0))
1131 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1132 if (BE (err != REG_NOERROR, 0))
1135 if (cur_state->has_backref)
1137 err = transit_state_bkref (mctx, &cur_state->nodes);
1138 if (BE (err != REG_NOERROR, 0))
1144 /* If the RE accepts NULL string. */
1145 if (BE (cur_state->halt, 0))
1147 if (!cur_state->has_constraint
1148 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1150 if (!fl_longest_match)
1154 match_last = cur_str_idx;
1160 while (!re_string_eoi (&mctx->input))
1162 re_dfastate_t *old_state = cur_state;
1163 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1165 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1166 && mctx->input.bufs_len < mctx->input.len)
1167 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1168 && mctx->input.valid_len < mctx->input.len))
1170 err = extend_buffers (mctx, next_char_idx + 1);
1171 if (BE (err != REG_NOERROR, 0))
1173 assert (err == REG_ESPACE);
1178 cur_state = transit_state (&err, mctx, cur_state);
1179 if (mctx->state_log != NULL)
1180 cur_state = merge_state_with_log (&err, mctx, cur_state);
1182 if (cur_state == NULL)
1184 /* Reached the invalid state or an error. Try to recover a valid
1185 state using the state log, if available and if we have not
1186 already found a valid (even if not the longest) match. */
1187 if (BE (err != REG_NOERROR, 0))
1190 if (mctx->state_log == NULL
1191 || (match && !fl_longest_match)
1192 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1196 if (BE (at_init_state, 0))
1198 if (old_state == cur_state)
1199 next_start_idx = next_char_idx;
1204 if (cur_state->halt)
1206 /* Reached a halt state.
1207 Check the halt state can satisfy the current context. */
1208 if (!cur_state->has_constraint
1209 || check_halt_state_context (mctx, cur_state,
1210 re_string_cur_idx (&mctx->input)))
1212 /* We found an appropriate halt state. */
1213 match_last = re_string_cur_idx (&mctx->input);
1216 /* We found a match, do not modify match_first below. */
1217 p_match_first = NULL;
1218 if (!fl_longest_match)
1225 *p_match_first += next_start_idx;
1230 /* Check NODE match the current context. */
1234 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1236 re_token_type_t type = dfa->nodes[node].type;
1237 unsigned int constraint = dfa->nodes[node].constraint;
1238 if (type != END_OF_RE)
1242 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1247 /* Check the halt state STATE match the current context.
1248 Return 0 if not match, if the node, STATE has, is a halt node and
1249 match the context, return the node. */
1253 check_halt_state_context (const re_match_context_t *mctx,
1254 const re_dfastate_t *state, int idx)
1257 unsigned int context;
1259 assert (state->halt);
1261 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1262 for (i = 0; i < state->nodes.nelem; ++i)
1263 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1264 return state->nodes.elems[i];
1268 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1269 corresponding to the DFA).
1270 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1275 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1276 int *pidx, int node, re_node_set *eps_via_nodes,
1277 struct re_fail_stack_t *fs)
1279 const re_dfa_t *const dfa = mctx->dfa;
1281 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1283 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1284 re_node_set *edests = &dfa->edests[node];
1286 err = re_node_set_insert (eps_via_nodes, node);
1287 if (BE (err < 0, 0))
1289 /* Pick up a valid destination, or return -1 if none is found. */
1290 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1292 int candidate = edests->elems[i];
1293 if (!re_node_set_contains (cur_nodes, candidate))
1295 if (dest_node == -1)
1296 dest_node = candidate;
1300 /* In order to avoid infinite loop like "(a*)*", return the second
1301 epsilon-transition if the first was already considered. */
1302 if (re_node_set_contains (eps_via_nodes, dest_node))
1305 /* Otherwise, push the second epsilon-transition on the fail stack. */
1307 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1311 /* We know we are going to exit. */
1320 re_token_type_t type = dfa->nodes[node].type;
1322 #ifdef RE_ENABLE_I18N
1323 if (dfa->nodes[node].accept_mb)
1324 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1326 #endif /* RE_ENABLE_I18N */
1327 if (type == OP_BACK_REF)
1329 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1330 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1333 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1337 char *buf = (char *) re_string_get_buffer (&mctx->input);
1338 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1347 err = re_node_set_insert (eps_via_nodes, node);
1348 if (BE (err < 0, 0))
1350 dest_node = dfa->edests[node].elems[0];
1351 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1358 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1360 int dest_node = dfa->nexts[node];
1361 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1362 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1363 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1366 re_node_set_empty (eps_via_nodes);
1373 static reg_errcode_t
1374 internal_function __attribute_warn_unused_result__
1375 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1376 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1379 int num = fs->num++;
1380 if (fs->num == fs->alloc)
1382 struct re_fail_stack_ent_t *new_array;
1383 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1385 if (new_array == NULL)
1388 fs->stack = new_array;
1390 fs->stack[num].idx = str_idx;
1391 fs->stack[num].node = dest_node;
1392 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1393 if (fs->stack[num].regs == NULL)
1395 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1396 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1402 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1403 regmatch_t *regs, re_node_set *eps_via_nodes)
1405 int num = --fs->num;
1407 *pidx = fs->stack[num].idx;
1408 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1409 re_node_set_free (eps_via_nodes);
1410 re_free (fs->stack[num].regs);
1411 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1412 return fs->stack[num].node;
1415 /* Set the positions where the subexpressions are starts/ends to registers
1417 Note: We assume that pmatch[0] is already set, and
1418 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1420 static reg_errcode_t
1421 internal_function __attribute_warn_unused_result__
1422 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1423 regmatch_t *pmatch, int fl_backtrack)
1425 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1427 re_node_set eps_via_nodes;
1428 struct re_fail_stack_t *fs;
1429 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1430 regmatch_t *prev_idx_match;
1431 int prev_idx_match_malloced = 0;
1434 assert (nmatch > 1);
1435 assert (mctx->state_log != NULL);
1440 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1441 if (fs->stack == NULL)
1447 cur_node = dfa->init_node;
1448 re_node_set_init_empty (&eps_via_nodes);
1451 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1452 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1456 prev_idx_match = re_malloc (regmatch_t, nmatch);
1457 if (prev_idx_match == NULL)
1459 free_fail_stack_return (fs);
1462 prev_idx_match_malloced = 1;
1464 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1466 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1468 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1470 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1475 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1476 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1478 if (reg_idx == nmatch)
1480 re_node_set_free (&eps_via_nodes);
1481 if (prev_idx_match_malloced)
1482 re_free (prev_idx_match);
1483 return free_fail_stack_return (fs);
1485 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1490 re_node_set_free (&eps_via_nodes);
1491 if (prev_idx_match_malloced)
1492 re_free (prev_idx_match);
1497 /* Proceed to next node. */
1498 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1499 &eps_via_nodes, fs);
1501 if (BE (cur_node < 0, 0))
1503 if (BE (cur_node == -2, 0))
1505 re_node_set_free (&eps_via_nodes);
1506 if (prev_idx_match_malloced)
1507 re_free (prev_idx_match);
1508 free_fail_stack_return (fs);
1512 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1516 re_node_set_free (&eps_via_nodes);
1517 if (prev_idx_match_malloced)
1518 re_free (prev_idx_match);
1523 re_node_set_free (&eps_via_nodes);
1524 if (prev_idx_match_malloced)
1525 re_free (prev_idx_match);
1526 return free_fail_stack_return (fs);
1529 static reg_errcode_t
1531 free_fail_stack_return (struct re_fail_stack_t *fs)
1536 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1538 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1539 re_free (fs->stack[fs_idx].regs);
1541 re_free (fs->stack);
1548 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1549 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1551 int type = dfa->nodes[cur_node].type;
1552 if (type == OP_OPEN_SUBEXP)
1554 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1556 /* We are at the first node of this sub expression. */
1557 if (reg_num < nmatch)
1559 pmatch[reg_num].rm_so = cur_idx;
1560 pmatch[reg_num].rm_eo = -1;
1563 else if (type == OP_CLOSE_SUBEXP)
1565 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1566 if (reg_num < nmatch)
1568 /* We are at the last node of this sub expression. */
1569 if (pmatch[reg_num].rm_so < cur_idx)
1571 pmatch[reg_num].rm_eo = cur_idx;
1572 /* This is a non-empty match or we are not inside an optional
1573 subexpression. Accept this right away. */
1574 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1578 if (dfa->nodes[cur_node].opt_subexp
1579 && prev_idx_match[reg_num].rm_so != -1)
1580 /* We transited through an empty match for an optional
1581 subexpression, like (a?)*, and this is not the subexp's
1582 first match. Copy back the old content of the registers
1583 so that matches of an inner subexpression are undone as
1584 well, like in ((a?))*. */
1585 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1587 /* We completed a subexpression, but it may be part of
1588 an optional one, so do not update PREV_IDX_MATCH. */
1589 pmatch[reg_num].rm_eo = cur_idx;
1595 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1596 and sift the nodes in each states according to the following rules.
1597 Updated state_log will be wrote to STATE_LOG.
1599 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1600 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1601 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1602 the LAST_NODE, we throw away the node `a'.
1603 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1604 string `s' and transit to `b':
1605 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1607 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1608 thrown away, we throw away the node `a'.
1609 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1610 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1612 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1613 we throw away the node `a'. */
1615 #define STATE_NODE_CONTAINS(state,node) \
1616 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1618 static reg_errcode_t
1620 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1624 int str_idx = sctx->last_str_idx;
1625 re_node_set cur_dest;
1628 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1631 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1632 transit to the last_node and the last_node itself. */
1633 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1634 if (BE (err != REG_NOERROR, 0))
1636 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1637 if (BE (err != REG_NOERROR, 0))
1640 /* Then check each states in the state_log. */
1643 /* Update counters. */
1644 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1645 if (null_cnt > mctx->max_mb_elem_len)
1647 memset (sctx->sifted_states, '\0',
1648 sizeof (re_dfastate_t *) * str_idx);
1649 re_node_set_free (&cur_dest);
1652 re_node_set_empty (&cur_dest);
1655 if (mctx->state_log[str_idx])
1657 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1658 if (BE (err != REG_NOERROR, 0))
1662 /* Add all the nodes which satisfy the following conditions:
1663 - It can epsilon transit to a node in CUR_DEST.
1665 And update state_log. */
1666 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1667 if (BE (err != REG_NOERROR, 0))
1672 re_node_set_free (&cur_dest);
1676 static reg_errcode_t
1677 internal_function __attribute_warn_unused_result__
1678 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1679 int str_idx, re_node_set *cur_dest)
1681 const re_dfa_t *const dfa = mctx->dfa;
1682 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1685 /* Then build the next sifted state.
1686 We build the next sifted state on `cur_dest', and update
1687 `sifted_states[str_idx]' with `cur_dest'.
1689 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1690 `cur_src' points the node_set of the old `state_log[str_idx]'
1691 (with the epsilon nodes pre-filtered out). */
1692 for (i = 0; i < cur_src->nelem; i++)
1694 int prev_node = cur_src->elems[i];
1699 re_token_type_t type = dfa->nodes[prev_node].type;
1700 assert (!IS_EPSILON_NODE (type));
1702 #ifdef RE_ENABLE_I18N
1703 /* If the node may accept `multi byte'. */
1704 if (dfa->nodes[prev_node].accept_mb)
1705 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1706 str_idx, sctx->last_str_idx);
1707 #endif /* RE_ENABLE_I18N */
1709 /* We don't check backreferences here.
1710 See update_cur_sifted_state(). */
1712 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1713 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1714 dfa->nexts[prev_node]))
1720 if (sctx->limits.nelem)
1722 int to_idx = str_idx + naccepted;
1723 if (check_dst_limits (mctx, &sctx->limits,
1724 dfa->nexts[prev_node], to_idx,
1725 prev_node, str_idx))
1728 ret = re_node_set_insert (cur_dest, prev_node);
1729 if (BE (ret == -1, 0))
1736 /* Helper functions. */
1738 static reg_errcode_t
1740 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1742 int top = mctx->state_log_top;
1744 if ((next_state_log_idx >= mctx->input.bufs_len
1745 && mctx->input.bufs_len < mctx->input.len)
1746 || (next_state_log_idx >= mctx->input.valid_len
1747 && mctx->input.valid_len < mctx->input.len))
1750 err = extend_buffers (mctx, next_state_log_idx + 1);
1751 if (BE (err != REG_NOERROR, 0))
1755 if (top < next_state_log_idx)
1757 memset (mctx->state_log + top + 1, '\0',
1758 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1759 mctx->state_log_top = next_state_log_idx;
1764 static reg_errcode_t
1766 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1767 re_dfastate_t **src, int num)
1771 for (st_idx = 0; st_idx < num; ++st_idx)
1773 if (dst[st_idx] == NULL)
1774 dst[st_idx] = src[st_idx];
1775 else if (src[st_idx] != NULL)
1777 re_node_set merged_set;
1778 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1779 &src[st_idx]->nodes);
1780 if (BE (err != REG_NOERROR, 0))
1782 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1783 re_node_set_free (&merged_set);
1784 if (BE (err != REG_NOERROR, 0))
1791 static reg_errcode_t
1793 update_cur_sifted_state (const re_match_context_t *mctx,
1794 re_sift_context_t *sctx, int str_idx,
1795 re_node_set *dest_nodes)
1797 const re_dfa_t *const dfa = mctx->dfa;
1798 reg_errcode_t err = REG_NOERROR;
1799 const re_node_set *candidates;
1800 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1801 : &mctx->state_log[str_idx]->nodes);
1803 if (dest_nodes->nelem == 0)
1804 sctx->sifted_states[str_idx] = NULL;
1809 /* At first, add the nodes which can epsilon transit to a node in
1811 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1812 if (BE (err != REG_NOERROR, 0))
1815 /* Then, check the limitations in the current sift_context. */
1816 if (sctx->limits.nelem)
1818 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1819 mctx->bkref_ents, str_idx);
1820 if (BE (err != REG_NOERROR, 0))
1825 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1826 if (BE (err != REG_NOERROR, 0))
1830 if (candidates && mctx->state_log[str_idx]->has_backref)
1832 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1833 if (BE (err != REG_NOERROR, 0))
1839 static reg_errcode_t
1840 internal_function __attribute_warn_unused_result__
1841 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1842 const re_node_set *candidates)
1844 reg_errcode_t err = REG_NOERROR;
1847 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1848 if (BE (err != REG_NOERROR, 0))
1851 if (!state->inveclosure.alloc)
1853 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1854 if (BE (err != REG_NOERROR, 0))
1856 for (i = 0; i < dest_nodes->nelem; i++)
1858 err = re_node_set_merge (&state->inveclosure,
1859 dfa->inveclosures + dest_nodes->elems[i]);
1860 if (BE (err != REG_NOERROR, 0))
1864 return re_node_set_add_intersect (dest_nodes, candidates,
1865 &state->inveclosure);
1868 static reg_errcode_t
1870 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1871 const re_node_set *candidates)
1875 re_node_set *inv_eclosure = dfa->inveclosures + node;
1876 re_node_set except_nodes;
1877 re_node_set_init_empty (&except_nodes);
1878 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1880 int cur_node = inv_eclosure->elems[ecl_idx];
1881 if (cur_node == node)
1883 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1885 int edst1 = dfa->edests[cur_node].elems[0];
1886 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1887 ? dfa->edests[cur_node].elems[1] : -1);
1888 if ((!re_node_set_contains (inv_eclosure, edst1)
1889 && re_node_set_contains (dest_nodes, edst1))
1891 && !re_node_set_contains (inv_eclosure, edst2)
1892 && re_node_set_contains (dest_nodes, edst2)))
1894 err = re_node_set_add_intersect (&except_nodes, candidates,
1895 dfa->inveclosures + cur_node);
1896 if (BE (err != REG_NOERROR, 0))
1898 re_node_set_free (&except_nodes);
1904 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1906 int cur_node = inv_eclosure->elems[ecl_idx];
1907 if (!re_node_set_contains (&except_nodes, cur_node))
1909 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1910 re_node_set_remove_at (dest_nodes, idx);
1913 re_node_set_free (&except_nodes);
1919 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1920 int dst_node, int dst_idx, int src_node, int src_idx)
1922 const re_dfa_t *const dfa = mctx->dfa;
1923 int lim_idx, src_pos, dst_pos;
1925 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1926 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1927 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1930 struct re_backref_cache_entry *ent;
1931 ent = mctx->bkref_ents + limits->elems[lim_idx];
1932 subexp_idx = dfa->nodes[ent->node].opr.idx;
1934 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1935 subexp_idx, dst_node, dst_idx,
1937 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1938 subexp_idx, src_node, src_idx,
1942 <src> <dst> ( <subexp> )
1943 ( <subexp> ) <src> <dst>
1944 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1945 if (src_pos == dst_pos)
1946 continue; /* This is unrelated limitation. */
1955 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1956 int subexp_idx, int from_node, int bkref_idx)
1958 const re_dfa_t *const dfa = mctx->dfa;
1959 const re_node_set *eclosures = dfa->eclosures + from_node;
1962 /* Else, we are on the boundary: examine the nodes on the epsilon
1964 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1966 int node = eclosures->elems[node_idx];
1967 switch (dfa->nodes[node].type)
1970 if (bkref_idx != -1)
1972 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1977 if (ent->node != node)
1980 if (subexp_idx < BITSET_WORD_BITS
1981 && !(ent->eps_reachable_subexps_map
1982 & ((bitset_word_t) 1 << subexp_idx)))
1985 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1986 OP_CLOSE_SUBEXP cases below. But, if the
1987 destination node is the same node as the source
1988 node, don't recurse because it would cause an
1989 infinite loop: a regex that exhibits this behavior
1991 dst = dfa->edests[node].elems[0];
1992 if (dst == from_node)
1996 else /* if (boundaries & 2) */
2001 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2003 if (cpos == -1 /* && (boundaries & 1) */)
2005 if (cpos == 0 && (boundaries & 2))
2008 if (subexp_idx < BITSET_WORD_BITS)
2009 ent->eps_reachable_subexps_map
2010 &= ~((bitset_word_t) 1 << subexp_idx);
2012 while (ent++->more);
2016 case OP_OPEN_SUBEXP:
2017 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2021 case OP_CLOSE_SUBEXP:
2022 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2031 return (boundaries & 2) ? 1 : 0;
2036 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2037 int subexp_idx, int from_node, int str_idx,
2040 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2043 /* If we are outside the range of the subexpression, return -1 or 1. */
2044 if (str_idx < lim->subexp_from)
2047 if (lim->subexp_to < str_idx)
2050 /* If we are within the subexpression, return 0. */
2051 boundaries = (str_idx == lim->subexp_from);
2052 boundaries |= (str_idx == lim->subexp_to) << 1;
2053 if (boundaries == 0)
2056 /* Else, examine epsilon closure. */
2057 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2058 from_node, bkref_idx);
2061 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2062 which are against limitations from DEST_NODES. */
2064 static reg_errcode_t
2066 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2067 const re_node_set *candidates, re_node_set *limits,
2068 struct re_backref_cache_entry *bkref_ents, int str_idx)
2071 int node_idx, lim_idx;
2073 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2076 struct re_backref_cache_entry *ent;
2077 ent = bkref_ents + limits->elems[lim_idx];
2079 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2080 continue; /* This is unrelated limitation. */
2082 subexp_idx = dfa->nodes[ent->node].opr.idx;
2083 if (ent->subexp_to == str_idx)
2087 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2089 int node = dest_nodes->elems[node_idx];
2090 re_token_type_t type = dfa->nodes[node].type;
2091 if (type == OP_OPEN_SUBEXP
2092 && subexp_idx == dfa->nodes[node].opr.idx)
2094 else if (type == OP_CLOSE_SUBEXP
2095 && subexp_idx == dfa->nodes[node].opr.idx)
2099 /* Check the limitation of the open subexpression. */
2100 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2103 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2105 if (BE (err != REG_NOERROR, 0))
2109 /* Check the limitation of the close subexpression. */
2111 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2113 int node = dest_nodes->elems[node_idx];
2114 if (!re_node_set_contains (dfa->inveclosures + node,
2116 && !re_node_set_contains (dfa->eclosures + node,
2119 /* It is against this limitation.
2120 Remove it form the current sifted state. */
2121 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2123 if (BE (err != REG_NOERROR, 0))
2129 else /* (ent->subexp_to != str_idx) */
2131 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2133 int node = dest_nodes->elems[node_idx];
2134 re_token_type_t type = dfa->nodes[node].type;
2135 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2137 if (subexp_idx != dfa->nodes[node].opr.idx)
2139 /* It is against this limitation.
2140 Remove it form the current sifted state. */
2141 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2143 if (BE (err != REG_NOERROR, 0))
2152 static reg_errcode_t
2153 internal_function __attribute_warn_unused_result__
2154 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2155 int str_idx, const re_node_set *candidates)
2157 const re_dfa_t *const dfa = mctx->dfa;
2160 re_sift_context_t local_sctx;
2161 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2163 if (first_idx == -1)
2166 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2168 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2171 re_token_type_t type;
2172 struct re_backref_cache_entry *entry;
2173 node = candidates->elems[node_idx];
2174 type = dfa->nodes[node].type;
2175 /* Avoid infinite loop for the REs like "()\1+". */
2176 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2178 if (type != OP_BACK_REF)
2181 entry = mctx->bkref_ents + first_idx;
2182 enabled_idx = first_idx;
2189 re_dfastate_t *cur_state;
2191 if (entry->node != node)
2193 subexp_len = entry->subexp_to - entry->subexp_from;
2194 to_idx = str_idx + subexp_len;
2195 dst_node = (subexp_len ? dfa->nexts[node]
2196 : dfa->edests[node].elems[0]);
2198 if (to_idx > sctx->last_str_idx
2199 || sctx->sifted_states[to_idx] == NULL
2200 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2201 || check_dst_limits (mctx, &sctx->limits, node,
2202 str_idx, dst_node, to_idx))
2205 if (local_sctx.sifted_states == NULL)
2208 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2209 if (BE (err != REG_NOERROR, 0))
2212 local_sctx.last_node = node;
2213 local_sctx.last_str_idx = str_idx;
2214 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2215 if (BE (ret < 0, 0))
2220 cur_state = local_sctx.sifted_states[str_idx];
2221 err = sift_states_backward (mctx, &local_sctx);
2222 if (BE (err != REG_NOERROR, 0))
2224 if (sctx->limited_states != NULL)
2226 err = merge_state_array (dfa, sctx->limited_states,
2227 local_sctx.sifted_states,
2229 if (BE (err != REG_NOERROR, 0))
2232 local_sctx.sifted_states[str_idx] = cur_state;
2233 re_node_set_remove (&local_sctx.limits, enabled_idx);
2235 /* mctx->bkref_ents may have changed, reload the pointer. */
2236 entry = mctx->bkref_ents + enabled_idx;
2238 while (enabled_idx++, entry++->more);
2242 if (local_sctx.sifted_states != NULL)
2244 re_node_set_free (&local_sctx.limits);
2251 #ifdef RE_ENABLE_I18N
2254 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2255 int node_idx, int str_idx, int max_str_idx)
2257 const re_dfa_t *const dfa = mctx->dfa;
2259 /* Check the node can accept `multi byte'. */
2260 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2261 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2262 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2263 dfa->nexts[node_idx]))
2264 /* The node can't accept the `multi byte', or the
2265 destination was already thrown away, then the node
2266 could't accept the current input `multi byte'. */
2268 /* Otherwise, it is sure that the node could accept
2269 `naccepted' bytes input. */
2272 #endif /* RE_ENABLE_I18N */
2275 /* Functions for state transition. */
2277 /* Return the next state to which the current state STATE will transit by
2278 accepting the current input byte, and update STATE_LOG if necessary.
2279 If STATE can accept a multibyte char/collating element/back reference
2280 update the destination of STATE_LOG. */
2282 static re_dfastate_t *
2283 internal_function __attribute_warn_unused_result__
2284 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2285 re_dfastate_t *state)
2287 re_dfastate_t **trtable;
2290 #ifdef RE_ENABLE_I18N
2291 /* If the current state can accept multibyte. */
2292 if (BE (state->accept_mb, 0))
2294 *err = transit_state_mb (mctx, state);
2295 if (BE (*err != REG_NOERROR, 0))
2298 #endif /* RE_ENABLE_I18N */
2300 /* Then decide the next state with the single byte. */
2303 /* don't use transition table */
2304 return transit_state_sb (err, mctx, state);
2307 /* Use transition table */
2308 ch = re_string_fetch_byte (&mctx->input);
2311 trtable = state->trtable;
2312 if (BE (trtable != NULL, 1))
2315 trtable = state->word_trtable;
2316 if (BE (trtable != NULL, 1))
2318 unsigned int context;
2320 = re_string_context_at (&mctx->input,
2321 re_string_cur_idx (&mctx->input) - 1,
2323 if (IS_WORD_CONTEXT (context))
2324 return trtable[ch + SBC_MAX];
2329 if (!build_trtable (mctx->dfa, state))
2335 /* Retry, we now have a transition table. */
2339 /* Update the state_log if we need */
2342 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2343 re_dfastate_t *next_state)
2345 const re_dfa_t *const dfa = mctx->dfa;
2346 int cur_idx = re_string_cur_idx (&mctx->input);
2348 if (cur_idx > mctx->state_log_top)
2350 mctx->state_log[cur_idx] = next_state;
2351 mctx->state_log_top = cur_idx;
2353 else if (mctx->state_log[cur_idx] == 0)
2355 mctx->state_log[cur_idx] = next_state;
2359 re_dfastate_t *pstate;
2360 unsigned int context;
2361 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2362 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2363 the destination of a multibyte char/collating element/
2364 back reference. Then the next state is the union set of
2365 these destinations and the results of the transition table. */
2366 pstate = mctx->state_log[cur_idx];
2367 log_nodes = pstate->entrance_nodes;
2368 if (next_state != NULL)
2370 table_nodes = next_state->entrance_nodes;
2371 *err = re_node_set_init_union (&next_nodes, table_nodes,
2373 if (BE (*err != REG_NOERROR, 0))
2377 next_nodes = *log_nodes;
2378 /* Note: We already add the nodes of the initial state,
2379 then we don't need to add them here. */
2381 context = re_string_context_at (&mctx->input,
2382 re_string_cur_idx (&mctx->input) - 1,
2384 next_state = mctx->state_log[cur_idx]
2385 = re_acquire_state_context (err, dfa, &next_nodes, context);
2386 /* We don't need to check errors here, since the return value of
2387 this function is next_state and ERR is already set. */
2389 if (table_nodes != NULL)
2390 re_node_set_free (&next_nodes);
2393 if (BE (dfa->nbackref, 0) && next_state != NULL)
2395 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2396 later. We must check them here, since the back references in the
2397 next state might use them. */
2398 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2400 if (BE (*err != REG_NOERROR, 0))
2403 /* If the next state has back references. */
2404 if (next_state->has_backref)
2406 *err = transit_state_bkref (mctx, &next_state->nodes);
2407 if (BE (*err != REG_NOERROR, 0))
2409 next_state = mctx->state_log[cur_idx];
2416 /* Skip bytes in the input that correspond to part of a
2417 multi-byte match, then look in the log for a state
2418 from which to restart matching. */
2421 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2423 re_dfastate_t *cur_state;
2426 int max = mctx->state_log_top;
2427 int cur_str_idx = re_string_cur_idx (&mctx->input);
2431 if (++cur_str_idx > max)
2433 re_string_skip_bytes (&mctx->input, 1);
2435 while (mctx->state_log[cur_str_idx] == NULL);
2437 cur_state = merge_state_with_log (err, mctx, NULL);
2439 while (*err == REG_NOERROR && cur_state == NULL);
2443 /* Helper functions for transit_state. */
2445 /* From the node set CUR_NODES, pick up the nodes whose types are
2446 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2447 expression. And register them to use them later for evaluating the
2448 correspoding back references. */
2450 static reg_errcode_t
2452 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2455 const re_dfa_t *const dfa = mctx->dfa;
2459 /* TODO: This isn't efficient.
2460 Because there might be more than one nodes whose types are
2461 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2464 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2466 int node = cur_nodes->elems[node_idx];
2467 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2468 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2469 && (dfa->used_bkref_map
2470 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2472 err = match_ctx_add_subtop (mctx, node, str_idx);
2473 if (BE (err != REG_NOERROR, 0))
2481 /* Return the next state to which the current state STATE will transit by
2482 accepting the current input byte. */
2484 static re_dfastate_t *
2485 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2486 re_dfastate_t *state)
2488 const re_dfa_t *const dfa = mctx->dfa;
2489 re_node_set next_nodes;
2490 re_dfastate_t *next_state;
2491 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2492 unsigned int context;
2494 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2495 if (BE (*err != REG_NOERROR, 0))
2497 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2499 int cur_node = state->nodes.elems[node_cnt];
2500 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2502 *err = re_node_set_merge (&next_nodes,
2503 dfa->eclosures + dfa->nexts[cur_node]);
2504 if (BE (*err != REG_NOERROR, 0))
2506 re_node_set_free (&next_nodes);
2511 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2512 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2513 /* We don't need to check errors here, since the return value of
2514 this function is next_state and ERR is already set. */
2516 re_node_set_free (&next_nodes);
2517 re_string_skip_bytes (&mctx->input, 1);
2522 #ifdef RE_ENABLE_I18N
2523 static reg_errcode_t
2525 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2527 const re_dfa_t *const dfa = mctx->dfa;
2531 for (i = 0; i < pstate->nodes.nelem; ++i)
2533 re_node_set dest_nodes, *new_nodes;
2534 int cur_node_idx = pstate->nodes.elems[i];
2535 int naccepted, dest_idx;
2536 unsigned int context;
2537 re_dfastate_t *dest_state;
2539 if (!dfa->nodes[cur_node_idx].accept_mb)
2542 if (dfa->nodes[cur_node_idx].constraint)
2544 context = re_string_context_at (&mctx->input,
2545 re_string_cur_idx (&mctx->input),
2547 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2552 /* How many bytes the node can accept? */
2553 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2554 re_string_cur_idx (&mctx->input));
2558 /* The node can accepts `naccepted' bytes. */
2559 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2560 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2561 : mctx->max_mb_elem_len);
2562 err = clean_state_log_if_needed (mctx, dest_idx);
2563 if (BE (err != REG_NOERROR, 0))
2566 assert (dfa->nexts[cur_node_idx] != -1);
2568 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2570 dest_state = mctx->state_log[dest_idx];
2571 if (dest_state == NULL)
2572 dest_nodes = *new_nodes;
2575 err = re_node_set_init_union (&dest_nodes,
2576 dest_state->entrance_nodes, new_nodes);
2577 if (BE (err != REG_NOERROR, 0))
2580 context = re_string_context_at (&mctx->input, dest_idx - 1,
2582 mctx->state_log[dest_idx]
2583 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2584 if (dest_state != NULL)
2585 re_node_set_free (&dest_nodes);
2586 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2591 #endif /* RE_ENABLE_I18N */
2593 static reg_errcode_t
2595 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2597 const re_dfa_t *const dfa = mctx->dfa;
2600 int cur_str_idx = re_string_cur_idx (&mctx->input);
2602 for (i = 0; i < nodes->nelem; ++i)
2604 int dest_str_idx, prev_nelem, bkc_idx;
2605 int node_idx = nodes->elems[i];
2606 unsigned int context;
2607 const re_token_t *node = dfa->nodes + node_idx;
2608 re_node_set *new_dest_nodes;
2610 /* Check whether `node' is a backreference or not. */
2611 if (node->type != OP_BACK_REF)
2614 if (node->constraint)
2616 context = re_string_context_at (&mctx->input, cur_str_idx,
2618 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2622 /* `node' is a backreference.
2623 Check the substring which the substring matched. */
2624 bkc_idx = mctx->nbkref_ents;
2625 err = get_subexp (mctx, node_idx, cur_str_idx);
2626 if (BE (err != REG_NOERROR, 0))
2629 /* And add the epsilon closures (which is `new_dest_nodes') of
2630 the backreference to appropriate state_log. */
2632 assert (dfa->nexts[node_idx] != -1);
2634 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2637 re_dfastate_t *dest_state;
2638 struct re_backref_cache_entry *bkref_ent;
2639 bkref_ent = mctx->bkref_ents + bkc_idx;
2640 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2642 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2643 new_dest_nodes = (subexp_len == 0
2644 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2645 : dfa->eclosures + dfa->nexts[node_idx]);
2646 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2647 - bkref_ent->subexp_from);
2648 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2650 dest_state = mctx->state_log[dest_str_idx];
2651 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2652 : mctx->state_log[cur_str_idx]->nodes.nelem);
2653 /* Add `new_dest_node' to state_log. */
2654 if (dest_state == NULL)
2656 mctx->state_log[dest_str_idx]
2657 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2659 if (BE (mctx->state_log[dest_str_idx] == NULL
2660 && err != REG_NOERROR, 0))
2665 re_node_set dest_nodes;
2666 err = re_node_set_init_union (&dest_nodes,
2667 dest_state->entrance_nodes,
2669 if (BE (err != REG_NOERROR, 0))
2671 re_node_set_free (&dest_nodes);
2674 mctx->state_log[dest_str_idx]
2675 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2676 re_node_set_free (&dest_nodes);
2677 if (BE (mctx->state_log[dest_str_idx] == NULL
2678 && err != REG_NOERROR, 0))
2681 /* We need to check recursively if the backreference can epsilon
2684 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2686 err = check_subexp_matching_top (mctx, new_dest_nodes,
2688 if (BE (err != REG_NOERROR, 0))
2690 err = transit_state_bkref (mctx, new_dest_nodes);
2691 if (BE (err != REG_NOERROR, 0))
2701 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2702 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2703 Note that we might collect inappropriate candidates here.
2704 However, the cost of checking them strictly here is too high, then we
2705 delay these checking for prune_impossible_nodes(). */
2707 static reg_errcode_t
2708 internal_function __attribute_warn_unused_result__
2709 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2711 const re_dfa_t *const dfa = mctx->dfa;
2712 int subexp_num, sub_top_idx;
2713 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2714 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2715 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2716 if (cache_idx != -1)
2718 const struct re_backref_cache_entry *entry
2719 = mctx->bkref_ents + cache_idx;
2721 if (entry->node == bkref_node)
2722 return REG_NOERROR; /* We already checked it. */
2723 while (entry++->more);
2726 subexp_num = dfa->nodes[bkref_node].opr.idx;
2728 /* For each sub expression */
2729 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2732 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2733 re_sub_match_last_t *sub_last;
2734 int sub_last_idx, sl_str, bkref_str_off;
2736 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2737 continue; /* It isn't related. */
2739 sl_str = sub_top->str_idx;
2740 bkref_str_off = bkref_str_idx;
2741 /* At first, check the last node of sub expressions we already
2743 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2746 sub_last = sub_top->lasts[sub_last_idx];
2747 sl_str_diff = sub_last->str_idx - sl_str;
2748 /* The matched string by the sub expression match with the substring
2749 at the back reference? */
2750 if (sl_str_diff > 0)
2752 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2754 /* Not enough chars for a successful match. */
2755 if (bkref_str_off + sl_str_diff > mctx->input.len)
2758 err = clean_state_log_if_needed (mctx,
2761 if (BE (err != REG_NOERROR, 0))
2763 buf = (const char *) re_string_get_buffer (&mctx->input);
2765 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2766 /* We don't need to search this sub expression any more. */
2769 bkref_str_off += sl_str_diff;
2770 sl_str += sl_str_diff;
2771 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2774 /* Reload buf, since the preceding call might have reallocated
2776 buf = (const char *) re_string_get_buffer (&mctx->input);
2778 if (err == REG_NOMATCH)
2780 if (BE (err != REG_NOERROR, 0))
2784 if (sub_last_idx < sub_top->nlasts)
2786 if (sub_last_idx > 0)
2788 /* Then, search for the other last nodes of the sub expression. */
2789 for (; sl_str <= bkref_str_idx; ++sl_str)
2791 int cls_node, sl_str_off;
2792 const re_node_set *nodes;
2793 sl_str_off = sl_str - sub_top->str_idx;
2794 /* The matched string by the sub expression match with the substring
2795 at the back reference? */
2798 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2800 /* If we are at the end of the input, we cannot match. */
2801 if (bkref_str_off >= mctx->input.len)
2804 err = extend_buffers (mctx, bkref_str_off + 1);
2805 if (BE (err != REG_NOERROR, 0))
2808 buf = (const char *) re_string_get_buffer (&mctx->input);
2810 if (buf [bkref_str_off++] != buf[sl_str - 1])
2811 break; /* We don't need to search this sub expression
2814 if (mctx->state_log[sl_str] == NULL)
2816 /* Does this state have a ')' of the sub expression? */
2817 nodes = &mctx->state_log[sl_str]->nodes;
2818 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2822 if (sub_top->path == NULL)
2824 sub_top->path = calloc (sizeof (state_array_t),
2825 sl_str - sub_top->str_idx + 1);
2826 if (sub_top->path == NULL)
2829 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2830 in the current context? */
2831 err = check_arrival (mctx, sub_top->path, sub_top->node,
2832 sub_top->str_idx, cls_node, sl_str,
2834 if (err == REG_NOMATCH)
2836 if (BE (err != REG_NOERROR, 0))
2838 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2839 if (BE (sub_last == NULL, 0))
2841 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2843 if (err == REG_NOMATCH)
2850 /* Helper functions for get_subexp(). */
2852 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2853 If it can arrive, register the sub expression expressed with SUB_TOP
2856 static reg_errcode_t
2858 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2859 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2863 /* Can the subexpression arrive the back reference? */
2864 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2865 sub_last->str_idx, bkref_node, bkref_str,
2867 if (err != REG_NOERROR)
2869 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2871 if (BE (err != REG_NOERROR, 0))
2873 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2874 return clean_state_log_if_needed (mctx, to_idx);
2877 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2878 Search '(' if FL_OPEN, or search ')' otherwise.
2879 TODO: This function isn't efficient...
2880 Because there might be more than one nodes whose types are
2881 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2887 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2888 int subexp_idx, int type)
2891 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2893 int cls_node = nodes->elems[cls_idx];
2894 const re_token_t *node = dfa->nodes + cls_node;
2895 if (node->type == type
2896 && node->opr.idx == subexp_idx)
2902 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2903 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2905 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2907 static reg_errcode_t
2908 internal_function __attribute_warn_unused_result__
2909 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2910 int top_str, int last_node, int last_str, int type)
2912 const re_dfa_t *const dfa = mctx->dfa;
2913 reg_errcode_t err = REG_NOERROR;
2914 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2915 re_dfastate_t *cur_state = NULL;
2916 re_node_set *cur_nodes, next_nodes;
2917 re_dfastate_t **backup_state_log;
2918 unsigned int context;
2920 subexp_num = dfa->nodes[top_node].opr.idx;
2921 /* Extend the buffer if we need. */
2922 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2924 re_dfastate_t **new_array;
2925 int old_alloc = path->alloc;
2926 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2927 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2928 if (BE (new_array == NULL, 0))
2930 path->alloc = old_alloc;
2933 path->array = new_array;
2934 memset (new_array + old_alloc, '\0',
2935 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2938 str_idx = path->next_idx ? path->next_idx : top_str;
2940 /* Temporary modify MCTX. */
2941 backup_state_log = mctx->state_log;
2942 backup_cur_idx = mctx->input.cur_idx;
2943 mctx->state_log = path->array;
2944 mctx->input.cur_idx = str_idx;
2946 /* Setup initial node set. */
2947 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2948 if (str_idx == top_str)
2950 err = re_node_set_init_1 (&next_nodes, top_node);
2951 if (BE (err != REG_NOERROR, 0))
2953 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2954 if (BE (err != REG_NOERROR, 0))
2956 re_node_set_free (&next_nodes);
2962 cur_state = mctx->state_log[str_idx];
2963 if (cur_state && cur_state->has_backref)
2965 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2966 if (BE (err != REG_NOERROR, 0))
2970 re_node_set_init_empty (&next_nodes);
2972 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2974 if (next_nodes.nelem)
2976 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2978 if (BE (err != REG_NOERROR, 0))
2980 re_node_set_free (&next_nodes);
2984 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2985 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2987 re_node_set_free (&next_nodes);
2990 mctx->state_log[str_idx] = cur_state;
2993 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2995 re_node_set_empty (&next_nodes);
2996 if (mctx->state_log[str_idx + 1])
2998 err = re_node_set_merge (&next_nodes,
2999 &mctx->state_log[str_idx + 1]->nodes);
3000 if (BE (err != REG_NOERROR, 0))
3002 re_node_set_free (&next_nodes);
3008 err = check_arrival_add_next_nodes (mctx, str_idx,
3009 &cur_state->non_eps_nodes,
3011 if (BE (err != REG_NOERROR, 0))
3013 re_node_set_free (&next_nodes);
3018 if (next_nodes.nelem)
3020 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3021 if (BE (err != REG_NOERROR, 0))
3023 re_node_set_free (&next_nodes);
3026 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3028 if (BE (err != REG_NOERROR, 0))
3030 re_node_set_free (&next_nodes);
3034 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3035 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3036 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3038 re_node_set_free (&next_nodes);
3041 mctx->state_log[str_idx] = cur_state;
3042 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3044 re_node_set_free (&next_nodes);
3045 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3046 : &mctx->state_log[last_str]->nodes);
3047 path->next_idx = str_idx;
3050 mctx->state_log = backup_state_log;
3051 mctx->input.cur_idx = backup_cur_idx;
3053 /* Then check the current node set has the node LAST_NODE. */
3054 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3060 /* Helper functions for check_arrival. */
3062 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3064 TODO: This function is similar to the functions transit_state*(),
3065 however this function has many additional works.
3066 Can't we unify them? */
3068 static reg_errcode_t
3069 internal_function __attribute_warn_unused_result__
3070 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3071 re_node_set *cur_nodes, re_node_set *next_nodes)
3073 const re_dfa_t *const dfa = mctx->dfa;
3076 #ifdef RE_ENABLE_I18N
3077 reg_errcode_t err = REG_NOERROR;
3079 re_node_set union_set;
3080 re_node_set_init_empty (&union_set);
3081 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3084 int cur_node = cur_nodes->elems[cur_idx];
3086 re_token_type_t type = dfa->nodes[cur_node].type;
3087 assert (!IS_EPSILON_NODE (type));
3089 #ifdef RE_ENABLE_I18N
3090 /* If the node may accept `multi byte'. */
3091 if (dfa->nodes[cur_node].accept_mb)
3093 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3097 re_dfastate_t *dest_state;
3098 int next_node = dfa->nexts[cur_node];
3099 int next_idx = str_idx + naccepted;
3100 dest_state = mctx->state_log[next_idx];
3101 re_node_set_empty (&union_set);
3104 err = re_node_set_merge (&union_set, &dest_state->nodes);
3105 if (BE (err != REG_NOERROR, 0))
3107 re_node_set_free (&union_set);
3111 result = re_node_set_insert (&union_set, next_node);
3112 if (BE (result < 0, 0))
3114 re_node_set_free (&union_set);
3117 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3119 if (BE (mctx->state_log[next_idx] == NULL
3120 && err != REG_NOERROR, 0))
3122 re_node_set_free (&union_set);
3127 #endif /* RE_ENABLE_I18N */
3129 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3131 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3132 if (BE (result < 0, 0))
3134 re_node_set_free (&union_set);
3139 re_node_set_free (&union_set);
3143 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3144 CUR_NODES, however exclude the nodes which are:
3145 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3146 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3149 static reg_errcode_t
3151 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3152 int ex_subexp, int type)
3155 int idx, outside_node;
3156 re_node_set new_nodes;
3158 assert (cur_nodes->nelem);
3160 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3161 if (BE (err != REG_NOERROR, 0))
3163 /* Create a new node set NEW_NODES with the nodes which are epsilon
3164 closures of the node in CUR_NODES. */
3166 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3168 int cur_node = cur_nodes->elems[idx];
3169 const re_node_set *eclosure = dfa->eclosures + cur_node;
3170 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3171 if (outside_node == -1)
3173 /* There are no problematic nodes, just merge them. */
3174 err = re_node_set_merge (&new_nodes, eclosure);
3175 if (BE (err != REG_NOERROR, 0))
3177 re_node_set_free (&new_nodes);
3183 /* There are problematic nodes, re-calculate incrementally. */
3184 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3186 if (BE (err != REG_NOERROR, 0))
3188 re_node_set_free (&new_nodes);
3193 re_node_set_free (cur_nodes);
3194 *cur_nodes = new_nodes;
3198 /* Helper function for check_arrival_expand_ecl.
3199 Check incrementally the epsilon closure of TARGET, and if it isn't
3200 problematic append it to DST_NODES. */
3202 static reg_errcode_t
3203 internal_function __attribute_warn_unused_result__
3204 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3205 int target, int ex_subexp, int type)
3208 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3212 if (dfa->nodes[cur_node].type == type
3213 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3215 if (type == OP_CLOSE_SUBEXP)
3217 err = re_node_set_insert (dst_nodes, cur_node);
3218 if (BE (err == -1, 0))
3223 err = re_node_set_insert (dst_nodes, cur_node);
3224 if (BE (err == -1, 0))
3226 if (dfa->edests[cur_node].nelem == 0)
3228 if (dfa->edests[cur_node].nelem == 2)
3230 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3231 dfa->edests[cur_node].elems[1],
3233 if (BE (err != REG_NOERROR, 0))
3236 cur_node = dfa->edests[cur_node].elems[0];
3242 /* For all the back references in the current state, calculate the
3243 destination of the back references by the appropriate entry
3244 in MCTX->BKREF_ENTS. */
3246 static reg_errcode_t
3247 internal_function __attribute_warn_unused_result__
3248 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3249 int cur_str, int subexp_num, int type)
3251 const re_dfa_t *const dfa = mctx->dfa;
3253 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3254 struct re_backref_cache_entry *ent;
3256 if (cache_idx_start == -1)
3260 ent = mctx->bkref_ents + cache_idx_start;
3263 int to_idx, next_node;
3265 /* Is this entry ENT is appropriate? */
3266 if (!re_node_set_contains (cur_nodes, ent->node))
3269 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3270 /* Calculate the destination of the back reference, and append it
3271 to MCTX->STATE_LOG. */
3272 if (to_idx == cur_str)
3274 /* The backreference did epsilon transit, we must re-check all the
3275 node in the current state. */
3276 re_node_set new_dests;
3277 reg_errcode_t err2, err3;
3278 next_node = dfa->edests[ent->node].elems[0];
3279 if (re_node_set_contains (cur_nodes, next_node))
3281 err = re_node_set_init_1 (&new_dests, next_node);
3282 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3283 err3 = re_node_set_merge (cur_nodes, &new_dests);
3284 re_node_set_free (&new_dests);
3285 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3286 || err3 != REG_NOERROR, 0))
3288 err = (err != REG_NOERROR ? err
3289 : (err2 != REG_NOERROR ? err2 : err3));
3292 /* TODO: It is still inefficient... */
3297 re_node_set union_set;
3298 next_node = dfa->nexts[ent->node];
3299 if (mctx->state_log[to_idx])
3302 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3305 err = re_node_set_init_copy (&union_set,
3306 &mctx->state_log[to_idx]->nodes);
3307 ret = re_node_set_insert (&union_set, next_node);
3308 if (BE (err != REG_NOERROR || ret < 0, 0))
3310 re_node_set_free (&union_set);
3311 err = err != REG_NOERROR ? err : REG_ESPACE;
3317 err = re_node_set_init_1 (&union_set, next_node);
3318 if (BE (err != REG_NOERROR, 0))
3321 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3322 re_node_set_free (&union_set);
3323 if (BE (mctx->state_log[to_idx] == NULL
3324 && err != REG_NOERROR, 0))
3328 while (ent++->more);
3332 /* Build transition table for the state.
3333 Return 1 if succeeded, otherwise return NULL. */
3337 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3340 int i, j, ch, need_word_trtable = 0;
3341 bitset_word_t elem, mask;
3342 bool dests_node_malloced = false;
3343 bool dest_states_malloced = false;
3344 int ndests; /* Number of the destination states from `state'. */
3345 re_dfastate_t **trtable;
3346 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3347 re_node_set follows, *dests_node;
3349 bitset_t acceptable;
3353 re_node_set dests_node[SBC_MAX];
3354 bitset_t dests_ch[SBC_MAX];
3357 /* We build DFA states which corresponds to the destination nodes
3358 from `state'. `dests_node[i]' represents the nodes which i-th
3359 destination state contains, and `dests_ch[i]' represents the
3360 characters which i-th destination state accepts. */
3362 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3363 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3367 dests_alloc = re_malloc (struct dests_alloc, 1);
3368 if (BE (dests_alloc == NULL, 0))
3370 dests_node_malloced = true;
3372 dests_node = dests_alloc->dests_node;
3373 dests_ch = dests_alloc->dests_ch;
3375 /* Initialize transiton table. */
3376 state->word_trtable = state->trtable = NULL;
3378 /* At first, group all nodes belonging to `state' into several
3380 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3381 if (BE (ndests <= 0, 0))
3383 if (dests_node_malloced)
3385 /* Return 0 in case of an error, 1 otherwise. */
3388 state->trtable = (re_dfastate_t **)
3389 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3390 if (BE (state->trtable == NULL, 0))
3397 err = re_node_set_alloc (&follows, ndests + 1);
3398 if (BE (err != REG_NOERROR, 0))
3401 /* Avoid arithmetic overflow in size calculation. */
3402 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3403 / (3 * sizeof (re_dfastate_t *)))
3409 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3410 + ndests * 3 * sizeof (re_dfastate_t *)))
3411 dest_states = (re_dfastate_t **)
3412 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3416 dest_states = (re_dfastate_t **)
3417 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3418 if (BE (dest_states == NULL, 0))
3421 if (dest_states_malloced)
3423 re_node_set_free (&follows);
3424 for (i = 0; i < ndests; ++i)
3425 re_node_set_free (dests_node + i);
3426 if (dests_node_malloced)
3430 dest_states_malloced = true;
3432 dest_states_word = dest_states + ndests;
3433 dest_states_nl = dest_states_word + ndests;
3434 bitset_empty (acceptable);
3436 /* Then build the states for all destinations. */
3437 for (i = 0; i < ndests; ++i)
3440 re_node_set_empty (&follows);
3441 /* Merge the follows of this destination states. */
3442 for (j = 0; j < dests_node[i].nelem; ++j)
3444 next_node = dfa->nexts[dests_node[i].elems[j]];
3445 if (next_node != -1)
3447 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3448 if (BE (err != REG_NOERROR, 0))
3452 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3453 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3455 /* If the new state has context constraint,
3456 build appropriate states for these contexts. */
3457 if (dest_states[i]->has_constraint)
3459 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3461 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3464 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3465 need_word_trtable = 1;
3467 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3469 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3474 dest_states_word[i] = dest_states[i];
3475 dest_states_nl[i] = dest_states[i];
3477 bitset_merge (acceptable, dests_ch[i]);
3480 if (!BE (need_word_trtable, 0))
3482 /* We don't care about whether the following character is a word
3483 character, or we are in a single-byte character set so we can
3484 discern by looking at the character code: allocate a
3485 256-entry transition table. */
3486 trtable = state->trtable =
3487 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3488 if (BE (trtable == NULL, 0))
3491 /* For all characters ch...: */
3492 for (i = 0; i < BITSET_WORDS; ++i)
3493 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3495 mask <<= 1, elem >>= 1, ++ch)
3496 if (BE (elem & 1, 0))
3498 /* There must be exactly one destination which accepts
3499 character ch. See group_nodes_into_DFAstates. */
3500 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3503 /* j-th destination accepts the word character ch. */
3504 if (dfa->word_char[i] & mask)
3505 trtable[ch] = dest_states_word[j];
3507 trtable[ch] = dest_states[j];
3512 /* We care about whether the following character is a word
3513 character, and we are in a multi-byte character set: discern
3514 by looking at the character code: build two 256-entry
3515 transition tables, one starting at trtable[0] and one
3516 starting at trtable[SBC_MAX]. */
3517 trtable = state->word_trtable =
3518 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3519 if (BE (trtable == NULL, 0))
3522 /* For all characters ch...: */
3523 for (i = 0; i < BITSET_WORDS; ++i)
3524 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3526 mask <<= 1, elem >>= 1, ++ch)
3527 if (BE (elem & 1, 0))
3529 /* There must be exactly one destination which accepts
3530 character ch. See group_nodes_into_DFAstates. */
3531 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3534 /* j-th destination accepts the word character ch. */
3535 trtable[ch] = dest_states[j];
3536 trtable[ch + SBC_MAX] = dest_states_word[j];
3541 if (bitset_contain (acceptable, NEWLINE_CHAR))
3543 /* The current state accepts newline character. */
3544 for (j = 0; j < ndests; ++j)
3545 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3547 /* k-th destination accepts newline character. */
3548 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3549 if (need_word_trtable)
3550 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3551 /* There must be only one destination which accepts
3552 newline. See group_nodes_into_DFAstates. */
3557 if (dest_states_malloced)
3560 re_node_set_free (&follows);
3561 for (i = 0; i < ndests; ++i)
3562 re_node_set_free (dests_node + i);
3564 if (dests_node_malloced)
3570 /* Group all nodes belonging to STATE into several destinations.
3571 Then for all destinations, set the nodes belonging to the destination
3572 to DESTS_NODE[i] and set the characters accepted by the destination
3573 to DEST_CH[i]. This function return the number of destinations. */
3577 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3578 re_node_set *dests_node, bitset_t *dests_ch)
3583 int ndests; /* Number of the destinations from `state'. */
3584 bitset_t accepts; /* Characters a node can accept. */
3585 const re_node_set *cur_nodes = &state->nodes;
3586 bitset_empty (accepts);
3589 /* For all the nodes belonging to `state', */
3590 for (i = 0; i < cur_nodes->nelem; ++i)
3592 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3593 re_token_type_t type = node->type;
3594 unsigned int constraint = node->constraint;
3596 /* Enumerate all single byte character this node can accept. */
3597 if (type == CHARACTER)
3598 bitset_set (accepts, node->opr.c);
3599 else if (type == SIMPLE_BRACKET)
3601 bitset_merge (accepts, node->opr.sbcset);
3603 else if (type == OP_PERIOD)
3605 #ifdef RE_ENABLE_I18N
3606 if (dfa->mb_cur_max > 1)
3607 bitset_merge (accepts, dfa->sb_char);
3610 bitset_set_all (accepts);
3611 if (!(dfa->syntax & RE_DOT_NEWLINE))
3612 bitset_clear (accepts, '\n');
3613 if (dfa->syntax & RE_DOT_NOT_NULL)
3614 bitset_clear (accepts, '\0');
3616 #ifdef RE_ENABLE_I18N
3617 else if (type == OP_UTF8_PERIOD)
3619 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3620 if (!(dfa->syntax & RE_DOT_NEWLINE))
3621 bitset_clear (accepts, '\n');
3622 if (dfa->syntax & RE_DOT_NOT_NULL)
3623 bitset_clear (accepts, '\0');
3629 /* Check the `accepts' and sift the characters which are not
3630 match it the context. */
3633 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3635 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3636 bitset_empty (accepts);
3637 if (accepts_newline)
3638 bitset_set (accepts, NEWLINE_CHAR);
3642 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3644 bitset_empty (accepts);
3648 if (constraint & NEXT_WORD_CONSTRAINT)
3650 bitset_word_t any_set = 0;
3651 if (type == CHARACTER && !node->word_char)
3653 bitset_empty (accepts);
3656 #ifdef RE_ENABLE_I18N
3657 if (dfa->mb_cur_max > 1)
3658 for (j = 0; j < BITSET_WORDS; ++j)
3659 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3662 for (j = 0; j < BITSET_WORDS; ++j)
3663 any_set |= (accepts[j] &= dfa->word_char[j]);
3667 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3669 bitset_word_t any_set = 0;
3670 if (type == CHARACTER && node->word_char)
3672 bitset_empty (accepts);
3675 #ifdef RE_ENABLE_I18N
3676 if (dfa->mb_cur_max > 1)
3677 for (j = 0; j < BITSET_WORDS; ++j)
3678 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3681 for (j = 0; j < BITSET_WORDS; ++j)
3682 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3688 /* Then divide `accepts' into DFA states, or create a new
3689 state. Above, we make sure that accepts is not empty. */
3690 for (j = 0; j < ndests; ++j)
3692 bitset_t intersec; /* Intersection sets, see below. */
3694 /* Flags, see below. */
3695 bitset_word_t has_intersec, not_subset, not_consumed;
3697 /* Optimization, skip if this state doesn't accept the character. */
3698 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3701 /* Enumerate the intersection set of this state and `accepts'. */
3703 for (k = 0; k < BITSET_WORDS; ++k)
3704 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3705 /* And skip if the intersection set is empty. */
3709 /* Then check if this state is a subset of `accepts'. */
3710 not_subset = not_consumed = 0;
3711 for (k = 0; k < BITSET_WORDS; ++k)
3713 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3714 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3717 /* If this state isn't a subset of `accepts', create a
3718 new group state, which has the `remains'. */
3721 bitset_copy (dests_ch[ndests], remains);
3722 bitset_copy (dests_ch[j], intersec);
3723 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3724 if (BE (err != REG_NOERROR, 0))
3729 /* Put the position in the current group. */
3730 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3731 if (BE (result < 0, 0))
3734 /* If all characters are consumed, go to next node. */
3738 /* Some characters remain, create a new group. */
3741 bitset_copy (dests_ch[ndests], accepts);
3742 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3743 if (BE (err != REG_NOERROR, 0))
3746 bitset_empty (accepts);
3751 for (j = 0; j < ndests; ++j)
3752 re_node_set_free (dests_node + j);
3756 #ifdef RE_ENABLE_I18N
3757 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3758 Return the number of the bytes the node accepts.
3759 STR_IDX is the current index of the input string.
3761 This function handles the nodes which can accept one character, or
3762 one collating element like '.', '[a-z]', opposite to the other nodes
3763 can only accept one byte. */
3767 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3768 const re_string_t *input, int str_idx)
3770 const re_token_t *node = dfa->nodes + node_idx;
3771 int char_len, elem_len;
3775 if (BE (node->type == OP_UTF8_PERIOD, 0))
3777 unsigned char c = re_string_byte_at (input, str_idx), d;
3778 if (BE (c < 0xc2, 1))
3781 if (str_idx + 2 > input->len)
3784 d = re_string_byte_at (input, str_idx + 1);
3786 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3790 if (c == 0xe0 && d < 0xa0)
3796 if (c == 0xf0 && d < 0x90)
3802 if (c == 0xf8 && d < 0x88)
3808 if (c == 0xfc && d < 0x84)
3814 if (str_idx + char_len > input->len)
3817 for (i = 1; i < char_len; ++i)
3819 d = re_string_byte_at (input, str_idx + i);
3820 if (d < 0x80 || d > 0xbf)
3826 char_len = re_string_char_size_at (input, str_idx);
3827 if (node->type == OP_PERIOD)
3831 /* FIXME: I don't think this if is needed, as both '\n'
3832 and '\0' are char_len == 1. */
3833 /* '.' accepts any one character except the following two cases. */
3834 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3835 re_string_byte_at (input, str_idx) == '\n') ||
3836 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3837 re_string_byte_at (input, str_idx) == '\0'))
3842 elem_len = re_string_elem_size_at (input, str_idx);
3843 wc = __btowc(*(input->mbs+str_idx));
3844 if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3847 if (node->type == COMPLEX_BRACKET)
3849 const re_charset_t *cset = node->opr.mbcset;
3851 const unsigned char *pin
3852 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3857 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3858 ? re_string_wchar_at (input, str_idx) : 0);
3860 /* match with multibyte character? */
3861 for (i = 0; i < cset->nmbchars; ++i)
3862 if (wc == cset->mbchars[i])
3864 match_len = char_len;
3865 goto check_node_accept_bytes_match;
3867 /* match with character_class? */
3868 for (i = 0; i < cset->nchar_classes; ++i)
3870 wctype_t wt = cset->char_classes[i];
3871 if (__iswctype (wc, wt))
3873 match_len = char_len;
3874 goto check_node_accept_bytes_match;
3879 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3882 unsigned int in_collseq = 0;
3883 const int32_t *table, *indirect;
3884 const unsigned char *weights, *extra;
3885 const char *collseqwc;
3886 /* This #include defines a local function! */
3887 # include <locale/weight.h>
3889 /* match with collating_symbol? */
3890 if (cset->ncoll_syms)
3891 extra = (const unsigned char *)
3892 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3893 for (i = 0; i < cset->ncoll_syms; ++i)
3895 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3896 /* Compare the length of input collating element and
3897 the length of current collating element. */
3898 if (*coll_sym != elem_len)
3900 /* Compare each bytes. */
3901 for (j = 0; j < *coll_sym; j++)
3902 if (pin[j] != coll_sym[1 + j])
3906 /* Match if every bytes is equal. */
3908 goto check_node_accept_bytes_match;
3914 if (elem_len <= char_len)
3916 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3917 in_collseq = __collseq_table_lookup (collseqwc, wc);
3920 in_collseq = find_collation_sequence_value (pin, elem_len);
3922 /* match with range expression? */
3923 for (i = 0; i < cset->nranges; ++i)
3924 if (cset->range_starts[i] <= in_collseq
3925 && in_collseq <= cset->range_ends[i])
3927 match_len = elem_len;
3928 goto check_node_accept_bytes_match;
3931 /* match with equivalence_class? */
3932 if (cset->nequiv_classes)
3934 const unsigned char *cp = pin;
3935 table = (const int32_t *)
3936 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3937 weights = (const unsigned char *)
3938 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3939 extra = (const unsigned char *)
3940 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3941 indirect = (const int32_t *)
3942 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3943 int32_t idx = findidx (&cp, elem_len);
3945 for (i = 0; i < cset->nequiv_classes; ++i)
3947 int32_t equiv_class_idx = cset->equiv_classes[i];
3948 size_t weight_len = weights[idx & 0xffffff];
3949 if (weight_len == weights[equiv_class_idx & 0xffffff]
3950 && (idx >> 24) == (equiv_class_idx >> 24))
3955 equiv_class_idx &= 0xffffff;
3957 while (cnt <= weight_len
3958 && (weights[equiv_class_idx + 1 + cnt]
3959 == weights[idx + 1 + cnt]))
3961 if (cnt > weight_len)
3963 match_len = elem_len;
3964 goto check_node_accept_bytes_match;
3973 /* match with range expression? */
3974 for (i = 0; i < cset->nranges; ++i)
3976 if (cset->range_starts[i] <= wc
3977 && wc <= cset->range_ends[i])
3979 match_len = char_len;
3980 goto check_node_accept_bytes_match;
3984 check_node_accept_bytes_match:
3985 if (!cset->non_match)
3992 return (elem_len > char_len) ? elem_len : char_len;
4001 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
4003 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4008 /* No valid character. Match it as a single byte character. */
4009 const unsigned char *collseq = (const unsigned char *)
4010 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4011 return collseq[mbs[0]];
4018 const unsigned char *extra = (const unsigned char *)
4019 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4020 int32_t extrasize = (const unsigned char *)
4021 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4023 for (idx = 0; idx < extrasize;)
4025 int mbs_cnt, found = 0;
4026 int32_t elem_mbs_len;
4027 /* Skip the name of collating element name. */
4028 idx = idx + extra[idx] + 1;
4029 elem_mbs_len = extra[idx++];
4030 if (mbs_len == elem_mbs_len)
4032 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4033 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4035 if (mbs_cnt == elem_mbs_len)
4036 /* Found the entry. */
4039 /* Skip the byte sequence of the collating element. */
4040 idx += elem_mbs_len;
4041 /* Adjust for the alignment. */
4042 idx = (idx + 3) & ~3;
4043 /* Skip the collation sequence value. */
4044 idx += sizeof (uint32_t);
4045 /* Skip the wide char sequence of the collating element. */
4046 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4047 /* If we found the entry, return the sequence value. */
4049 return *(uint32_t *) (extra + idx);
4050 /* Skip the collation sequence value. */
4051 idx += sizeof (uint32_t);
4057 #endif /* RE_ENABLE_I18N */
4059 /* Check whether the node accepts the byte which is IDX-th
4060 byte of the INPUT. */
4064 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4068 ch = re_string_byte_at (&mctx->input, idx);
4072 if (node->opr.c != ch)
4076 case SIMPLE_BRACKET:
4077 if (!bitset_contain (node->opr.sbcset, ch))
4081 #ifdef RE_ENABLE_I18N
4082 case OP_UTF8_PERIOD:
4088 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4089 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4097 if (node->constraint)
4099 /* The node has constraints. Check whether the current context
4100 satisfies the constraints. */
4101 unsigned int context = re_string_context_at (&mctx->input, idx,
4103 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4110 /* Extend the buffers, if the buffers have run out. */
4112 static reg_errcode_t
4113 internal_function __attribute_warn_unused_result__
4114 extend_buffers (re_match_context_t *mctx, int min_len)
4117 re_string_t *pstr = &mctx->input;
4119 /* Avoid overflow. */
4120 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4123 /* Double the lengthes of the buffers, but allocate at least MIN_LEN. */
4124 ret = re_string_realloc_buffers (pstr,
4126 MIN (pstr->len, pstr->bufs_len * 2)));
4127 if (BE (ret != REG_NOERROR, 0))
4130 if (mctx->state_log != NULL)
4132 /* And double the length of state_log. */
4133 /* XXX We have no indication of the size of this buffer. If this
4134 allocation fail we have no indication that the state_log array
4135 does not have the right size. */
4136 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4137 pstr->bufs_len + 1);
4138 if (BE (new_array == NULL, 0))
4140 mctx->state_log = new_array;
4143 /* Then reconstruct the buffers. */
4146 #ifdef RE_ENABLE_I18N
4147 if (pstr->mb_cur_max > 1)
4149 ret = build_wcs_upper_buffer (pstr);
4150 if (BE (ret != REG_NOERROR, 0))
4154 #endif /* RE_ENABLE_I18N */
4155 build_upper_buffer (pstr);
4159 #ifdef RE_ENABLE_I18N
4160 if (pstr->mb_cur_max > 1)
4161 build_wcs_buffer (pstr);
4163 #endif /* RE_ENABLE_I18N */
4165 if (pstr->trans != NULL)
4166 re_string_translate_buffer (pstr);
4173 /* Functions for matching context. */
4175 /* Initialize MCTX. */
4177 static reg_errcode_t
4178 internal_function __attribute_warn_unused_result__
4179 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4181 mctx->eflags = eflags;
4182 mctx->match_last = -1;
4185 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4186 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4187 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4190 /* Already zero-ed by the caller.
4192 mctx->bkref_ents = NULL;
4193 mctx->nbkref_ents = 0;
4194 mctx->nsub_tops = 0; */
4195 mctx->abkref_ents = n;
4196 mctx->max_mb_elem_len = 1;
4197 mctx->asub_tops = n;
4201 /* Clean the entries which depend on the current input in MCTX.
4202 This function must be invoked when the matcher changes the start index
4203 of the input, or changes the input string. */
4207 match_ctx_clean (re_match_context_t *mctx)
4210 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4213 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4214 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4216 re_sub_match_last_t *last = top->lasts[sl_idx];
4217 re_free (last->path.array);
4220 re_free (top->lasts);
4223 re_free (top->path->array);
4224 re_free (top->path);
4229 mctx->nsub_tops = 0;
4230 mctx->nbkref_ents = 0;
4233 /* Free all the memory associated with MCTX. */
4237 match_ctx_free (re_match_context_t *mctx)
4239 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4240 match_ctx_clean (mctx);
4241 re_free (mctx->sub_tops);
4242 re_free (mctx->bkref_ents);
4245 /* Add a new backreference entry to MCTX.
4246 Note that we assume that caller never call this function with duplicate
4247 entry, and call with STR_IDX which isn't smaller than any existing entry.
4250 static reg_errcode_t
4251 internal_function __attribute_warn_unused_result__
4252 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4255 if (mctx->nbkref_ents >= mctx->abkref_ents)
4257 struct re_backref_cache_entry* new_entry;
4258 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4259 mctx->abkref_ents * 2);
4260 if (BE (new_entry == NULL, 0))
4262 re_free (mctx->bkref_ents);
4265 mctx->bkref_ents = new_entry;
4266 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4267 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4268 mctx->abkref_ents *= 2;
4270 if (mctx->nbkref_ents > 0
4271 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4272 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4274 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4275 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4276 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4277 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4279 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4280 If bit N is clear, means that this entry won't epsilon-transition to
4281 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4282 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4285 A backreference does not epsilon-transition unless it is empty, so set
4286 to all zeros if FROM != TO. */
4287 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4288 = (from == to ? ~0 : 0);
4290 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4291 if (mctx->max_mb_elem_len < to - from)
4292 mctx->max_mb_elem_len = to - from;
4296 /* Search for the first entry which has the same str_idx, or -1 if none is
4297 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4301 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4303 int left, right, mid, last;
4304 last = right = mctx->nbkref_ents;
4305 for (left = 0; left < right;)
4307 mid = (left + right) / 2;
4308 if (mctx->bkref_ents[mid].str_idx < str_idx)
4313 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4319 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4322 static reg_errcode_t
4323 internal_function __attribute_warn_unused_result__
4324 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4327 assert (mctx->sub_tops != NULL);
4328 assert (mctx->asub_tops > 0);
4330 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4332 int new_asub_tops = mctx->asub_tops * 2;
4333 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4334 re_sub_match_top_t *,
4336 if (BE (new_array == NULL, 0))
4338 mctx->sub_tops = new_array;
4339 mctx->asub_tops = new_asub_tops;
4341 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4342 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4344 mctx->sub_tops[mctx->nsub_tops]->node = node;
4345 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4349 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4350 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4352 static re_sub_match_last_t *
4354 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4356 re_sub_match_last_t *new_entry;
4357 if (BE (subtop->nlasts == subtop->alasts, 0))
4359 int new_alasts = 2 * subtop->alasts + 1;
4360 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4361 re_sub_match_last_t *,
4363 if (BE (new_array == NULL, 0))
4365 subtop->lasts = new_array;
4366 subtop->alasts = new_alasts;
4368 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4369 if (BE (new_entry != NULL, 1))
4371 subtop->lasts[subtop->nlasts] = new_entry;
4372 new_entry->node = node;
4373 new_entry->str_idx = str_idx;
4381 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4382 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4384 sctx->sifted_states = sifted_sts;
4385 sctx->limited_states = limited_sts;
4386 sctx->last_node = last_node;
4387 sctx->last_str_idx = last_str_idx;
4388 re_node_set_init_empty (&sctx->limits);