1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 static void match_ctx_clean (re_match_context_t *mctx);
23 static void match_ctx_free (re_match_context_t *cache);
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to);
26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
30 Idx node, Idx str_idx);
31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
32 re_dfastate_t **limited_sts, Idx last_node,
34 static reg_errcode_t re_search_internal (const regex_t *preg,
35 const char *string, Idx length,
36 Idx start, Idx last_start, Idx stop,
37 size_t nmatch, regmatch_t pmatch[],
39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
40 const char *string1, Idx length1,
41 const char *string2, Idx length2,
42 Idx start, regoff_t range,
43 struct re_registers *regs,
44 Idx stop, bool ret_len);
45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
46 const char *string, Idx length, Idx start,
47 regoff_t range, Idx stop,
48 struct re_registers *regs,
50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
51 Idx nregs, int regs_allocated);
52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
55 static Idx check_halt_state_context (const re_match_context_t *mctx,
56 const re_dfastate_t *state, Idx idx);
57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
58 regmatch_t *prev_idx_match, Idx cur_node,
59 Idx cur_idx, Idx nmatch);
60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
61 Idx str_idx, Idx dest_node, Idx nregs,
62 regmatch_t *regs, regmatch_t *prevregs,
63 re_node_set *eps_via_nodes);
64 static reg_errcode_t set_regs (const regex_t *preg,
65 const re_match_context_t *mctx,
66 size_t nmatch, regmatch_t *pmatch,
68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
71 static int sift_states_iter_mb (const re_match_context_t *mctx,
72 re_sift_context_t *sctx,
73 Idx node_idx, Idx str_idx, Idx max_str_idx);
74 #endif /* RE_ENABLE_I18N */
75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
76 re_sift_context_t *sctx);
77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
78 re_sift_context_t *sctx, Idx str_idx,
79 re_node_set *cur_dest);
80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
81 re_sift_context_t *sctx,
83 re_node_set *dest_nodes);
84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
85 re_node_set *dest_nodes,
86 const re_node_set *candidates);
87 static bool check_dst_limits (const re_match_context_t *mctx,
88 const re_node_set *limits,
89 Idx dst_node, Idx dst_idx, Idx src_node,
91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
92 int boundaries, Idx subexp_idx,
93 Idx from_node, Idx bkref_idx);
94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
95 Idx limit, Idx subexp_idx,
96 Idx node, Idx str_idx,
98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
99 re_node_set *dest_nodes,
100 const re_node_set *candidates,
102 struct re_backref_cache_entry *bkref_ents,
104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
105 re_sift_context_t *sctx,
106 Idx str_idx, const re_node_set *candidates);
107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
109 re_dfastate_t **src, Idx num);
110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
111 re_match_context_t *mctx);
112 static re_dfastate_t *transit_state (reg_errcode_t *err,
113 re_match_context_t *mctx,
114 re_dfastate_t *state);
115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
116 re_match_context_t *mctx,
117 re_dfastate_t *next_state);
118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
119 re_node_set *cur_nodes,
122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
123 re_match_context_t *mctx,
124 re_dfastate_t *pstate);
126 #ifdef RE_ENABLE_I18N
127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
128 re_dfastate_t *pstate);
129 #endif /* RE_ENABLE_I18N */
130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
131 const re_node_set *nodes);
132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
133 Idx bkref_node, Idx bkref_str_idx);
134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
135 const re_sub_match_top_t *sub_top,
136 re_sub_match_last_t *sub_last,
137 Idx bkref_node, Idx bkref_str);
138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
139 Idx subexp_idx, int type);
140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
141 state_array_t *path, Idx top_node,
142 Idx top_str, Idx last_node, Idx last_str,
144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
146 re_node_set *cur_nodes,
147 re_node_set *next_nodes);
148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
149 re_node_set *cur_nodes,
150 Idx ex_subexp, int type);
151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
152 re_node_set *dst_nodes,
153 Idx target, Idx ex_subexp,
155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
156 re_node_set *cur_nodes, Idx cur_str,
157 Idx subexp_num, int type);
158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
159 #ifdef RE_ENABLE_I18N
160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
161 const re_string_t *input, Idx idx);
163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
166 #endif /* RE_ENABLE_I18N */
167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
168 const re_dfastate_t *state,
169 re_node_set *states_node,
170 bitset_t *states_ch);
171 static bool check_node_accept (const re_match_context_t *mctx,
172 const re_token_t *node, Idx idx);
173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
175 /* Entry point for POSIX code. */
177 /* regexec searches for a given pattern, specified by PREG, in the
180 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
181 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
182 least NMATCH elements, and we set them to the offsets of the
183 corresponding matched substrings.
185 EFLAGS specifies "execution flags" which affect matching: if
186 REG_NOTBOL is set, then ^ does not match at the beginning of the
187 string; if REG_NOTEOL is set, then $ does not match at the end.
189 Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
190 EFLAGS is invalid. */
193 regexec (const regex_t *__restrict preg, const char *__restrict string,
194 size_t nmatch, regmatch_t pmatch[], int eflags)
198 re_dfa_t *dfa = preg->buffer;
200 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
203 if (eflags & REG_STARTEND)
205 start = pmatch[0].rm_so;
206 length = pmatch[0].rm_eo;
211 length = strlen (string);
214 lock_lock (dfa->lock);
216 err = re_search_internal (preg, string, length, start, length,
217 length, 0, NULL, eflags);
219 err = re_search_internal (preg, string, length, start, length,
220 length, nmatch, pmatch, eflags);
221 lock_unlock (dfa->lock);
222 return err != REG_NOERROR;
226 libc_hidden_def (__regexec)
228 # include <shlib-compat.h>
229 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
231 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
232 __typeof__ (__regexec) __compat_regexec;
235 attribute_compat_text_section
236 __compat_regexec (const regex_t *__restrict preg,
237 const char *__restrict string, size_t nmatch,
238 regmatch_t pmatch[], int eflags)
240 return regexec (preg, string, nmatch, pmatch,
241 eflags & (REG_NOTBOL | REG_NOTEOL));
243 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
247 /* Entry points for GNU code. */
249 /* re_match, re_search, re_match_2, re_search_2
251 The former two functions operate on STRING with length LENGTH,
252 while the later two operate on concatenation of STRING1 and STRING2
253 with lengths LENGTH1 and LENGTH2, respectively.
255 re_match() matches the compiled pattern in BUFP against the string,
256 starting at index START.
258 re_search() first tries matching at index START, then it tries to match
259 starting from index START + 1, and so on. The last start position tried
260 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
263 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
264 the first STOP characters of the concatenation of the strings should be
267 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
268 and all groups is stored in REGS. (For the "_2" variants, the offsets are
269 computed relative to the concatenation, not relative to the individual
272 On success, re_match* functions return the length of the match, re_search*
273 return the position of the start of the match. They return -1 on
274 match failure, -2 on error. */
277 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
278 Idx start, struct re_registers *regs)
280 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
283 weak_alias (__re_match, re_match)
287 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
288 Idx start, regoff_t range, struct re_registers *regs)
290 return re_search_stub (bufp, string, length, start, range, length, regs,
294 weak_alias (__re_search, re_search)
298 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
299 const char *string2, Idx length2, Idx start,
300 struct re_registers *regs, Idx stop)
302 return re_search_2_stub (bufp, string1, length1, string2, length2,
303 start, 0, regs, stop, true);
306 weak_alias (__re_match_2, re_match_2)
310 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
311 const char *string2, Idx length2, Idx start, regoff_t range,
312 struct re_registers *regs, Idx stop)
314 return re_search_2_stub (bufp, string1, length1, string2, length2,
315 start, range, regs, stop, false);
318 weak_alias (__re_search_2, re_search_2)
322 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
323 Idx length1, const char *string2, Idx length2, Idx start,
324 regoff_t range, struct re_registers *regs,
325 Idx stop, bool ret_len)
332 if (__glibc_unlikely ((length1 < 0 || length2 < 0 || stop < 0
333 || INT_ADD_WRAPV (length1, length2, &len))))
336 /* Concatenate the strings. */
340 s = re_malloc (char, len);
342 if (__glibc_unlikely (s == NULL))
345 memcpy (__mempcpy (s, string1, length1), string2, length2);
347 memcpy (s, string1, length1);
348 memcpy (s + length1, string2, length2);
357 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
363 /* The parameters have the same meaning as those of re_search.
364 Additional parameters:
365 If RET_LEN is true the length of the match is returned (re_match style);
366 otherwise the position of the match is returned. */
369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
370 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
373 reg_errcode_t result;
378 re_dfa_t *dfa = bufp->buffer;
379 Idx last_start = start + range;
381 /* Check for out-of-range. */
382 if (__glibc_unlikely (start < 0 || start > length))
384 if (__glibc_unlikely (length < last_start
385 || (0 <= range && last_start < start)))
387 else if (__glibc_unlikely (last_start < 0
388 || (range < 0 && start <= last_start)))
391 lock_lock (dfa->lock);
393 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
394 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
396 /* Compile fastmap if we haven't yet. */
397 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
398 re_compile_fastmap (bufp);
400 if (__glibc_unlikely (bufp->no_sub))
403 /* We need at least 1 register. */
406 else if (__glibc_unlikely (bufp->regs_allocated == REGS_FIXED
407 && regs->num_regs <= bufp->re_nsub))
409 nregs = regs->num_regs;
410 if (__glibc_unlikely (nregs < 1))
412 /* Nothing can be copied to regs. */
418 nregs = bufp->re_nsub + 1;
419 pmatch = re_malloc (regmatch_t, nregs);
420 if (__glibc_unlikely (pmatch == NULL))
426 result = re_search_internal (bufp, string, length, start, last_start, stop,
427 nregs, pmatch, eflags);
431 /* I hope we needn't fill their regs with -1's when no match was found. */
432 if (result != REG_NOERROR)
433 rval = result == REG_NOMATCH ? -1 : -2;
434 else if (regs != NULL)
436 /* If caller wants register contents data back, copy them. */
437 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
438 bufp->regs_allocated);
439 if (__glibc_unlikely (bufp->regs_allocated == REGS_UNALLOCATED))
443 if (__glibc_likely (rval == 0))
447 DEBUG_ASSERT (pmatch[0].rm_so == start);
448 rval = pmatch[0].rm_eo - start;
451 rval = pmatch[0].rm_so;
455 lock_unlock (dfa->lock);
460 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
463 int rval = REGS_REALLOCATE;
465 Idx need_regs = nregs + 1;
466 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
469 /* Have the register data arrays been allocated? */
470 if (regs_allocated == REGS_UNALLOCATED)
471 { /* No. So allocate them with malloc. */
472 regs->start = re_malloc (regoff_t, need_regs);
473 if (__glibc_unlikely (regs->start == NULL))
474 return REGS_UNALLOCATED;
475 regs->end = re_malloc (regoff_t, need_regs);
476 if (__glibc_unlikely (regs->end == NULL))
478 re_free (regs->start);
479 return REGS_UNALLOCATED;
481 regs->num_regs = need_regs;
483 else if (regs_allocated == REGS_REALLOCATE)
484 { /* Yes. If we need more elements than were already
485 allocated, reallocate them. If we need fewer, just
487 if (__glibc_unlikely (need_regs > regs->num_regs))
489 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
491 if (__glibc_unlikely (new_start == NULL))
492 return REGS_UNALLOCATED;
493 new_end = re_realloc (regs->end, regoff_t, need_regs);
494 if (__glibc_unlikely (new_end == NULL))
497 return REGS_UNALLOCATED;
499 regs->start = new_start;
501 regs->num_regs = need_regs;
506 DEBUG_ASSERT (regs_allocated == REGS_FIXED);
507 /* This function may not be called with REGS_FIXED and nregs too big. */
508 DEBUG_ASSERT (nregs <= regs->num_regs);
513 for (i = 0; i < nregs; ++i)
515 regs->start[i] = pmatch[i].rm_so;
516 regs->end[i] = pmatch[i].rm_eo;
518 for ( ; i < regs->num_regs; ++i)
519 regs->start[i] = regs->end[i] = -1;
524 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
525 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
526 this memory for recording register information. STARTS and ENDS
527 must be allocated using the malloc library routine, and must each
528 be at least NUM_REGS * sizeof (regoff_t) bytes long.
530 If NUM_REGS == 0, then subsequent matches should allocate their own
533 Unless this function is called, the first search or match using
534 PATTERN_BUFFER will allocate its own register data, without
535 freeing the old data. */
538 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
539 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
543 bufp->regs_allocated = REGS_REALLOCATE;
544 regs->num_regs = num_regs;
545 regs->start = starts;
550 bufp->regs_allocated = REGS_UNALLOCATED;
552 regs->start = regs->end = NULL;
556 weak_alias (__re_set_registers, re_set_registers)
559 /* Entry points compatible with 4.2 BSD regex library. We don't define
560 them unless specifically requested. */
562 #if defined _REGEX_RE_COMP || defined _LIBC
567 re_exec (const char *s)
569 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
571 #endif /* _REGEX_RE_COMP */
573 /* Internal entry point. */
575 /* Searches for a compiled pattern PREG in the string STRING, whose
576 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
577 meaning as with regexec. LAST_START is START + RANGE, where
578 START and RANGE have the same meaning as with re_search.
579 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
580 otherwise return the error code.
581 Note: We assume front end functions already check ranges.
582 (0 <= LAST_START && LAST_START <= LENGTH) */
585 __attribute_warn_unused_result__
586 re_search_internal (const regex_t *preg, const char *string, Idx length,
587 Idx start, Idx last_start, Idx stop, size_t nmatch,
588 regmatch_t pmatch[], int eflags)
591 const re_dfa_t *dfa = preg->buffer;
592 Idx left_lim, right_lim;
594 bool fl_longest_match;
601 re_match_context_t mctx = { .dfa = dfa };
602 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
603 && start != last_start && !preg->can_be_null)
604 ? preg->fastmap : NULL);
605 RE_TRANSLATE_TYPE t = preg->translate;
607 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
608 nmatch -= extra_nmatch;
610 /* Check if the DFA haven't been compiled. */
611 if (__glibc_unlikely (preg->used == 0 || dfa->init_state == NULL
612 || dfa->init_state_word == NULL
613 || dfa->init_state_nl == NULL
614 || dfa->init_state_begbuf == NULL))
617 /* We assume front-end functions already check them. */
618 DEBUG_ASSERT (0 <= last_start && last_start <= length);
620 /* If initial states with non-begbuf contexts have no elements,
621 the regex must be anchored. If preg->newline_anchor is set,
622 we'll never use init_state_nl, so do not check it. */
623 if (dfa->init_state->nodes.nelem == 0
624 && dfa->init_state_word->nodes.nelem == 0
625 && (dfa->init_state_nl->nodes.nelem == 0
626 || !preg->newline_anchor))
628 if (start != 0 && last_start != 0)
630 start = last_start = 0;
633 /* We must check the longest matching, if nmatch > 0. */
634 fl_longest_match = (nmatch != 0 || dfa->nbackref);
636 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
637 preg->translate, (preg->syntax & RE_ICASE) != 0,
639 if (__glibc_unlikely (err != REG_NOERROR))
641 mctx.input.stop = stop;
642 mctx.input.raw_stop = stop;
643 mctx.input.newline_anchor = preg->newline_anchor;
645 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
646 if (__glibc_unlikely (err != REG_NOERROR))
649 /* We will log all the DFA states through which the dfa pass,
650 if nmatch > 1, or this dfa has "multibyte node", which is a
651 back-reference or a node which can accept multibyte character or
652 multi character collating element. */
653 if (nmatch > 1 || dfa->has_mb_node)
655 /* Avoid overflow. */
656 if (__glibc_unlikely ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
657 <= mctx.input.bufs_len)))
663 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
664 if (__glibc_unlikely (mctx.state_log == NULL))
672 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
673 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
675 /* Check incrementally whether the input string matches. */
676 incr = (last_start < start) ? -1 : 1;
677 left_lim = (last_start < start) ? last_start : start;
678 right_lim = (last_start < start) ? start : last_start;
679 sb = dfa->mb_cur_max == 1;
682 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
683 | (start <= last_start ? 2 : 0)
684 | (t != NULL ? 1 : 0))
687 for (;; match_first += incr)
690 if (match_first < left_lim || right_lim < match_first)
693 /* Advance as rapidly as possible through the string, until we
694 find a plausible place to start matching. This may be done
695 with varying efficiency, so there are various possibilities:
696 only the most common of them are specialized, in order to
697 save on code size. We use a switch statement for speed. */
705 /* Fastmap with single-byte translation, match forward. */
706 while (__glibc_likely (match_first < right_lim)
707 && !fastmap[t[(unsigned char) string[match_first]]])
709 goto forward_match_found_start_or_reached_end;
712 /* Fastmap without translation, match forward. */
713 while (__glibc_likely (match_first < right_lim)
714 && !fastmap[(unsigned char) string[match_first]])
717 forward_match_found_start_or_reached_end:
718 if (__glibc_unlikely (match_first == right_lim))
720 ch = match_first >= length
721 ? 0 : (unsigned char) string[match_first];
722 if (!fastmap[t ? t[ch] : ch])
729 /* Fastmap without multi-byte translation, match backwards. */
730 while (match_first >= left_lim)
732 ch = match_first >= length
733 ? 0 : (unsigned char) string[match_first];
734 if (fastmap[t ? t[ch] : ch])
738 if (match_first < left_lim)
743 /* In this case, we can't determine easily the current byte,
744 since it might be a component byte of a multibyte
745 character. Then we use the constructed buffer instead. */
748 /* If MATCH_FIRST is out of the valid range, reconstruct the
750 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
751 if (__glibc_unlikely (offset
752 >= (__re_size_t) mctx.input.valid_raw_len))
754 err = re_string_reconstruct (&mctx.input, match_first,
756 if (__glibc_unlikely (err != REG_NOERROR))
759 offset = match_first - mctx.input.raw_mbs_idx;
761 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
762 Note that MATCH_FIRST must not be smaller than 0. */
763 ch = (match_first >= length
764 ? 0 : re_string_byte_at (&mctx.input, offset));
768 if (match_first < left_lim || match_first > right_lim)
777 /* Reconstruct the buffers so that the matcher can assume that
778 the matching starts from the beginning of the buffer. */
779 err = re_string_reconstruct (&mctx.input, match_first, eflags);
780 if (__glibc_unlikely (err != REG_NOERROR))
783 #ifdef RE_ENABLE_I18N
784 /* Don't consider this char as a possible match start if it part,
785 yet isn't the head, of a multibyte character. */
786 if (!sb && !re_string_first_byte (&mctx.input, 0))
790 /* It seems to be appropriate one, then use the matcher. */
791 /* We assume that the matching starts from 0. */
792 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
793 match_last = check_matching (&mctx, fl_longest_match,
794 start <= last_start ? &match_first : NULL);
795 if (match_last != -1)
797 if (__glibc_unlikely (match_last == -2))
804 mctx.match_last = match_last;
805 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
807 re_dfastate_t *pstate = mctx.state_log[match_last];
808 mctx.last_node = check_halt_state_context (&mctx, pstate,
811 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
814 err = prune_impossible_nodes (&mctx);
815 if (err == REG_NOERROR)
817 if (__glibc_unlikely (err != REG_NOMATCH))
822 break; /* We found a match. */
826 match_ctx_clean (&mctx);
829 DEBUG_ASSERT (match_last != -1);
830 DEBUG_ASSERT (err == REG_NOERROR);
832 /* Set pmatch[] if we need. */
837 /* Initialize registers. */
838 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
839 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
841 /* Set the points where matching start/end. */
843 pmatch[0].rm_eo = mctx.match_last;
844 /* FIXME: This function should fail if mctx.match_last exceeds
845 the maximum possible regoff_t value. We need a new error
846 code REG_OVERFLOW. */
848 if (!preg->no_sub && nmatch > 1)
850 err = set_regs (preg, &mctx, nmatch, pmatch,
851 dfa->has_plural_match && dfa->nbackref > 0);
852 if (__glibc_unlikely (err != REG_NOERROR))
856 /* At last, add the offset to each register, since we slid
857 the buffers so that we could assume that the matching starts
859 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
860 if (pmatch[reg_idx].rm_so != -1)
862 #ifdef RE_ENABLE_I18N
863 if (__glibc_unlikely (mctx.input.offsets_needed != 0))
865 pmatch[reg_idx].rm_so =
866 (pmatch[reg_idx].rm_so == mctx.input.valid_len
867 ? mctx.input.valid_raw_len
868 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
869 pmatch[reg_idx].rm_eo =
870 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
871 ? mctx.input.valid_raw_len
872 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
875 DEBUG_ASSERT (mctx.input.offsets_needed == 0);
877 pmatch[reg_idx].rm_so += match_first;
878 pmatch[reg_idx].rm_eo += match_first;
880 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
882 pmatch[nmatch + reg_idx].rm_so = -1;
883 pmatch[nmatch + reg_idx].rm_eo = -1;
887 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
888 if (dfa->subexp_map[reg_idx] != reg_idx)
890 pmatch[reg_idx + 1].rm_so
891 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
892 pmatch[reg_idx + 1].rm_eo
893 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
898 re_free (mctx.state_log);
900 match_ctx_free (&mctx);
901 re_string_destruct (&mctx.input);
906 __attribute_warn_unused_result__
907 prune_impossible_nodes (re_match_context_t *mctx)
909 const re_dfa_t *const dfa = mctx->dfa;
910 Idx halt_node, match_last;
912 re_dfastate_t **sifted_states;
913 re_dfastate_t **lim_states = NULL;
914 re_sift_context_t sctx;
915 DEBUG_ASSERT (mctx->state_log != NULL);
916 match_last = mctx->match_last;
917 halt_node = mctx->last_node;
919 /* Avoid overflow. */
920 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
924 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
925 if (__glibc_unlikely (sifted_states == NULL))
932 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
933 if (__glibc_unlikely (lim_states == NULL))
940 memset (lim_states, '\0',
941 sizeof (re_dfastate_t *) * (match_last + 1));
942 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
944 ret = sift_states_backward (mctx, &sctx);
945 re_node_set_free (&sctx.limits);
946 if (__glibc_unlikely (ret != REG_NOERROR))
948 if (sifted_states[0] != NULL || lim_states[0] != NULL)
958 } while (mctx->state_log[match_last] == NULL
959 || !mctx->state_log[match_last]->halt);
960 halt_node = check_halt_state_context (mctx,
961 mctx->state_log[match_last],
964 ret = merge_state_array (dfa, sifted_states, lim_states,
966 re_free (lim_states);
968 if (__glibc_unlikely (ret != REG_NOERROR))
973 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
974 ret = sift_states_backward (mctx, &sctx);
975 re_node_set_free (&sctx.limits);
976 if (__glibc_unlikely (ret != REG_NOERROR))
978 if (sifted_states[0] == NULL)
984 re_free (mctx->state_log);
985 mctx->state_log = sifted_states;
986 sifted_states = NULL;
987 mctx->last_node = halt_node;
988 mctx->match_last = match_last;
991 re_free (sifted_states);
992 re_free (lim_states);
996 /* Acquire an initial state and return it.
997 We must select appropriate initial state depending on the context,
998 since initial states may have constraints like "\<", "^", etc.. */
1000 static inline re_dfastate_t *
1001 __attribute__ ((always_inline))
1002 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1005 const re_dfa_t *const dfa = mctx->dfa;
1006 if (dfa->init_state->has_constraint)
1008 unsigned int context;
1009 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1010 if (IS_WORD_CONTEXT (context))
1011 return dfa->init_state_word;
1012 else if (IS_ORDINARY_CONTEXT (context))
1013 return dfa->init_state;
1014 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1015 return dfa->init_state_begbuf;
1016 else if (IS_NEWLINE_CONTEXT (context))
1017 return dfa->init_state_nl;
1018 else if (IS_BEGBUF_CONTEXT (context))
1020 /* It is relatively rare case, then calculate on demand. */
1021 return re_acquire_state_context (err, dfa,
1022 dfa->init_state->entrance_nodes,
1026 /* Must not happen? */
1027 return dfa->init_state;
1030 return dfa->init_state;
1033 /* Check whether the regular expression match input string INPUT or not,
1034 and return the index where the matching end. Return -1 if
1035 there is no match, and return -2 in case of an error.
1036 FL_LONGEST_MATCH means we want the POSIX longest matching.
1037 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1038 next place where we may want to try matching.
1039 Note that the matcher assumes that the matching starts from the current
1040 index of the buffer. */
1043 __attribute_warn_unused_result__
1044 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1047 const re_dfa_t *const dfa = mctx->dfa;
1050 Idx match_last = -1;
1051 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1052 re_dfastate_t *cur_state;
1053 bool at_init_state = p_match_first != NULL;
1054 Idx next_start_idx = cur_str_idx;
1057 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1058 /* An initial state must not be NULL (invalid). */
1059 if (__glibc_unlikely (cur_state == NULL))
1061 DEBUG_ASSERT (err == REG_ESPACE);
1065 if (mctx->state_log != NULL)
1067 mctx->state_log[cur_str_idx] = cur_state;
1069 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1070 later. E.g. Processing back references. */
1071 if (__glibc_unlikely (dfa->nbackref))
1073 at_init_state = false;
1074 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1075 if (__glibc_unlikely (err != REG_NOERROR))
1078 if (cur_state->has_backref)
1080 err = transit_state_bkref (mctx, &cur_state->nodes);
1081 if (__glibc_unlikely (err != REG_NOERROR))
1087 /* If the RE accepts NULL string. */
1088 if (__glibc_unlikely (cur_state->halt))
1090 if (!cur_state->has_constraint
1091 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1093 if (!fl_longest_match)
1097 match_last = cur_str_idx;
1103 while (!re_string_eoi (&mctx->input))
1105 re_dfastate_t *old_state = cur_state;
1106 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1108 if ((__glibc_unlikely (next_char_idx >= mctx->input.bufs_len)
1109 && mctx->input.bufs_len < mctx->input.len)
1110 || (__glibc_unlikely (next_char_idx >= mctx->input.valid_len)
1111 && mctx->input.valid_len < mctx->input.len))
1113 err = extend_buffers (mctx, next_char_idx + 1);
1114 if (__glibc_unlikely (err != REG_NOERROR))
1116 DEBUG_ASSERT (err == REG_ESPACE);
1121 cur_state = transit_state (&err, mctx, cur_state);
1122 if (mctx->state_log != NULL)
1123 cur_state = merge_state_with_log (&err, mctx, cur_state);
1125 if (cur_state == NULL)
1127 /* Reached the invalid state or an error. Try to recover a valid
1128 state using the state log, if available and if we have not
1129 already found a valid (even if not the longest) match. */
1130 if (__glibc_unlikely (err != REG_NOERROR))
1133 if (mctx->state_log == NULL
1134 || (match && !fl_longest_match)
1135 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1139 if (__glibc_unlikely (at_init_state))
1141 if (old_state == cur_state)
1142 next_start_idx = next_char_idx;
1144 at_init_state = false;
1147 if (cur_state->halt)
1149 /* Reached a halt state.
1150 Check the halt state can satisfy the current context. */
1151 if (!cur_state->has_constraint
1152 || check_halt_state_context (mctx, cur_state,
1153 re_string_cur_idx (&mctx->input)))
1155 /* We found an appropriate halt state. */
1156 match_last = re_string_cur_idx (&mctx->input);
1159 /* We found a match, do not modify match_first below. */
1160 p_match_first = NULL;
1161 if (!fl_longest_match)
1168 *p_match_first += next_start_idx;
1173 /* Check NODE match the current context. */
1176 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1178 re_token_type_t type = dfa->nodes[node].type;
1179 unsigned int constraint = dfa->nodes[node].constraint;
1180 if (type != END_OF_RE)
1184 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1189 /* Check the halt state STATE match the current context.
1190 Return 0 if not match, if the node, STATE has, is a halt node and
1191 match the context, return the node. */
1194 check_halt_state_context (const re_match_context_t *mctx,
1195 const re_dfastate_t *state, Idx idx)
1198 unsigned int context;
1199 DEBUG_ASSERT (state->halt);
1200 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201 for (i = 0; i < state->nodes.nelem; ++i)
1202 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203 return state->nodes.elems[i];
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208 corresponding to the DFA).
1209 Return the destination node, and update EPS_VIA_NODES;
1210 return -1 on match failure, -2 on error. */
1213 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1214 regmatch_t *prevregs,
1215 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1216 struct re_fail_stack_t *fs)
1218 const re_dfa_t *const dfa = mctx->dfa;
1219 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1221 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1222 re_node_set *edests = &dfa->edests[node];
1223 bool ok = re_node_set_insert (eps_via_nodes, node);
1224 if (__glibc_unlikely (! ok))
1227 /* Pick a valid destination, or return -1 if none is found. */
1229 for (Idx i = 0; i < edests->nelem; i++)
1231 Idx candidate = edests->elems[i];
1232 if (!re_node_set_contains (cur_nodes, candidate))
1234 if (dest_node == -1)
1235 dest_node = candidate;
1239 /* In order to avoid infinite loop like "(a*)*", return the second
1240 epsilon-transition if the first was already considered. */
1241 if (re_node_set_contains (eps_via_nodes, dest_node))
1244 /* Otherwise, push the second epsilon-transition on the fail stack. */
1246 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1247 prevregs, eps_via_nodes))
1250 /* We know we are going to exit. */
1259 re_token_type_t type = dfa->nodes[node].type;
1261 #ifdef RE_ENABLE_I18N
1262 if (dfa->nodes[node].accept_mb)
1263 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1265 #endif /* RE_ENABLE_I18N */
1266 if (type == OP_BACK_REF)
1268 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1269 if (subexp_idx < nregs)
1270 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1273 if (subexp_idx >= nregs
1274 || regs[subexp_idx].rm_so == -1
1275 || regs[subexp_idx].rm_eo == -1)
1279 char *buf = (char *) re_string_get_buffer (&mctx->input);
1280 if (mctx->input.valid_len - *pidx < naccepted
1281 || (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1291 bool ok = re_node_set_insert (eps_via_nodes, node);
1292 if (__glibc_unlikely (! ok))
1294 dest_node = dfa->edests[node].elems[0];
1295 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1302 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1304 Idx dest_node = dfa->nexts[node];
1305 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1306 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1307 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1310 re_node_set_empty (eps_via_nodes);
1317 static reg_errcode_t
1318 __attribute_warn_unused_result__
1319 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1320 Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
1321 re_node_set *eps_via_nodes)
1324 Idx num = fs->num++;
1325 if (fs->num == fs->alloc)
1327 struct re_fail_stack_ent_t *new_array;
1328 new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1330 if (new_array == NULL)
1333 fs->stack = new_array;
1335 fs->stack[num].idx = str_idx;
1336 fs->stack[num].node = dest_node;
1337 fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
1338 if (fs->stack[num].regs == NULL)
1340 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1341 memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
1342 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1347 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1348 regmatch_t *regs, regmatch_t *prevregs,
1349 re_node_set *eps_via_nodes)
1351 if (fs == NULL || fs->num == 0)
1353 Idx num = --fs->num;
1354 *pidx = fs->stack[num].idx;
1355 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1356 memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
1357 re_node_set_free (eps_via_nodes);
1358 re_free (fs->stack[num].regs);
1359 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1360 DEBUG_ASSERT (0 <= fs->stack[num].node);
1361 return fs->stack[num].node;
1365 #define DYNARRAY_STRUCT regmatch_list
1366 #define DYNARRAY_ELEMENT regmatch_t
1367 #define DYNARRAY_PREFIX regmatch_list_
1368 #include <malloc/dynarray-skeleton.c>
1370 /* Set the positions where the subexpressions are starts/ends to registers
1372 Note: We assume that pmatch[0] is already set, and
1373 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1375 static reg_errcode_t
1376 __attribute_warn_unused_result__
1377 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1378 regmatch_t *pmatch, bool fl_backtrack)
1380 const re_dfa_t *dfa = preg->buffer;
1382 re_node_set eps_via_nodes;
1383 struct re_fail_stack_t *fs;
1384 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1385 struct regmatch_list prev_match;
1386 regmatch_list_init (&prev_match);
1388 DEBUG_ASSERT (nmatch > 1);
1389 DEBUG_ASSERT (mctx->state_log != NULL);
1393 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1394 if (fs->stack == NULL)
1400 cur_node = dfa->init_node;
1401 re_node_set_init_empty (&eps_via_nodes);
1403 if (!regmatch_list_resize (&prev_match, nmatch))
1405 regmatch_list_free (&prev_match);
1406 free_fail_stack_return (fs);
1409 regmatch_t *prev_idx_match = regmatch_list_begin (&prev_match);
1410 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1412 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1414 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1416 if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1417 || re_node_set_contains (&eps_via_nodes, cur_node))
1423 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1424 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1426 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1427 prev_idx_match, &eps_via_nodes);
1433 re_node_set_free (&eps_via_nodes);
1434 regmatch_list_free (&prev_match);
1435 return free_fail_stack_return (fs);
1439 /* Proceed to next node. */
1440 cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
1442 &eps_via_nodes, fs);
1444 if (__glibc_unlikely (cur_node < 0))
1446 if (__glibc_unlikely (cur_node == -2))
1448 re_node_set_free (&eps_via_nodes);
1449 regmatch_list_free (&prev_match);
1450 free_fail_stack_return (fs);
1453 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454 prev_idx_match, &eps_via_nodes);
1457 re_node_set_free (&eps_via_nodes);
1458 regmatch_list_free (&prev_match);
1459 free_fail_stack_return (fs);
1464 re_node_set_free (&eps_via_nodes);
1465 regmatch_list_free (&prev_match);
1466 return free_fail_stack_return (fs);
1469 static reg_errcode_t
1470 free_fail_stack_return (struct re_fail_stack_t *fs)
1475 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1477 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1478 re_free (fs->stack[fs_idx].regs);
1480 re_free (fs->stack);
1486 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1487 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1489 int type = dfa->nodes[cur_node].type;
1490 if (type == OP_OPEN_SUBEXP)
1492 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1494 /* We are at the first node of this sub expression. */
1495 if (reg_num < nmatch)
1497 pmatch[reg_num].rm_so = cur_idx;
1498 pmatch[reg_num].rm_eo = -1;
1501 else if (type == OP_CLOSE_SUBEXP)
1503 /* We are at the last node of this sub expression. */
1504 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1505 if (reg_num < nmatch)
1507 if (pmatch[reg_num].rm_so < cur_idx)
1509 pmatch[reg_num].rm_eo = cur_idx;
1510 /* This is a non-empty match or we are not inside an optional
1511 subexpression. Accept this right away. */
1512 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1516 if (dfa->nodes[cur_node].opt_subexp
1517 && prev_idx_match[reg_num].rm_so != -1)
1518 /* We transited through an empty match for an optional
1519 subexpression, like (a?)*, and this is not the subexp's
1520 first match. Copy back the old content of the registers
1521 so that matches of an inner subexpression are undone as
1522 well, like in ((a?))*. */
1523 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1525 /* We completed a subexpression, but it may be part of
1526 an optional one, so do not update PREV_IDX_MATCH. */
1527 pmatch[reg_num].rm_eo = cur_idx;
1533 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1534 and sift the nodes in each states according to the following rules.
1535 Updated state_log will be wrote to STATE_LOG.
1537 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1538 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1539 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1540 the LAST_NODE, we throw away the node 'a'.
1541 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1542 string 's' and transit to 'b':
1543 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1545 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1546 thrown away, we throw away the node 'a'.
1547 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1548 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1550 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1551 we throw away the node 'a'. */
1553 #define STATE_NODE_CONTAINS(state,node) \
1554 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1556 static reg_errcode_t
1557 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1561 Idx str_idx = sctx->last_str_idx;
1562 re_node_set cur_dest;
1564 DEBUG_ASSERT (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1566 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1567 transit to the last_node and the last_node itself. */
1568 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1569 if (__glibc_unlikely (err != REG_NOERROR))
1571 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1572 if (__glibc_unlikely (err != REG_NOERROR))
1575 /* Then check each states in the state_log. */
1578 /* Update counters. */
1579 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1580 if (null_cnt > mctx->max_mb_elem_len)
1582 memset (sctx->sifted_states, '\0',
1583 sizeof (re_dfastate_t *) * str_idx);
1584 re_node_set_free (&cur_dest);
1587 re_node_set_empty (&cur_dest);
1590 if (mctx->state_log[str_idx])
1592 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1593 if (__glibc_unlikely (err != REG_NOERROR))
1597 /* Add all the nodes which satisfy the following conditions:
1598 - It can epsilon transit to a node in CUR_DEST.
1600 And update state_log. */
1601 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1602 if (__glibc_unlikely (err != REG_NOERROR))
1607 re_node_set_free (&cur_dest);
1611 static reg_errcode_t
1612 __attribute_warn_unused_result__
1613 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1614 Idx str_idx, re_node_set *cur_dest)
1616 const re_dfa_t *const dfa = mctx->dfa;
1617 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1620 /* Then build the next sifted state.
1621 We build the next sifted state on 'cur_dest', and update
1622 'sifted_states[str_idx]' with 'cur_dest'.
1624 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1625 'cur_src' points the node_set of the old 'state_log[str_idx]'
1626 (with the epsilon nodes pre-filtered out). */
1627 for (i = 0; i < cur_src->nelem; i++)
1629 Idx prev_node = cur_src->elems[i];
1632 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[prev_node].type));
1634 #ifdef RE_ENABLE_I18N
1635 /* If the node may accept "multi byte". */
1636 if (dfa->nodes[prev_node].accept_mb)
1637 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1638 str_idx, sctx->last_str_idx);
1639 #endif /* RE_ENABLE_I18N */
1641 /* We don't check backreferences here.
1642 See update_cur_sifted_state(). */
1644 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1645 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1646 dfa->nexts[prev_node]))
1652 if (sctx->limits.nelem)
1654 Idx to_idx = str_idx + naccepted;
1655 if (check_dst_limits (mctx, &sctx->limits,
1656 dfa->nexts[prev_node], to_idx,
1657 prev_node, str_idx))
1660 ok = re_node_set_insert (cur_dest, prev_node);
1661 if (__glibc_unlikely (! ok))
1668 /* Helper functions. */
1670 static reg_errcode_t
1671 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1673 Idx top = mctx->state_log_top;
1675 if ((next_state_log_idx >= mctx->input.bufs_len
1676 && mctx->input.bufs_len < mctx->input.len)
1677 || (next_state_log_idx >= mctx->input.valid_len
1678 && mctx->input.valid_len < mctx->input.len))
1681 err = extend_buffers (mctx, next_state_log_idx + 1);
1682 if (__glibc_unlikely (err != REG_NOERROR))
1686 if (top < next_state_log_idx)
1688 memset (mctx->state_log + top + 1, '\0',
1689 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1690 mctx->state_log_top = next_state_log_idx;
1695 static reg_errcode_t
1696 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1697 re_dfastate_t **src, Idx num)
1701 for (st_idx = 0; st_idx < num; ++st_idx)
1703 if (dst[st_idx] == NULL)
1704 dst[st_idx] = src[st_idx];
1705 else if (src[st_idx] != NULL)
1707 re_node_set merged_set;
1708 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1709 &src[st_idx]->nodes);
1710 if (__glibc_unlikely (err != REG_NOERROR))
1712 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1713 re_node_set_free (&merged_set);
1714 if (__glibc_unlikely (err != REG_NOERROR))
1721 static reg_errcode_t
1722 update_cur_sifted_state (const re_match_context_t *mctx,
1723 re_sift_context_t *sctx, Idx str_idx,
1724 re_node_set *dest_nodes)
1726 const re_dfa_t *const dfa = mctx->dfa;
1727 reg_errcode_t err = REG_NOERROR;
1728 const re_node_set *candidates;
1729 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1730 : &mctx->state_log[str_idx]->nodes);
1732 if (dest_nodes->nelem == 0)
1733 sctx->sifted_states[str_idx] = NULL;
1738 /* At first, add the nodes which can epsilon transit to a node in
1740 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1741 if (__glibc_unlikely (err != REG_NOERROR))
1744 /* Then, check the limitations in the current sift_context. */
1745 if (sctx->limits.nelem)
1747 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1748 mctx->bkref_ents, str_idx);
1749 if (__glibc_unlikely (err != REG_NOERROR))
1754 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1755 if (__glibc_unlikely (err != REG_NOERROR))
1759 if (candidates && mctx->state_log[str_idx]->has_backref)
1761 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1762 if (__glibc_unlikely (err != REG_NOERROR))
1768 static reg_errcode_t
1769 __attribute_warn_unused_result__
1770 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1771 const re_node_set *candidates)
1773 reg_errcode_t err = REG_NOERROR;
1776 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1777 if (__glibc_unlikely (err != REG_NOERROR))
1780 if (!state->inveclosure.alloc)
1782 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1783 if (__glibc_unlikely (err != REG_NOERROR))
1785 for (i = 0; i < dest_nodes->nelem; i++)
1787 err = re_node_set_merge (&state->inveclosure,
1788 dfa->inveclosures + dest_nodes->elems[i]);
1789 if (__glibc_unlikely (err != REG_NOERROR))
1793 return re_node_set_add_intersect (dest_nodes, candidates,
1794 &state->inveclosure);
1797 static reg_errcode_t
1798 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1799 const re_node_set *candidates)
1803 re_node_set *inv_eclosure = dfa->inveclosures + node;
1804 re_node_set except_nodes;
1805 re_node_set_init_empty (&except_nodes);
1806 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1808 Idx cur_node = inv_eclosure->elems[ecl_idx];
1809 if (cur_node == node)
1811 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1813 Idx edst1 = dfa->edests[cur_node].elems[0];
1814 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1815 ? dfa->edests[cur_node].elems[1] : -1);
1816 if ((!re_node_set_contains (inv_eclosure, edst1)
1817 && re_node_set_contains (dest_nodes, edst1))
1819 && !re_node_set_contains (inv_eclosure, edst2)
1820 && re_node_set_contains (dest_nodes, edst2)))
1822 err = re_node_set_add_intersect (&except_nodes, candidates,
1823 dfa->inveclosures + cur_node);
1824 if (__glibc_unlikely (err != REG_NOERROR))
1826 re_node_set_free (&except_nodes);
1832 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1834 Idx cur_node = inv_eclosure->elems[ecl_idx];
1835 if (!re_node_set_contains (&except_nodes, cur_node))
1837 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1838 re_node_set_remove_at (dest_nodes, idx);
1841 re_node_set_free (&except_nodes);
1846 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1847 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1849 const re_dfa_t *const dfa = mctx->dfa;
1850 Idx lim_idx, src_pos, dst_pos;
1852 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1853 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1854 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1857 struct re_backref_cache_entry *ent;
1858 ent = mctx->bkref_ents + limits->elems[lim_idx];
1859 subexp_idx = dfa->nodes[ent->node].opr.idx;
1861 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1862 subexp_idx, dst_node, dst_idx,
1864 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1865 subexp_idx, src_node, src_idx,
1869 <src> <dst> ( <subexp> )
1870 ( <subexp> ) <src> <dst>
1871 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1872 if (src_pos == dst_pos)
1873 continue; /* This is unrelated limitation. */
1881 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1882 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1884 const re_dfa_t *const dfa = mctx->dfa;
1885 const re_node_set *eclosures = dfa->eclosures + from_node;
1888 /* Else, we are on the boundary: examine the nodes on the epsilon
1890 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1892 Idx node = eclosures->elems[node_idx];
1893 switch (dfa->nodes[node].type)
1896 if (bkref_idx != -1)
1898 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1904 if (ent->node != node)
1907 if (subexp_idx < BITSET_WORD_BITS
1908 && !(ent->eps_reachable_subexps_map
1909 & ((bitset_word_t) 1 << subexp_idx)))
1912 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1913 OP_CLOSE_SUBEXP cases below. But, if the
1914 destination node is the same node as the source
1915 node, don't recurse because it would cause an
1916 infinite loop: a regex that exhibits this behavior
1918 dst = dfa->edests[node].elems[0];
1919 if (dst == from_node)
1923 else /* if (boundaries & 2) */
1928 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1930 if (cpos == -1 /* && (boundaries & 1) */)
1932 if (cpos == 0 && (boundaries & 2))
1935 if (subexp_idx < BITSET_WORD_BITS)
1936 ent->eps_reachable_subexps_map
1937 &= ~((bitset_word_t) 1 << subexp_idx);
1939 while (ent++->more);
1943 case OP_OPEN_SUBEXP:
1944 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1948 case OP_CLOSE_SUBEXP:
1949 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1958 return (boundaries & 2) ? 1 : 0;
1962 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1963 Idx subexp_idx, Idx from_node, Idx str_idx,
1966 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1969 /* If we are outside the range of the subexpression, return -1 or 1. */
1970 if (str_idx < lim->subexp_from)
1973 if (lim->subexp_to < str_idx)
1976 /* If we are within the subexpression, return 0. */
1977 boundaries = (str_idx == lim->subexp_from);
1978 boundaries |= (str_idx == lim->subexp_to) << 1;
1979 if (boundaries == 0)
1982 /* Else, examine epsilon closure. */
1983 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1984 from_node, bkref_idx);
1987 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1988 which are against limitations from DEST_NODES. */
1990 static reg_errcode_t
1991 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1992 const re_node_set *candidates, re_node_set *limits,
1993 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
1996 Idx node_idx, lim_idx;
1998 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2001 struct re_backref_cache_entry *ent;
2002 ent = bkref_ents + limits->elems[lim_idx];
2004 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2005 continue; /* This is unrelated limitation. */
2007 subexp_idx = dfa->nodes[ent->node].opr.idx;
2008 if (ent->subexp_to == str_idx)
2012 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2014 Idx node = dest_nodes->elems[node_idx];
2015 re_token_type_t type = dfa->nodes[node].type;
2016 if (type == OP_OPEN_SUBEXP
2017 && subexp_idx == dfa->nodes[node].opr.idx)
2019 else if (type == OP_CLOSE_SUBEXP
2020 && subexp_idx == dfa->nodes[node].opr.idx)
2024 /* Check the limitation of the open subexpression. */
2025 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2028 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2030 if (__glibc_unlikely (err != REG_NOERROR))
2034 /* Check the limitation of the close subexpression. */
2036 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2038 Idx node = dest_nodes->elems[node_idx];
2039 if (!re_node_set_contains (dfa->inveclosures + node,
2041 && !re_node_set_contains (dfa->eclosures + node,
2044 /* It is against this limitation.
2045 Remove it form the current sifted state. */
2046 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2048 if (__glibc_unlikely (err != REG_NOERROR))
2054 else /* (ent->subexp_to != str_idx) */
2056 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2058 Idx node = dest_nodes->elems[node_idx];
2059 re_token_type_t type = dfa->nodes[node].type;
2060 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2062 if (subexp_idx != dfa->nodes[node].opr.idx)
2064 /* It is against this limitation.
2065 Remove it form the current sifted state. */
2066 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2068 if (__glibc_unlikely (err != REG_NOERROR))
2077 static reg_errcode_t
2078 __attribute_warn_unused_result__
2079 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2080 Idx str_idx, const re_node_set *candidates)
2082 const re_dfa_t *const dfa = mctx->dfa;
2085 re_sift_context_t local_sctx;
2086 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2088 if (first_idx == -1)
2091 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2093 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2096 re_token_type_t type;
2097 struct re_backref_cache_entry *entry;
2098 node = candidates->elems[node_idx];
2099 type = dfa->nodes[node].type;
2100 /* Avoid infinite loop for the REs like "()\1+". */
2101 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2103 if (type != OP_BACK_REF)
2106 entry = mctx->bkref_ents + first_idx;
2107 enabled_idx = first_idx;
2114 re_dfastate_t *cur_state;
2116 if (entry->node != node)
2118 subexp_len = entry->subexp_to - entry->subexp_from;
2119 to_idx = str_idx + subexp_len;
2120 dst_node = (subexp_len ? dfa->nexts[node]
2121 : dfa->edests[node].elems[0]);
2123 if (to_idx > sctx->last_str_idx
2124 || sctx->sifted_states[to_idx] == NULL
2125 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2126 || check_dst_limits (mctx, &sctx->limits, node,
2127 str_idx, dst_node, to_idx))
2130 if (local_sctx.sifted_states == NULL)
2133 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2134 if (__glibc_unlikely (err != REG_NOERROR))
2137 local_sctx.last_node = node;
2138 local_sctx.last_str_idx = str_idx;
2139 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2140 if (__glibc_unlikely (! ok))
2145 cur_state = local_sctx.sifted_states[str_idx];
2146 err = sift_states_backward (mctx, &local_sctx);
2147 if (__glibc_unlikely (err != REG_NOERROR))
2149 if (sctx->limited_states != NULL)
2151 err = merge_state_array (dfa, sctx->limited_states,
2152 local_sctx.sifted_states,
2154 if (__glibc_unlikely (err != REG_NOERROR))
2157 local_sctx.sifted_states[str_idx] = cur_state;
2158 re_node_set_remove (&local_sctx.limits, enabled_idx);
2160 /* mctx->bkref_ents may have changed, reload the pointer. */
2161 entry = mctx->bkref_ents + enabled_idx;
2163 while (enabled_idx++, entry++->more);
2167 if (local_sctx.sifted_states != NULL)
2169 re_node_set_free (&local_sctx.limits);
2176 #ifdef RE_ENABLE_I18N
2178 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2179 Idx node_idx, Idx str_idx, Idx max_str_idx)
2181 const re_dfa_t *const dfa = mctx->dfa;
2183 /* Check the node can accept "multi byte". */
2184 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2185 if (naccepted > 0 && str_idx + naccepted <= max_str_idx
2186 && !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2187 dfa->nexts[node_idx]))
2188 /* The node can't accept the "multi byte", or the
2189 destination was already thrown away, then the node
2190 couldn't accept the current input "multi byte". */
2192 /* Otherwise, it is sure that the node could accept
2193 'naccepted' bytes input. */
2196 #endif /* RE_ENABLE_I18N */
2199 /* Functions for state transition. */
2201 /* Return the next state to which the current state STATE will transit by
2202 accepting the current input byte, and update STATE_LOG if necessary.
2203 Return NULL on failure.
2204 If STATE can accept a multibyte char/collating element/back reference
2205 update the destination of STATE_LOG. */
2207 static re_dfastate_t *
2208 __attribute_warn_unused_result__
2209 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2210 re_dfastate_t *state)
2212 re_dfastate_t **trtable;
2215 #ifdef RE_ENABLE_I18N
2216 /* If the current state can accept multibyte. */
2217 if (__glibc_unlikely (state->accept_mb))
2219 *err = transit_state_mb (mctx, state);
2220 if (__glibc_unlikely (*err != REG_NOERROR))
2223 #endif /* RE_ENABLE_I18N */
2225 /* Then decide the next state with the single byte. */
2228 /* don't use transition table */
2229 return transit_state_sb (err, mctx, state);
2232 /* Use transition table */
2233 ch = re_string_fetch_byte (&mctx->input);
2236 trtable = state->trtable;
2237 if (__glibc_likely (trtable != NULL))
2240 trtable = state->word_trtable;
2241 if (__glibc_likely (trtable != NULL))
2243 unsigned int context;
2245 = re_string_context_at (&mctx->input,
2246 re_string_cur_idx (&mctx->input) - 1,
2248 if (IS_WORD_CONTEXT (context))
2249 return trtable[ch + SBC_MAX];
2254 if (!build_trtable (mctx->dfa, state))
2260 /* Retry, we now have a transition table. */
2264 /* Update the state_log if we need */
2265 static re_dfastate_t *
2266 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2267 re_dfastate_t *next_state)
2269 const re_dfa_t *const dfa = mctx->dfa;
2270 Idx cur_idx = re_string_cur_idx (&mctx->input);
2272 if (cur_idx > mctx->state_log_top)
2274 mctx->state_log[cur_idx] = next_state;
2275 mctx->state_log_top = cur_idx;
2277 else if (mctx->state_log[cur_idx] == 0)
2279 mctx->state_log[cur_idx] = next_state;
2283 re_dfastate_t *pstate;
2284 unsigned int context;
2285 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2286 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2287 the destination of a multibyte char/collating element/
2288 back reference. Then the next state is the union set of
2289 these destinations and the results of the transition table. */
2290 pstate = mctx->state_log[cur_idx];
2291 log_nodes = pstate->entrance_nodes;
2292 if (next_state != NULL)
2294 table_nodes = next_state->entrance_nodes;
2295 *err = re_node_set_init_union (&next_nodes, table_nodes,
2297 if (__glibc_unlikely (*err != REG_NOERROR))
2301 next_nodes = *log_nodes;
2302 /* Note: We already add the nodes of the initial state,
2303 then we don't need to add them here. */
2305 context = re_string_context_at (&mctx->input,
2306 re_string_cur_idx (&mctx->input) - 1,
2308 next_state = mctx->state_log[cur_idx]
2309 = re_acquire_state_context (err, dfa, &next_nodes, context);
2310 /* We don't need to check errors here, since the return value of
2311 this function is next_state and ERR is already set. */
2313 if (table_nodes != NULL)
2314 re_node_set_free (&next_nodes);
2317 if (__glibc_unlikely (dfa->nbackref) && next_state != NULL)
2319 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2320 later. We must check them here, since the back references in the
2321 next state might use them. */
2322 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2324 if (__glibc_unlikely (*err != REG_NOERROR))
2327 /* If the next state has back references. */
2328 if (next_state->has_backref)
2330 *err = transit_state_bkref (mctx, &next_state->nodes);
2331 if (__glibc_unlikely (*err != REG_NOERROR))
2333 next_state = mctx->state_log[cur_idx];
2340 /* Skip bytes in the input that correspond to part of a
2341 multi-byte match, then look in the log for a state
2342 from which to restart matching. */
2343 static re_dfastate_t *
2344 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2346 re_dfastate_t *cur_state;
2349 Idx max = mctx->state_log_top;
2350 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2354 if (++cur_str_idx > max)
2356 re_string_skip_bytes (&mctx->input, 1);
2358 while (mctx->state_log[cur_str_idx] == NULL);
2360 cur_state = merge_state_with_log (err, mctx, NULL);
2362 while (*err == REG_NOERROR && cur_state == NULL);
2366 /* Helper functions for transit_state. */
2368 /* From the node set CUR_NODES, pick up the nodes whose types are
2369 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2370 expression. And register them to use them later for evaluating the
2371 corresponding back references. */
2373 static reg_errcode_t
2374 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2377 const re_dfa_t *const dfa = mctx->dfa;
2381 /* TODO: This isn't efficient.
2382 Because there might be more than one nodes whose types are
2383 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2386 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2388 Idx node = cur_nodes->elems[node_idx];
2389 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2390 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2391 && (dfa->used_bkref_map
2392 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2394 err = match_ctx_add_subtop (mctx, node, str_idx);
2395 if (__glibc_unlikely (err != REG_NOERROR))
2403 /* Return the next state to which the current state STATE will transit by
2404 accepting the current input byte. Return NULL on failure. */
2406 static re_dfastate_t *
2407 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2408 re_dfastate_t *state)
2410 const re_dfa_t *const dfa = mctx->dfa;
2411 re_node_set next_nodes;
2412 re_dfastate_t *next_state;
2413 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2414 unsigned int context;
2416 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2417 if (__glibc_unlikely (*err != REG_NOERROR))
2419 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2421 Idx cur_node = state->nodes.elems[node_cnt];
2422 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2424 *err = re_node_set_merge (&next_nodes,
2425 dfa->eclosures + dfa->nexts[cur_node]);
2426 if (__glibc_unlikely (*err != REG_NOERROR))
2428 re_node_set_free (&next_nodes);
2433 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2434 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2435 /* We don't need to check errors here, since the return value of
2436 this function is next_state and ERR is already set. */
2438 re_node_set_free (&next_nodes);
2439 re_string_skip_bytes (&mctx->input, 1);
2444 #ifdef RE_ENABLE_I18N
2445 static reg_errcode_t
2446 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2448 const re_dfa_t *const dfa = mctx->dfa;
2452 for (i = 0; i < pstate->nodes.nelem; ++i)
2454 re_node_set dest_nodes, *new_nodes;
2455 Idx cur_node_idx = pstate->nodes.elems[i];
2458 unsigned int context;
2459 re_dfastate_t *dest_state;
2461 if (!dfa->nodes[cur_node_idx].accept_mb)
2464 if (dfa->nodes[cur_node_idx].constraint)
2466 context = re_string_context_at (&mctx->input,
2467 re_string_cur_idx (&mctx->input),
2469 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2474 /* How many bytes the node can accept? */
2475 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2476 re_string_cur_idx (&mctx->input));
2480 /* The node can accepts 'naccepted' bytes. */
2481 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2482 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2483 : mctx->max_mb_elem_len);
2484 err = clean_state_log_if_needed (mctx, dest_idx);
2485 if (__glibc_unlikely (err != REG_NOERROR))
2487 DEBUG_ASSERT (dfa->nexts[cur_node_idx] != -1);
2488 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2490 dest_state = mctx->state_log[dest_idx];
2491 if (dest_state == NULL)
2492 dest_nodes = *new_nodes;
2495 err = re_node_set_init_union (&dest_nodes,
2496 dest_state->entrance_nodes, new_nodes);
2497 if (__glibc_unlikely (err != REG_NOERROR))
2500 context = re_string_context_at (&mctx->input, dest_idx - 1,
2502 mctx->state_log[dest_idx]
2503 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2504 if (dest_state != NULL)
2505 re_node_set_free (&dest_nodes);
2506 if (__glibc_unlikely (mctx->state_log[dest_idx] == NULL
2507 && err != REG_NOERROR))
2512 #endif /* RE_ENABLE_I18N */
2514 static reg_errcode_t
2515 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2517 const re_dfa_t *const dfa = mctx->dfa;
2520 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2522 for (i = 0; i < nodes->nelem; ++i)
2524 Idx dest_str_idx, prev_nelem, bkc_idx;
2525 Idx node_idx = nodes->elems[i];
2526 unsigned int context;
2527 const re_token_t *node = dfa->nodes + node_idx;
2528 re_node_set *new_dest_nodes;
2530 /* Check whether 'node' is a backreference or not. */
2531 if (node->type != OP_BACK_REF)
2534 if (node->constraint)
2536 context = re_string_context_at (&mctx->input, cur_str_idx,
2538 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2542 /* 'node' is a backreference.
2543 Check the substring which the substring matched. */
2544 bkc_idx = mctx->nbkref_ents;
2545 err = get_subexp (mctx, node_idx, cur_str_idx);
2546 if (__glibc_unlikely (err != REG_NOERROR))
2549 /* And add the epsilon closures (which is 'new_dest_nodes') of
2550 the backreference to appropriate state_log. */
2551 DEBUG_ASSERT (dfa->nexts[node_idx] != -1);
2552 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2555 re_dfastate_t *dest_state;
2556 struct re_backref_cache_entry *bkref_ent;
2557 bkref_ent = mctx->bkref_ents + bkc_idx;
2558 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2560 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2561 new_dest_nodes = (subexp_len == 0
2562 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2563 : dfa->eclosures + dfa->nexts[node_idx]);
2564 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2565 - bkref_ent->subexp_from);
2566 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2568 dest_state = mctx->state_log[dest_str_idx];
2569 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2570 : mctx->state_log[cur_str_idx]->nodes.nelem);
2571 /* Add 'new_dest_node' to state_log. */
2572 if (dest_state == NULL)
2574 mctx->state_log[dest_str_idx]
2575 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2577 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2578 && err != REG_NOERROR))
2583 re_node_set dest_nodes;
2584 err = re_node_set_init_union (&dest_nodes,
2585 dest_state->entrance_nodes,
2587 if (__glibc_unlikely (err != REG_NOERROR))
2589 re_node_set_free (&dest_nodes);
2592 mctx->state_log[dest_str_idx]
2593 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2594 re_node_set_free (&dest_nodes);
2595 if (__glibc_unlikely (mctx->state_log[dest_str_idx] == NULL
2596 && err != REG_NOERROR))
2599 /* We need to check recursively if the backreference can epsilon
2602 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2604 err = check_subexp_matching_top (mctx, new_dest_nodes,
2606 if (__glibc_unlikely (err != REG_NOERROR))
2608 err = transit_state_bkref (mctx, new_dest_nodes);
2609 if (__glibc_unlikely (err != REG_NOERROR))
2619 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2620 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2621 Note that we might collect inappropriate candidates here.
2622 However, the cost of checking them strictly here is too high, then we
2623 delay these checking for prune_impossible_nodes(). */
2625 static reg_errcode_t
2626 __attribute_warn_unused_result__
2627 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2629 const re_dfa_t *const dfa = mctx->dfa;
2630 Idx subexp_num, sub_top_idx;
2631 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2632 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2633 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2634 if (cache_idx != -1)
2636 const struct re_backref_cache_entry *entry
2637 = mctx->bkref_ents + cache_idx;
2639 if (entry->node == bkref_node)
2640 return REG_NOERROR; /* We already checked it. */
2641 while (entry++->more);
2644 subexp_num = dfa->nodes[bkref_node].opr.idx;
2646 /* For each sub expression */
2647 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2650 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2651 re_sub_match_last_t *sub_last;
2652 Idx sub_last_idx, sl_str, bkref_str_off;
2654 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2655 continue; /* It isn't related. */
2657 sl_str = sub_top->str_idx;
2658 bkref_str_off = bkref_str_idx;
2659 /* At first, check the last node of sub expressions we already
2661 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2663 regoff_t sl_str_diff;
2664 sub_last = sub_top->lasts[sub_last_idx];
2665 sl_str_diff = sub_last->str_idx - sl_str;
2666 /* The matched string by the sub expression match with the substring
2667 at the back reference? */
2668 if (sl_str_diff > 0)
2670 if (__glibc_unlikely (bkref_str_off + sl_str_diff
2671 > mctx->input.valid_len))
2673 /* Not enough chars for a successful match. */
2674 if (bkref_str_off + sl_str_diff > mctx->input.len)
2677 err = clean_state_log_if_needed (mctx,
2680 if (__glibc_unlikely (err != REG_NOERROR))
2682 buf = (const char *) re_string_get_buffer (&mctx->input);
2684 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2685 /* We don't need to search this sub expression any more. */
2688 bkref_str_off += sl_str_diff;
2689 sl_str += sl_str_diff;
2690 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2693 /* Reload buf, since the preceding call might have reallocated
2695 buf = (const char *) re_string_get_buffer (&mctx->input);
2697 if (err == REG_NOMATCH)
2699 if (__glibc_unlikely (err != REG_NOERROR))
2703 if (sub_last_idx < sub_top->nlasts)
2705 if (sub_last_idx > 0)
2707 /* Then, search for the other last nodes of the sub expression. */
2708 for (; sl_str <= bkref_str_idx; ++sl_str)
2711 regoff_t sl_str_off;
2712 const re_node_set *nodes;
2713 sl_str_off = sl_str - sub_top->str_idx;
2714 /* The matched string by the sub expression match with the substring
2715 at the back reference? */
2718 if (__glibc_unlikely (bkref_str_off >= mctx->input.valid_len))
2720 /* If we are at the end of the input, we cannot match. */
2721 if (bkref_str_off >= mctx->input.len)
2724 err = extend_buffers (mctx, bkref_str_off + 1);
2725 if (__glibc_unlikely (err != REG_NOERROR))
2728 buf = (const char *) re_string_get_buffer (&mctx->input);
2730 if (buf [bkref_str_off++] != buf[sl_str - 1])
2731 break; /* We don't need to search this sub expression
2734 if (mctx->state_log[sl_str] == NULL)
2736 /* Does this state have a ')' of the sub expression? */
2737 nodes = &mctx->state_log[sl_str]->nodes;
2738 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2742 if (sub_top->path == NULL)
2744 sub_top->path = calloc (sizeof (state_array_t),
2745 sl_str - sub_top->str_idx + 1);
2746 if (sub_top->path == NULL)
2749 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2750 in the current context? */
2751 err = check_arrival (mctx, sub_top->path, sub_top->node,
2752 sub_top->str_idx, cls_node, sl_str,
2754 if (err == REG_NOMATCH)
2756 if (__glibc_unlikely (err != REG_NOERROR))
2758 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2759 if (__glibc_unlikely (sub_last == NULL))
2761 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2763 buf = (const char *) re_string_get_buffer (&mctx->input);
2764 if (err == REG_NOMATCH)
2766 if (__glibc_unlikely (err != REG_NOERROR))
2773 /* Helper functions for get_subexp(). */
2775 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2776 If it can arrive, register the sub expression expressed with SUB_TOP
2779 static reg_errcode_t
2780 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2781 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2785 /* Can the subexpression arrive the back reference? */
2786 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2787 sub_last->str_idx, bkref_node, bkref_str,
2789 if (err != REG_NOERROR)
2791 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2793 if (__glibc_unlikely (err != REG_NOERROR))
2795 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2796 return clean_state_log_if_needed (mctx, to_idx);
2799 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2800 Search '(' if FL_OPEN, or search ')' otherwise.
2801 TODO: This function isn't efficient...
2802 Because there might be more than one nodes whose types are
2803 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2808 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2809 Idx subexp_idx, int type)
2812 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2814 Idx cls_node = nodes->elems[cls_idx];
2815 const re_token_t *node = dfa->nodes + cls_node;
2816 if (node->type == type
2817 && node->opr.idx == subexp_idx)
2823 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2824 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2826 Return REG_NOERROR if it can arrive, REG_NOMATCH if it cannot,
2827 REG_ESPACE if memory is exhausted. */
2829 static reg_errcode_t
2830 __attribute_warn_unused_result__
2831 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2832 Idx top_str, Idx last_node, Idx last_str, int type)
2834 const re_dfa_t *const dfa = mctx->dfa;
2835 reg_errcode_t err = REG_NOERROR;
2836 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2837 re_dfastate_t *cur_state = NULL;
2838 re_node_set *cur_nodes, next_nodes;
2839 re_dfastate_t **backup_state_log;
2840 unsigned int context;
2842 subexp_num = dfa->nodes[top_node].opr.idx;
2843 /* Extend the buffer if we need. */
2844 if (__glibc_unlikely (path->alloc < last_str + mctx->max_mb_elem_len + 1))
2846 re_dfastate_t **new_array;
2847 Idx old_alloc = path->alloc;
2848 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2850 if (__glibc_unlikely (IDX_MAX - old_alloc < incr_alloc))
2852 new_alloc = old_alloc + incr_alloc;
2853 if (__glibc_unlikely (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc))
2855 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2856 if (__glibc_unlikely (new_array == NULL))
2858 path->array = new_array;
2859 path->alloc = new_alloc;
2860 memset (new_array + old_alloc, '\0',
2861 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2864 str_idx = path->next_idx ? path->next_idx : top_str;
2866 /* Temporary modify MCTX. */
2867 backup_state_log = mctx->state_log;
2868 backup_cur_idx = mctx->input.cur_idx;
2869 mctx->state_log = path->array;
2870 mctx->input.cur_idx = str_idx;
2872 /* Setup initial node set. */
2873 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2874 if (str_idx == top_str)
2876 err = re_node_set_init_1 (&next_nodes, top_node);
2877 if (__glibc_unlikely (err != REG_NOERROR))
2879 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2880 if (__glibc_unlikely (err != REG_NOERROR))
2882 re_node_set_free (&next_nodes);
2888 cur_state = mctx->state_log[str_idx];
2889 if (cur_state && cur_state->has_backref)
2891 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2892 if (__glibc_unlikely (err != REG_NOERROR))
2896 re_node_set_init_empty (&next_nodes);
2898 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2900 if (next_nodes.nelem)
2902 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2904 if (__glibc_unlikely (err != REG_NOERROR))
2906 re_node_set_free (&next_nodes);
2910 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2911 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2913 re_node_set_free (&next_nodes);
2916 mctx->state_log[str_idx] = cur_state;
2919 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2921 re_node_set_empty (&next_nodes);
2922 if (mctx->state_log[str_idx + 1])
2924 err = re_node_set_merge (&next_nodes,
2925 &mctx->state_log[str_idx + 1]->nodes);
2926 if (__glibc_unlikely (err != REG_NOERROR))
2928 re_node_set_free (&next_nodes);
2934 err = check_arrival_add_next_nodes (mctx, str_idx,
2935 &cur_state->non_eps_nodes,
2937 if (__glibc_unlikely (err != REG_NOERROR))
2939 re_node_set_free (&next_nodes);
2944 if (next_nodes.nelem)
2946 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2947 if (__glibc_unlikely (err != REG_NOERROR))
2949 re_node_set_free (&next_nodes);
2952 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2954 if (__glibc_unlikely (err != REG_NOERROR))
2956 re_node_set_free (&next_nodes);
2960 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2961 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2962 if (__glibc_unlikely (cur_state == NULL && err != REG_NOERROR))
2964 re_node_set_free (&next_nodes);
2967 mctx->state_log[str_idx] = cur_state;
2968 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2970 re_node_set_free (&next_nodes);
2971 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2972 : &mctx->state_log[last_str]->nodes);
2973 path->next_idx = str_idx;
2976 mctx->state_log = backup_state_log;
2977 mctx->input.cur_idx = backup_cur_idx;
2979 /* Then check the current node set has the node LAST_NODE. */
2980 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2986 /* Helper functions for check_arrival. */
2988 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2990 TODO: This function is similar to the functions transit_state*(),
2991 however this function has many additional works.
2992 Can't we unify them? */
2994 static reg_errcode_t
2995 __attribute_warn_unused_result__
2996 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
2997 re_node_set *cur_nodes, re_node_set *next_nodes)
2999 const re_dfa_t *const dfa = mctx->dfa;
3002 #ifdef RE_ENABLE_I18N
3003 reg_errcode_t err = REG_NOERROR;
3005 re_node_set union_set;
3006 re_node_set_init_empty (&union_set);
3007 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3010 Idx cur_node = cur_nodes->elems[cur_idx];
3011 DEBUG_ASSERT (!IS_EPSILON_NODE (dfa->nodes[cur_node].type));
3013 #ifdef RE_ENABLE_I18N
3014 /* If the node may accept "multi byte". */
3015 if (dfa->nodes[cur_node].accept_mb)
3017 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3021 re_dfastate_t *dest_state;
3022 Idx next_node = dfa->nexts[cur_node];
3023 Idx next_idx = str_idx + naccepted;
3024 dest_state = mctx->state_log[next_idx];
3025 re_node_set_empty (&union_set);
3028 err = re_node_set_merge (&union_set, &dest_state->nodes);
3029 if (__glibc_unlikely (err != REG_NOERROR))
3031 re_node_set_free (&union_set);
3035 ok = re_node_set_insert (&union_set, next_node);
3036 if (__glibc_unlikely (! ok))
3038 re_node_set_free (&union_set);
3041 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3043 if (__glibc_unlikely (mctx->state_log[next_idx] == NULL
3044 && err != REG_NOERROR))
3046 re_node_set_free (&union_set);
3051 #endif /* RE_ENABLE_I18N */
3053 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3055 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3056 if (__glibc_unlikely (! ok))
3058 re_node_set_free (&union_set);
3063 re_node_set_free (&union_set);
3067 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3068 CUR_NODES, however exclude the nodes which are:
3069 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3070 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3073 static reg_errcode_t
3074 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3075 Idx ex_subexp, int type)
3078 Idx idx, outside_node;
3079 re_node_set new_nodes;
3080 DEBUG_ASSERT (cur_nodes->nelem);
3081 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3082 if (__glibc_unlikely (err != REG_NOERROR))
3084 /* Create a new node set NEW_NODES with the nodes which are epsilon
3085 closures of the node in CUR_NODES. */
3087 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3089 Idx cur_node = cur_nodes->elems[idx];
3090 const re_node_set *eclosure = dfa->eclosures + cur_node;
3091 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3092 if (outside_node == -1)
3094 /* There are no problematic nodes, just merge them. */
3095 err = re_node_set_merge (&new_nodes, eclosure);
3096 if (__glibc_unlikely (err != REG_NOERROR))
3098 re_node_set_free (&new_nodes);
3104 /* There are problematic nodes, re-calculate incrementally. */
3105 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3107 if (__glibc_unlikely (err != REG_NOERROR))
3109 re_node_set_free (&new_nodes);
3114 re_node_set_free (cur_nodes);
3115 *cur_nodes = new_nodes;
3119 /* Helper function for check_arrival_expand_ecl.
3120 Check incrementally the epsilon closure of TARGET, and if it isn't
3121 problematic append it to DST_NODES. */
3123 static reg_errcode_t
3124 __attribute_warn_unused_result__
3125 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3126 Idx target, Idx ex_subexp, int type)
3129 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3133 if (dfa->nodes[cur_node].type == type
3134 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3136 if (type == OP_CLOSE_SUBEXP)
3138 ok = re_node_set_insert (dst_nodes, cur_node);
3139 if (__glibc_unlikely (! ok))
3144 ok = re_node_set_insert (dst_nodes, cur_node);
3145 if (__glibc_unlikely (! ok))
3147 if (dfa->edests[cur_node].nelem == 0)
3149 if (dfa->edests[cur_node].nelem == 2)
3152 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3153 dfa->edests[cur_node].elems[1],
3155 if (__glibc_unlikely (err != REG_NOERROR))
3158 cur_node = dfa->edests[cur_node].elems[0];
3164 /* For all the back references in the current state, calculate the
3165 destination of the back references by the appropriate entry
3166 in MCTX->BKREF_ENTS. */
3168 static reg_errcode_t
3169 __attribute_warn_unused_result__
3170 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3171 Idx cur_str, Idx subexp_num, int type)
3173 const re_dfa_t *const dfa = mctx->dfa;
3175 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3176 struct re_backref_cache_entry *ent;
3178 if (cache_idx_start == -1)
3182 ent = mctx->bkref_ents + cache_idx_start;
3185 Idx to_idx, next_node;
3187 /* Is this entry ENT is appropriate? */
3188 if (!re_node_set_contains (cur_nodes, ent->node))
3191 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3192 /* Calculate the destination of the back reference, and append it
3193 to MCTX->STATE_LOG. */
3194 if (to_idx == cur_str)
3196 /* The backreference did epsilon transit, we must re-check all the
3197 node in the current state. */
3198 re_node_set new_dests;
3199 reg_errcode_t err2, err3;
3200 next_node = dfa->edests[ent->node].elems[0];
3201 if (re_node_set_contains (cur_nodes, next_node))
3203 err = re_node_set_init_1 (&new_dests, next_node);
3204 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3205 err3 = re_node_set_merge (cur_nodes, &new_dests);
3206 re_node_set_free (&new_dests);
3207 if (__glibc_unlikely (err != REG_NOERROR || err2 != REG_NOERROR
3208 || err3 != REG_NOERROR))
3210 err = (err != REG_NOERROR ? err
3211 : (err2 != REG_NOERROR ? err2 : err3));
3214 /* TODO: It is still inefficient... */
3219 re_node_set union_set;
3220 next_node = dfa->nexts[ent->node];
3221 if (mctx->state_log[to_idx])
3224 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3227 err = re_node_set_init_copy (&union_set,
3228 &mctx->state_log[to_idx]->nodes);
3229 ok = re_node_set_insert (&union_set, next_node);
3230 if (__glibc_unlikely (err != REG_NOERROR || ! ok))
3232 re_node_set_free (&union_set);
3233 err = err != REG_NOERROR ? err : REG_ESPACE;
3239 err = re_node_set_init_1 (&union_set, next_node);
3240 if (__glibc_unlikely (err != REG_NOERROR))
3243 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3244 re_node_set_free (&union_set);
3245 if (__glibc_unlikely (mctx->state_log[to_idx] == NULL
3246 && err != REG_NOERROR))
3250 while (ent++->more);
3254 /* Build transition table for the state.
3255 Return true if successful. */
3257 static bool __attribute_noinline__
3258 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3263 bool need_word_trtable = false;
3264 bitset_word_t elem, mask;
3265 Idx ndests; /* Number of the destination states from 'state'. */
3266 re_dfastate_t **trtable;
3267 re_dfastate_t *dest_states[SBC_MAX];
3268 re_dfastate_t *dest_states_word[SBC_MAX];
3269 re_dfastate_t *dest_states_nl[SBC_MAX];
3270 re_node_set follows;
3271 bitset_t acceptable;
3273 /* We build DFA states which corresponds to the destination nodes
3274 from 'state'. 'dests_node[i]' represents the nodes which i-th
3275 destination state contains, and 'dests_ch[i]' represents the
3276 characters which i-th destination state accepts. */
3277 re_node_set dests_node[SBC_MAX];
3278 bitset_t dests_ch[SBC_MAX];
3280 /* Initialize transition table. */
3281 state->word_trtable = state->trtable = NULL;
3283 /* At first, group all nodes belonging to 'state' into several
3285 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3286 if (__glibc_unlikely (ndests <= 0))
3288 /* Return false in case of an error, true otherwise. */
3291 state->trtable = (re_dfastate_t **)
3292 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3293 if (__glibc_unlikely (state->trtable == NULL))
3300 err = re_node_set_alloc (&follows, ndests + 1);
3301 if (__glibc_unlikely (err != REG_NOERROR))
3304 re_node_set_free (&follows);
3305 for (i = 0; i < ndests; ++i)
3306 re_node_set_free (dests_node + i);
3310 bitset_empty (acceptable);
3312 /* Then build the states for all destinations. */
3313 for (i = 0; i < ndests; ++i)
3316 re_node_set_empty (&follows);
3317 /* Merge the follows of this destination states. */
3318 for (j = 0; j < dests_node[i].nelem; ++j)
3320 next_node = dfa->nexts[dests_node[i].elems[j]];
3321 if (next_node != -1)
3323 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3324 if (__glibc_unlikely (err != REG_NOERROR))
3328 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3329 if (__glibc_unlikely (dest_states[i] == NULL && err != REG_NOERROR))
3331 /* If the new state has context constraint,
3332 build appropriate states for these contexts. */
3333 if (dest_states[i]->has_constraint)
3335 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3337 if (__glibc_unlikely (dest_states_word[i] == NULL
3338 && err != REG_NOERROR))
3341 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3342 need_word_trtable = true;
3344 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3346 if (__glibc_unlikely (dest_states_nl[i] == NULL && err != REG_NOERROR))
3351 dest_states_word[i] = dest_states[i];
3352 dest_states_nl[i] = dest_states[i];
3354 bitset_merge (acceptable, dests_ch[i]);
3357 if (!__glibc_unlikely (need_word_trtable))
3359 /* We don't care about whether the following character is a word
3360 character, or we are in a single-byte character set so we can
3361 discern by looking at the character code: allocate a
3362 256-entry transition table. */
3363 trtable = state->trtable =
3364 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3365 if (__glibc_unlikely (trtable == NULL))
3368 /* For all characters ch...: */
3369 for (i = 0; i < BITSET_WORDS; ++i)
3370 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3372 mask <<= 1, elem >>= 1, ++ch)
3373 if (__glibc_unlikely (elem & 1))
3375 /* There must be exactly one destination which accepts
3376 character ch. See group_nodes_into_DFAstates. */
3377 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3380 /* j-th destination accepts the word character ch. */
3381 if (dfa->word_char[i] & mask)
3382 trtable[ch] = dest_states_word[j];
3384 trtable[ch] = dest_states[j];
3389 /* We care about whether the following character is a word
3390 character, and we are in a multi-byte character set: discern
3391 by looking at the character code: build two 256-entry
3392 transition tables, one starting at trtable[0] and one
3393 starting at trtable[SBC_MAX]. */
3394 trtable = state->word_trtable =
3395 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3396 if (__glibc_unlikely (trtable == NULL))
3399 /* For all characters ch...: */
3400 for (i = 0; i < BITSET_WORDS; ++i)
3401 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3403 mask <<= 1, elem >>= 1, ++ch)
3404 if (__glibc_unlikely (elem & 1))
3406 /* There must be exactly one destination which accepts
3407 character ch. See group_nodes_into_DFAstates. */
3408 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3411 /* j-th destination accepts the word character ch. */
3412 trtable[ch] = dest_states[j];
3413 trtable[ch + SBC_MAX] = dest_states_word[j];
3418 if (bitset_contain (acceptable, NEWLINE_CHAR))
3420 /* The current state accepts newline character. */
3421 for (j = 0; j < ndests; ++j)
3422 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3424 /* k-th destination accepts newline character. */
3425 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3426 if (need_word_trtable)
3427 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3428 /* There must be only one destination which accepts
3429 newline. See group_nodes_into_DFAstates. */
3434 re_node_set_free (&follows);
3435 for (i = 0; i < ndests; ++i)
3436 re_node_set_free (dests_node + i);
3440 /* Group all nodes belonging to STATE into several destinations.
3441 Then for all destinations, set the nodes belonging to the destination
3442 to DESTS_NODE[i] and set the characters accepted by the destination
3443 to DEST_CH[i]. Return the number of destinations if successful,
3444 -1 on internal error. */
3447 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3448 re_node_set *dests_node, bitset_t *dests_ch)
3453 Idx ndests; /* Number of the destinations from 'state'. */
3454 bitset_t accepts; /* Characters a node can accept. */
3455 const re_node_set *cur_nodes = &state->nodes;
3456 bitset_empty (accepts);
3459 /* For all the nodes belonging to 'state', */
3460 for (i = 0; i < cur_nodes->nelem; ++i)
3462 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3463 re_token_type_t type = node->type;
3464 unsigned int constraint = node->constraint;
3466 /* Enumerate all single byte character this node can accept. */
3467 if (type == CHARACTER)
3468 bitset_set (accepts, node->opr.c);
3469 else if (type == SIMPLE_BRACKET)
3471 bitset_merge (accepts, node->opr.sbcset);
3473 else if (type == OP_PERIOD)
3475 #ifdef RE_ENABLE_I18N
3476 if (dfa->mb_cur_max > 1)
3477 bitset_merge (accepts, dfa->sb_char);
3480 bitset_set_all (accepts);
3481 if (!(dfa->syntax & RE_DOT_NEWLINE))
3482 bitset_clear (accepts, '\n');
3483 if (dfa->syntax & RE_DOT_NOT_NULL)
3484 bitset_clear (accepts, '\0');
3486 #ifdef RE_ENABLE_I18N
3487 else if (type == OP_UTF8_PERIOD)
3489 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3490 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3492 bitset_merge (accepts, utf8_sb_map);
3493 if (!(dfa->syntax & RE_DOT_NEWLINE))
3494 bitset_clear (accepts, '\n');
3495 if (dfa->syntax & RE_DOT_NOT_NULL)
3496 bitset_clear (accepts, '\0');
3502 /* Check the 'accepts' and sift the characters which are not
3503 match it the context. */
3506 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3508 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3509 bitset_empty (accepts);
3510 if (accepts_newline)
3511 bitset_set (accepts, NEWLINE_CHAR);
3515 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3517 bitset_empty (accepts);
3521 if (constraint & NEXT_WORD_CONSTRAINT)
3523 bitset_word_t any_set = 0;
3524 if (type == CHARACTER && !node->word_char)
3526 bitset_empty (accepts);
3529 #ifdef RE_ENABLE_I18N
3530 if (dfa->mb_cur_max > 1)
3531 for (j = 0; j < BITSET_WORDS; ++j)
3532 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3535 for (j = 0; j < BITSET_WORDS; ++j)
3536 any_set |= (accepts[j] &= dfa->word_char[j]);
3540 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3542 bitset_word_t any_set = 0;
3543 if (type == CHARACTER && node->word_char)
3545 bitset_empty (accepts);
3548 #ifdef RE_ENABLE_I18N
3549 if (dfa->mb_cur_max > 1)
3550 for (j = 0; j < BITSET_WORDS; ++j)
3551 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3554 for (j = 0; j < BITSET_WORDS; ++j)
3555 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3561 /* Then divide 'accepts' into DFA states, or create a new
3562 state. Above, we make sure that accepts is not empty. */
3563 for (j = 0; j < ndests; ++j)
3565 bitset_t intersec; /* Intersection sets, see below. */
3567 /* Flags, see below. */
3568 bitset_word_t has_intersec, not_subset, not_consumed;
3570 /* Optimization, skip if this state doesn't accept the character. */
3571 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3574 /* Enumerate the intersection set of this state and 'accepts'. */
3576 for (k = 0; k < BITSET_WORDS; ++k)
3577 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3578 /* And skip if the intersection set is empty. */
3582 /* Then check if this state is a subset of 'accepts'. */
3583 not_subset = not_consumed = 0;
3584 for (k = 0; k < BITSET_WORDS; ++k)
3586 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3587 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3590 /* If this state isn't a subset of 'accepts', create a
3591 new group state, which has the 'remains'. */
3594 bitset_copy (dests_ch[ndests], remains);
3595 bitset_copy (dests_ch[j], intersec);
3596 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3597 if (__glibc_unlikely (err != REG_NOERROR))
3602 /* Put the position in the current group. */
3603 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3604 if (__glibc_unlikely (! ok))
3607 /* If all characters are consumed, go to next node. */
3611 /* Some characters remain, create a new group. */
3614 bitset_copy (dests_ch[ndests], accepts);
3615 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3616 if (__glibc_unlikely (err != REG_NOERROR))
3619 bitset_empty (accepts);
3622 assume (ndests <= SBC_MAX);
3625 for (j = 0; j < ndests; ++j)
3626 re_node_set_free (dests_node + j);
3630 #ifdef RE_ENABLE_I18N
3631 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3632 Return the number of the bytes the node accepts.
3633 STR_IDX is the current index of the input string.
3635 This function handles the nodes which can accept one character, or
3636 one collating element like '.', '[a-z]', opposite to the other nodes
3637 can only accept one byte. */
3640 # include <locale/weight.h>
3644 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3645 const re_string_t *input, Idx str_idx)
3647 const re_token_t *node = dfa->nodes + node_idx;
3648 int char_len, elem_len;
3651 if (__glibc_unlikely (node->type == OP_UTF8_PERIOD))
3653 unsigned char c = re_string_byte_at (input, str_idx), d;
3654 if (__glibc_likely (c < 0xc2))
3657 if (str_idx + 2 > input->len)
3660 d = re_string_byte_at (input, str_idx + 1);
3662 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3666 if (c == 0xe0 && d < 0xa0)
3672 if (c == 0xf0 && d < 0x90)
3678 if (c == 0xf8 && d < 0x88)
3684 if (c == 0xfc && d < 0x84)
3690 if (str_idx + char_len > input->len)
3693 for (i = 1; i < char_len; ++i)
3695 d = re_string_byte_at (input, str_idx + i);
3696 if (d < 0x80 || d > 0xbf)
3702 char_len = re_string_char_size_at (input, str_idx);
3703 if (node->type == OP_PERIOD)
3707 /* FIXME: I don't think this if is needed, as both '\n'
3708 and '\0' are char_len == 1. */
3709 /* '.' accepts any one character except the following two cases. */
3710 if ((!(dfa->syntax & RE_DOT_NEWLINE)
3711 && re_string_byte_at (input, str_idx) == '\n')
3712 || ((dfa->syntax & RE_DOT_NOT_NULL)
3713 && re_string_byte_at (input, str_idx) == '\0'))
3718 elem_len = re_string_elem_size_at (input, str_idx);
3719 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3722 if (node->type == COMPLEX_BRACKET)
3724 const re_charset_t *cset = node->opr.mbcset;
3726 const unsigned char *pin
3727 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3732 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3733 ? re_string_wchar_at (input, str_idx) : 0);
3735 /* match with multibyte character? */
3736 for (i = 0; i < cset->nmbchars; ++i)
3737 if (wc == cset->mbchars[i])
3739 match_len = char_len;
3740 goto check_node_accept_bytes_match;
3742 /* match with character_class? */
3743 for (i = 0; i < cset->nchar_classes; ++i)
3745 wctype_t wt = cset->char_classes[i];
3746 if (__iswctype (wc, wt))
3748 match_len = char_len;
3749 goto check_node_accept_bytes_match;
3754 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3757 unsigned int in_collseq = 0;
3758 const int32_t *table, *indirect;
3759 const unsigned char *weights, *extra;
3760 const char *collseqwc;
3762 /* match with collating_symbol? */
3763 if (cset->ncoll_syms)
3764 extra = (const unsigned char *)
3765 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3766 for (i = 0; i < cset->ncoll_syms; ++i)
3768 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3769 /* Compare the length of input collating element and
3770 the length of current collating element. */
3771 if (*coll_sym != elem_len)
3773 /* Compare each bytes. */
3774 for (j = 0; j < *coll_sym; j++)
3775 if (pin[j] != coll_sym[1 + j])
3779 /* Match if every bytes is equal. */
3781 goto check_node_accept_bytes_match;
3787 if (elem_len <= char_len)
3789 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3790 in_collseq = __collseq_table_lookup (collseqwc, wc);
3793 in_collseq = find_collation_sequence_value (pin, elem_len);
3795 /* match with range expression? */
3796 /* FIXME: Implement rational ranges here, too. */
3797 for (i = 0; i < cset->nranges; ++i)
3798 if (cset->range_starts[i] <= in_collseq
3799 && in_collseq <= cset->range_ends[i])
3801 match_len = elem_len;
3802 goto check_node_accept_bytes_match;
3805 /* match with equivalence_class? */
3806 if (cset->nequiv_classes)
3808 const unsigned char *cp = pin;
3809 table = (const int32_t *)
3810 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3811 weights = (const unsigned char *)
3812 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3813 extra = (const unsigned char *)
3814 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3815 indirect = (const int32_t *)
3816 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3817 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3818 int32_t rule = idx >> 24;
3822 size_t weight_len = weights[idx];
3823 for (i = 0; i < cset->nequiv_classes; ++i)
3825 int32_t equiv_class_idx = cset->equiv_classes[i];
3826 int32_t equiv_class_rule = equiv_class_idx >> 24;
3827 equiv_class_idx &= 0xffffff;
3828 if (weights[equiv_class_idx] == weight_len
3829 && equiv_class_rule == rule
3830 && memcmp (weights + idx + 1,
3831 weights + equiv_class_idx + 1,
3834 match_len = elem_len;
3835 goto check_node_accept_bytes_match;
3844 /* match with range expression? */
3845 for (i = 0; i < cset->nranges; ++i)
3847 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3849 match_len = char_len;
3850 goto check_node_accept_bytes_match;
3854 check_node_accept_bytes_match:
3855 if (!cset->non_match)
3862 return (elem_len > char_len) ? elem_len : char_len;
3870 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3872 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3877 /* No valid character. Match it as a single byte character. */
3878 const unsigned char *collseq = (const unsigned char *)
3879 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3880 return collseq[mbs[0]];
3887 const unsigned char *extra = (const unsigned char *)
3888 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3889 int32_t extrasize = (const unsigned char *)
3890 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3892 for (idx = 0; idx < extrasize;)
3896 int32_t elem_mbs_len;
3897 /* Skip the name of collating element name. */
3898 idx = idx + extra[idx] + 1;
3899 elem_mbs_len = extra[idx++];
3900 if (mbs_len == elem_mbs_len)
3902 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3903 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3905 if (mbs_cnt == elem_mbs_len)
3906 /* Found the entry. */
3909 /* Skip the byte sequence of the collating element. */
3910 idx += elem_mbs_len;
3911 /* Adjust for the alignment. */
3912 idx = (idx + 3) & ~3;
3913 /* Skip the collation sequence value. */
3914 idx += sizeof (uint32_t);
3915 /* Skip the wide char sequence of the collating element. */
3916 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3917 /* If we found the entry, return the sequence value. */
3919 return *(uint32_t *) (extra + idx);
3920 /* Skip the collation sequence value. */
3921 idx += sizeof (uint32_t);
3927 #endif /* RE_ENABLE_I18N */
3929 /* Check whether the node accepts the byte which is IDX-th
3930 byte of the INPUT. */
3933 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3937 ch = re_string_byte_at (&mctx->input, idx);
3941 if (node->opr.c != ch)
3945 case SIMPLE_BRACKET:
3946 if (!bitset_contain (node->opr.sbcset, ch))
3950 #ifdef RE_ENABLE_I18N
3951 case OP_UTF8_PERIOD:
3952 if (ch >= ASCII_CHARS)
3957 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3958 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3966 if (node->constraint)
3968 /* The node has constraints. Check whether the current context
3969 satisfies the constraints. */
3970 unsigned int context = re_string_context_at (&mctx->input, idx,
3972 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3979 /* Extend the buffers, if the buffers have run out. */
3981 static reg_errcode_t
3982 __attribute_warn_unused_result__
3983 extend_buffers (re_match_context_t *mctx, int min_len)
3986 re_string_t *pstr = &mctx->input;
3988 /* Avoid overflow. */
3989 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
3993 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
3994 ret = re_string_realloc_buffers (pstr,
3996 MIN (pstr->len, pstr->bufs_len * 2)));
3997 if (__glibc_unlikely (ret != REG_NOERROR))
4000 if (mctx->state_log != NULL)
4002 /* And double the length of state_log. */
4003 /* XXX We have no indication of the size of this buffer. If this
4004 allocation fail we have no indication that the state_log array
4005 does not have the right size. */
4006 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4007 pstr->bufs_len + 1);
4008 if (__glibc_unlikely (new_array == NULL))
4010 mctx->state_log = new_array;
4013 /* Then reconstruct the buffers. */
4016 #ifdef RE_ENABLE_I18N
4017 if (pstr->mb_cur_max > 1)
4019 ret = build_wcs_upper_buffer (pstr);
4020 if (__glibc_unlikely (ret != REG_NOERROR))
4024 #endif /* RE_ENABLE_I18N */
4025 build_upper_buffer (pstr);
4029 #ifdef RE_ENABLE_I18N
4030 if (pstr->mb_cur_max > 1)
4031 build_wcs_buffer (pstr);
4033 #endif /* RE_ENABLE_I18N */
4035 if (pstr->trans != NULL)
4036 re_string_translate_buffer (pstr);
4043 /* Functions for matching context. */
4045 /* Initialize MCTX. */
4047 static reg_errcode_t
4048 __attribute_warn_unused_result__
4049 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4051 mctx->eflags = eflags;
4052 mctx->match_last = -1;
4055 /* Avoid overflow. */
4056 size_t max_object_size =
4057 MAX (sizeof (struct re_backref_cache_entry),
4058 sizeof (re_sub_match_top_t *));
4059 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n))
4062 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4063 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4064 if (__glibc_unlikely (mctx->bkref_ents == NULL || mctx->sub_tops == NULL))
4067 /* Already zero-ed by the caller.
4069 mctx->bkref_ents = NULL;
4070 mctx->nbkref_ents = 0;
4071 mctx->nsub_tops = 0; */
4072 mctx->abkref_ents = n;
4073 mctx->max_mb_elem_len = 1;
4074 mctx->asub_tops = n;
4078 /* Clean the entries which depend on the current input in MCTX.
4079 This function must be invoked when the matcher changes the start index
4080 of the input, or changes the input string. */
4083 match_ctx_clean (re_match_context_t *mctx)
4086 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4089 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4090 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4092 re_sub_match_last_t *last = top->lasts[sl_idx];
4093 re_free (last->path.array);
4096 re_free (top->lasts);
4099 re_free (top->path->array);
4100 re_free (top->path);
4105 mctx->nsub_tops = 0;
4106 mctx->nbkref_ents = 0;
4109 /* Free all the memory associated with MCTX. */
4112 match_ctx_free (re_match_context_t *mctx)
4114 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4115 match_ctx_clean (mctx);
4116 re_free (mctx->sub_tops);
4117 re_free (mctx->bkref_ents);
4120 /* Add a new backreference entry to MCTX.
4121 Note that we assume that caller never call this function with duplicate
4122 entry, and call with STR_IDX which isn't smaller than any existing entry.
4125 static reg_errcode_t
4126 __attribute_warn_unused_result__
4127 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4130 if (mctx->nbkref_ents >= mctx->abkref_ents)
4132 struct re_backref_cache_entry* new_entry;
4133 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4134 mctx->abkref_ents * 2);
4135 if (__glibc_unlikely (new_entry == NULL))
4137 re_free (mctx->bkref_ents);
4140 mctx->bkref_ents = new_entry;
4141 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4142 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4143 mctx->abkref_ents *= 2;
4145 if (mctx->nbkref_ents > 0
4146 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4147 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4149 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4150 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4151 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4152 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4154 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4155 If bit N is clear, means that this entry won't epsilon-transition to
4156 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4157 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4160 A backreference does not epsilon-transition unless it is empty, so set
4161 to all zeros if FROM != TO. */
4162 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4163 = (from == to ? -1 : 0);
4165 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4166 if (mctx->max_mb_elem_len < to - from)
4167 mctx->max_mb_elem_len = to - from;
4171 /* Return the first entry with the same str_idx, or -1 if none is
4172 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4175 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4177 Idx left, right, mid, last;
4178 last = right = mctx->nbkref_ents;
4179 for (left = 0; left < right;)
4181 mid = (left + right) / 2;
4182 if (mctx->bkref_ents[mid].str_idx < str_idx)
4187 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4193 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4196 static reg_errcode_t
4197 __attribute_warn_unused_result__
4198 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4200 DEBUG_ASSERT (mctx->sub_tops != NULL);
4201 DEBUG_ASSERT (mctx->asub_tops > 0);
4202 if (__glibc_unlikely (mctx->nsub_tops == mctx->asub_tops))
4204 Idx new_asub_tops = mctx->asub_tops * 2;
4205 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4206 re_sub_match_top_t *,
4208 if (__glibc_unlikely (new_array == NULL))
4210 mctx->sub_tops = new_array;
4211 mctx->asub_tops = new_asub_tops;
4213 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4214 if (__glibc_unlikely (mctx->sub_tops[mctx->nsub_tops] == NULL))
4216 mctx->sub_tops[mctx->nsub_tops]->node = node;
4217 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4221 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4222 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.
4223 Return the new entry if successful, NULL if memory is exhausted. */
4225 static re_sub_match_last_t *
4226 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4228 re_sub_match_last_t *new_entry;
4229 if (__glibc_unlikely (subtop->nlasts == subtop->alasts))
4231 Idx new_alasts = 2 * subtop->alasts + 1;
4232 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4233 re_sub_match_last_t *,
4235 if (__glibc_unlikely (new_array == NULL))
4237 subtop->lasts = new_array;
4238 subtop->alasts = new_alasts;
4240 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4241 if (__glibc_likely (new_entry != NULL))
4243 subtop->lasts[subtop->nlasts] = new_entry;
4244 new_entry->node = node;
4245 new_entry->str_idx = str_idx;
4252 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4253 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4255 sctx->sifted_states = sifted_sts;
4256 sctx->limited_states = limited_sts;
4257 sctx->last_node = last_node;
4258 sctx->last_str_idx = last_str_idx;
4259 re_node_set_init_empty (&sctx->limits);