1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
21 Idx n) internal_function;
22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
23 static void match_ctx_free (re_match_context_t *cache) internal_function;
24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
25 Idx str_idx, Idx from, Idx to)
27 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
30 Idx str_idx) internal_function;
31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
32 Idx node, Idx str_idx)
34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35 re_dfastate_t **limited_sts, Idx last_node,
38 static reg_errcode_t re_search_internal (const regex_t *preg,
39 const char *string, Idx length,
40 Idx start, Idx last_start, Idx stop,
41 size_t nmatch, regmatch_t pmatch[],
42 int eflags) internal_function;
43 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
44 const char *string1, Idx length1,
45 const char *string2, Idx length2,
46 Idx start, regoff_t range,
47 struct re_registers *regs,
48 Idx stop, bool ret_len) internal_function;
49 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, Idx length, Idx start,
51 regoff_t range, Idx stop,
52 struct re_registers *regs,
53 bool ret_len) internal_function;
54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
55 Idx nregs, int regs_allocated) internal_function;
56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
58 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
59 Idx *p_match_first) internal_function;
60 static Idx check_halt_state_context (const re_match_context_t *mctx,
61 const re_dfastate_t *state, Idx idx)
63 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
64 regmatch_t *prev_idx_match, Idx cur_node,
65 Idx cur_idx, Idx nmatch) internal_function;
66 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
67 Idx str_idx, Idx dest_node, Idx nregs,
69 re_node_set *eps_via_nodes)
71 static reg_errcode_t set_regs (const regex_t *preg,
72 const re_match_context_t *mctx,
73 size_t nmatch, regmatch_t *pmatch,
74 bool fl_backtrack) internal_function;
75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
79 static int sift_states_iter_mb (const re_match_context_t *mctx,
80 re_sift_context_t *sctx,
81 Idx node_idx, Idx str_idx, Idx max_str_idx)
83 #endif /* RE_ENABLE_I18N */
84 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
85 re_sift_context_t *sctx)
87 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
88 re_sift_context_t *sctx, Idx str_idx,
89 re_node_set *cur_dest)
91 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
92 re_sift_context_t *sctx,
94 re_node_set *dest_nodes)
96 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
97 re_node_set *dest_nodes,
98 const re_node_set *candidates)
100 static bool check_dst_limits (const re_match_context_t *mctx,
101 const re_node_set *limits,
102 Idx dst_node, Idx dst_idx, Idx src_node,
103 Idx src_idx) internal_function;
104 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
105 int boundaries, Idx subexp_idx,
106 Idx from_node, Idx bkref_idx)
108 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
109 Idx limit, Idx subexp_idx,
110 Idx node, Idx str_idx,
111 Idx bkref_idx) internal_function;
112 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
113 re_node_set *dest_nodes,
114 const re_node_set *candidates,
116 struct re_backref_cache_entry *bkref_ents,
117 Idx str_idx) internal_function;
118 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
119 re_sift_context_t *sctx,
120 Idx str_idx, const re_node_set *candidates)
122 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
124 re_dfastate_t **src, Idx num)
126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
127 re_match_context_t *mctx) internal_function;
128 static re_dfastate_t *transit_state (reg_errcode_t *err,
129 re_match_context_t *mctx,
130 re_dfastate_t *state) internal_function;
131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
132 re_match_context_t *mctx,
133 re_dfastate_t *next_state)
135 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
136 re_node_set *cur_nodes,
137 Idx str_idx) internal_function;
139 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
140 re_match_context_t *mctx,
141 re_dfastate_t *pstate)
144 #ifdef RE_ENABLE_I18N
145 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
146 re_dfastate_t *pstate)
148 #endif /* RE_ENABLE_I18N */
149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
150 const re_node_set *nodes)
152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 Idx bkref_node, Idx bkref_str_idx)
155 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
156 const re_sub_match_top_t *sub_top,
157 re_sub_match_last_t *sub_last,
158 Idx bkref_node, Idx bkref_str)
160 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
161 Idx subexp_idx, int type) internal_function;
162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
163 state_array_t *path, Idx top_node,
164 Idx top_str, Idx last_node, Idx last_str,
165 int type) internal_function;
166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
168 re_node_set *cur_nodes,
169 re_node_set *next_nodes)
171 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
172 re_node_set *cur_nodes,
173 Idx ex_subexp, int type)
175 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
176 re_node_set *dst_nodes,
177 Idx target, Idx ex_subexp,
178 int type) internal_function;
179 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
180 re_node_set *cur_nodes, Idx cur_str,
181 Idx subexp_num, int type)
183 static bool build_trtable (const re_dfa_t *dfa,
184 re_dfastate_t *state) internal_function;
185 #ifdef RE_ENABLE_I18N
186 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
187 const re_string_t *input, Idx idx)
190 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
194 #endif /* RE_ENABLE_I18N */
195 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
196 const re_dfastate_t *state,
197 re_node_set *states_node,
198 bitset_t *states_ch) internal_function;
199 static bool check_node_accept (const re_match_context_t *mctx,
200 const re_token_t *node, Idx idx)
202 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
205 /* Entry point for POSIX code. */
207 /* regexec searches for a given pattern, specified by PREG, in the
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 'regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
215 EFLAGS specifies "execution flags" which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
219 We return 0 if we find a match and REG_NOMATCH if not. */
222 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
223 size_t nmatch, regmatch_t pmatch[], int eflags)
227 re_dfa_t *dfa = preg->buffer;
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
232 if (eflags & REG_STARTEND)
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
240 length = strlen (string);
243 lock_lock (dfa->lock);
245 err = re_search_internal (preg, string, length, start, length,
246 length, 0, NULL, eflags);
248 err = re_search_internal (preg, string, length, start, length,
249 length, nmatch, pmatch, eflags);
250 lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
255 # include <shlib-compat.h>
256 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
258 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
259 __typeof__ (__regexec) __compat_regexec;
262 attribute_compat_text_section
263 __compat_regexec (const regex_t *_Restrict_ preg,
264 const char *_Restrict_ string, size_t nmatch,
265 regmatch_t pmatch[], int eflags)
267 return regexec (preg, string, nmatch, pmatch,
268 eflags & (REG_NOTBOL | REG_NOTEOL));
270 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
274 /* Entry points for GNU code. */
276 /* re_match, re_search, re_match_2, re_search_2
278 The former two functions operate on STRING with length LENGTH,
279 while the later two operate on concatenation of STRING1 and STRING2
280 with lengths LENGTH1 and LENGTH2, respectively.
282 re_match() matches the compiled pattern in BUFP against the string,
283 starting at index START.
285 re_search() first tries matching at index START, then it tries to match
286 starting from index START + 1, and so on. The last start position tried
287 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
290 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
291 the first STOP characters of the concatenation of the strings should be
294 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
295 and all groups is stored in REGS. (For the "_2" variants, the offsets are
296 computed relative to the concatenation, not relative to the individual
299 On success, re_match* functions return the length of the match, re_search*
300 return the position of the start of the match. Return value -1 means no
301 match was found and -2 indicates an internal error. */
304 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
305 Idx start, struct re_registers *regs)
307 return re_search_stub (bufp, string, length, start, 0, length, regs, true);
310 weak_alias (__re_match, re_match)
314 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
315 Idx start, regoff_t range, struct re_registers *regs)
317 return re_search_stub (bufp, string, length, start, range, length, regs,
321 weak_alias (__re_search, re_search)
325 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
326 const char *string2, Idx length2, Idx start,
327 struct re_registers *regs, Idx stop)
329 return re_search_2_stub (bufp, string1, length1, string2, length2,
330 start, 0, regs, stop, true);
333 weak_alias (__re_match_2, re_match_2)
337 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
338 const char *string2, Idx length2, Idx start, regoff_t range,
339 struct re_registers *regs, Idx stop)
341 return re_search_2_stub (bufp, string1, length1, string2, length2,
342 start, range, regs, stop, false);
345 weak_alias (__re_search_2, re_search_2)
350 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
351 Idx length1, const char *string2, Idx length2, Idx start,
352 regoff_t range, struct re_registers *regs,
353 Idx stop, bool ret_len)
357 Idx len = length1 + length2;
360 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
363 /* Concatenate the strings. */
367 s = re_malloc (char, len);
369 if (BE (s == NULL, 0))
372 memcpy (__mempcpy (s, string1, length1), string2, length2);
374 memcpy (s, string1, length1);
375 memcpy (s + length1, string2, length2);
384 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
390 /* The parameters have the same meaning as those of re_search.
391 Additional parameters:
392 If RET_LEN is true the length of the match is returned (re_match style);
393 otherwise the position of the match is returned. */
397 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
398 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
401 reg_errcode_t result;
406 re_dfa_t *dfa = bufp->buffer;
407 Idx last_start = start + range;
409 /* Check for out-of-range. */
410 if (BE (start < 0 || start > length, 0))
412 if (BE (length < last_start || (0 <= range && last_start < start), 0))
414 else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
417 lock_lock (dfa->lock);
419 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
420 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
422 /* Compile fastmap if we haven't yet. */
423 if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
424 re_compile_fastmap (bufp);
426 if (BE (bufp->no_sub, 0))
429 /* We need at least 1 register. */
432 else if (BE (bufp->regs_allocated == REGS_FIXED
433 && regs->num_regs <= bufp->re_nsub, 0))
435 nregs = regs->num_regs;
436 if (BE (nregs < 1, 0))
438 /* Nothing can be copied to regs. */
444 nregs = bufp->re_nsub + 1;
445 pmatch = re_malloc (regmatch_t, nregs);
446 if (BE (pmatch == NULL, 0))
452 result = re_search_internal (bufp, string, length, start, last_start, stop,
453 nregs, pmatch, eflags);
457 /* I hope we needn't fill their regs with -1's when no match was found. */
458 if (result != REG_NOERROR)
459 rval = result == REG_NOMATCH ? -1 : -2;
460 else if (regs != NULL)
462 /* If caller wants register contents data back, copy them. */
463 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
464 bufp->regs_allocated);
465 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
469 if (BE (rval == 0, 1))
473 assert (pmatch[0].rm_so == start);
474 rval = pmatch[0].rm_eo - start;
477 rval = pmatch[0].rm_so;
481 lock_unlock (dfa->lock);
487 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
490 int rval = REGS_REALLOCATE;
492 Idx need_regs = nregs + 1;
493 /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
496 /* Have the register data arrays been allocated? */
497 if (regs_allocated == REGS_UNALLOCATED)
498 { /* No. So allocate them with malloc. */
499 regs->start = re_malloc (regoff_t, need_regs);
500 if (BE (regs->start == NULL, 0))
501 return REGS_UNALLOCATED;
502 regs->end = re_malloc (regoff_t, need_regs);
503 if (BE (regs->end == NULL, 0))
505 re_free (regs->start);
506 return REGS_UNALLOCATED;
508 regs->num_regs = need_regs;
510 else if (regs_allocated == REGS_REALLOCATE)
511 { /* Yes. If we need more elements than were already
512 allocated, reallocate them. If we need fewer, just
514 if (BE (need_regs > regs->num_regs, 0))
516 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
518 if (BE (new_start == NULL, 0))
519 return REGS_UNALLOCATED;
520 new_end = re_realloc (regs->end, regoff_t, need_regs);
521 if (BE (new_end == NULL, 0))
524 return REGS_UNALLOCATED;
526 regs->start = new_start;
528 regs->num_regs = need_regs;
533 assert (regs_allocated == REGS_FIXED);
534 /* This function may not be called with REGS_FIXED and nregs too big. */
535 assert (regs->num_regs >= nregs);
540 for (i = 0; i < nregs; ++i)
542 regs->start[i] = pmatch[i].rm_so;
543 regs->end[i] = pmatch[i].rm_eo;
545 for ( ; i < regs->num_regs; ++i)
546 regs->start[i] = regs->end[i] = -1;
551 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
552 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
553 this memory for recording register information. STARTS and ENDS
554 must be allocated using the malloc library routine, and must each
555 be at least NUM_REGS * sizeof (regoff_t) bytes long.
557 If NUM_REGS == 0, then subsequent matches should allocate their own
560 Unless this function is called, the first search or match using
561 PATTERN_BUFFER will allocate its own register data, without
562 freeing the old data. */
565 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
566 __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
570 bufp->regs_allocated = REGS_REALLOCATE;
571 regs->num_regs = num_regs;
572 regs->start = starts;
577 bufp->regs_allocated = REGS_UNALLOCATED;
579 regs->start = regs->end = NULL;
583 weak_alias (__re_set_registers, re_set_registers)
586 /* Entry points compatible with 4.2 BSD regex library. We don't define
587 them unless specifically requested. */
589 #if defined _REGEX_RE_COMP || defined _LIBC
594 re_exec (const char *s)
596 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
598 #endif /* _REGEX_RE_COMP */
600 /* Internal entry point. */
602 /* Searches for a compiled pattern PREG in the string STRING, whose
603 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
604 meaning as with regexec. LAST_START is START + RANGE, where
605 START and RANGE have the same meaning as with re_search.
606 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
607 otherwise return the error code.
608 Note: We assume front end functions already check ranges.
609 (0 <= LAST_START && LAST_START <= LENGTH) */
612 __attribute_warn_unused_result__ internal_function
613 re_search_internal (const regex_t *preg, const char *string, Idx length,
614 Idx start, Idx last_start, Idx stop, size_t nmatch,
615 regmatch_t pmatch[], int eflags)
618 const re_dfa_t *dfa = preg->buffer;
619 Idx left_lim, right_lim;
621 bool fl_longest_match;
628 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
629 re_match_context_t mctx = { .dfa = dfa };
631 re_match_context_t mctx;
633 char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
634 && start != last_start && !preg->can_be_null)
635 ? preg->fastmap : NULL);
636 RE_TRANSLATE_TYPE t = preg->translate;
638 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
639 memset (&mctx, '\0', sizeof (re_match_context_t));
643 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
644 nmatch -= extra_nmatch;
646 /* Check if the DFA haven't been compiled. */
647 if (BE (preg->used == 0 || dfa->init_state == NULL
648 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
649 || dfa->init_state_begbuf == NULL, 0))
653 /* We assume front-end functions already check them. */
654 assert (0 <= last_start && last_start <= length);
657 /* If initial states with non-begbuf contexts have no elements,
658 the regex must be anchored. If preg->newline_anchor is set,
659 we'll never use init_state_nl, so do not check it. */
660 if (dfa->init_state->nodes.nelem == 0
661 && dfa->init_state_word->nodes.nelem == 0
662 && (dfa->init_state_nl->nodes.nelem == 0
663 || !preg->newline_anchor))
665 if (start != 0 && last_start != 0)
667 start = last_start = 0;
670 /* We must check the longest matching, if nmatch > 0. */
671 fl_longest_match = (nmatch != 0 || dfa->nbackref);
673 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
674 preg->translate, (preg->syntax & RE_ICASE) != 0,
676 if (BE (err != REG_NOERROR, 0))
678 mctx.input.stop = stop;
679 mctx.input.raw_stop = stop;
680 mctx.input.newline_anchor = preg->newline_anchor;
682 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
683 if (BE (err != REG_NOERROR, 0))
686 /* We will log all the DFA states through which the dfa pass,
687 if nmatch > 1, or this dfa has "multibyte node", which is a
688 back-reference or a node which can accept multibyte character or
689 multi character collating element. */
690 if (nmatch > 1 || dfa->has_mb_node)
692 /* Avoid overflow. */
693 if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
694 <= mctx.input.bufs_len), 0))
700 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
701 if (BE (mctx.state_log == NULL, 0))
708 mctx.state_log = NULL;
711 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
712 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
714 /* Check incrementally whether the input string matches. */
715 incr = (last_start < start) ? -1 : 1;
716 left_lim = (last_start < start) ? last_start : start;
717 right_lim = (last_start < start) ? start : last_start;
718 sb = dfa->mb_cur_max == 1;
721 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
722 | (start <= last_start ? 2 : 0)
723 | (t != NULL ? 1 : 0))
726 for (;; match_first += incr)
729 if (match_first < left_lim || right_lim < match_first)
732 /* Advance as rapidly as possible through the string, until we
733 find a plausible place to start matching. This may be done
734 with varying efficiency, so there are various possibilities:
735 only the most common of them are specialized, in order to
736 save on code size. We use a switch statement for speed. */
744 /* Fastmap with single-byte translation, match forward. */
745 while (BE (match_first < right_lim, 1)
746 && !fastmap[t[(unsigned char) string[match_first]]])
748 goto forward_match_found_start_or_reached_end;
751 /* Fastmap without translation, match forward. */
752 while (BE (match_first < right_lim, 1)
753 && !fastmap[(unsigned char) string[match_first]])
756 forward_match_found_start_or_reached_end:
757 if (BE (match_first == right_lim, 0))
759 ch = match_first >= length
760 ? 0 : (unsigned char) string[match_first];
761 if (!fastmap[t ? t[ch] : ch])
768 /* Fastmap without multi-byte translation, match backwards. */
769 while (match_first >= left_lim)
771 ch = match_first >= length
772 ? 0 : (unsigned char) string[match_first];
773 if (fastmap[t ? t[ch] : ch])
777 if (match_first < left_lim)
782 /* In this case, we can't determine easily the current byte,
783 since it might be a component byte of a multibyte
784 character. Then we use the constructed buffer instead. */
787 /* If MATCH_FIRST is out of the valid range, reconstruct the
789 __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
790 if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
792 err = re_string_reconstruct (&mctx.input, match_first,
794 if (BE (err != REG_NOERROR, 0))
797 offset = match_first - mctx.input.raw_mbs_idx;
799 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
800 Note that MATCH_FIRST must not be smaller than 0. */
801 ch = (match_first >= length
802 ? 0 : re_string_byte_at (&mctx.input, offset));
806 if (match_first < left_lim || match_first > right_lim)
815 /* Reconstruct the buffers so that the matcher can assume that
816 the matching starts from the beginning of the buffer. */
817 err = re_string_reconstruct (&mctx.input, match_first, eflags);
818 if (BE (err != REG_NOERROR, 0))
821 #ifdef RE_ENABLE_I18N
822 /* Don't consider this char as a possible match start if it part,
823 yet isn't the head, of a multibyte character. */
824 if (!sb && !re_string_first_byte (&mctx.input, 0))
828 /* It seems to be appropriate one, then use the matcher. */
829 /* We assume that the matching starts from 0. */
830 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
831 match_last = check_matching (&mctx, fl_longest_match,
832 start <= last_start ? &match_first : NULL);
833 if (match_last != -1)
835 if (BE (match_last == -2, 0))
842 mctx.match_last = match_last;
843 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
845 re_dfastate_t *pstate = mctx.state_log[match_last];
846 mctx.last_node = check_halt_state_context (&mctx, pstate,
849 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
852 err = prune_impossible_nodes (&mctx);
853 if (err == REG_NOERROR)
855 if (BE (err != REG_NOMATCH, 0))
860 break; /* We found a match. */
864 match_ctx_clean (&mctx);
868 assert (match_last != -1);
869 assert (err == REG_NOERROR);
872 /* Set pmatch[] if we need. */
877 /* Initialize registers. */
878 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
879 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
881 /* Set the points where matching start/end. */
883 pmatch[0].rm_eo = mctx.match_last;
884 /* FIXME: This function should fail if mctx.match_last exceeds
885 the maximum possible regoff_t value. We need a new error
886 code REG_OVERFLOW. */
888 if (!preg->no_sub && nmatch > 1)
890 err = set_regs (preg, &mctx, nmatch, pmatch,
891 dfa->has_plural_match && dfa->nbackref > 0);
892 if (BE (err != REG_NOERROR, 0))
896 /* At last, add the offset to each register, since we slid
897 the buffers so that we could assume that the matching starts
899 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
900 if (pmatch[reg_idx].rm_so != -1)
902 #ifdef RE_ENABLE_I18N
903 if (BE (mctx.input.offsets_needed != 0, 0))
905 pmatch[reg_idx].rm_so =
906 (pmatch[reg_idx].rm_so == mctx.input.valid_len
907 ? mctx.input.valid_raw_len
908 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
909 pmatch[reg_idx].rm_eo =
910 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
911 ? mctx.input.valid_raw_len
912 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
915 assert (mctx.input.offsets_needed == 0);
917 pmatch[reg_idx].rm_so += match_first;
918 pmatch[reg_idx].rm_eo += match_first;
920 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
922 pmatch[nmatch + reg_idx].rm_so = -1;
923 pmatch[nmatch + reg_idx].rm_eo = -1;
927 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
928 if (dfa->subexp_map[reg_idx] != reg_idx)
930 pmatch[reg_idx + 1].rm_so
931 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
932 pmatch[reg_idx + 1].rm_eo
933 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
938 re_free (mctx.state_log);
940 match_ctx_free (&mctx);
941 re_string_destruct (&mctx.input);
946 internal_function __attribute_warn_unused_result__
947 prune_impossible_nodes (re_match_context_t *mctx)
949 const re_dfa_t *const dfa = mctx->dfa;
950 Idx halt_node, match_last;
952 re_dfastate_t **sifted_states;
953 re_dfastate_t **lim_states = NULL;
954 re_sift_context_t sctx;
956 assert (mctx->state_log != NULL);
958 match_last = mctx->match_last;
959 halt_node = mctx->last_node;
961 /* Avoid overflow. */
962 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
965 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
966 if (BE (sifted_states == NULL, 0))
973 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
974 if (BE (lim_states == NULL, 0))
981 memset (lim_states, '\0',
982 sizeof (re_dfastate_t *) * (match_last + 1));
983 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
985 ret = sift_states_backward (mctx, &sctx);
986 re_node_set_free (&sctx.limits);
987 if (BE (ret != REG_NOERROR, 0))
989 if (sifted_states[0] != NULL || lim_states[0] != NULL)
999 } while (mctx->state_log[match_last] == NULL
1000 || !mctx->state_log[match_last]->halt);
1001 halt_node = check_halt_state_context (mctx,
1002 mctx->state_log[match_last],
1005 ret = merge_state_array (dfa, sifted_states, lim_states,
1007 re_free (lim_states);
1009 if (BE (ret != REG_NOERROR, 0))
1014 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1015 ret = sift_states_backward (mctx, &sctx);
1016 re_node_set_free (&sctx.limits);
1017 if (BE (ret != REG_NOERROR, 0))
1019 if (sifted_states[0] == NULL)
1025 re_free (mctx->state_log);
1026 mctx->state_log = sifted_states;
1027 sifted_states = NULL;
1028 mctx->last_node = halt_node;
1029 mctx->match_last = match_last;
1032 re_free (sifted_states);
1033 re_free (lim_states);
1037 /* Acquire an initial state and return it.
1038 We must select appropriate initial state depending on the context,
1039 since initial states may have constraints like "\<", "^", etc.. */
1041 static inline re_dfastate_t *
1042 __attribute__ ((always_inline)) internal_function
1043 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1046 const re_dfa_t *const dfa = mctx->dfa;
1047 if (dfa->init_state->has_constraint)
1049 unsigned int context;
1050 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1051 if (IS_WORD_CONTEXT (context))
1052 return dfa->init_state_word;
1053 else if (IS_ORDINARY_CONTEXT (context))
1054 return dfa->init_state;
1055 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1056 return dfa->init_state_begbuf;
1057 else if (IS_NEWLINE_CONTEXT (context))
1058 return dfa->init_state_nl;
1059 else if (IS_BEGBUF_CONTEXT (context))
1061 /* It is relatively rare case, then calculate on demand. */
1062 return re_acquire_state_context (err, dfa,
1063 dfa->init_state->entrance_nodes,
1067 /* Must not happen? */
1068 return dfa->init_state;
1071 return dfa->init_state;
1074 /* Check whether the regular expression match input string INPUT or not,
1075 and return the index where the matching end. Return -1 if
1076 there is no match, and return -2 in case of an error.
1077 FL_LONGEST_MATCH means we want the POSIX longest matching.
1078 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1079 next place where we may want to try matching.
1080 Note that the matcher assumes that the matching starts from the current
1081 index of the buffer. */
1084 internal_function __attribute_warn_unused_result__
1085 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1088 const re_dfa_t *const dfa = mctx->dfa;
1091 Idx match_last = -1;
1092 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1093 re_dfastate_t *cur_state;
1094 bool at_init_state = p_match_first != NULL;
1095 Idx next_start_idx = cur_str_idx;
1098 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1099 /* An initial state must not be NULL (invalid). */
1100 if (BE (cur_state == NULL, 0))
1102 assert (err == REG_ESPACE);
1106 if (mctx->state_log != NULL)
1108 mctx->state_log[cur_str_idx] = cur_state;
1110 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1111 later. E.g. Processing back references. */
1112 if (BE (dfa->nbackref, 0))
1114 at_init_state = false;
1115 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1116 if (BE (err != REG_NOERROR, 0))
1119 if (cur_state->has_backref)
1121 err = transit_state_bkref (mctx, &cur_state->nodes);
1122 if (BE (err != REG_NOERROR, 0))
1128 /* If the RE accepts NULL string. */
1129 if (BE (cur_state->halt, 0))
1131 if (!cur_state->has_constraint
1132 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1134 if (!fl_longest_match)
1138 match_last = cur_str_idx;
1144 while (!re_string_eoi (&mctx->input))
1146 re_dfastate_t *old_state = cur_state;
1147 Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1149 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1150 && mctx->input.bufs_len < mctx->input.len)
1151 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1152 && mctx->input.valid_len < mctx->input.len))
1154 err = extend_buffers (mctx, next_char_idx + 1);
1155 if (BE (err != REG_NOERROR, 0))
1157 assert (err == REG_ESPACE);
1162 cur_state = transit_state (&err, mctx, cur_state);
1163 if (mctx->state_log != NULL)
1164 cur_state = merge_state_with_log (&err, mctx, cur_state);
1166 if (cur_state == NULL)
1168 /* Reached the invalid state or an error. Try to recover a valid
1169 state using the state log, if available and if we have not
1170 already found a valid (even if not the longest) match. */
1171 if (BE (err != REG_NOERROR, 0))
1174 if (mctx->state_log == NULL
1175 || (match && !fl_longest_match)
1176 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1180 if (BE (at_init_state, 0))
1182 if (old_state == cur_state)
1183 next_start_idx = next_char_idx;
1185 at_init_state = false;
1188 if (cur_state->halt)
1190 /* Reached a halt state.
1191 Check the halt state can satisfy the current context. */
1192 if (!cur_state->has_constraint
1193 || check_halt_state_context (mctx, cur_state,
1194 re_string_cur_idx (&mctx->input)))
1196 /* We found an appropriate halt state. */
1197 match_last = re_string_cur_idx (&mctx->input);
1200 /* We found a match, do not modify match_first below. */
1201 p_match_first = NULL;
1202 if (!fl_longest_match)
1209 *p_match_first += next_start_idx;
1214 /* Check NODE match the current context. */
1218 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1220 re_token_type_t type = dfa->nodes[node].type;
1221 unsigned int constraint = dfa->nodes[node].constraint;
1222 if (type != END_OF_RE)
1226 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1231 /* Check the halt state STATE match the current context.
1232 Return 0 if not match, if the node, STATE has, is a halt node and
1233 match the context, return the node. */
1237 check_halt_state_context (const re_match_context_t *mctx,
1238 const re_dfastate_t *state, Idx idx)
1241 unsigned int context;
1243 assert (state->halt);
1245 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1246 for (i = 0; i < state->nodes.nelem; ++i)
1247 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1248 return state->nodes.elems[i];
1252 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1253 corresponding to the DFA).
1254 Return the destination node, and update EPS_VIA_NODES;
1255 return -1 in case of errors. */
1259 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1260 Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1261 struct re_fail_stack_t *fs)
1263 const re_dfa_t *const dfa = mctx->dfa;
1266 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1268 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1269 re_node_set *edests = &dfa->edests[node];
1271 ok = re_node_set_insert (eps_via_nodes, node);
1274 /* Pick up a valid destination, or return -1 if none
1276 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1278 Idx candidate = edests->elems[i];
1279 if (!re_node_set_contains (cur_nodes, candidate))
1281 if (dest_node == -1)
1282 dest_node = candidate;
1286 /* In order to avoid infinite loop like "(a*)*", return the second
1287 epsilon-transition if the first was already considered. */
1288 if (re_node_set_contains (eps_via_nodes, dest_node))
1291 /* Otherwise, push the second epsilon-transition on the fail stack. */
1293 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1297 /* We know we are going to exit. */
1306 re_token_type_t type = dfa->nodes[node].type;
1308 #ifdef RE_ENABLE_I18N
1309 if (dfa->nodes[node].accept_mb)
1310 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1312 #endif /* RE_ENABLE_I18N */
1313 if (type == OP_BACK_REF)
1315 Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1316 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1319 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1323 char *buf = (char *) re_string_get_buffer (&mctx->input);
1324 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1333 ok = re_node_set_insert (eps_via_nodes, node);
1336 dest_node = dfa->edests[node].elems[0];
1337 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1344 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1346 Idx dest_node = dfa->nexts[node];
1347 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1348 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1349 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1352 re_node_set_empty (eps_via_nodes);
1359 static reg_errcode_t
1360 internal_function __attribute_warn_unused_result__
1361 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1362 Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1365 Idx num = fs->num++;
1366 if (fs->num == fs->alloc)
1368 struct re_fail_stack_ent_t *new_array;
1369 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1371 if (new_array == NULL)
1374 fs->stack = new_array;
1376 fs->stack[num].idx = str_idx;
1377 fs->stack[num].node = dest_node;
1378 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1379 if (fs->stack[num].regs == NULL)
1381 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1382 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1388 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1389 regmatch_t *regs, re_node_set *eps_via_nodes)
1391 Idx num = --fs->num;
1393 *pidx = fs->stack[num].idx;
1394 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1395 re_node_set_free (eps_via_nodes);
1396 re_free (fs->stack[num].regs);
1397 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1398 return fs->stack[num].node;
1401 /* Set the positions where the subexpressions are starts/ends to registers
1403 Note: We assume that pmatch[0] is already set, and
1404 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1406 static reg_errcode_t
1407 internal_function __attribute_warn_unused_result__
1408 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1409 regmatch_t *pmatch, bool fl_backtrack)
1411 const re_dfa_t *dfa = preg->buffer;
1413 re_node_set eps_via_nodes;
1414 struct re_fail_stack_t *fs;
1415 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1416 regmatch_t *prev_idx_match;
1417 bool prev_idx_match_malloced = false;
1420 assert (nmatch > 1);
1421 assert (mctx->state_log != NULL);
1426 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1427 if (fs->stack == NULL)
1433 cur_node = dfa->init_node;
1434 re_node_set_init_empty (&eps_via_nodes);
1436 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1437 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1440 prev_idx_match = re_malloc (regmatch_t, nmatch);
1441 if (prev_idx_match == NULL)
1443 free_fail_stack_return (fs);
1446 prev_idx_match_malloced = true;
1448 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1450 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1452 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1454 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1459 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1460 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1462 if (reg_idx == nmatch)
1464 re_node_set_free (&eps_via_nodes);
1465 if (prev_idx_match_malloced)
1466 re_free (prev_idx_match);
1467 return free_fail_stack_return (fs);
1469 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1474 re_node_set_free (&eps_via_nodes);
1475 if (prev_idx_match_malloced)
1476 re_free (prev_idx_match);
1481 /* Proceed to next node. */
1482 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1483 &eps_via_nodes, fs);
1485 if (BE (cur_node < 0, 0))
1487 if (BE (cur_node == -2, 0))
1489 re_node_set_free (&eps_via_nodes);
1490 if (prev_idx_match_malloced)
1491 re_free (prev_idx_match);
1492 free_fail_stack_return (fs);
1496 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1500 re_node_set_free (&eps_via_nodes);
1501 if (prev_idx_match_malloced)
1502 re_free (prev_idx_match);
1507 re_node_set_free (&eps_via_nodes);
1508 if (prev_idx_match_malloced)
1509 re_free (prev_idx_match);
1510 return free_fail_stack_return (fs);
1513 static reg_errcode_t
1515 free_fail_stack_return (struct re_fail_stack_t *fs)
1520 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1522 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1523 re_free (fs->stack[fs_idx].regs);
1525 re_free (fs->stack);
1532 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1533 regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1535 int type = dfa->nodes[cur_node].type;
1536 if (type == OP_OPEN_SUBEXP)
1538 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1540 /* We are at the first node of this sub expression. */
1541 if (reg_num < nmatch)
1543 pmatch[reg_num].rm_so = cur_idx;
1544 pmatch[reg_num].rm_eo = -1;
1547 else if (type == OP_CLOSE_SUBEXP)
1549 Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1550 if (reg_num < nmatch)
1552 /* We are at the last node of this sub expression. */
1553 if (pmatch[reg_num].rm_so < cur_idx)
1555 pmatch[reg_num].rm_eo = cur_idx;
1556 /* This is a non-empty match or we are not inside an optional
1557 subexpression. Accept this right away. */
1558 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1562 if (dfa->nodes[cur_node].opt_subexp
1563 && prev_idx_match[reg_num].rm_so != -1)
1564 /* We transited through an empty match for an optional
1565 subexpression, like (a?)*, and this is not the subexp's
1566 first match. Copy back the old content of the registers
1567 so that matches of an inner subexpression are undone as
1568 well, like in ((a?))*. */
1569 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1571 /* We completed a subexpression, but it may be part of
1572 an optional one, so do not update PREV_IDX_MATCH. */
1573 pmatch[reg_num].rm_eo = cur_idx;
1579 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1580 and sift the nodes in each states according to the following rules.
1581 Updated state_log will be wrote to STATE_LOG.
1583 Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1584 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1585 If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1586 the LAST_NODE, we throw away the node 'a'.
1587 2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1588 string 's' and transit to 'b':
1589 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1591 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1592 thrown away, we throw away the node 'a'.
1593 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1594 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1596 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1597 we throw away the node 'a'. */
1599 #define STATE_NODE_CONTAINS(state,node) \
1600 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1602 static reg_errcode_t
1604 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1608 Idx str_idx = sctx->last_str_idx;
1609 re_node_set cur_dest;
1612 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1615 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1616 transit to the last_node and the last_node itself. */
1617 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1618 if (BE (err != REG_NOERROR, 0))
1620 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1621 if (BE (err != REG_NOERROR, 0))
1624 /* Then check each states in the state_log. */
1627 /* Update counters. */
1628 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1629 if (null_cnt > mctx->max_mb_elem_len)
1631 memset (sctx->sifted_states, '\0',
1632 sizeof (re_dfastate_t *) * str_idx);
1633 re_node_set_free (&cur_dest);
1636 re_node_set_empty (&cur_dest);
1639 if (mctx->state_log[str_idx])
1641 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1642 if (BE (err != REG_NOERROR, 0))
1646 /* Add all the nodes which satisfy the following conditions:
1647 - It can epsilon transit to a node in CUR_DEST.
1649 And update state_log. */
1650 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1651 if (BE (err != REG_NOERROR, 0))
1656 re_node_set_free (&cur_dest);
1660 static reg_errcode_t
1661 internal_function __attribute_warn_unused_result__
1662 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1663 Idx str_idx, re_node_set *cur_dest)
1665 const re_dfa_t *const dfa = mctx->dfa;
1666 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1669 /* Then build the next sifted state.
1670 We build the next sifted state on 'cur_dest', and update
1671 'sifted_states[str_idx]' with 'cur_dest'.
1673 'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1674 'cur_src' points the node_set of the old 'state_log[str_idx]'
1675 (with the epsilon nodes pre-filtered out). */
1676 for (i = 0; i < cur_src->nelem; i++)
1678 Idx prev_node = cur_src->elems[i];
1683 re_token_type_t type = dfa->nodes[prev_node].type;
1684 assert (!IS_EPSILON_NODE (type));
1686 #ifdef RE_ENABLE_I18N
1687 /* If the node may accept "multi byte". */
1688 if (dfa->nodes[prev_node].accept_mb)
1689 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1690 str_idx, sctx->last_str_idx);
1691 #endif /* RE_ENABLE_I18N */
1693 /* We don't check backreferences here.
1694 See update_cur_sifted_state(). */
1696 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1697 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1698 dfa->nexts[prev_node]))
1704 if (sctx->limits.nelem)
1706 Idx to_idx = str_idx + naccepted;
1707 if (check_dst_limits (mctx, &sctx->limits,
1708 dfa->nexts[prev_node], to_idx,
1709 prev_node, str_idx))
1712 ok = re_node_set_insert (cur_dest, prev_node);
1720 /* Helper functions. */
1722 static reg_errcode_t
1724 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1726 Idx top = mctx->state_log_top;
1728 if ((next_state_log_idx >= mctx->input.bufs_len
1729 && mctx->input.bufs_len < mctx->input.len)
1730 || (next_state_log_idx >= mctx->input.valid_len
1731 && mctx->input.valid_len < mctx->input.len))
1734 err = extend_buffers (mctx, next_state_log_idx + 1);
1735 if (BE (err != REG_NOERROR, 0))
1739 if (top < next_state_log_idx)
1741 memset (mctx->state_log + top + 1, '\0',
1742 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1743 mctx->state_log_top = next_state_log_idx;
1748 static reg_errcode_t
1750 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1751 re_dfastate_t **src, Idx num)
1755 for (st_idx = 0; st_idx < num; ++st_idx)
1757 if (dst[st_idx] == NULL)
1758 dst[st_idx] = src[st_idx];
1759 else if (src[st_idx] != NULL)
1761 re_node_set merged_set;
1762 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1763 &src[st_idx]->nodes);
1764 if (BE (err != REG_NOERROR, 0))
1766 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1767 re_node_set_free (&merged_set);
1768 if (BE (err != REG_NOERROR, 0))
1775 static reg_errcode_t
1777 update_cur_sifted_state (const re_match_context_t *mctx,
1778 re_sift_context_t *sctx, Idx str_idx,
1779 re_node_set *dest_nodes)
1781 const re_dfa_t *const dfa = mctx->dfa;
1782 reg_errcode_t err = REG_NOERROR;
1783 const re_node_set *candidates;
1784 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1785 : &mctx->state_log[str_idx]->nodes);
1787 if (dest_nodes->nelem == 0)
1788 sctx->sifted_states[str_idx] = NULL;
1793 /* At first, add the nodes which can epsilon transit to a node in
1795 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1796 if (BE (err != REG_NOERROR, 0))
1799 /* Then, check the limitations in the current sift_context. */
1800 if (sctx->limits.nelem)
1802 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1803 mctx->bkref_ents, str_idx);
1804 if (BE (err != REG_NOERROR, 0))
1809 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1810 if (BE (err != REG_NOERROR, 0))
1814 if (candidates && mctx->state_log[str_idx]->has_backref)
1816 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1817 if (BE (err != REG_NOERROR, 0))
1823 static reg_errcode_t
1824 internal_function __attribute_warn_unused_result__
1825 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1826 const re_node_set *candidates)
1828 reg_errcode_t err = REG_NOERROR;
1831 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1832 if (BE (err != REG_NOERROR, 0))
1835 if (!state->inveclosure.alloc)
1837 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1838 if (BE (err != REG_NOERROR, 0))
1840 for (i = 0; i < dest_nodes->nelem; i++)
1842 err = re_node_set_merge (&state->inveclosure,
1843 dfa->inveclosures + dest_nodes->elems[i]);
1844 if (BE (err != REG_NOERROR, 0))
1848 return re_node_set_add_intersect (dest_nodes, candidates,
1849 &state->inveclosure);
1852 static reg_errcode_t
1854 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1855 const re_node_set *candidates)
1859 re_node_set *inv_eclosure = dfa->inveclosures + node;
1860 re_node_set except_nodes;
1861 re_node_set_init_empty (&except_nodes);
1862 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1864 Idx cur_node = inv_eclosure->elems[ecl_idx];
1865 if (cur_node == node)
1867 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1869 Idx edst1 = dfa->edests[cur_node].elems[0];
1870 Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1871 ? dfa->edests[cur_node].elems[1] : -1);
1872 if ((!re_node_set_contains (inv_eclosure, edst1)
1873 && re_node_set_contains (dest_nodes, edst1))
1875 && !re_node_set_contains (inv_eclosure, edst2)
1876 && re_node_set_contains (dest_nodes, edst2)))
1878 err = re_node_set_add_intersect (&except_nodes, candidates,
1879 dfa->inveclosures + cur_node);
1880 if (BE (err != REG_NOERROR, 0))
1882 re_node_set_free (&except_nodes);
1888 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1890 Idx cur_node = inv_eclosure->elems[ecl_idx];
1891 if (!re_node_set_contains (&except_nodes, cur_node))
1893 Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1894 re_node_set_remove_at (dest_nodes, idx);
1897 re_node_set_free (&except_nodes);
1903 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1904 Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1906 const re_dfa_t *const dfa = mctx->dfa;
1907 Idx lim_idx, src_pos, dst_pos;
1909 Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1910 Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1911 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1914 struct re_backref_cache_entry *ent;
1915 ent = mctx->bkref_ents + limits->elems[lim_idx];
1916 subexp_idx = dfa->nodes[ent->node].opr.idx;
1918 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1919 subexp_idx, dst_node, dst_idx,
1921 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1922 subexp_idx, src_node, src_idx,
1926 <src> <dst> ( <subexp> )
1927 ( <subexp> ) <src> <dst>
1928 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1929 if (src_pos == dst_pos)
1930 continue; /* This is unrelated limitation. */
1939 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1940 Idx subexp_idx, Idx from_node, Idx bkref_idx)
1942 const re_dfa_t *const dfa = mctx->dfa;
1943 const re_node_set *eclosures = dfa->eclosures + from_node;
1946 /* Else, we are on the boundary: examine the nodes on the epsilon
1948 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1950 Idx node = eclosures->elems[node_idx];
1951 switch (dfa->nodes[node].type)
1954 if (bkref_idx != -1)
1956 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1962 if (ent->node != node)
1965 if (subexp_idx < BITSET_WORD_BITS
1966 && !(ent->eps_reachable_subexps_map
1967 & ((bitset_word_t) 1 << subexp_idx)))
1970 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1971 OP_CLOSE_SUBEXP cases below. But, if the
1972 destination node is the same node as the source
1973 node, don't recurse because it would cause an
1974 infinite loop: a regex that exhibits this behavior
1976 dst = dfa->edests[node].elems[0];
1977 if (dst == from_node)
1981 else /* if (boundaries & 2) */
1986 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988 if (cpos == -1 /* && (boundaries & 1) */)
1990 if (cpos == 0 && (boundaries & 2))
1993 if (subexp_idx < BITSET_WORD_BITS)
1994 ent->eps_reachable_subexps_map
1995 &= ~((bitset_word_t) 1 << subexp_idx);
1997 while (ent++->more);
2001 case OP_OPEN_SUBEXP:
2002 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2006 case OP_CLOSE_SUBEXP:
2007 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2016 return (boundaries & 2) ? 1 : 0;
2021 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
2022 Idx subexp_idx, Idx from_node, Idx str_idx,
2025 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2028 /* If we are outside the range of the subexpression, return -1 or 1. */
2029 if (str_idx < lim->subexp_from)
2032 if (lim->subexp_to < str_idx)
2035 /* If we are within the subexpression, return 0. */
2036 boundaries = (str_idx == lim->subexp_from);
2037 boundaries |= (str_idx == lim->subexp_to) << 1;
2038 if (boundaries == 0)
2041 /* Else, examine epsilon closure. */
2042 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2043 from_node, bkref_idx);
2046 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2047 which are against limitations from DEST_NODES. */
2049 static reg_errcode_t
2051 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2052 const re_node_set *candidates, re_node_set *limits,
2053 struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2056 Idx node_idx, lim_idx;
2058 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2061 struct re_backref_cache_entry *ent;
2062 ent = bkref_ents + limits->elems[lim_idx];
2064 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2065 continue; /* This is unrelated limitation. */
2067 subexp_idx = dfa->nodes[ent->node].opr.idx;
2068 if (ent->subexp_to == str_idx)
2072 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074 Idx node = dest_nodes->elems[node_idx];
2075 re_token_type_t type = dfa->nodes[node].type;
2076 if (type == OP_OPEN_SUBEXP
2077 && subexp_idx == dfa->nodes[node].opr.idx)
2079 else if (type == OP_CLOSE_SUBEXP
2080 && subexp_idx == dfa->nodes[node].opr.idx)
2084 /* Check the limitation of the open subexpression. */
2085 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2088 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2090 if (BE (err != REG_NOERROR, 0))
2094 /* Check the limitation of the close subexpression. */
2096 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2098 Idx node = dest_nodes->elems[node_idx];
2099 if (!re_node_set_contains (dfa->inveclosures + node,
2101 && !re_node_set_contains (dfa->eclosures + node,
2104 /* It is against this limitation.
2105 Remove it form the current sifted state. */
2106 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2108 if (BE (err != REG_NOERROR, 0))
2114 else /* (ent->subexp_to != str_idx) */
2116 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2118 Idx node = dest_nodes->elems[node_idx];
2119 re_token_type_t type = dfa->nodes[node].type;
2120 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2122 if (subexp_idx != dfa->nodes[node].opr.idx)
2124 /* It is against this limitation.
2125 Remove it form the current sifted state. */
2126 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2128 if (BE (err != REG_NOERROR, 0))
2137 static reg_errcode_t
2138 internal_function __attribute_warn_unused_result__
2139 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2140 Idx str_idx, const re_node_set *candidates)
2142 const re_dfa_t *const dfa = mctx->dfa;
2145 re_sift_context_t local_sctx;
2146 Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2148 if (first_idx == -1)
2151 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2153 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2156 re_token_type_t type;
2157 struct re_backref_cache_entry *entry;
2158 node = candidates->elems[node_idx];
2159 type = dfa->nodes[node].type;
2160 /* Avoid infinite loop for the REs like "()\1+". */
2161 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2163 if (type != OP_BACK_REF)
2166 entry = mctx->bkref_ents + first_idx;
2167 enabled_idx = first_idx;
2174 re_dfastate_t *cur_state;
2176 if (entry->node != node)
2178 subexp_len = entry->subexp_to - entry->subexp_from;
2179 to_idx = str_idx + subexp_len;
2180 dst_node = (subexp_len ? dfa->nexts[node]
2181 : dfa->edests[node].elems[0]);
2183 if (to_idx > sctx->last_str_idx
2184 || sctx->sifted_states[to_idx] == NULL
2185 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2186 || check_dst_limits (mctx, &sctx->limits, node,
2187 str_idx, dst_node, to_idx))
2190 if (local_sctx.sifted_states == NULL)
2193 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2194 if (BE (err != REG_NOERROR, 0))
2197 local_sctx.last_node = node;
2198 local_sctx.last_str_idx = str_idx;
2199 ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2205 cur_state = local_sctx.sifted_states[str_idx];
2206 err = sift_states_backward (mctx, &local_sctx);
2207 if (BE (err != REG_NOERROR, 0))
2209 if (sctx->limited_states != NULL)
2211 err = merge_state_array (dfa, sctx->limited_states,
2212 local_sctx.sifted_states,
2214 if (BE (err != REG_NOERROR, 0))
2217 local_sctx.sifted_states[str_idx] = cur_state;
2218 re_node_set_remove (&local_sctx.limits, enabled_idx);
2220 /* mctx->bkref_ents may have changed, reload the pointer. */
2221 entry = mctx->bkref_ents + enabled_idx;
2223 while (enabled_idx++, entry++->more);
2227 if (local_sctx.sifted_states != NULL)
2229 re_node_set_free (&local_sctx.limits);
2236 #ifdef RE_ENABLE_I18N
2239 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2240 Idx node_idx, Idx str_idx, Idx max_str_idx)
2242 const re_dfa_t *const dfa = mctx->dfa;
2244 /* Check the node can accept "multi byte". */
2245 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2246 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2247 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2248 dfa->nexts[node_idx]))
2249 /* The node can't accept the "multi byte", or the
2250 destination was already thrown away, then the node
2251 could't accept the current input "multi byte". */
2253 /* Otherwise, it is sure that the node could accept
2254 'naccepted' bytes input. */
2257 #endif /* RE_ENABLE_I18N */
2260 /* Functions for state transition. */
2262 /* Return the next state to which the current state STATE will transit by
2263 accepting the current input byte, and update STATE_LOG if necessary.
2264 If STATE can accept a multibyte char/collating element/back reference
2265 update the destination of STATE_LOG. */
2267 static re_dfastate_t *
2268 internal_function __attribute_warn_unused_result__
2269 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2270 re_dfastate_t *state)
2272 re_dfastate_t **trtable;
2275 #ifdef RE_ENABLE_I18N
2276 /* If the current state can accept multibyte. */
2277 if (BE (state->accept_mb, 0))
2279 *err = transit_state_mb (mctx, state);
2280 if (BE (*err != REG_NOERROR, 0))
2283 #endif /* RE_ENABLE_I18N */
2285 /* Then decide the next state with the single byte. */
2288 /* don't use transition table */
2289 return transit_state_sb (err, mctx, state);
2292 /* Use transition table */
2293 ch = re_string_fetch_byte (&mctx->input);
2296 trtable = state->trtable;
2297 if (BE (trtable != NULL, 1))
2300 trtable = state->word_trtable;
2301 if (BE (trtable != NULL, 1))
2303 unsigned int context;
2305 = re_string_context_at (&mctx->input,
2306 re_string_cur_idx (&mctx->input) - 1,
2308 if (IS_WORD_CONTEXT (context))
2309 return trtable[ch + SBC_MAX];
2314 if (!build_trtable (mctx->dfa, state))
2320 /* Retry, we now have a transition table. */
2324 /* Update the state_log if we need */
2325 static re_dfastate_t *
2327 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2328 re_dfastate_t *next_state)
2330 const re_dfa_t *const dfa = mctx->dfa;
2331 Idx cur_idx = re_string_cur_idx (&mctx->input);
2333 if (cur_idx > mctx->state_log_top)
2335 mctx->state_log[cur_idx] = next_state;
2336 mctx->state_log_top = cur_idx;
2338 else if (mctx->state_log[cur_idx] == 0)
2340 mctx->state_log[cur_idx] = next_state;
2344 re_dfastate_t *pstate;
2345 unsigned int context;
2346 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2347 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2348 the destination of a multibyte char/collating element/
2349 back reference. Then the next state is the union set of
2350 these destinations and the results of the transition table. */
2351 pstate = mctx->state_log[cur_idx];
2352 log_nodes = pstate->entrance_nodes;
2353 if (next_state != NULL)
2355 table_nodes = next_state->entrance_nodes;
2356 *err = re_node_set_init_union (&next_nodes, table_nodes,
2358 if (BE (*err != REG_NOERROR, 0))
2362 next_nodes = *log_nodes;
2363 /* Note: We already add the nodes of the initial state,
2364 then we don't need to add them here. */
2366 context = re_string_context_at (&mctx->input,
2367 re_string_cur_idx (&mctx->input) - 1,
2369 next_state = mctx->state_log[cur_idx]
2370 = re_acquire_state_context (err, dfa, &next_nodes, context);
2371 /* We don't need to check errors here, since the return value of
2372 this function is next_state and ERR is already set. */
2374 if (table_nodes != NULL)
2375 re_node_set_free (&next_nodes);
2378 if (BE (dfa->nbackref, 0) && next_state != NULL)
2380 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2381 later. We must check them here, since the back references in the
2382 next state might use them. */
2383 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2385 if (BE (*err != REG_NOERROR, 0))
2388 /* If the next state has back references. */
2389 if (next_state->has_backref)
2391 *err = transit_state_bkref (mctx, &next_state->nodes);
2392 if (BE (*err != REG_NOERROR, 0))
2394 next_state = mctx->state_log[cur_idx];
2401 /* Skip bytes in the input that correspond to part of a
2402 multi-byte match, then look in the log for a state
2403 from which to restart matching. */
2404 static re_dfastate_t *
2406 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2408 re_dfastate_t *cur_state;
2411 Idx max = mctx->state_log_top;
2412 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2416 if (++cur_str_idx > max)
2418 re_string_skip_bytes (&mctx->input, 1);
2420 while (mctx->state_log[cur_str_idx] == NULL);
2422 cur_state = merge_state_with_log (err, mctx, NULL);
2424 while (*err == REG_NOERROR && cur_state == NULL);
2428 /* Helper functions for transit_state. */
2430 /* From the node set CUR_NODES, pick up the nodes whose types are
2431 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2432 expression. And register them to use them later for evaluating the
2433 corresponding back references. */
2435 static reg_errcode_t
2437 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2440 const re_dfa_t *const dfa = mctx->dfa;
2444 /* TODO: This isn't efficient.
2445 Because there might be more than one nodes whose types are
2446 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2449 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2451 Idx node = cur_nodes->elems[node_idx];
2452 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2453 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2454 && (dfa->used_bkref_map
2455 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2457 err = match_ctx_add_subtop (mctx, node, str_idx);
2458 if (BE (err != REG_NOERROR, 0))
2466 /* Return the next state to which the current state STATE will transit by
2467 accepting the current input byte. */
2469 static re_dfastate_t *
2470 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2471 re_dfastate_t *state)
2473 const re_dfa_t *const dfa = mctx->dfa;
2474 re_node_set next_nodes;
2475 re_dfastate_t *next_state;
2476 Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2477 unsigned int context;
2479 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2480 if (BE (*err != REG_NOERROR, 0))
2482 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2484 Idx cur_node = state->nodes.elems[node_cnt];
2485 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2487 *err = re_node_set_merge (&next_nodes,
2488 dfa->eclosures + dfa->nexts[cur_node]);
2489 if (BE (*err != REG_NOERROR, 0))
2491 re_node_set_free (&next_nodes);
2496 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2497 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2498 /* We don't need to check errors here, since the return value of
2499 this function is next_state and ERR is already set. */
2501 re_node_set_free (&next_nodes);
2502 re_string_skip_bytes (&mctx->input, 1);
2507 #ifdef RE_ENABLE_I18N
2508 static reg_errcode_t
2510 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2512 const re_dfa_t *const dfa = mctx->dfa;
2516 for (i = 0; i < pstate->nodes.nelem; ++i)
2518 re_node_set dest_nodes, *new_nodes;
2519 Idx cur_node_idx = pstate->nodes.elems[i];
2522 unsigned int context;
2523 re_dfastate_t *dest_state;
2525 if (!dfa->nodes[cur_node_idx].accept_mb)
2528 if (dfa->nodes[cur_node_idx].constraint)
2530 context = re_string_context_at (&mctx->input,
2531 re_string_cur_idx (&mctx->input),
2533 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2538 /* How many bytes the node can accept? */
2539 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2540 re_string_cur_idx (&mctx->input));
2544 /* The node can accepts 'naccepted' bytes. */
2545 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2546 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2547 : mctx->max_mb_elem_len);
2548 err = clean_state_log_if_needed (mctx, dest_idx);
2549 if (BE (err != REG_NOERROR, 0))
2552 assert (dfa->nexts[cur_node_idx] != -1);
2554 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2556 dest_state = mctx->state_log[dest_idx];
2557 if (dest_state == NULL)
2558 dest_nodes = *new_nodes;
2561 err = re_node_set_init_union (&dest_nodes,
2562 dest_state->entrance_nodes, new_nodes);
2563 if (BE (err != REG_NOERROR, 0))
2566 context = re_string_context_at (&mctx->input, dest_idx - 1,
2568 mctx->state_log[dest_idx]
2569 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2570 if (dest_state != NULL)
2571 re_node_set_free (&dest_nodes);
2572 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2577 #endif /* RE_ENABLE_I18N */
2579 static reg_errcode_t
2581 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2583 const re_dfa_t *const dfa = mctx->dfa;
2586 Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2588 for (i = 0; i < nodes->nelem; ++i)
2590 Idx dest_str_idx, prev_nelem, bkc_idx;
2591 Idx node_idx = nodes->elems[i];
2592 unsigned int context;
2593 const re_token_t *node = dfa->nodes + node_idx;
2594 re_node_set *new_dest_nodes;
2596 /* Check whether 'node' is a backreference or not. */
2597 if (node->type != OP_BACK_REF)
2600 if (node->constraint)
2602 context = re_string_context_at (&mctx->input, cur_str_idx,
2604 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2608 /* 'node' is a backreference.
2609 Check the substring which the substring matched. */
2610 bkc_idx = mctx->nbkref_ents;
2611 err = get_subexp (mctx, node_idx, cur_str_idx);
2612 if (BE (err != REG_NOERROR, 0))
2615 /* And add the epsilon closures (which is 'new_dest_nodes') of
2616 the backreference to appropriate state_log. */
2618 assert (dfa->nexts[node_idx] != -1);
2620 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2623 re_dfastate_t *dest_state;
2624 struct re_backref_cache_entry *bkref_ent;
2625 bkref_ent = mctx->bkref_ents + bkc_idx;
2626 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2628 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2629 new_dest_nodes = (subexp_len == 0
2630 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2631 : dfa->eclosures + dfa->nexts[node_idx]);
2632 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2633 - bkref_ent->subexp_from);
2634 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2636 dest_state = mctx->state_log[dest_str_idx];
2637 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2638 : mctx->state_log[cur_str_idx]->nodes.nelem);
2639 /* Add 'new_dest_node' to state_log. */
2640 if (dest_state == NULL)
2642 mctx->state_log[dest_str_idx]
2643 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2645 if (BE (mctx->state_log[dest_str_idx] == NULL
2646 && err != REG_NOERROR, 0))
2651 re_node_set dest_nodes;
2652 err = re_node_set_init_union (&dest_nodes,
2653 dest_state->entrance_nodes,
2655 if (BE (err != REG_NOERROR, 0))
2657 re_node_set_free (&dest_nodes);
2660 mctx->state_log[dest_str_idx]
2661 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2662 re_node_set_free (&dest_nodes);
2663 if (BE (mctx->state_log[dest_str_idx] == NULL
2664 && err != REG_NOERROR, 0))
2667 /* We need to check recursively if the backreference can epsilon
2670 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2672 err = check_subexp_matching_top (mctx, new_dest_nodes,
2674 if (BE (err != REG_NOERROR, 0))
2676 err = transit_state_bkref (mctx, new_dest_nodes);
2677 if (BE (err != REG_NOERROR, 0))
2687 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2688 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2689 Note that we might collect inappropriate candidates here.
2690 However, the cost of checking them strictly here is too high, then we
2691 delay these checking for prune_impossible_nodes(). */
2693 static reg_errcode_t
2694 internal_function __attribute_warn_unused_result__
2695 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2697 const re_dfa_t *const dfa = mctx->dfa;
2698 Idx subexp_num, sub_top_idx;
2699 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2700 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2701 Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2702 if (cache_idx != -1)
2704 const struct re_backref_cache_entry *entry
2705 = mctx->bkref_ents + cache_idx;
2707 if (entry->node == bkref_node)
2708 return REG_NOERROR; /* We already checked it. */
2709 while (entry++->more);
2712 subexp_num = dfa->nodes[bkref_node].opr.idx;
2714 /* For each sub expression */
2715 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2718 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2719 re_sub_match_last_t *sub_last;
2720 Idx sub_last_idx, sl_str, bkref_str_off;
2722 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2723 continue; /* It isn't related. */
2725 sl_str = sub_top->str_idx;
2726 bkref_str_off = bkref_str_idx;
2727 /* At first, check the last node of sub expressions we already
2729 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2731 regoff_t sl_str_diff;
2732 sub_last = sub_top->lasts[sub_last_idx];
2733 sl_str_diff = sub_last->str_idx - sl_str;
2734 /* The matched string by the sub expression match with the substring
2735 at the back reference? */
2736 if (sl_str_diff > 0)
2738 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2740 /* Not enough chars for a successful match. */
2741 if (bkref_str_off + sl_str_diff > mctx->input.len)
2744 err = clean_state_log_if_needed (mctx,
2747 if (BE (err != REG_NOERROR, 0))
2749 buf = (const char *) re_string_get_buffer (&mctx->input);
2751 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2752 /* We don't need to search this sub expression any more. */
2755 bkref_str_off += sl_str_diff;
2756 sl_str += sl_str_diff;
2757 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2760 /* Reload buf, since the preceding call might have reallocated
2762 buf = (const char *) re_string_get_buffer (&mctx->input);
2764 if (err == REG_NOMATCH)
2766 if (BE (err != REG_NOERROR, 0))
2770 if (sub_last_idx < sub_top->nlasts)
2772 if (sub_last_idx > 0)
2774 /* Then, search for the other last nodes of the sub expression. */
2775 for (; sl_str <= bkref_str_idx; ++sl_str)
2778 regoff_t sl_str_off;
2779 const re_node_set *nodes;
2780 sl_str_off = sl_str - sub_top->str_idx;
2781 /* The matched string by the sub expression match with the substring
2782 at the back reference? */
2785 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2787 /* If we are at the end of the input, we cannot match. */
2788 if (bkref_str_off >= mctx->input.len)
2791 err = extend_buffers (mctx, bkref_str_off + 1);
2792 if (BE (err != REG_NOERROR, 0))
2795 buf = (const char *) re_string_get_buffer (&mctx->input);
2797 if (buf [bkref_str_off++] != buf[sl_str - 1])
2798 break; /* We don't need to search this sub expression
2801 if (mctx->state_log[sl_str] == NULL)
2803 /* Does this state have a ')' of the sub expression? */
2804 nodes = &mctx->state_log[sl_str]->nodes;
2805 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2809 if (sub_top->path == NULL)
2811 sub_top->path = calloc (sizeof (state_array_t),
2812 sl_str - sub_top->str_idx + 1);
2813 if (sub_top->path == NULL)
2816 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2817 in the current context? */
2818 err = check_arrival (mctx, sub_top->path, sub_top->node,
2819 sub_top->str_idx, cls_node, sl_str,
2821 if (err == REG_NOMATCH)
2823 if (BE (err != REG_NOERROR, 0))
2825 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2826 if (BE (sub_last == NULL, 0))
2828 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2830 if (err == REG_NOMATCH)
2837 /* Helper functions for get_subexp(). */
2839 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2840 If it can arrive, register the sub expression expressed with SUB_TOP
2843 static reg_errcode_t
2845 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2846 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2850 /* Can the subexpression arrive the back reference? */
2851 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2852 sub_last->str_idx, bkref_node, bkref_str,
2854 if (err != REG_NOERROR)
2856 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2858 if (BE (err != REG_NOERROR, 0))
2860 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2861 return clean_state_log_if_needed (mctx, to_idx);
2864 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2865 Search '(' if FL_OPEN, or search ')' otherwise.
2866 TODO: This function isn't efficient...
2867 Because there might be more than one nodes whose types are
2868 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2874 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2875 Idx subexp_idx, int type)
2878 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2880 Idx cls_node = nodes->elems[cls_idx];
2881 const re_token_t *node = dfa->nodes + cls_node;
2882 if (node->type == type
2883 && node->opr.idx == subexp_idx)
2889 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2890 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2892 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2894 static reg_errcode_t
2895 internal_function __attribute_warn_unused_result__
2896 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2897 Idx top_str, Idx last_node, Idx last_str, int type)
2899 const re_dfa_t *const dfa = mctx->dfa;
2900 reg_errcode_t err = REG_NOERROR;
2901 Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2902 re_dfastate_t *cur_state = NULL;
2903 re_node_set *cur_nodes, next_nodes;
2904 re_dfastate_t **backup_state_log;
2905 unsigned int context;
2907 subexp_num = dfa->nodes[top_node].opr.idx;
2908 /* Extend the buffer if we need. */
2909 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2911 re_dfastate_t **new_array;
2912 Idx old_alloc = path->alloc;
2913 Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2915 if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2917 new_alloc = old_alloc + incr_alloc;
2918 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2920 new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2921 if (BE (new_array == NULL, 0))
2923 path->array = new_array;
2924 path->alloc = new_alloc;
2925 memset (new_array + old_alloc, '\0',
2926 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2929 str_idx = path->next_idx ? path->next_idx : top_str;
2931 /* Temporary modify MCTX. */
2932 backup_state_log = mctx->state_log;
2933 backup_cur_idx = mctx->input.cur_idx;
2934 mctx->state_log = path->array;
2935 mctx->input.cur_idx = str_idx;
2937 /* Setup initial node set. */
2938 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2939 if (str_idx == top_str)
2941 err = re_node_set_init_1 (&next_nodes, top_node);
2942 if (BE (err != REG_NOERROR, 0))
2944 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2945 if (BE (err != REG_NOERROR, 0))
2947 re_node_set_free (&next_nodes);
2953 cur_state = mctx->state_log[str_idx];
2954 if (cur_state && cur_state->has_backref)
2956 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2957 if (BE (err != REG_NOERROR, 0))
2961 re_node_set_init_empty (&next_nodes);
2963 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2965 if (next_nodes.nelem)
2967 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2969 if (BE (err != REG_NOERROR, 0))
2971 re_node_set_free (&next_nodes);
2975 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2976 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2978 re_node_set_free (&next_nodes);
2981 mctx->state_log[str_idx] = cur_state;
2984 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2986 re_node_set_empty (&next_nodes);
2987 if (mctx->state_log[str_idx + 1])
2989 err = re_node_set_merge (&next_nodes,
2990 &mctx->state_log[str_idx + 1]->nodes);
2991 if (BE (err != REG_NOERROR, 0))
2993 re_node_set_free (&next_nodes);
2999 err = check_arrival_add_next_nodes (mctx, str_idx,
3000 &cur_state->non_eps_nodes,
3002 if (BE (err != REG_NOERROR, 0))
3004 re_node_set_free (&next_nodes);
3009 if (next_nodes.nelem)
3011 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3012 if (BE (err != REG_NOERROR, 0))
3014 re_node_set_free (&next_nodes);
3017 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3019 if (BE (err != REG_NOERROR, 0))
3021 re_node_set_free (&next_nodes);
3025 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3026 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3027 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3029 re_node_set_free (&next_nodes);
3032 mctx->state_log[str_idx] = cur_state;
3033 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3035 re_node_set_free (&next_nodes);
3036 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3037 : &mctx->state_log[last_str]->nodes);
3038 path->next_idx = str_idx;
3041 mctx->state_log = backup_state_log;
3042 mctx->input.cur_idx = backup_cur_idx;
3044 /* Then check the current node set has the node LAST_NODE. */
3045 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3051 /* Helper functions for check_arrival. */
3053 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3055 TODO: This function is similar to the functions transit_state*(),
3056 however this function has many additional works.
3057 Can't we unify them? */
3059 static reg_errcode_t
3060 internal_function __attribute_warn_unused_result__
3061 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3062 re_node_set *cur_nodes, re_node_set *next_nodes)
3064 const re_dfa_t *const dfa = mctx->dfa;
3067 #ifdef RE_ENABLE_I18N
3068 reg_errcode_t err = REG_NOERROR;
3070 re_node_set union_set;
3071 re_node_set_init_empty (&union_set);
3072 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3075 Idx cur_node = cur_nodes->elems[cur_idx];
3077 re_token_type_t type = dfa->nodes[cur_node].type;
3078 assert (!IS_EPSILON_NODE (type));
3080 #ifdef RE_ENABLE_I18N
3081 /* If the node may accept "multi byte". */
3082 if (dfa->nodes[cur_node].accept_mb)
3084 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3088 re_dfastate_t *dest_state;
3089 Idx next_node = dfa->nexts[cur_node];
3090 Idx next_idx = str_idx + naccepted;
3091 dest_state = mctx->state_log[next_idx];
3092 re_node_set_empty (&union_set);
3095 err = re_node_set_merge (&union_set, &dest_state->nodes);
3096 if (BE (err != REG_NOERROR, 0))
3098 re_node_set_free (&union_set);
3102 ok = re_node_set_insert (&union_set, next_node);
3105 re_node_set_free (&union_set);
3108 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3110 if (BE (mctx->state_log[next_idx] == NULL
3111 && err != REG_NOERROR, 0))
3113 re_node_set_free (&union_set);
3118 #endif /* RE_ENABLE_I18N */
3120 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3122 ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3125 re_node_set_free (&union_set);
3130 re_node_set_free (&union_set);
3134 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3135 CUR_NODES, however exclude the nodes which are:
3136 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3137 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3140 static reg_errcode_t
3142 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3143 Idx ex_subexp, int type)
3146 Idx idx, outside_node;
3147 re_node_set new_nodes;
3149 assert (cur_nodes->nelem);
3151 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3152 if (BE (err != REG_NOERROR, 0))
3154 /* Create a new node set NEW_NODES with the nodes which are epsilon
3155 closures of the node in CUR_NODES. */
3157 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3159 Idx cur_node = cur_nodes->elems[idx];
3160 const re_node_set *eclosure = dfa->eclosures + cur_node;
3161 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3162 if (outside_node == -1)
3164 /* There are no problematic nodes, just merge them. */
3165 err = re_node_set_merge (&new_nodes, eclosure);
3166 if (BE (err != REG_NOERROR, 0))
3168 re_node_set_free (&new_nodes);
3174 /* There are problematic nodes, re-calculate incrementally. */
3175 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3177 if (BE (err != REG_NOERROR, 0))
3179 re_node_set_free (&new_nodes);
3184 re_node_set_free (cur_nodes);
3185 *cur_nodes = new_nodes;
3189 /* Helper function for check_arrival_expand_ecl.
3190 Check incrementally the epsilon closure of TARGET, and if it isn't
3191 problematic append it to DST_NODES. */
3193 static reg_errcode_t
3194 internal_function __attribute_warn_unused_result__
3195 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3196 Idx target, Idx ex_subexp, int type)
3199 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3203 if (dfa->nodes[cur_node].type == type
3204 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3206 if (type == OP_CLOSE_SUBEXP)
3208 ok = re_node_set_insert (dst_nodes, cur_node);
3214 ok = re_node_set_insert (dst_nodes, cur_node);
3217 if (dfa->edests[cur_node].nelem == 0)
3219 if (dfa->edests[cur_node].nelem == 2)
3222 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3223 dfa->edests[cur_node].elems[1],
3225 if (BE (err != REG_NOERROR, 0))
3228 cur_node = dfa->edests[cur_node].elems[0];
3234 /* For all the back references in the current state, calculate the
3235 destination of the back references by the appropriate entry
3236 in MCTX->BKREF_ENTS. */
3238 static reg_errcode_t
3239 internal_function __attribute_warn_unused_result__
3240 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3241 Idx cur_str, Idx subexp_num, int type)
3243 const re_dfa_t *const dfa = mctx->dfa;
3245 Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3246 struct re_backref_cache_entry *ent;
3248 if (cache_idx_start == -1)
3252 ent = mctx->bkref_ents + cache_idx_start;
3255 Idx to_idx, next_node;
3257 /* Is this entry ENT is appropriate? */
3258 if (!re_node_set_contains (cur_nodes, ent->node))
3261 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3262 /* Calculate the destination of the back reference, and append it
3263 to MCTX->STATE_LOG. */
3264 if (to_idx == cur_str)
3266 /* The backreference did epsilon transit, we must re-check all the
3267 node in the current state. */
3268 re_node_set new_dests;
3269 reg_errcode_t err2, err3;
3270 next_node = dfa->edests[ent->node].elems[0];
3271 if (re_node_set_contains (cur_nodes, next_node))
3273 err = re_node_set_init_1 (&new_dests, next_node);
3274 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3275 err3 = re_node_set_merge (cur_nodes, &new_dests);
3276 re_node_set_free (&new_dests);
3277 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3278 || err3 != REG_NOERROR, 0))
3280 err = (err != REG_NOERROR ? err
3281 : (err2 != REG_NOERROR ? err2 : err3));
3284 /* TODO: It is still inefficient... */
3289 re_node_set union_set;
3290 next_node = dfa->nexts[ent->node];
3291 if (mctx->state_log[to_idx])
3294 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3297 err = re_node_set_init_copy (&union_set,
3298 &mctx->state_log[to_idx]->nodes);
3299 ok = re_node_set_insert (&union_set, next_node);
3300 if (BE (err != REG_NOERROR || ! ok, 0))
3302 re_node_set_free (&union_set);
3303 err = err != REG_NOERROR ? err : REG_ESPACE;
3309 err = re_node_set_init_1 (&union_set, next_node);
3310 if (BE (err != REG_NOERROR, 0))
3313 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3314 re_node_set_free (&union_set);
3315 if (BE (mctx->state_log[to_idx] == NULL
3316 && err != REG_NOERROR, 0))
3320 while (ent++->more);
3324 /* Build transition table for the state.
3325 Return true if successful. */
3329 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3334 bool need_word_trtable = false;
3335 bitset_word_t elem, mask;
3336 bool dests_node_malloced = false;
3337 bool dest_states_malloced = false;
3338 Idx ndests; /* Number of the destination states from 'state'. */
3339 re_dfastate_t **trtable;
3340 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3341 re_node_set follows, *dests_node;
3343 bitset_t acceptable;
3347 re_node_set dests_node[SBC_MAX];
3348 bitset_t dests_ch[SBC_MAX];
3351 /* We build DFA states which corresponds to the destination nodes
3352 from 'state'. 'dests_node[i]' represents the nodes which i-th
3353 destination state contains, and 'dests_ch[i]' represents the
3354 characters which i-th destination state accepts. */
3355 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3356 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3359 dests_alloc = re_malloc (struct dests_alloc, 1);
3360 if (BE (dests_alloc == NULL, 0))
3362 dests_node_malloced = true;
3364 dests_node = dests_alloc->dests_node;
3365 dests_ch = dests_alloc->dests_ch;
3367 /* Initialize transition table. */
3368 state->word_trtable = state->trtable = NULL;
3370 /* At first, group all nodes belonging to 'state' into several
3372 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3373 if (BE (ndests <= 0, 0))
3375 if (dests_node_malloced)
3377 /* Return false in case of an error, true otherwise. */
3380 state->trtable = (re_dfastate_t **)
3381 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3382 if (BE (state->trtable == NULL, 0))
3389 err = re_node_set_alloc (&follows, ndests + 1);
3390 if (BE (err != REG_NOERROR, 0))
3393 /* Avoid arithmetic overflow in size calculation. */
3394 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3395 / (3 * sizeof (re_dfastate_t *)))
3400 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3401 + ndests * 3 * sizeof (re_dfastate_t *)))
3402 dest_states = (re_dfastate_t **)
3403 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3406 dest_states = (re_dfastate_t **)
3407 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3408 if (BE (dest_states == NULL, 0))
3411 if (dest_states_malloced)
3413 re_node_set_free (&follows);
3414 for (i = 0; i < ndests; ++i)
3415 re_node_set_free (dests_node + i);
3416 if (dests_node_malloced)
3420 dest_states_malloced = true;
3422 dest_states_word = dest_states + ndests;
3423 dest_states_nl = dest_states_word + ndests;
3424 bitset_empty (acceptable);
3426 /* Then build the states for all destinations. */
3427 for (i = 0; i < ndests; ++i)
3430 re_node_set_empty (&follows);
3431 /* Merge the follows of this destination states. */
3432 for (j = 0; j < dests_node[i].nelem; ++j)
3434 next_node = dfa->nexts[dests_node[i].elems[j]];
3435 if (next_node != -1)
3437 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3438 if (BE (err != REG_NOERROR, 0))
3442 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3443 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3445 /* If the new state has context constraint,
3446 build appropriate states for these contexts. */
3447 if (dest_states[i]->has_constraint)
3449 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3451 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3454 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3455 need_word_trtable = true;
3457 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3459 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3464 dest_states_word[i] = dest_states[i];
3465 dest_states_nl[i] = dest_states[i];
3467 bitset_merge (acceptable, dests_ch[i]);
3470 if (!BE (need_word_trtable, 0))
3472 /* We don't care about whether the following character is a word
3473 character, or we are in a single-byte character set so we can
3474 discern by looking at the character code: allocate a
3475 256-entry transition table. */
3476 trtable = state->trtable =
3477 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3478 if (BE (trtable == NULL, 0))
3481 /* For all characters ch...: */
3482 for (i = 0; i < BITSET_WORDS; ++i)
3483 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3485 mask <<= 1, elem >>= 1, ++ch)
3486 if (BE (elem & 1, 0))
3488 /* There must be exactly one destination which accepts
3489 character ch. See group_nodes_into_DFAstates. */
3490 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3493 /* j-th destination accepts the word character ch. */
3494 if (dfa->word_char[i] & mask)
3495 trtable[ch] = dest_states_word[j];
3497 trtable[ch] = dest_states[j];
3502 /* We care about whether the following character is a word
3503 character, and we are in a multi-byte character set: discern
3504 by looking at the character code: build two 256-entry
3505 transition tables, one starting at trtable[0] and one
3506 starting at trtable[SBC_MAX]. */
3507 trtable = state->word_trtable =
3508 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3509 if (BE (trtable == NULL, 0))
3512 /* For all characters ch...: */
3513 for (i = 0; i < BITSET_WORDS; ++i)
3514 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3516 mask <<= 1, elem >>= 1, ++ch)
3517 if (BE (elem & 1, 0))
3519 /* There must be exactly one destination which accepts
3520 character ch. See group_nodes_into_DFAstates. */
3521 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3524 /* j-th destination accepts the word character ch. */
3525 trtable[ch] = dest_states[j];
3526 trtable[ch + SBC_MAX] = dest_states_word[j];
3531 if (bitset_contain (acceptable, NEWLINE_CHAR))
3533 /* The current state accepts newline character. */
3534 for (j = 0; j < ndests; ++j)
3535 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3537 /* k-th destination accepts newline character. */
3538 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3539 if (need_word_trtable)
3540 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3541 /* There must be only one destination which accepts
3542 newline. See group_nodes_into_DFAstates. */
3547 if (dest_states_malloced)
3550 re_node_set_free (&follows);
3551 for (i = 0; i < ndests; ++i)
3552 re_node_set_free (dests_node + i);
3554 if (dests_node_malloced)
3560 /* Group all nodes belonging to STATE into several destinations.
3561 Then for all destinations, set the nodes belonging to the destination
3562 to DESTS_NODE[i] and set the characters accepted by the destination
3563 to DEST_CH[i]. This function return the number of destinations. */
3567 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3568 re_node_set *dests_node, bitset_t *dests_ch)
3573 Idx ndests; /* Number of the destinations from 'state'. */
3574 bitset_t accepts; /* Characters a node can accept. */
3575 const re_node_set *cur_nodes = &state->nodes;
3576 bitset_empty (accepts);
3579 /* For all the nodes belonging to 'state', */
3580 for (i = 0; i < cur_nodes->nelem; ++i)
3582 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3583 re_token_type_t type = node->type;
3584 unsigned int constraint = node->constraint;
3586 /* Enumerate all single byte character this node can accept. */
3587 if (type == CHARACTER)
3588 bitset_set (accepts, node->opr.c);
3589 else if (type == SIMPLE_BRACKET)
3591 bitset_merge (accepts, node->opr.sbcset);
3593 else if (type == OP_PERIOD)
3595 #ifdef RE_ENABLE_I18N
3596 if (dfa->mb_cur_max > 1)
3597 bitset_merge (accepts, dfa->sb_char);
3600 bitset_set_all (accepts);
3601 if (!(dfa->syntax & RE_DOT_NEWLINE))
3602 bitset_clear (accepts, '\n');
3603 if (dfa->syntax & RE_DOT_NOT_NULL)
3604 bitset_clear (accepts, '\0');
3606 #ifdef RE_ENABLE_I18N
3607 else if (type == OP_UTF8_PERIOD)
3609 if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3610 memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3612 bitset_merge (accepts, utf8_sb_map);
3613 if (!(dfa->syntax & RE_DOT_NEWLINE))
3614 bitset_clear (accepts, '\n');
3615 if (dfa->syntax & RE_DOT_NOT_NULL)
3616 bitset_clear (accepts, '\0');
3622 /* Check the 'accepts' and sift the characters which are not
3623 match it the context. */
3626 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3628 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3629 bitset_empty (accepts);
3630 if (accepts_newline)
3631 bitset_set (accepts, NEWLINE_CHAR);
3635 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3637 bitset_empty (accepts);
3641 if (constraint & NEXT_WORD_CONSTRAINT)
3643 bitset_word_t any_set = 0;
3644 if (type == CHARACTER && !node->word_char)
3646 bitset_empty (accepts);
3649 #ifdef RE_ENABLE_I18N
3650 if (dfa->mb_cur_max > 1)
3651 for (j = 0; j < BITSET_WORDS; ++j)
3652 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3655 for (j = 0; j < BITSET_WORDS; ++j)
3656 any_set |= (accepts[j] &= dfa->word_char[j]);
3660 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3662 bitset_word_t any_set = 0;
3663 if (type == CHARACTER && node->word_char)
3665 bitset_empty (accepts);
3668 #ifdef RE_ENABLE_I18N
3669 if (dfa->mb_cur_max > 1)
3670 for (j = 0; j < BITSET_WORDS; ++j)
3671 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3674 for (j = 0; j < BITSET_WORDS; ++j)
3675 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3681 /* Then divide 'accepts' into DFA states, or create a new
3682 state. Above, we make sure that accepts is not empty. */
3683 for (j = 0; j < ndests; ++j)
3685 bitset_t intersec; /* Intersection sets, see below. */
3687 /* Flags, see below. */
3688 bitset_word_t has_intersec, not_subset, not_consumed;
3690 /* Optimization, skip if this state doesn't accept the character. */
3691 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3694 /* Enumerate the intersection set of this state and 'accepts'. */
3696 for (k = 0; k < BITSET_WORDS; ++k)
3697 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3698 /* And skip if the intersection set is empty. */
3702 /* Then check if this state is a subset of 'accepts'. */
3703 not_subset = not_consumed = 0;
3704 for (k = 0; k < BITSET_WORDS; ++k)
3706 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3707 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3710 /* If this state isn't a subset of 'accepts', create a
3711 new group state, which has the 'remains'. */
3714 bitset_copy (dests_ch[ndests], remains);
3715 bitset_copy (dests_ch[j], intersec);
3716 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3717 if (BE (err != REG_NOERROR, 0))
3722 /* Put the position in the current group. */
3723 ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3727 /* If all characters are consumed, go to next node. */
3731 /* Some characters remain, create a new group. */
3734 bitset_copy (dests_ch[ndests], accepts);
3735 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3736 if (BE (err != REG_NOERROR, 0))
3739 bitset_empty (accepts);
3744 for (j = 0; j < ndests; ++j)
3745 re_node_set_free (dests_node + j);
3749 #ifdef RE_ENABLE_I18N
3750 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3751 Return the number of the bytes the node accepts.
3752 STR_IDX is the current index of the input string.
3754 This function handles the nodes which can accept one character, or
3755 one collating element like '.', '[a-z]', opposite to the other nodes
3756 can only accept one byte. */
3759 # include <locale/weight.h>
3764 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3765 const re_string_t *input, Idx str_idx)
3767 const re_token_t *node = dfa->nodes + node_idx;
3768 int char_len, elem_len;
3771 if (BE (node->type == OP_UTF8_PERIOD, 0))
3773 unsigned char c = re_string_byte_at (input, str_idx), d;
3774 if (BE (c < 0xc2, 1))
3777 if (str_idx + 2 > input->len)
3780 d = re_string_byte_at (input, str_idx + 1);
3782 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3786 if (c == 0xe0 && d < 0xa0)
3792 if (c == 0xf0 && d < 0x90)
3798 if (c == 0xf8 && d < 0x88)
3804 if (c == 0xfc && d < 0x84)
3810 if (str_idx + char_len > input->len)
3813 for (i = 1; i < char_len; ++i)
3815 d = re_string_byte_at (input, str_idx + i);
3816 if (d < 0x80 || d > 0xbf)
3822 char_len = re_string_char_size_at (input, str_idx);
3823 if (node->type == OP_PERIOD)
3827 /* FIXME: I don't think this if is needed, as both '\n'
3828 and '\0' are char_len == 1. */
3829 /* '.' accepts any one character except the following two cases. */
3830 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3831 re_string_byte_at (input, str_idx) == '\n') ||
3832 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3833 re_string_byte_at (input, str_idx) == '\0'))
3838 elem_len = re_string_elem_size_at (input, str_idx);
3839 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3842 if (node->type == COMPLEX_BRACKET)
3844 const re_charset_t *cset = node->opr.mbcset;
3846 const unsigned char *pin
3847 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3852 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3853 ? re_string_wchar_at (input, str_idx) : 0);
3855 /* match with multibyte character? */
3856 for (i = 0; i < cset->nmbchars; ++i)
3857 if (wc == cset->mbchars[i])
3859 match_len = char_len;
3860 goto check_node_accept_bytes_match;
3862 /* match with character_class? */
3863 for (i = 0; i < cset->nchar_classes; ++i)
3865 wctype_t wt = cset->char_classes[i];
3866 if (__iswctype (wc, wt))
3868 match_len = char_len;
3869 goto check_node_accept_bytes_match;
3874 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3877 unsigned int in_collseq = 0;
3878 const int32_t *table, *indirect;
3879 const unsigned char *weights, *extra;
3880 const char *collseqwc;
3882 /* match with collating_symbol? */
3883 if (cset->ncoll_syms)
3884 extra = (const unsigned char *)
3885 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3886 for (i = 0; i < cset->ncoll_syms; ++i)
3888 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3889 /* Compare the length of input collating element and
3890 the length of current collating element. */
3891 if (*coll_sym != elem_len)
3893 /* Compare each bytes. */
3894 for (j = 0; j < *coll_sym; j++)
3895 if (pin[j] != coll_sym[1 + j])
3899 /* Match if every bytes is equal. */
3901 goto check_node_accept_bytes_match;
3907 if (elem_len <= char_len)
3909 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3910 in_collseq = __collseq_table_lookup (collseqwc, wc);
3913 in_collseq = find_collation_sequence_value (pin, elem_len);
3915 /* match with range expression? */
3916 /* FIXME: Implement rational ranges here, too. */
3917 for (i = 0; i < cset->nranges; ++i)
3918 if (cset->range_starts[i] <= in_collseq
3919 && in_collseq <= cset->range_ends[i])
3921 match_len = elem_len;
3922 goto check_node_accept_bytes_match;
3925 /* match with equivalence_class? */
3926 if (cset->nequiv_classes)
3928 const unsigned char *cp = pin;
3929 table = (const int32_t *)
3930 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3931 weights = (const unsigned char *)
3932 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3933 extra = (const unsigned char *)
3934 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3935 indirect = (const int32_t *)
3936 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3937 int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3939 for (i = 0; i < cset->nequiv_classes; ++i)
3941 int32_t equiv_class_idx = cset->equiv_classes[i];
3942 size_t weight_len = weights[idx & 0xffffff];
3943 if (weight_len == weights[equiv_class_idx & 0xffffff]
3944 && (idx >> 24) == (equiv_class_idx >> 24))
3949 equiv_class_idx &= 0xffffff;
3951 while (cnt <= weight_len
3952 && (weights[equiv_class_idx + 1 + cnt]
3953 == weights[idx + 1 + cnt]))
3955 if (cnt > weight_len)
3957 match_len = elem_len;
3958 goto check_node_accept_bytes_match;
3967 /* match with range expression? */
3968 for (i = 0; i < cset->nranges; ++i)
3970 if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3972 match_len = char_len;
3973 goto check_node_accept_bytes_match;
3977 check_node_accept_bytes_match:
3978 if (!cset->non_match)
3985 return (elem_len > char_len) ? elem_len : char_len;
3994 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3996 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4001 /* No valid character. Match it as a single byte character. */
4002 const unsigned char *collseq = (const unsigned char *)
4003 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4004 return collseq[mbs[0]];
4011 const unsigned char *extra = (const unsigned char *)
4012 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4013 int32_t extrasize = (const unsigned char *)
4014 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4016 for (idx = 0; idx < extrasize;)
4020 int32_t elem_mbs_len;
4021 /* Skip the name of collating element name. */
4022 idx = idx + extra[idx] + 1;
4023 elem_mbs_len = extra[idx++];
4024 if (mbs_len == elem_mbs_len)
4026 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4027 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4029 if (mbs_cnt == elem_mbs_len)
4030 /* Found the entry. */
4033 /* Skip the byte sequence of the collating element. */
4034 idx += elem_mbs_len;
4035 /* Adjust for the alignment. */
4036 idx = (idx + 3) & ~3;
4037 /* Skip the collation sequence value. */
4038 idx += sizeof (uint32_t);
4039 /* Skip the wide char sequence of the collating element. */
4040 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4041 /* If we found the entry, return the sequence value. */
4043 return *(uint32_t *) (extra + idx);
4044 /* Skip the collation sequence value. */
4045 idx += sizeof (uint32_t);
4051 #endif /* RE_ENABLE_I18N */
4053 /* Check whether the node accepts the byte which is IDX-th
4054 byte of the INPUT. */
4058 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4062 ch = re_string_byte_at (&mctx->input, idx);
4066 if (node->opr.c != ch)
4070 case SIMPLE_BRACKET:
4071 if (!bitset_contain (node->opr.sbcset, ch))
4075 #ifdef RE_ENABLE_I18N
4076 case OP_UTF8_PERIOD:
4077 if (ch >= ASCII_CHARS)
4082 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4083 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4091 if (node->constraint)
4093 /* The node has constraints. Check whether the current context
4094 satisfies the constraints. */
4095 unsigned int context = re_string_context_at (&mctx->input, idx,
4097 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4104 /* Extend the buffers, if the buffers have run out. */
4106 static reg_errcode_t
4107 internal_function __attribute_warn_unused_result__
4108 extend_buffers (re_match_context_t *mctx, int min_len)
4111 re_string_t *pstr = &mctx->input;
4113 /* Avoid overflow. */
4114 if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4115 <= pstr->bufs_len, 0))
4118 /* Double the lengths of the buffers, but allocate at least MIN_LEN. */
4119 ret = re_string_realloc_buffers (pstr,
4121 MIN (pstr->len, pstr->bufs_len * 2)));
4122 if (BE (ret != REG_NOERROR, 0))
4125 if (mctx->state_log != NULL)
4127 /* And double the length of state_log. */
4128 /* XXX We have no indication of the size of this buffer. If this
4129 allocation fail we have no indication that the state_log array
4130 does not have the right size. */
4131 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4132 pstr->bufs_len + 1);
4133 if (BE (new_array == NULL, 0))
4135 mctx->state_log = new_array;
4138 /* Then reconstruct the buffers. */
4141 #ifdef RE_ENABLE_I18N
4142 if (pstr->mb_cur_max > 1)
4144 ret = build_wcs_upper_buffer (pstr);
4145 if (BE (ret != REG_NOERROR, 0))
4149 #endif /* RE_ENABLE_I18N */
4150 build_upper_buffer (pstr);
4154 #ifdef RE_ENABLE_I18N
4155 if (pstr->mb_cur_max > 1)
4156 build_wcs_buffer (pstr);
4158 #endif /* RE_ENABLE_I18N */
4160 if (pstr->trans != NULL)
4161 re_string_translate_buffer (pstr);
4168 /* Functions for matching context. */
4170 /* Initialize MCTX. */
4172 static reg_errcode_t
4173 internal_function __attribute_warn_unused_result__
4174 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4176 mctx->eflags = eflags;
4177 mctx->match_last = -1;
4180 /* Avoid overflow. */
4181 size_t max_object_size =
4182 MAX (sizeof (struct re_backref_cache_entry),
4183 sizeof (re_sub_match_top_t *));
4184 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4187 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4188 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4189 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4192 /* Already zero-ed by the caller.
4194 mctx->bkref_ents = NULL;
4195 mctx->nbkref_ents = 0;
4196 mctx->nsub_tops = 0; */
4197 mctx->abkref_ents = n;
4198 mctx->max_mb_elem_len = 1;
4199 mctx->asub_tops = n;
4203 /* Clean the entries which depend on the current input in MCTX.
4204 This function must be invoked when the matcher changes the start index
4205 of the input, or changes the input string. */
4209 match_ctx_clean (re_match_context_t *mctx)
4212 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4215 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4216 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4218 re_sub_match_last_t *last = top->lasts[sl_idx];
4219 re_free (last->path.array);
4222 re_free (top->lasts);
4225 re_free (top->path->array);
4226 re_free (top->path);
4231 mctx->nsub_tops = 0;
4232 mctx->nbkref_ents = 0;
4235 /* Free all the memory associated with MCTX. */
4239 match_ctx_free (re_match_context_t *mctx)
4241 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4242 match_ctx_clean (mctx);
4243 re_free (mctx->sub_tops);
4244 re_free (mctx->bkref_ents);
4247 /* Add a new backreference entry to MCTX.
4248 Note that we assume that caller never call this function with duplicate
4249 entry, and call with STR_IDX which isn't smaller than any existing entry.
4252 static reg_errcode_t
4253 internal_function __attribute_warn_unused_result__
4254 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4257 if (mctx->nbkref_ents >= mctx->abkref_ents)
4259 struct re_backref_cache_entry* new_entry;
4260 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4261 mctx->abkref_ents * 2);
4262 if (BE (new_entry == NULL, 0))
4264 re_free (mctx->bkref_ents);
4267 mctx->bkref_ents = new_entry;
4268 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4269 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4270 mctx->abkref_ents *= 2;
4272 if (mctx->nbkref_ents > 0
4273 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4274 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4276 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4277 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4278 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4279 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4281 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4282 If bit N is clear, means that this entry won't epsilon-transition to
4283 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4284 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4287 A backreference does not epsilon-transition unless it is empty, so set
4288 to all zeros if FROM != TO. */
4289 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4290 = (from == to ? -1 : 0);
4292 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4293 if (mctx->max_mb_elem_len < to - from)
4294 mctx->max_mb_elem_len = to - from;
4298 /* Return the first entry with the same str_idx, or -1 if none is
4299 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4303 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4305 Idx left, right, mid, last;
4306 last = right = mctx->nbkref_ents;
4307 for (left = 0; left < right;)
4309 mid = (left + right) / 2;
4310 if (mctx->bkref_ents[mid].str_idx < str_idx)
4315 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4321 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4324 static reg_errcode_t
4325 internal_function __attribute_warn_unused_result__
4326 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4329 assert (mctx->sub_tops != NULL);
4330 assert (mctx->asub_tops > 0);
4332 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4334 Idx new_asub_tops = mctx->asub_tops * 2;
4335 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4336 re_sub_match_top_t *,
4338 if (BE (new_array == NULL, 0))
4340 mctx->sub_tops = new_array;
4341 mctx->asub_tops = new_asub_tops;
4343 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4344 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4346 mctx->sub_tops[mctx->nsub_tops]->node = node;
4347 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4351 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4352 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4354 static re_sub_match_last_t *
4356 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4358 re_sub_match_last_t *new_entry;
4359 if (BE (subtop->nlasts == subtop->alasts, 0))
4361 Idx new_alasts = 2 * subtop->alasts + 1;
4362 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4363 re_sub_match_last_t *,
4365 if (BE (new_array == NULL, 0))
4367 subtop->lasts = new_array;
4368 subtop->alasts = new_alasts;
4370 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4371 if (BE (new_entry != NULL, 1))
4373 subtop->lasts[subtop->nlasts] = new_entry;
4374 new_entry->node = node;
4375 new_entry->str_idx = str_idx;
4383 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4384 re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4386 sctx->sifted_states = sifted_sts;
4387 sctx->limited_states = limited_sts;
4388 sctx->last_node = last_node;
4389 sctx->last_str_idx = last_str_idx;
4390 re_node_set_init_empty (&sctx->limits);