1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2005,2007,2009,2010,2011 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59 static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
68 re_node_set *eps_via_nodes)
70 static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
82 #endif /* RE_ENABLE_I18N */
83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
93 re_node_set *dest_nodes)
95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
99 static int check_dst_limits (const re_match_context_t *mctx,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 re_dfastate_t **src, int num)
125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127 static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
147 #endif /* RE_ENABLE_I18N */
148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
182 static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
193 #endif /* RE_ENABLE_I18N */
194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 const re_dfastate_t *state,
196 re_node_set *states_node,
197 bitset_t *states_ch) internal_function;
198 static int check_node_accept (const re_match_context_t *mctx,
199 const re_token_t *node, int idx)
201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
204 /* Entry point for POSIX code. */
206 /* regexec searches for a given pattern, specified by PREG, in the
209 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
210 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
211 least NMATCH elements, and we set them to the offsets of the
212 corresponding matched substrings.
214 EFLAGS specifies `execution flags' which affect matching: if
215 REG_NOTBOL is set, then ^ does not match at the beginning of the
216 string; if REG_NOTEOL is set, then $ does not match at the end.
218 We return 0 if we find a match and REG_NOMATCH if not. */
221 regexec (preg, string, nmatch, pmatch, eflags)
222 const regex_t *__restrict preg;
223 const char *__restrict string;
230 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
232 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
235 if (eflags & REG_STARTEND)
237 start = pmatch[0].rm_so;
238 length = pmatch[0].rm_eo;
243 length = strlen (string);
246 __libc_lock_lock (dfa->lock);
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, 0, NULL, eflags);
251 err = re_search_internal (preg, string, length, start, length - start,
252 length, nmatch, pmatch, eflags);
253 __libc_lock_unlock (dfa->lock);
254 return err != REG_NOERROR;
258 # include <shlib-compat.h>
259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
262 __typeof__ (__regexec) __compat_regexec;
265 attribute_compat_text_section
266 __compat_regexec (const regex_t *__restrict preg,
267 const char *__restrict string, size_t nmatch,
268 regmatch_t pmatch[], int eflags)
270 return regexec (preg, string, nmatch, pmatch,
271 eflags & (REG_NOTBOL | REG_NOTEOL));
273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
277 /* Entry points for GNU code. */
279 /* re_match, re_search, re_match_2, re_search_2
281 The former two functions operate on STRING with length LENGTH,
282 while the later two operate on concatenation of STRING1 and STRING2
283 with lengths LENGTH1 and LENGTH2, respectively.
285 re_match() matches the compiled pattern in BUFP against the string,
286 starting at index START.
288 re_search() first tries matching at index START, then it tries to match
289 starting from index START + 1, and so on. The last start position tried
290 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
293 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
294 the first STOP characters of the concatenation of the strings should be
297 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
298 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
299 computed relative to the concatenation, not relative to the individual
302 On success, re_match* functions return the length of the match, re_search*
303 return the position of the start of the match. Return value -1 means no
304 match was found and -2 indicates an internal error. */
307 re_match (bufp, string, length, start, regs)
308 struct re_pattern_buffer *bufp;
311 struct re_registers *regs;
313 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
316 weak_alias (__re_match, re_match)
320 re_search (bufp, string, length, start, range, regs)
321 struct re_pattern_buffer *bufp;
323 int length, start, range;
324 struct re_registers *regs;
326 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
329 weak_alias (__re_search, re_search)
333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
334 struct re_pattern_buffer *bufp;
335 const char *string1, *string2;
336 int length1, length2, start, stop;
337 struct re_registers *regs;
339 return re_search_2_stub (bufp, string1, length1, string2, length2,
340 start, 0, regs, stop, 1);
343 weak_alias (__re_match_2, re_match_2)
347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
348 struct re_pattern_buffer *bufp;
349 const char *string1, *string2;
350 int length1, length2, start, range, stop;
351 struct re_registers *regs;
353 return re_search_2_stub (bufp, string1, length1, string2, length2,
354 start, range, regs, stop, 0);
357 weak_alias (__re_search_2, re_search_2)
361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
363 struct re_pattern_buffer *bufp;
364 const char *string1, *string2;
365 int length1, length2, start, range, stop, ret_len;
366 struct re_registers *regs;
370 int len = length1 + length2;
373 if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
376 /* Concatenate the strings. */
380 s = re_malloc (char, len);
382 if (BE (s == NULL, 0))
385 memcpy (__mempcpy (s, string1, length1), string2, length2);
387 memcpy (s, string1, length1);
388 memcpy (s + length1, string2, length2);
397 rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
402 /* The parameters have the same meaning as those of re_search.
403 Additional parameters:
404 If RET_LEN is nonzero the length of the match is returned (re_match style);
405 otherwise the position of the match is returned. */
408 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
409 struct re_pattern_buffer *bufp;
411 int length, start, range, stop, ret_len;
412 struct re_registers *regs;
414 reg_errcode_t result;
418 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
420 /* Check for out-of-range. */
421 if (BE (start < 0 || start > length, 0))
423 if (BE (start + range > length, 0))
424 range = length - start;
425 else if (BE (start + range < 0, 0))
428 __libc_lock_lock (dfa->lock);
430 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
431 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
433 /* Compile fastmap if we haven't yet. */
434 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
435 re_compile_fastmap (bufp);
437 if (BE (bufp->no_sub, 0))
440 /* We need at least 1 register. */
443 else if (BE (bufp->regs_allocated == REGS_FIXED &&
444 regs->num_regs < bufp->re_nsub + 1, 0))
446 nregs = regs->num_regs;
447 if (BE (nregs < 1, 0))
449 /* Nothing can be copied to regs. */
455 nregs = bufp->re_nsub + 1;
456 pmatch = re_malloc (regmatch_t, nregs);
457 if (BE (pmatch == NULL, 0))
463 result = re_search_internal (bufp, string, length, start, range, stop,
464 nregs, pmatch, eflags);
468 /* I hope we needn't fill ther regs with -1's when no match was found. */
469 if (result != REG_NOERROR)
471 else if (regs != NULL)
473 /* If caller wants register contents data back, copy them. */
474 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
475 bufp->regs_allocated);
476 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
480 if (BE (rval == 0, 1))
484 assert (pmatch[0].rm_so == start);
485 rval = pmatch[0].rm_eo - start;
488 rval = pmatch[0].rm_so;
492 __libc_lock_unlock (dfa->lock);
497 re_copy_regs (regs, pmatch, nregs, regs_allocated)
498 struct re_registers *regs;
500 int nregs, regs_allocated;
502 int rval = REGS_REALLOCATE;
504 int need_regs = nregs + 1;
505 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
508 /* Have the register data arrays been allocated? */
509 if (regs_allocated == REGS_UNALLOCATED)
510 { /* No. So allocate them with malloc. */
511 regs->start = re_malloc (regoff_t, need_regs);
512 if (BE (regs->start == NULL, 0))
513 return REGS_UNALLOCATED;
514 regs->end = re_malloc (regoff_t, need_regs);
515 if (BE (regs->end == NULL, 0))
517 re_free (regs->start);
518 return REGS_UNALLOCATED;
520 regs->num_regs = need_regs;
522 else if (regs_allocated == REGS_REALLOCATE)
523 { /* Yes. If we need more elements than were already
524 allocated, reallocate them. If we need fewer, just
526 if (BE (need_regs > regs->num_regs, 0))
528 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
530 if (BE (new_start == NULL, 0))
531 return REGS_UNALLOCATED;
532 new_end = re_realloc (regs->end, regoff_t, need_regs);
533 if (BE (new_end == NULL, 0))
536 return REGS_UNALLOCATED;
538 regs->start = new_start;
540 regs->num_regs = need_regs;
545 assert (regs_allocated == REGS_FIXED);
546 /* This function may not be called with REGS_FIXED and nregs too big. */
547 assert (regs->num_regs >= nregs);
552 for (i = 0; i < nregs; ++i)
554 regs->start[i] = pmatch[i].rm_so;
555 regs->end[i] = pmatch[i].rm_eo;
557 for ( ; i < regs->num_regs; ++i)
558 regs->start[i] = regs->end[i] = -1;
563 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
564 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
565 this memory for recording register information. STARTS and ENDS
566 must be allocated using the malloc library routine, and must each
567 be at least NUM_REGS * sizeof (regoff_t) bytes long.
569 If NUM_REGS == 0, then subsequent matches should allocate their own
572 Unless this function is called, the first search or match using
573 PATTERN_BUFFER will allocate its own register data, without
574 freeing the old data. */
577 re_set_registers (bufp, regs, num_regs, starts, ends)
578 struct re_pattern_buffer *bufp;
579 struct re_registers *regs;
581 regoff_t *starts, *ends;
585 bufp->regs_allocated = REGS_REALLOCATE;
586 regs->num_regs = num_regs;
587 regs->start = starts;
592 bufp->regs_allocated = REGS_UNALLOCATED;
594 regs->start = regs->end = (regoff_t *) 0;
598 weak_alias (__re_set_registers, re_set_registers)
601 /* Entry points compatible with 4.2 BSD regex library. We don't define
602 them unless specifically requested. */
604 #if defined _REGEX_RE_COMP || defined _LIBC
612 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
614 #endif /* _REGEX_RE_COMP */
616 /* Internal entry point. */
618 /* Searches for a compiled pattern PREG in the string STRING, whose
619 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
620 mingings with regexec. START, and RANGE have the same meanings
622 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
623 otherwise return the error code.
624 Note: We assume front end functions already check ranges.
625 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
628 __attribute_warn_unused_result__
629 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
633 int length, start, range, stop, eflags;
638 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
639 int left_lim, right_lim, incr;
640 int fl_longest_match, match_first, match_kind, match_last = -1;
643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
644 re_match_context_t mctx = { .dfa = dfa };
646 re_match_context_t mctx;
648 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
649 && range && !preg->can_be_null) ? preg->fastmap : NULL;
650 RE_TRANSLATE_TYPE t = preg->translate;
652 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
653 memset (&mctx, '\0', sizeof (re_match_context_t));
657 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
658 nmatch -= extra_nmatch;
660 /* Check if the DFA haven't been compiled. */
661 if (BE (preg->used == 0 || dfa->init_state == NULL
662 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
663 || dfa->init_state_begbuf == NULL, 0))
667 /* We assume front-end functions already check them. */
668 assert (start + range >= 0 && start + range <= length);
671 /* If initial states with non-begbuf contexts have no elements,
672 the regex must be anchored. If preg->newline_anchor is set,
673 we'll never use init_state_nl, so do not check it. */
674 if (dfa->init_state->nodes.nelem == 0
675 && dfa->init_state_word->nodes.nelem == 0
676 && (dfa->init_state_nl->nodes.nelem == 0
677 || !preg->newline_anchor))
679 if (start != 0 && start + range != 0)
684 /* We must check the longest matching, if nmatch > 0. */
685 fl_longest_match = (nmatch != 0 || dfa->nbackref);
687 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
688 preg->translate, preg->syntax & RE_ICASE, dfa);
689 if (BE (err != REG_NOERROR, 0))
691 mctx.input.stop = stop;
692 mctx.input.raw_stop = stop;
693 mctx.input.newline_anchor = preg->newline_anchor;
695 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
696 if (BE (err != REG_NOERROR, 0))
699 /* We will log all the DFA states through which the dfa pass,
700 if nmatch > 1, or this dfa has "multibyte node", which is a
701 back-reference or a node which can accept multibyte character or
702 multi character collating element. */
703 if (nmatch > 1 || dfa->has_mb_node)
705 /* Avoid overflow. */
706 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
712 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
713 if (BE (mctx.state_log == NULL, 0))
720 mctx.state_log = NULL;
723 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
724 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
726 /* Check incrementally whether of not the input string match. */
727 incr = (range < 0) ? -1 : 1;
728 left_lim = (range < 0) ? start + range : start;
729 right_lim = (range < 0) ? start : start + range;
730 sb = dfa->mb_cur_max == 1;
733 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
734 | (range >= 0 ? 2 : 0)
735 | (t != NULL ? 1 : 0))
738 for (;; match_first += incr)
741 if (match_first < left_lim || right_lim < match_first)
744 /* Advance as rapidly as possible through the string, until we
745 find a plausible place to start matching. This may be done
746 with varying efficiency, so there are various possibilities:
747 only the most common of them are specialized, in order to
748 save on code size. We use a switch statement for speed. */
756 /* Fastmap with single-byte translation, match forward. */
757 while (BE (match_first < right_lim, 1)
758 && !fastmap[t[(unsigned char) string[match_first]]])
760 goto forward_match_found_start_or_reached_end;
763 /* Fastmap without translation, match forward. */
764 while (BE (match_first < right_lim, 1)
765 && !fastmap[(unsigned char) string[match_first]])
768 forward_match_found_start_or_reached_end:
769 if (BE (match_first == right_lim, 0))
771 ch = match_first >= length
772 ? 0 : (unsigned char) string[match_first];
773 if (!fastmap[t ? t[ch] : ch])
780 /* Fastmap without multi-byte translation, match backwards. */
781 while (match_first >= left_lim)
783 ch = match_first >= length
784 ? 0 : (unsigned char) string[match_first];
785 if (fastmap[t ? t[ch] : ch])
789 if (match_first < left_lim)
794 /* In this case, we can't determine easily the current byte,
795 since it might be a component byte of a multibyte
796 character. Then we use the constructed buffer instead. */
799 /* If MATCH_FIRST is out of the valid range, reconstruct the
801 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
802 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
804 err = re_string_reconstruct (&mctx.input, match_first,
806 if (BE (err != REG_NOERROR, 0))
809 offset = match_first - mctx.input.raw_mbs_idx;
811 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
812 Note that MATCH_FIRST must not be smaller than 0. */
813 ch = (match_first >= length
814 ? 0 : re_string_byte_at (&mctx.input, offset));
818 if (match_first < left_lim || match_first > right_lim)
827 /* Reconstruct the buffers so that the matcher can assume that
828 the matching starts from the beginning of the buffer. */
829 err = re_string_reconstruct (&mctx.input, match_first, eflags);
830 if (BE (err != REG_NOERROR, 0))
833 #ifdef RE_ENABLE_I18N
834 /* Don't consider this char as a possible match start if it part,
835 yet isn't the head, of a multibyte character. */
836 if (!sb && !re_string_first_byte (&mctx.input, 0))
840 /* It seems to be appropriate one, then use the matcher. */
841 /* We assume that the matching starts from 0. */
842 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
843 match_last = check_matching (&mctx, fl_longest_match,
844 range >= 0 ? &match_first : NULL);
845 if (match_last != -1)
847 if (BE (match_last == -2, 0))
854 mctx.match_last = match_last;
855 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
857 re_dfastate_t *pstate = mctx.state_log[match_last];
858 mctx.last_node = check_halt_state_context (&mctx, pstate,
861 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
864 err = prune_impossible_nodes (&mctx);
865 if (err == REG_NOERROR)
867 if (BE (err != REG_NOMATCH, 0))
872 break; /* We found a match. */
876 match_ctx_clean (&mctx);
880 assert (match_last != -1);
881 assert (err == REG_NOERROR);
884 /* Set pmatch[] if we need. */
889 /* Initialize registers. */
890 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
891 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
893 /* Set the points where matching start/end. */
895 pmatch[0].rm_eo = mctx.match_last;
897 if (!preg->no_sub && nmatch > 1)
899 err = set_regs (preg, &mctx, nmatch, pmatch,
900 dfa->has_plural_match && dfa->nbackref > 0);
901 if (BE (err != REG_NOERROR, 0))
905 /* At last, add the offset to the each registers, since we slided
906 the buffers so that we could assume that the matching starts
908 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
909 if (pmatch[reg_idx].rm_so != -1)
911 #ifdef RE_ENABLE_I18N
912 if (BE (mctx.input.offsets_needed != 0, 0))
914 pmatch[reg_idx].rm_so =
915 (pmatch[reg_idx].rm_so == mctx.input.valid_len
916 ? mctx.input.valid_raw_len
917 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
918 pmatch[reg_idx].rm_eo =
919 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
920 ? mctx.input.valid_raw_len
921 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
924 assert (mctx.input.offsets_needed == 0);
926 pmatch[reg_idx].rm_so += match_first;
927 pmatch[reg_idx].rm_eo += match_first;
929 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
931 pmatch[nmatch + reg_idx].rm_so = -1;
932 pmatch[nmatch + reg_idx].rm_eo = -1;
936 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
937 if (dfa->subexp_map[reg_idx] != reg_idx)
939 pmatch[reg_idx + 1].rm_so
940 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
941 pmatch[reg_idx + 1].rm_eo
942 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
947 re_free (mctx.state_log);
949 match_ctx_free (&mctx);
950 re_string_destruct (&mctx.input);
955 __attribute_warn_unused_result__
956 prune_impossible_nodes (mctx)
957 re_match_context_t *mctx;
959 const re_dfa_t *const dfa = mctx->dfa;
960 int halt_node, match_last;
962 re_dfastate_t **sifted_states;
963 re_dfastate_t **lim_states = NULL;
964 re_sift_context_t sctx;
966 assert (mctx->state_log != NULL);
968 match_last = mctx->match_last;
969 halt_node = mctx->last_node;
971 /* Avoid overflow. */
972 if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
975 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
976 if (BE (sifted_states == NULL, 0))
983 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
984 if (BE (lim_states == NULL, 0))
991 memset (lim_states, '\0',
992 sizeof (re_dfastate_t *) * (match_last + 1));
993 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
995 ret = sift_states_backward (mctx, &sctx);
996 re_node_set_free (&sctx.limits);
997 if (BE (ret != REG_NOERROR, 0))
999 if (sifted_states[0] != NULL || lim_states[0] != NULL)
1009 } while (mctx->state_log[match_last] == NULL
1010 || !mctx->state_log[match_last]->halt);
1011 halt_node = check_halt_state_context (mctx,
1012 mctx->state_log[match_last],
1015 ret = merge_state_array (dfa, sifted_states, lim_states,
1017 re_free (lim_states);
1019 if (BE (ret != REG_NOERROR, 0))
1024 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1025 ret = sift_states_backward (mctx, &sctx);
1026 re_node_set_free (&sctx.limits);
1027 if (BE (ret != REG_NOERROR, 0))
1029 if (sifted_states[0] == NULL)
1035 re_free (mctx->state_log);
1036 mctx->state_log = sifted_states;
1037 sifted_states = NULL;
1038 mctx->last_node = halt_node;
1039 mctx->match_last = match_last;
1042 re_free (sifted_states);
1043 re_free (lim_states);
1047 /* Acquire an initial state and return it.
1048 We must select appropriate initial state depending on the context,
1049 since initial states may have constraints like "\<", "^", etc.. */
1051 static inline re_dfastate_t *
1052 __attribute ((always_inline)) internal_function
1053 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1056 const re_dfa_t *const dfa = mctx->dfa;
1057 if (dfa->init_state->has_constraint)
1059 unsigned int context;
1060 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1061 if (IS_WORD_CONTEXT (context))
1062 return dfa->init_state_word;
1063 else if (IS_ORDINARY_CONTEXT (context))
1064 return dfa->init_state;
1065 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1066 return dfa->init_state_begbuf;
1067 else if (IS_NEWLINE_CONTEXT (context))
1068 return dfa->init_state_nl;
1069 else if (IS_BEGBUF_CONTEXT (context))
1071 /* It is relatively rare case, then calculate on demand. */
1072 return re_acquire_state_context (err, dfa,
1073 dfa->init_state->entrance_nodes,
1077 /* Must not happen? */
1078 return dfa->init_state;
1081 return dfa->init_state;
1084 /* Check whether the regular expression match input string INPUT or not,
1085 and return the index where the matching end, return -1 if not match,
1086 or return -2 in case of an error.
1087 FL_LONGEST_MATCH means we want the POSIX longest matching.
1088 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1089 next place where we may want to try matching.
1090 Note that the matcher assume that the maching starts from the current
1091 index of the buffer. */
1094 internal_function __attribute_warn_unused_result__
1095 check_matching (re_match_context_t *mctx, int fl_longest_match,
1098 const re_dfa_t *const dfa = mctx->dfa;
1101 int match_last = -1;
1102 int cur_str_idx = re_string_cur_idx (&mctx->input);
1103 re_dfastate_t *cur_state;
1104 int at_init_state = p_match_first != NULL;
1105 int next_start_idx = cur_str_idx;
1108 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1109 /* An initial state must not be NULL (invalid). */
1110 if (BE (cur_state == NULL, 0))
1112 assert (err == REG_ESPACE);
1116 if (mctx->state_log != NULL)
1118 mctx->state_log[cur_str_idx] = cur_state;
1120 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1121 later. E.g. Processing back references. */
1122 if (BE (dfa->nbackref, 0))
1125 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1126 if (BE (err != REG_NOERROR, 0))
1129 if (cur_state->has_backref)
1131 err = transit_state_bkref (mctx, &cur_state->nodes);
1132 if (BE (err != REG_NOERROR, 0))
1138 /* If the RE accepts NULL string. */
1139 if (BE (cur_state->halt, 0))
1141 if (!cur_state->has_constraint
1142 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1144 if (!fl_longest_match)
1148 match_last = cur_str_idx;
1154 while (!re_string_eoi (&mctx->input))
1156 re_dfastate_t *old_state = cur_state;
1157 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1159 if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1160 && mctx->input.bufs_len < mctx->input.len)
1161 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1162 && mctx->input.valid_len < mctx->input.len))
1164 err = extend_buffers (mctx);
1165 if (BE (err != REG_NOERROR, 0))
1167 assert (err == REG_ESPACE);
1172 cur_state = transit_state (&err, mctx, cur_state);
1173 if (mctx->state_log != NULL)
1174 cur_state = merge_state_with_log (&err, mctx, cur_state);
1176 if (cur_state == NULL)
1178 /* Reached the invalid state or an error. Try to recover a valid
1179 state using the state log, if available and if we have not
1180 already found a valid (even if not the longest) match. */
1181 if (BE (err != REG_NOERROR, 0))
1184 if (mctx->state_log == NULL
1185 || (match && !fl_longest_match)
1186 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1190 if (BE (at_init_state, 0))
1192 if (old_state == cur_state)
1193 next_start_idx = next_char_idx;
1198 if (cur_state->halt)
1200 /* Reached a halt state.
1201 Check the halt state can satisfy the current context. */
1202 if (!cur_state->has_constraint
1203 || check_halt_state_context (mctx, cur_state,
1204 re_string_cur_idx (&mctx->input)))
1206 /* We found an appropriate halt state. */
1207 match_last = re_string_cur_idx (&mctx->input);
1210 /* We found a match, do not modify match_first below. */
1211 p_match_first = NULL;
1212 if (!fl_longest_match)
1219 *p_match_first += next_start_idx;
1224 /* Check NODE match the current context. */
1228 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1230 re_token_type_t type = dfa->nodes[node].type;
1231 unsigned int constraint = dfa->nodes[node].constraint;
1232 if (type != END_OF_RE)
1236 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1241 /* Check the halt state STATE match the current context.
1242 Return 0 if not match, if the node, STATE has, is a halt node and
1243 match the context, return the node. */
1247 check_halt_state_context (const re_match_context_t *mctx,
1248 const re_dfastate_t *state, int idx)
1251 unsigned int context;
1253 assert (state->halt);
1255 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1256 for (i = 0; i < state->nodes.nelem; ++i)
1257 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1258 return state->nodes.elems[i];
1262 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1263 corresponding to the DFA).
1264 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1269 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1270 int *pidx, int node, re_node_set *eps_via_nodes,
1271 struct re_fail_stack_t *fs)
1273 const re_dfa_t *const dfa = mctx->dfa;
1275 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1277 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1278 re_node_set *edests = &dfa->edests[node];
1280 err = re_node_set_insert (eps_via_nodes, node);
1281 if (BE (err < 0, 0))
1283 /* Pick up a valid destination, or return -1 if none is found. */
1284 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1286 int candidate = edests->elems[i];
1287 if (!re_node_set_contains (cur_nodes, candidate))
1289 if (dest_node == -1)
1290 dest_node = candidate;
1294 /* In order to avoid infinite loop like "(a*)*", return the second
1295 epsilon-transition if the first was already considered. */
1296 if (re_node_set_contains (eps_via_nodes, dest_node))
1299 /* Otherwise, push the second epsilon-transition on the fail stack. */
1301 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1305 /* We know we are going to exit. */
1314 re_token_type_t type = dfa->nodes[node].type;
1316 #ifdef RE_ENABLE_I18N
1317 if (dfa->nodes[node].accept_mb)
1318 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1320 #endif /* RE_ENABLE_I18N */
1321 if (type == OP_BACK_REF)
1323 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1324 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1327 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1331 char *buf = (char *) re_string_get_buffer (&mctx->input);
1332 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1341 err = re_node_set_insert (eps_via_nodes, node);
1342 if (BE (err < 0, 0))
1344 dest_node = dfa->edests[node].elems[0];
1345 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1352 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1354 int dest_node = dfa->nexts[node];
1355 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1356 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1357 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1360 re_node_set_empty (eps_via_nodes);
1367 static reg_errcode_t
1368 internal_function __attribute_warn_unused_result__
1369 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1370 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1373 int num = fs->num++;
1374 if (fs->num == fs->alloc)
1376 struct re_fail_stack_ent_t *new_array;
1377 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1379 if (new_array == NULL)
1382 fs->stack = new_array;
1384 fs->stack[num].idx = str_idx;
1385 fs->stack[num].node = dest_node;
1386 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1387 if (fs->stack[num].regs == NULL)
1389 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1390 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1396 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1397 regmatch_t *regs, re_node_set *eps_via_nodes)
1399 int num = --fs->num;
1401 *pidx = fs->stack[num].idx;
1402 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1403 re_node_set_free (eps_via_nodes);
1404 re_free (fs->stack[num].regs);
1405 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1406 return fs->stack[num].node;
1409 /* Set the positions where the subexpressions are starts/ends to registers
1411 Note: We assume that pmatch[0] is already set, and
1412 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1414 static reg_errcode_t
1415 internal_function __attribute_warn_unused_result__
1416 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1417 regmatch_t *pmatch, int fl_backtrack)
1419 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1421 re_node_set eps_via_nodes;
1422 struct re_fail_stack_t *fs;
1423 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1424 regmatch_t *prev_idx_match;
1425 int prev_idx_match_malloced = 0;
1428 assert (nmatch > 1);
1429 assert (mctx->state_log != NULL);
1434 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1435 if (fs->stack == NULL)
1441 cur_node = dfa->init_node;
1442 re_node_set_init_empty (&eps_via_nodes);
1444 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1445 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1448 prev_idx_match = re_malloc (regmatch_t, nmatch);
1449 if (prev_idx_match == NULL)
1451 free_fail_stack_return (fs);
1454 prev_idx_match_malloced = 1;
1456 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1458 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1460 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1462 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1467 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1468 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1470 if (reg_idx == nmatch)
1472 re_node_set_free (&eps_via_nodes);
1473 if (prev_idx_match_malloced)
1474 re_free (prev_idx_match);
1475 return free_fail_stack_return (fs);
1477 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1482 re_node_set_free (&eps_via_nodes);
1483 if (prev_idx_match_malloced)
1484 re_free (prev_idx_match);
1489 /* Proceed to next node. */
1490 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1491 &eps_via_nodes, fs);
1493 if (BE (cur_node < 0, 0))
1495 if (BE (cur_node == -2, 0))
1497 re_node_set_free (&eps_via_nodes);
1498 if (prev_idx_match_malloced)
1499 re_free (prev_idx_match);
1500 free_fail_stack_return (fs);
1504 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1508 re_node_set_free (&eps_via_nodes);
1509 if (prev_idx_match_malloced)
1510 re_free (prev_idx_match);
1515 re_node_set_free (&eps_via_nodes);
1516 if (prev_idx_match_malloced)
1517 re_free (prev_idx_match);
1518 return free_fail_stack_return (fs);
1521 static reg_errcode_t
1523 free_fail_stack_return (struct re_fail_stack_t *fs)
1528 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1530 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1531 re_free (fs->stack[fs_idx].regs);
1533 re_free (fs->stack);
1540 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1541 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1543 int type = dfa->nodes[cur_node].type;
1544 if (type == OP_OPEN_SUBEXP)
1546 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1548 /* We are at the first node of this sub expression. */
1549 if (reg_num < nmatch)
1551 pmatch[reg_num].rm_so = cur_idx;
1552 pmatch[reg_num].rm_eo = -1;
1555 else if (type == OP_CLOSE_SUBEXP)
1557 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1558 if (reg_num < nmatch)
1560 /* We are at the last node of this sub expression. */
1561 if (pmatch[reg_num].rm_so < cur_idx)
1563 pmatch[reg_num].rm_eo = cur_idx;
1564 /* This is a non-empty match or we are not inside an optional
1565 subexpression. Accept this right away. */
1566 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1570 if (dfa->nodes[cur_node].opt_subexp
1571 && prev_idx_match[reg_num].rm_so != -1)
1572 /* We transited through an empty match for an optional
1573 subexpression, like (a?)*, and this is not the subexp's
1574 first match. Copy back the old content of the registers
1575 so that matches of an inner subexpression are undone as
1576 well, like in ((a?))*. */
1577 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1579 /* We completed a subexpression, but it may be part of
1580 an optional one, so do not update PREV_IDX_MATCH. */
1581 pmatch[reg_num].rm_eo = cur_idx;
1587 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1588 and sift the nodes in each states according to the following rules.
1589 Updated state_log will be wrote to STATE_LOG.
1591 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1592 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1593 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1594 the LAST_NODE, we throw away the node `a'.
1595 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1596 string `s' and transit to `b':
1597 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1599 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1600 thrown away, we throw away the node `a'.
1601 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1602 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1604 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1605 we throw away the node `a'. */
1607 #define STATE_NODE_CONTAINS(state,node) \
1608 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1610 static reg_errcode_t
1612 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1616 int str_idx = sctx->last_str_idx;
1617 re_node_set cur_dest;
1620 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1623 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1624 transit to the last_node and the last_node itself. */
1625 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1626 if (BE (err != REG_NOERROR, 0))
1628 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1629 if (BE (err != REG_NOERROR, 0))
1632 /* Then check each states in the state_log. */
1635 /* Update counters. */
1636 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1637 if (null_cnt > mctx->max_mb_elem_len)
1639 memset (sctx->sifted_states, '\0',
1640 sizeof (re_dfastate_t *) * str_idx);
1641 re_node_set_free (&cur_dest);
1644 re_node_set_empty (&cur_dest);
1647 if (mctx->state_log[str_idx])
1649 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1650 if (BE (err != REG_NOERROR, 0))
1654 /* Add all the nodes which satisfy the following conditions:
1655 - It can epsilon transit to a node in CUR_DEST.
1657 And update state_log. */
1658 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1659 if (BE (err != REG_NOERROR, 0))
1664 re_node_set_free (&cur_dest);
1668 static reg_errcode_t
1669 internal_function __attribute_warn_unused_result__
1670 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1671 int str_idx, re_node_set *cur_dest)
1673 const re_dfa_t *const dfa = mctx->dfa;
1674 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1677 /* Then build the next sifted state.
1678 We build the next sifted state on `cur_dest', and update
1679 `sifted_states[str_idx]' with `cur_dest'.
1681 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1682 `cur_src' points the node_set of the old `state_log[str_idx]'
1683 (with the epsilon nodes pre-filtered out). */
1684 for (i = 0; i < cur_src->nelem; i++)
1686 int prev_node = cur_src->elems[i];
1691 re_token_type_t type = dfa->nodes[prev_node].type;
1692 assert (!IS_EPSILON_NODE (type));
1694 #ifdef RE_ENABLE_I18N
1695 /* If the node may accept `multi byte'. */
1696 if (dfa->nodes[prev_node].accept_mb)
1697 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1698 str_idx, sctx->last_str_idx);
1699 #endif /* RE_ENABLE_I18N */
1701 /* We don't check backreferences here.
1702 See update_cur_sifted_state(). */
1704 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1705 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1706 dfa->nexts[prev_node]))
1712 if (sctx->limits.nelem)
1714 int to_idx = str_idx + naccepted;
1715 if (check_dst_limits (mctx, &sctx->limits,
1716 dfa->nexts[prev_node], to_idx,
1717 prev_node, str_idx))
1720 ret = re_node_set_insert (cur_dest, prev_node);
1721 if (BE (ret == -1, 0))
1728 /* Helper functions. */
1730 static reg_errcode_t
1732 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1734 int top = mctx->state_log_top;
1736 if ((next_state_log_idx >= mctx->input.bufs_len
1737 && mctx->input.bufs_len < mctx->input.len)
1738 || (next_state_log_idx >= mctx->input.valid_len
1739 && mctx->input.valid_len < mctx->input.len))
1742 err = extend_buffers (mctx);
1743 if (BE (err != REG_NOERROR, 0))
1747 if (top < next_state_log_idx)
1749 memset (mctx->state_log + top + 1, '\0',
1750 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1751 mctx->state_log_top = next_state_log_idx;
1756 static reg_errcode_t
1758 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1759 re_dfastate_t **src, int num)
1763 for (st_idx = 0; st_idx < num; ++st_idx)
1765 if (dst[st_idx] == NULL)
1766 dst[st_idx] = src[st_idx];
1767 else if (src[st_idx] != NULL)
1769 re_node_set merged_set;
1770 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1771 &src[st_idx]->nodes);
1772 if (BE (err != REG_NOERROR, 0))
1774 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1775 re_node_set_free (&merged_set);
1776 if (BE (err != REG_NOERROR, 0))
1783 static reg_errcode_t
1785 update_cur_sifted_state (const re_match_context_t *mctx,
1786 re_sift_context_t *sctx, int str_idx,
1787 re_node_set *dest_nodes)
1789 const re_dfa_t *const dfa = mctx->dfa;
1790 reg_errcode_t err = REG_NOERROR;
1791 const re_node_set *candidates;
1792 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1793 : &mctx->state_log[str_idx]->nodes);
1795 if (dest_nodes->nelem == 0)
1796 sctx->sifted_states[str_idx] = NULL;
1801 /* At first, add the nodes which can epsilon transit to a node in
1803 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1804 if (BE (err != REG_NOERROR, 0))
1807 /* Then, check the limitations in the current sift_context. */
1808 if (sctx->limits.nelem)
1810 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1811 mctx->bkref_ents, str_idx);
1812 if (BE (err != REG_NOERROR, 0))
1817 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1818 if (BE (err != REG_NOERROR, 0))
1822 if (candidates && mctx->state_log[str_idx]->has_backref)
1824 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1825 if (BE (err != REG_NOERROR, 0))
1831 static reg_errcode_t
1832 internal_function __attribute_warn_unused_result__
1833 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1834 const re_node_set *candidates)
1836 reg_errcode_t err = REG_NOERROR;
1839 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1840 if (BE (err != REG_NOERROR, 0))
1843 if (!state->inveclosure.alloc)
1845 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1846 if (BE (err != REG_NOERROR, 0))
1848 for (i = 0; i < dest_nodes->nelem; i++)
1850 err = re_node_set_merge (&state->inveclosure,
1851 dfa->inveclosures + dest_nodes->elems[i]);
1852 if (BE (err != REG_NOERROR, 0))
1856 return re_node_set_add_intersect (dest_nodes, candidates,
1857 &state->inveclosure);
1860 static reg_errcode_t
1862 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1863 const re_node_set *candidates)
1867 re_node_set *inv_eclosure = dfa->inveclosures + node;
1868 re_node_set except_nodes;
1869 re_node_set_init_empty (&except_nodes);
1870 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1872 int cur_node = inv_eclosure->elems[ecl_idx];
1873 if (cur_node == node)
1875 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1877 int edst1 = dfa->edests[cur_node].elems[0];
1878 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1879 ? dfa->edests[cur_node].elems[1] : -1);
1880 if ((!re_node_set_contains (inv_eclosure, edst1)
1881 && re_node_set_contains (dest_nodes, edst1))
1883 && !re_node_set_contains (inv_eclosure, edst2)
1884 && re_node_set_contains (dest_nodes, edst2)))
1886 err = re_node_set_add_intersect (&except_nodes, candidates,
1887 dfa->inveclosures + cur_node);
1888 if (BE (err != REG_NOERROR, 0))
1890 re_node_set_free (&except_nodes);
1896 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1898 int cur_node = inv_eclosure->elems[ecl_idx];
1899 if (!re_node_set_contains (&except_nodes, cur_node))
1901 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1902 re_node_set_remove_at (dest_nodes, idx);
1905 re_node_set_free (&except_nodes);
1911 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1912 int dst_node, int dst_idx, int src_node, int src_idx)
1914 const re_dfa_t *const dfa = mctx->dfa;
1915 int lim_idx, src_pos, dst_pos;
1917 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1918 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1919 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1922 struct re_backref_cache_entry *ent;
1923 ent = mctx->bkref_ents + limits->elems[lim_idx];
1924 subexp_idx = dfa->nodes[ent->node].opr.idx;
1926 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1927 subexp_idx, dst_node, dst_idx,
1929 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1930 subexp_idx, src_node, src_idx,
1934 <src> <dst> ( <subexp> )
1935 ( <subexp> ) <src> <dst>
1936 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1937 if (src_pos == dst_pos)
1938 continue; /* This is unrelated limitation. */
1947 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1948 int subexp_idx, int from_node, int bkref_idx)
1950 const re_dfa_t *const dfa = mctx->dfa;
1951 const re_node_set *eclosures = dfa->eclosures + from_node;
1954 /* Else, we are on the boundary: examine the nodes on the epsilon
1956 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1958 int node = eclosures->elems[node_idx];
1959 switch (dfa->nodes[node].type)
1962 if (bkref_idx != -1)
1964 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1969 if (ent->node != node)
1972 if (subexp_idx < BITSET_WORD_BITS
1973 && !(ent->eps_reachable_subexps_map
1974 & ((bitset_word_t) 1 << subexp_idx)))
1977 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1978 OP_CLOSE_SUBEXP cases below. But, if the
1979 destination node is the same node as the source
1980 node, don't recurse because it would cause an
1981 infinite loop: a regex that exhibits this behavior
1983 dst = dfa->edests[node].elems[0];
1984 if (dst == from_node)
1988 else /* if (boundaries & 2) */
1993 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1995 if (cpos == -1 /* && (boundaries & 1) */)
1997 if (cpos == 0 && (boundaries & 2))
2000 if (subexp_idx < BITSET_WORD_BITS)
2001 ent->eps_reachable_subexps_map
2002 &= ~((bitset_word_t) 1 << subexp_idx);
2004 while (ent++->more);
2008 case OP_OPEN_SUBEXP:
2009 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2013 case OP_CLOSE_SUBEXP:
2014 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2023 return (boundaries & 2) ? 1 : 0;
2028 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2029 int subexp_idx, int from_node, int str_idx,
2032 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2035 /* If we are outside the range of the subexpression, return -1 or 1. */
2036 if (str_idx < lim->subexp_from)
2039 if (lim->subexp_to < str_idx)
2042 /* If we are within the subexpression, return 0. */
2043 boundaries = (str_idx == lim->subexp_from);
2044 boundaries |= (str_idx == lim->subexp_to) << 1;
2045 if (boundaries == 0)
2048 /* Else, examine epsilon closure. */
2049 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2050 from_node, bkref_idx);
2053 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2054 which are against limitations from DEST_NODES. */
2056 static reg_errcode_t
2058 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2059 const re_node_set *candidates, re_node_set *limits,
2060 struct re_backref_cache_entry *bkref_ents, int str_idx)
2063 int node_idx, lim_idx;
2065 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2068 struct re_backref_cache_entry *ent;
2069 ent = bkref_ents + limits->elems[lim_idx];
2071 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2072 continue; /* This is unrelated limitation. */
2074 subexp_idx = dfa->nodes[ent->node].opr.idx;
2075 if (ent->subexp_to == str_idx)
2079 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2081 int node = dest_nodes->elems[node_idx];
2082 re_token_type_t type = dfa->nodes[node].type;
2083 if (type == OP_OPEN_SUBEXP
2084 && subexp_idx == dfa->nodes[node].opr.idx)
2086 else if (type == OP_CLOSE_SUBEXP
2087 && subexp_idx == dfa->nodes[node].opr.idx)
2091 /* Check the limitation of the open subexpression. */
2092 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2095 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2097 if (BE (err != REG_NOERROR, 0))
2101 /* Check the limitation of the close subexpression. */
2103 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2105 int node = dest_nodes->elems[node_idx];
2106 if (!re_node_set_contains (dfa->inveclosures + node,
2108 && !re_node_set_contains (dfa->eclosures + node,
2111 /* It is against this limitation.
2112 Remove it form the current sifted state. */
2113 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2115 if (BE (err != REG_NOERROR, 0))
2121 else /* (ent->subexp_to != str_idx) */
2123 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2125 int node = dest_nodes->elems[node_idx];
2126 re_token_type_t type = dfa->nodes[node].type;
2127 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2129 if (subexp_idx != dfa->nodes[node].opr.idx)
2131 /* It is against this limitation.
2132 Remove it form the current sifted state. */
2133 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2135 if (BE (err != REG_NOERROR, 0))
2144 static reg_errcode_t
2145 internal_function __attribute_warn_unused_result__
2146 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2147 int str_idx, const re_node_set *candidates)
2149 const re_dfa_t *const dfa = mctx->dfa;
2152 re_sift_context_t local_sctx;
2153 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2155 if (first_idx == -1)
2158 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2160 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2163 re_token_type_t type;
2164 struct re_backref_cache_entry *entry;
2165 node = candidates->elems[node_idx];
2166 type = dfa->nodes[node].type;
2167 /* Avoid infinite loop for the REs like "()\1+". */
2168 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2170 if (type != OP_BACK_REF)
2173 entry = mctx->bkref_ents + first_idx;
2174 enabled_idx = first_idx;
2181 re_dfastate_t *cur_state;
2183 if (entry->node != node)
2185 subexp_len = entry->subexp_to - entry->subexp_from;
2186 to_idx = str_idx + subexp_len;
2187 dst_node = (subexp_len ? dfa->nexts[node]
2188 : dfa->edests[node].elems[0]);
2190 if (to_idx > sctx->last_str_idx
2191 || sctx->sifted_states[to_idx] == NULL
2192 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2193 || check_dst_limits (mctx, &sctx->limits, node,
2194 str_idx, dst_node, to_idx))
2197 if (local_sctx.sifted_states == NULL)
2200 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2201 if (BE (err != REG_NOERROR, 0))
2204 local_sctx.last_node = node;
2205 local_sctx.last_str_idx = str_idx;
2206 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2207 if (BE (ret < 0, 0))
2212 cur_state = local_sctx.sifted_states[str_idx];
2213 err = sift_states_backward (mctx, &local_sctx);
2214 if (BE (err != REG_NOERROR, 0))
2216 if (sctx->limited_states != NULL)
2218 err = merge_state_array (dfa, sctx->limited_states,
2219 local_sctx.sifted_states,
2221 if (BE (err != REG_NOERROR, 0))
2224 local_sctx.sifted_states[str_idx] = cur_state;
2225 re_node_set_remove (&local_sctx.limits, enabled_idx);
2227 /* mctx->bkref_ents may have changed, reload the pointer. */
2228 entry = mctx->bkref_ents + enabled_idx;
2230 while (enabled_idx++, entry++->more);
2234 if (local_sctx.sifted_states != NULL)
2236 re_node_set_free (&local_sctx.limits);
2243 #ifdef RE_ENABLE_I18N
2246 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2247 int node_idx, int str_idx, int max_str_idx)
2249 const re_dfa_t *const dfa = mctx->dfa;
2251 /* Check the node can accept `multi byte'. */
2252 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2253 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2254 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2255 dfa->nexts[node_idx]))
2256 /* The node can't accept the `multi byte', or the
2257 destination was already thrown away, then the node
2258 could't accept the current input `multi byte'. */
2260 /* Otherwise, it is sure that the node could accept
2261 `naccepted' bytes input. */
2264 #endif /* RE_ENABLE_I18N */
2267 /* Functions for state transition. */
2269 /* Return the next state to which the current state STATE will transit by
2270 accepting the current input byte, and update STATE_LOG if necessary.
2271 If STATE can accept a multibyte char/collating element/back reference
2272 update the destination of STATE_LOG. */
2274 static re_dfastate_t *
2275 internal_function __attribute_warn_unused_result__
2276 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2277 re_dfastate_t *state)
2279 re_dfastate_t **trtable;
2282 #ifdef RE_ENABLE_I18N
2283 /* If the current state can accept multibyte. */
2284 if (BE (state->accept_mb, 0))
2286 *err = transit_state_mb (mctx, state);
2287 if (BE (*err != REG_NOERROR, 0))
2290 #endif /* RE_ENABLE_I18N */
2292 /* Then decide the next state with the single byte. */
2295 /* don't use transition table */
2296 return transit_state_sb (err, mctx, state);
2299 /* Use transition table */
2300 ch = re_string_fetch_byte (&mctx->input);
2303 trtable = state->trtable;
2304 if (BE (trtable != NULL, 1))
2307 trtable = state->word_trtable;
2308 if (BE (trtable != NULL, 1))
2310 unsigned int context;
2312 = re_string_context_at (&mctx->input,
2313 re_string_cur_idx (&mctx->input) - 1,
2315 if (IS_WORD_CONTEXT (context))
2316 return trtable[ch + SBC_MAX];
2321 if (!build_trtable (mctx->dfa, state))
2327 /* Retry, we now have a transition table. */
2331 /* Update the state_log if we need */
2334 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2335 re_dfastate_t *next_state)
2337 const re_dfa_t *const dfa = mctx->dfa;
2338 int cur_idx = re_string_cur_idx (&mctx->input);
2340 if (cur_idx > mctx->state_log_top)
2342 mctx->state_log[cur_idx] = next_state;
2343 mctx->state_log_top = cur_idx;
2345 else if (mctx->state_log[cur_idx] == 0)
2347 mctx->state_log[cur_idx] = next_state;
2351 re_dfastate_t *pstate;
2352 unsigned int context;
2353 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2354 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2355 the destination of a multibyte char/collating element/
2356 back reference. Then the next state is the union set of
2357 these destinations and the results of the transition table. */
2358 pstate = mctx->state_log[cur_idx];
2359 log_nodes = pstate->entrance_nodes;
2360 if (next_state != NULL)
2362 table_nodes = next_state->entrance_nodes;
2363 *err = re_node_set_init_union (&next_nodes, table_nodes,
2365 if (BE (*err != REG_NOERROR, 0))
2369 next_nodes = *log_nodes;
2370 /* Note: We already add the nodes of the initial state,
2371 then we don't need to add them here. */
2373 context = re_string_context_at (&mctx->input,
2374 re_string_cur_idx (&mctx->input) - 1,
2376 next_state = mctx->state_log[cur_idx]
2377 = re_acquire_state_context (err, dfa, &next_nodes, context);
2378 /* We don't need to check errors here, since the return value of
2379 this function is next_state and ERR is already set. */
2381 if (table_nodes != NULL)
2382 re_node_set_free (&next_nodes);
2385 if (BE (dfa->nbackref, 0) && next_state != NULL)
2387 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2388 later. We must check them here, since the back references in the
2389 next state might use them. */
2390 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2392 if (BE (*err != REG_NOERROR, 0))
2395 /* If the next state has back references. */
2396 if (next_state->has_backref)
2398 *err = transit_state_bkref (mctx, &next_state->nodes);
2399 if (BE (*err != REG_NOERROR, 0))
2401 next_state = mctx->state_log[cur_idx];
2408 /* Skip bytes in the input that correspond to part of a
2409 multi-byte match, then look in the log for a state
2410 from which to restart matching. */
2413 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2415 re_dfastate_t *cur_state;
2418 int max = mctx->state_log_top;
2419 int cur_str_idx = re_string_cur_idx (&mctx->input);
2423 if (++cur_str_idx > max)
2425 re_string_skip_bytes (&mctx->input, 1);
2427 while (mctx->state_log[cur_str_idx] == NULL);
2429 cur_state = merge_state_with_log (err, mctx, NULL);
2431 while (*err == REG_NOERROR && cur_state == NULL);
2435 /* Helper functions for transit_state. */
2437 /* From the node set CUR_NODES, pick up the nodes whose types are
2438 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2439 expression. And register them to use them later for evaluating the
2440 correspoding back references. */
2442 static reg_errcode_t
2444 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2447 const re_dfa_t *const dfa = mctx->dfa;
2451 /* TODO: This isn't efficient.
2452 Because there might be more than one nodes whose types are
2453 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2456 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2458 int node = cur_nodes->elems[node_idx];
2459 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2460 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2461 && (dfa->used_bkref_map
2462 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2464 err = match_ctx_add_subtop (mctx, node, str_idx);
2465 if (BE (err != REG_NOERROR, 0))
2473 /* Return the next state to which the current state STATE will transit by
2474 accepting the current input byte. */
2476 static re_dfastate_t *
2477 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2478 re_dfastate_t *state)
2480 const re_dfa_t *const dfa = mctx->dfa;
2481 re_node_set next_nodes;
2482 re_dfastate_t *next_state;
2483 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2484 unsigned int context;
2486 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2487 if (BE (*err != REG_NOERROR, 0))
2489 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2491 int cur_node = state->nodes.elems[node_cnt];
2492 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2494 *err = re_node_set_merge (&next_nodes,
2495 dfa->eclosures + dfa->nexts[cur_node]);
2496 if (BE (*err != REG_NOERROR, 0))
2498 re_node_set_free (&next_nodes);
2503 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2504 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2505 /* We don't need to check errors here, since the return value of
2506 this function is next_state and ERR is already set. */
2508 re_node_set_free (&next_nodes);
2509 re_string_skip_bytes (&mctx->input, 1);
2514 #ifdef RE_ENABLE_I18N
2515 static reg_errcode_t
2517 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2519 const re_dfa_t *const dfa = mctx->dfa;
2523 for (i = 0; i < pstate->nodes.nelem; ++i)
2525 re_node_set dest_nodes, *new_nodes;
2526 int cur_node_idx = pstate->nodes.elems[i];
2527 int naccepted, dest_idx;
2528 unsigned int context;
2529 re_dfastate_t *dest_state;
2531 if (!dfa->nodes[cur_node_idx].accept_mb)
2534 if (dfa->nodes[cur_node_idx].constraint)
2536 context = re_string_context_at (&mctx->input,
2537 re_string_cur_idx (&mctx->input),
2539 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2544 /* How many bytes the node can accept? */
2545 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2546 re_string_cur_idx (&mctx->input));
2550 /* The node can accepts `naccepted' bytes. */
2551 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2552 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2553 : mctx->max_mb_elem_len);
2554 err = clean_state_log_if_needed (mctx, dest_idx);
2555 if (BE (err != REG_NOERROR, 0))
2558 assert (dfa->nexts[cur_node_idx] != -1);
2560 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2562 dest_state = mctx->state_log[dest_idx];
2563 if (dest_state == NULL)
2564 dest_nodes = *new_nodes;
2567 err = re_node_set_init_union (&dest_nodes,
2568 dest_state->entrance_nodes, new_nodes);
2569 if (BE (err != REG_NOERROR, 0))
2572 context = re_string_context_at (&mctx->input, dest_idx - 1,
2574 mctx->state_log[dest_idx]
2575 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2576 if (dest_state != NULL)
2577 re_node_set_free (&dest_nodes);
2578 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2583 #endif /* RE_ENABLE_I18N */
2585 static reg_errcode_t
2587 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2589 const re_dfa_t *const dfa = mctx->dfa;
2592 int cur_str_idx = re_string_cur_idx (&mctx->input);
2594 for (i = 0; i < nodes->nelem; ++i)
2596 int dest_str_idx, prev_nelem, bkc_idx;
2597 int node_idx = nodes->elems[i];
2598 unsigned int context;
2599 const re_token_t *node = dfa->nodes + node_idx;
2600 re_node_set *new_dest_nodes;
2602 /* Check whether `node' is a backreference or not. */
2603 if (node->type != OP_BACK_REF)
2606 if (node->constraint)
2608 context = re_string_context_at (&mctx->input, cur_str_idx,
2610 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2614 /* `node' is a backreference.
2615 Check the substring which the substring matched. */
2616 bkc_idx = mctx->nbkref_ents;
2617 err = get_subexp (mctx, node_idx, cur_str_idx);
2618 if (BE (err != REG_NOERROR, 0))
2621 /* And add the epsilon closures (which is `new_dest_nodes') of
2622 the backreference to appropriate state_log. */
2624 assert (dfa->nexts[node_idx] != -1);
2626 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2629 re_dfastate_t *dest_state;
2630 struct re_backref_cache_entry *bkref_ent;
2631 bkref_ent = mctx->bkref_ents + bkc_idx;
2632 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2634 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2635 new_dest_nodes = (subexp_len == 0
2636 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2637 : dfa->eclosures + dfa->nexts[node_idx]);
2638 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2639 - bkref_ent->subexp_from);
2640 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2642 dest_state = mctx->state_log[dest_str_idx];
2643 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2644 : mctx->state_log[cur_str_idx]->nodes.nelem);
2645 /* Add `new_dest_node' to state_log. */
2646 if (dest_state == NULL)
2648 mctx->state_log[dest_str_idx]
2649 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2651 if (BE (mctx->state_log[dest_str_idx] == NULL
2652 && err != REG_NOERROR, 0))
2657 re_node_set dest_nodes;
2658 err = re_node_set_init_union (&dest_nodes,
2659 dest_state->entrance_nodes,
2661 if (BE (err != REG_NOERROR, 0))
2663 re_node_set_free (&dest_nodes);
2666 mctx->state_log[dest_str_idx]
2667 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2668 re_node_set_free (&dest_nodes);
2669 if (BE (mctx->state_log[dest_str_idx] == NULL
2670 && err != REG_NOERROR, 0))
2673 /* We need to check recursively if the backreference can epsilon
2676 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2678 err = check_subexp_matching_top (mctx, new_dest_nodes,
2680 if (BE (err != REG_NOERROR, 0))
2682 err = transit_state_bkref (mctx, new_dest_nodes);
2683 if (BE (err != REG_NOERROR, 0))
2693 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2694 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2695 Note that we might collect inappropriate candidates here.
2696 However, the cost of checking them strictly here is too high, then we
2697 delay these checking for prune_impossible_nodes(). */
2699 static reg_errcode_t
2700 internal_function __attribute_warn_unused_result__
2701 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2703 const re_dfa_t *const dfa = mctx->dfa;
2704 int subexp_num, sub_top_idx;
2705 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2706 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2707 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2708 if (cache_idx != -1)
2710 const struct re_backref_cache_entry *entry
2711 = mctx->bkref_ents + cache_idx;
2713 if (entry->node == bkref_node)
2714 return REG_NOERROR; /* We already checked it. */
2715 while (entry++->more);
2718 subexp_num = dfa->nodes[bkref_node].opr.idx;
2720 /* For each sub expression */
2721 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2724 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2725 re_sub_match_last_t *sub_last;
2726 int sub_last_idx, sl_str, bkref_str_off;
2728 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2729 continue; /* It isn't related. */
2731 sl_str = sub_top->str_idx;
2732 bkref_str_off = bkref_str_idx;
2733 /* At first, check the last node of sub expressions we already
2735 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2738 sub_last = sub_top->lasts[sub_last_idx];
2739 sl_str_diff = sub_last->str_idx - sl_str;
2740 /* The matched string by the sub expression match with the substring
2741 at the back reference? */
2742 if (sl_str_diff > 0)
2744 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2746 /* Not enough chars for a successful match. */
2747 if (bkref_str_off + sl_str_diff > mctx->input.len)
2750 err = clean_state_log_if_needed (mctx,
2753 if (BE (err != REG_NOERROR, 0))
2755 buf = (const char *) re_string_get_buffer (&mctx->input);
2757 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2758 /* We don't need to search this sub expression any more. */
2761 bkref_str_off += sl_str_diff;
2762 sl_str += sl_str_diff;
2763 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2766 /* Reload buf, since the preceding call might have reallocated
2768 buf = (const char *) re_string_get_buffer (&mctx->input);
2770 if (err == REG_NOMATCH)
2772 if (BE (err != REG_NOERROR, 0))
2776 if (sub_last_idx < sub_top->nlasts)
2778 if (sub_last_idx > 0)
2780 /* Then, search for the other last nodes of the sub expression. */
2781 for (; sl_str <= bkref_str_idx; ++sl_str)
2783 int cls_node, sl_str_off;
2784 const re_node_set *nodes;
2785 sl_str_off = sl_str - sub_top->str_idx;
2786 /* The matched string by the sub expression match with the substring
2787 at the back reference? */
2790 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2792 /* If we are at the end of the input, we cannot match. */
2793 if (bkref_str_off >= mctx->input.len)
2796 err = extend_buffers (mctx);
2797 if (BE (err != REG_NOERROR, 0))
2800 buf = (const char *) re_string_get_buffer (&mctx->input);
2802 if (buf [bkref_str_off++] != buf[sl_str - 1])
2803 break; /* We don't need to search this sub expression
2806 if (mctx->state_log[sl_str] == NULL)
2808 /* Does this state have a ')' of the sub expression? */
2809 nodes = &mctx->state_log[sl_str]->nodes;
2810 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2814 if (sub_top->path == NULL)
2816 sub_top->path = calloc (sizeof (state_array_t),
2817 sl_str - sub_top->str_idx + 1);
2818 if (sub_top->path == NULL)
2821 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2822 in the current context? */
2823 err = check_arrival (mctx, sub_top->path, sub_top->node,
2824 sub_top->str_idx, cls_node, sl_str,
2826 if (err == REG_NOMATCH)
2828 if (BE (err != REG_NOERROR, 0))
2830 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2831 if (BE (sub_last == NULL, 0))
2833 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2835 if (err == REG_NOMATCH)
2842 /* Helper functions for get_subexp(). */
2844 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2845 If it can arrive, register the sub expression expressed with SUB_TOP
2848 static reg_errcode_t
2850 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2851 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2855 /* Can the subexpression arrive the back reference? */
2856 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2857 sub_last->str_idx, bkref_node, bkref_str,
2859 if (err != REG_NOERROR)
2861 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2863 if (BE (err != REG_NOERROR, 0))
2865 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2866 return clean_state_log_if_needed (mctx, to_idx);
2869 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2870 Search '(' if FL_OPEN, or search ')' otherwise.
2871 TODO: This function isn't efficient...
2872 Because there might be more than one nodes whose types are
2873 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2879 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2880 int subexp_idx, int type)
2883 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2885 int cls_node = nodes->elems[cls_idx];
2886 const re_token_t *node = dfa->nodes + cls_node;
2887 if (node->type == type
2888 && node->opr.idx == subexp_idx)
2894 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2895 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2897 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2899 static reg_errcode_t
2900 internal_function __attribute_warn_unused_result__
2901 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2902 int top_str, int last_node, int last_str, int type)
2904 const re_dfa_t *const dfa = mctx->dfa;
2905 reg_errcode_t err = REG_NOERROR;
2906 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2907 re_dfastate_t *cur_state = NULL;
2908 re_node_set *cur_nodes, next_nodes;
2909 re_dfastate_t **backup_state_log;
2910 unsigned int context;
2912 subexp_num = dfa->nodes[top_node].opr.idx;
2913 /* Extend the buffer if we need. */
2914 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2916 re_dfastate_t **new_array;
2917 int old_alloc = path->alloc;
2918 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2919 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2920 if (BE (new_array == NULL, 0))
2922 path->alloc = old_alloc;
2925 path->array = new_array;
2926 memset (new_array + old_alloc, '\0',
2927 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2930 str_idx = path->next_idx ?: top_str;
2932 /* Temporary modify MCTX. */
2933 backup_state_log = mctx->state_log;
2934 backup_cur_idx = mctx->input.cur_idx;
2935 mctx->state_log = path->array;
2936 mctx->input.cur_idx = str_idx;
2938 /* Setup initial node set. */
2939 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2940 if (str_idx == top_str)
2942 err = re_node_set_init_1 (&next_nodes, top_node);
2943 if (BE (err != REG_NOERROR, 0))
2945 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2946 if (BE (err != REG_NOERROR, 0))
2948 re_node_set_free (&next_nodes);
2954 cur_state = mctx->state_log[str_idx];
2955 if (cur_state && cur_state->has_backref)
2957 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2958 if (BE (err != REG_NOERROR, 0))
2962 re_node_set_init_empty (&next_nodes);
2964 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2966 if (next_nodes.nelem)
2968 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2970 if (BE (err != REG_NOERROR, 0))
2972 re_node_set_free (&next_nodes);
2976 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2977 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2979 re_node_set_free (&next_nodes);
2982 mctx->state_log[str_idx] = cur_state;
2985 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2987 re_node_set_empty (&next_nodes);
2988 if (mctx->state_log[str_idx + 1])
2990 err = re_node_set_merge (&next_nodes,
2991 &mctx->state_log[str_idx + 1]->nodes);
2992 if (BE (err != REG_NOERROR, 0))
2994 re_node_set_free (&next_nodes);
3000 err = check_arrival_add_next_nodes (mctx, str_idx,
3001 &cur_state->non_eps_nodes,
3003 if (BE (err != REG_NOERROR, 0))
3005 re_node_set_free (&next_nodes);
3010 if (next_nodes.nelem)
3012 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3013 if (BE (err != REG_NOERROR, 0))
3015 re_node_set_free (&next_nodes);
3018 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3020 if (BE (err != REG_NOERROR, 0))
3022 re_node_set_free (&next_nodes);
3026 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3027 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3028 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3030 re_node_set_free (&next_nodes);
3033 mctx->state_log[str_idx] = cur_state;
3034 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3036 re_node_set_free (&next_nodes);
3037 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3038 : &mctx->state_log[last_str]->nodes);
3039 path->next_idx = str_idx;
3042 mctx->state_log = backup_state_log;
3043 mctx->input.cur_idx = backup_cur_idx;
3045 /* Then check the current node set has the node LAST_NODE. */
3046 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3052 /* Helper functions for check_arrival. */
3054 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3056 TODO: This function is similar to the functions transit_state*(),
3057 however this function has many additional works.
3058 Can't we unify them? */
3060 static reg_errcode_t
3061 internal_function __attribute_warn_unused_result__
3062 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3063 re_node_set *cur_nodes, re_node_set *next_nodes)
3065 const re_dfa_t *const dfa = mctx->dfa;
3068 reg_errcode_t err = REG_NOERROR;
3069 re_node_set union_set;
3070 re_node_set_init_empty (&union_set);
3071 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3074 int cur_node = cur_nodes->elems[cur_idx];
3076 re_token_type_t type = dfa->nodes[cur_node].type;
3077 assert (!IS_EPSILON_NODE (type));
3079 #ifdef RE_ENABLE_I18N
3080 /* If the node may accept `multi byte'. */
3081 if (dfa->nodes[cur_node].accept_mb)
3083 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3087 re_dfastate_t *dest_state;
3088 int next_node = dfa->nexts[cur_node];
3089 int next_idx = str_idx + naccepted;
3090 dest_state = mctx->state_log[next_idx];
3091 re_node_set_empty (&union_set);
3094 err = re_node_set_merge (&union_set, &dest_state->nodes);
3095 if (BE (err != REG_NOERROR, 0))
3097 re_node_set_free (&union_set);
3101 result = re_node_set_insert (&union_set, next_node);
3102 if (BE (result < 0, 0))
3104 re_node_set_free (&union_set);
3107 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3109 if (BE (mctx->state_log[next_idx] == NULL
3110 && err != REG_NOERROR, 0))
3112 re_node_set_free (&union_set);
3117 #endif /* RE_ENABLE_I18N */
3119 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3121 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3122 if (BE (result < 0, 0))
3124 re_node_set_free (&union_set);
3129 re_node_set_free (&union_set);
3133 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3134 CUR_NODES, however exclude the nodes which are:
3135 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3136 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3139 static reg_errcode_t
3141 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3142 int ex_subexp, int type)
3145 int idx, outside_node;
3146 re_node_set new_nodes;
3148 assert (cur_nodes->nelem);
3150 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3151 if (BE (err != REG_NOERROR, 0))
3153 /* Create a new node set NEW_NODES with the nodes which are epsilon
3154 closures of the node in CUR_NODES. */
3156 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3158 int cur_node = cur_nodes->elems[idx];
3159 const re_node_set *eclosure = dfa->eclosures + cur_node;
3160 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3161 if (outside_node == -1)
3163 /* There are no problematic nodes, just merge them. */
3164 err = re_node_set_merge (&new_nodes, eclosure);
3165 if (BE (err != REG_NOERROR, 0))
3167 re_node_set_free (&new_nodes);
3173 /* There are problematic nodes, re-calculate incrementally. */
3174 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3176 if (BE (err != REG_NOERROR, 0))
3178 re_node_set_free (&new_nodes);
3183 re_node_set_free (cur_nodes);
3184 *cur_nodes = new_nodes;
3188 /* Helper function for check_arrival_expand_ecl.
3189 Check incrementally the epsilon closure of TARGET, and if it isn't
3190 problematic append it to DST_NODES. */
3192 static reg_errcode_t
3193 internal_function __attribute_warn_unused_result__
3194 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3195 int target, int ex_subexp, int type)
3198 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3202 if (dfa->nodes[cur_node].type == type
3203 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3205 if (type == OP_CLOSE_SUBEXP)
3207 err = re_node_set_insert (dst_nodes, cur_node);
3208 if (BE (err == -1, 0))
3213 err = re_node_set_insert (dst_nodes, cur_node);
3214 if (BE (err == -1, 0))
3216 if (dfa->edests[cur_node].nelem == 0)
3218 if (dfa->edests[cur_node].nelem == 2)
3220 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3221 dfa->edests[cur_node].elems[1],
3223 if (BE (err != REG_NOERROR, 0))
3226 cur_node = dfa->edests[cur_node].elems[0];
3232 /* For all the back references in the current state, calculate the
3233 destination of the back references by the appropriate entry
3234 in MCTX->BKREF_ENTS. */
3236 static reg_errcode_t
3237 internal_function __attribute_warn_unused_result__
3238 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3239 int cur_str, int subexp_num, int type)
3241 const re_dfa_t *const dfa = mctx->dfa;
3243 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3244 struct re_backref_cache_entry *ent;
3246 if (cache_idx_start == -1)
3250 ent = mctx->bkref_ents + cache_idx_start;
3253 int to_idx, next_node;
3255 /* Is this entry ENT is appropriate? */
3256 if (!re_node_set_contains (cur_nodes, ent->node))
3259 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3260 /* Calculate the destination of the back reference, and append it
3261 to MCTX->STATE_LOG. */
3262 if (to_idx == cur_str)
3264 /* The backreference did epsilon transit, we must re-check all the
3265 node in the current state. */
3266 re_node_set new_dests;
3267 reg_errcode_t err2, err3;
3268 next_node = dfa->edests[ent->node].elems[0];
3269 if (re_node_set_contains (cur_nodes, next_node))
3271 err = re_node_set_init_1 (&new_dests, next_node);
3272 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3273 err3 = re_node_set_merge (cur_nodes, &new_dests);
3274 re_node_set_free (&new_dests);
3275 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3276 || err3 != REG_NOERROR, 0))
3278 err = (err != REG_NOERROR ? err
3279 : (err2 != REG_NOERROR ? err2 : err3));
3282 /* TODO: It is still inefficient... */
3287 re_node_set union_set;
3288 next_node = dfa->nexts[ent->node];
3289 if (mctx->state_log[to_idx])
3292 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3295 err = re_node_set_init_copy (&union_set,
3296 &mctx->state_log[to_idx]->nodes);
3297 ret = re_node_set_insert (&union_set, next_node);
3298 if (BE (err != REG_NOERROR || ret < 0, 0))
3300 re_node_set_free (&union_set);
3301 err = err != REG_NOERROR ? err : REG_ESPACE;
3307 err = re_node_set_init_1 (&union_set, next_node);
3308 if (BE (err != REG_NOERROR, 0))
3311 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3312 re_node_set_free (&union_set);
3313 if (BE (mctx->state_log[to_idx] == NULL
3314 && err != REG_NOERROR, 0))
3318 while (ent++->more);
3322 /* Build transition table for the state.
3323 Return 1 if succeeded, otherwise return NULL. */
3327 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3330 int i, j, ch, need_word_trtable = 0;
3331 bitset_word_t elem, mask;
3332 bool dests_node_malloced = false;
3333 bool dest_states_malloced = false;
3334 int ndests; /* Number of the destination states from `state'. */
3335 re_dfastate_t **trtable;
3336 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3337 re_node_set follows, *dests_node;
3339 bitset_t acceptable;
3343 re_node_set dests_node[SBC_MAX];
3344 bitset_t dests_ch[SBC_MAX];
3347 /* We build DFA states which corresponds to the destination nodes
3348 from `state'. `dests_node[i]' represents the nodes which i-th
3349 destination state contains, and `dests_ch[i]' represents the
3350 characters which i-th destination state accepts. */
3351 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3352 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3355 dests_alloc = re_malloc (struct dests_alloc, 1);
3356 if (BE (dests_alloc == NULL, 0))
3358 dests_node_malloced = true;
3360 dests_node = dests_alloc->dests_node;
3361 dests_ch = dests_alloc->dests_ch;
3363 /* Initialize transiton table. */
3364 state->word_trtable = state->trtable = NULL;
3366 /* At first, group all nodes belonging to `state' into several
3368 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3369 if (BE (ndests <= 0, 0))
3371 if (dests_node_malloced)
3373 /* Return 0 in case of an error, 1 otherwise. */
3376 state->trtable = (re_dfastate_t **)
3377 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3378 if (BE (state->trtable == NULL, 0))
3385 err = re_node_set_alloc (&follows, ndests + 1);
3386 if (BE (err != REG_NOERROR, 0))
3389 /* Avoid arithmetic overflow in size calculation. */
3390 if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3391 / (3 * sizeof (re_dfastate_t *)))
3396 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3397 + ndests * 3 * sizeof (re_dfastate_t *)))
3398 dest_states = (re_dfastate_t **)
3399 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3402 dest_states = (re_dfastate_t **)
3403 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3404 if (BE (dest_states == NULL, 0))
3407 if (dest_states_malloced)
3409 re_node_set_free (&follows);
3410 for (i = 0; i < ndests; ++i)
3411 re_node_set_free (dests_node + i);
3412 if (dests_node_malloced)
3416 dest_states_malloced = true;
3418 dest_states_word = dest_states + ndests;
3419 dest_states_nl = dest_states_word + ndests;
3420 bitset_empty (acceptable);
3422 /* Then build the states for all destinations. */
3423 for (i = 0; i < ndests; ++i)
3426 re_node_set_empty (&follows);
3427 /* Merge the follows of this destination states. */
3428 for (j = 0; j < dests_node[i].nelem; ++j)
3430 next_node = dfa->nexts[dests_node[i].elems[j]];
3431 if (next_node != -1)
3433 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3434 if (BE (err != REG_NOERROR, 0))
3438 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3439 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3441 /* If the new state has context constraint,
3442 build appropriate states for these contexts. */
3443 if (dest_states[i]->has_constraint)
3445 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3447 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3450 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3451 need_word_trtable = 1;
3453 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3455 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3460 dest_states_word[i] = dest_states[i];
3461 dest_states_nl[i] = dest_states[i];
3463 bitset_merge (acceptable, dests_ch[i]);
3466 if (!BE (need_word_trtable, 0))
3468 /* We don't care about whether the following character is a word
3469 character, or we are in a single-byte character set so we can
3470 discern by looking at the character code: allocate a
3471 256-entry transition table. */
3472 trtable = state->trtable =
3473 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3474 if (BE (trtable == NULL, 0))
3477 /* For all characters ch...: */
3478 for (i = 0; i < BITSET_WORDS; ++i)
3479 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3481 mask <<= 1, elem >>= 1, ++ch)
3482 if (BE (elem & 1, 0))
3484 /* There must be exactly one destination which accepts
3485 character ch. See group_nodes_into_DFAstates. */
3486 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3489 /* j-th destination accepts the word character ch. */
3490 if (dfa->word_char[i] & mask)
3491 trtable[ch] = dest_states_word[j];
3493 trtable[ch] = dest_states[j];
3498 /* We care about whether the following character is a word
3499 character, and we are in a multi-byte character set: discern
3500 by looking at the character code: build two 256-entry
3501 transition tables, one starting at trtable[0] and one
3502 starting at trtable[SBC_MAX]. */
3503 trtable = state->word_trtable =
3504 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3505 if (BE (trtable == NULL, 0))
3508 /* For all characters ch...: */
3509 for (i = 0; i < BITSET_WORDS; ++i)
3510 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3512 mask <<= 1, elem >>= 1, ++ch)
3513 if (BE (elem & 1, 0))
3515 /* There must be exactly one destination which accepts
3516 character ch. See group_nodes_into_DFAstates. */
3517 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3520 /* j-th destination accepts the word character ch. */
3521 trtable[ch] = dest_states[j];
3522 trtable[ch + SBC_MAX] = dest_states_word[j];
3527 if (bitset_contain (acceptable, NEWLINE_CHAR))
3529 /* The current state accepts newline character. */
3530 for (j = 0; j < ndests; ++j)
3531 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3533 /* k-th destination accepts newline character. */
3534 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3535 if (need_word_trtable)
3536 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3537 /* There must be only one destination which accepts
3538 newline. See group_nodes_into_DFAstates. */
3543 if (dest_states_malloced)
3546 re_node_set_free (&follows);
3547 for (i = 0; i < ndests; ++i)
3548 re_node_set_free (dests_node + i);
3550 if (dests_node_malloced)
3556 /* Group all nodes belonging to STATE into several destinations.
3557 Then for all destinations, set the nodes belonging to the destination
3558 to DESTS_NODE[i] and set the characters accepted by the destination
3559 to DEST_CH[i]. This function return the number of destinations. */
3563 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3564 re_node_set *dests_node, bitset_t *dests_ch)
3569 int ndests; /* Number of the destinations from `state'. */
3570 bitset_t accepts; /* Characters a node can accept. */
3571 const re_node_set *cur_nodes = &state->nodes;
3572 bitset_empty (accepts);
3575 /* For all the nodes belonging to `state', */
3576 for (i = 0; i < cur_nodes->nelem; ++i)
3578 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3579 re_token_type_t type = node->type;
3580 unsigned int constraint = node->constraint;
3582 /* Enumerate all single byte character this node can accept. */
3583 if (type == CHARACTER)
3584 bitset_set (accepts, node->opr.c);
3585 else if (type == SIMPLE_BRACKET)
3587 bitset_merge (accepts, node->opr.sbcset);
3589 else if (type == OP_PERIOD)
3591 #ifdef RE_ENABLE_I18N
3592 if (dfa->mb_cur_max > 1)
3593 bitset_merge (accepts, dfa->sb_char);
3596 bitset_set_all (accepts);
3597 if (!(dfa->syntax & RE_DOT_NEWLINE))
3598 bitset_clear (accepts, '\n');
3599 if (dfa->syntax & RE_DOT_NOT_NULL)
3600 bitset_clear (accepts, '\0');
3602 #ifdef RE_ENABLE_I18N
3603 else if (type == OP_UTF8_PERIOD)
3605 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3606 if (!(dfa->syntax & RE_DOT_NEWLINE))
3607 bitset_clear (accepts, '\n');
3608 if (dfa->syntax & RE_DOT_NOT_NULL)
3609 bitset_clear (accepts, '\0');
3615 /* Check the `accepts' and sift the characters which are not
3616 match it the context. */
3619 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3621 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3622 bitset_empty (accepts);
3623 if (accepts_newline)
3624 bitset_set (accepts, NEWLINE_CHAR);
3628 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3630 bitset_empty (accepts);
3634 if (constraint & NEXT_WORD_CONSTRAINT)
3636 bitset_word_t any_set = 0;
3637 if (type == CHARACTER && !node->word_char)
3639 bitset_empty (accepts);
3642 #ifdef RE_ENABLE_I18N
3643 if (dfa->mb_cur_max > 1)
3644 for (j = 0; j < BITSET_WORDS; ++j)
3645 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3648 for (j = 0; j < BITSET_WORDS; ++j)
3649 any_set |= (accepts[j] &= dfa->word_char[j]);
3653 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3655 bitset_word_t any_set = 0;
3656 if (type == CHARACTER && node->word_char)
3658 bitset_empty (accepts);
3661 #ifdef RE_ENABLE_I18N
3662 if (dfa->mb_cur_max > 1)
3663 for (j = 0; j < BITSET_WORDS; ++j)
3664 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3667 for (j = 0; j < BITSET_WORDS; ++j)
3668 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3674 /* Then divide `accepts' into DFA states, or create a new
3675 state. Above, we make sure that accepts is not empty. */
3676 for (j = 0; j < ndests; ++j)
3678 bitset_t intersec; /* Intersection sets, see below. */
3680 /* Flags, see below. */
3681 bitset_word_t has_intersec, not_subset, not_consumed;
3683 /* Optimization, skip if this state doesn't accept the character. */
3684 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3687 /* Enumerate the intersection set of this state and `accepts'. */
3689 for (k = 0; k < BITSET_WORDS; ++k)
3690 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3691 /* And skip if the intersection set is empty. */
3695 /* Then check if this state is a subset of `accepts'. */
3696 not_subset = not_consumed = 0;
3697 for (k = 0; k < BITSET_WORDS; ++k)
3699 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3700 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3703 /* If this state isn't a subset of `accepts', create a
3704 new group state, which has the `remains'. */
3707 bitset_copy (dests_ch[ndests], remains);
3708 bitset_copy (dests_ch[j], intersec);
3709 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3710 if (BE (err != REG_NOERROR, 0))
3715 /* Put the position in the current group. */
3716 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3717 if (BE (result < 0, 0))
3720 /* If all characters are consumed, go to next node. */
3724 /* Some characters remain, create a new group. */
3727 bitset_copy (dests_ch[ndests], accepts);
3728 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3729 if (BE (err != REG_NOERROR, 0))
3732 bitset_empty (accepts);
3737 for (j = 0; j < ndests; ++j)
3738 re_node_set_free (dests_node + j);
3742 #ifdef RE_ENABLE_I18N
3743 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3744 Return the number of the bytes the node accepts.
3745 STR_IDX is the current index of the input string.
3747 This function handles the nodes which can accept one character, or
3748 one collating element like '.', '[a-z]', opposite to the other nodes
3749 can only accept one byte. */
3753 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3754 const re_string_t *input, int str_idx)
3756 const re_token_t *node = dfa->nodes + node_idx;
3757 int char_len, elem_len;
3760 if (BE (node->type == OP_UTF8_PERIOD, 0))
3762 unsigned char c = re_string_byte_at (input, str_idx), d;
3763 if (BE (c < 0xc2, 1))
3766 if (str_idx + 2 > input->len)
3769 d = re_string_byte_at (input, str_idx + 1);
3771 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3775 if (c == 0xe0 && d < 0xa0)
3781 if (c == 0xf0 && d < 0x90)
3787 if (c == 0xf8 && d < 0x88)
3793 if (c == 0xfc && d < 0x84)
3799 if (str_idx + char_len > input->len)
3802 for (i = 1; i < char_len; ++i)
3804 d = re_string_byte_at (input, str_idx + i);
3805 if (d < 0x80 || d > 0xbf)
3811 char_len = re_string_char_size_at (input, str_idx);
3812 if (node->type == OP_PERIOD)
3816 /* FIXME: I don't think this if is needed, as both '\n'
3817 and '\0' are char_len == 1. */
3818 /* '.' accepts any one character except the following two cases. */
3819 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3820 re_string_byte_at (input, str_idx) == '\n') ||
3821 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3822 re_string_byte_at (input, str_idx) == '\0'))
3827 elem_len = re_string_elem_size_at (input, str_idx);
3828 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3831 if (node->type == COMPLEX_BRACKET)
3833 const re_charset_t *cset = node->opr.mbcset;
3835 const unsigned char *pin
3836 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3841 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3842 ? re_string_wchar_at (input, str_idx) : 0);
3844 /* match with multibyte character? */
3845 for (i = 0; i < cset->nmbchars; ++i)
3846 if (wc == cset->mbchars[i])
3848 match_len = char_len;
3849 goto check_node_accept_bytes_match;
3851 /* match with character_class? */
3852 for (i = 0; i < cset->nchar_classes; ++i)
3854 wctype_t wt = cset->char_classes[i];
3855 if (__iswctype (wc, wt))
3857 match_len = char_len;
3858 goto check_node_accept_bytes_match;
3863 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3866 unsigned int in_collseq = 0;
3867 const int32_t *table, *indirect;
3868 const unsigned char *weights, *extra;
3869 const char *collseqwc;
3870 /* This #include defines a local function! */
3871 # include <locale/weight.h>
3873 /* match with collating_symbol? */
3874 if (cset->ncoll_syms)
3875 extra = (const unsigned char *)
3876 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3877 for (i = 0; i < cset->ncoll_syms; ++i)
3879 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3880 /* Compare the length of input collating element and
3881 the length of current collating element. */
3882 if (*coll_sym != elem_len)
3884 /* Compare each bytes. */
3885 for (j = 0; j < *coll_sym; j++)
3886 if (pin[j] != coll_sym[1 + j])
3890 /* Match if every bytes is equal. */
3892 goto check_node_accept_bytes_match;
3898 if (elem_len <= char_len)
3900 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3901 in_collseq = __collseq_table_lookup (collseqwc, wc);
3904 in_collseq = find_collation_sequence_value (pin, elem_len);
3906 /* match with range expression? */
3907 for (i = 0; i < cset->nranges; ++i)
3908 if (cset->range_starts[i] <= in_collseq
3909 && in_collseq <= cset->range_ends[i])
3911 match_len = elem_len;
3912 goto check_node_accept_bytes_match;
3915 /* match with equivalence_class? */
3916 if (cset->nequiv_classes)
3918 const unsigned char *cp = pin;
3919 table = (const int32_t *)
3920 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3921 weights = (const unsigned char *)
3922 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3923 extra = (const unsigned char *)
3924 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3925 indirect = (const int32_t *)
3926 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3927 int32_t idx = findidx (&cp, elem_len);
3929 for (i = 0; i < cset->nequiv_classes; ++i)
3931 int32_t equiv_class_idx = cset->equiv_classes[i];
3932 size_t weight_len = weights[idx & 0xffffff];
3933 if (weight_len == weights[equiv_class_idx & 0xffffff]
3934 && (idx >> 24) == (equiv_class_idx >> 24))
3939 equiv_class_idx &= 0xffffff;
3941 while (cnt <= weight_len
3942 && (weights[equiv_class_idx + 1 + cnt]
3943 == weights[idx + 1 + cnt]))
3945 if (cnt > weight_len)
3947 match_len = elem_len;
3948 goto check_node_accept_bytes_match;
3957 /* match with range expression? */
3959 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3961 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3964 for (i = 0; i < cset->nranges; ++i)
3966 cmp_buf[0] = cset->range_starts[i];
3967 cmp_buf[4] = cset->range_ends[i];
3968 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3969 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3971 match_len = char_len;
3972 goto check_node_accept_bytes_match;
3976 check_node_accept_bytes_match:
3977 if (!cset->non_match)
3984 return (elem_len > char_len) ? elem_len : char_len;
3993 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3995 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4000 /* No valid character. Match it as a single byte character. */
4001 const unsigned char *collseq = (const unsigned char *)
4002 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4003 return collseq[mbs[0]];
4010 const unsigned char *extra = (const unsigned char *)
4011 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4012 int32_t extrasize = (const unsigned char *)
4013 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4015 for (idx = 0; idx < extrasize;)
4017 int mbs_cnt, found = 0;
4018 int32_t elem_mbs_len;
4019 /* Skip the name of collating element name. */
4020 idx = idx + extra[idx] + 1;
4021 elem_mbs_len = extra[idx++];
4022 if (mbs_len == elem_mbs_len)
4024 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4025 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4027 if (mbs_cnt == elem_mbs_len)
4028 /* Found the entry. */
4031 /* Skip the byte sequence of the collating element. */
4032 idx += elem_mbs_len;
4033 /* Adjust for the alignment. */
4034 idx = (idx + 3) & ~3;
4035 /* Skip the collation sequence value. */
4036 idx += sizeof (uint32_t);
4037 /* Skip the wide char sequence of the collating element. */
4038 idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4039 /* If we found the entry, return the sequence value. */
4041 return *(uint32_t *) (extra + idx);
4042 /* Skip the collation sequence value. */
4043 idx += sizeof (uint32_t);
4049 #endif /* RE_ENABLE_I18N */
4051 /* Check whether the node accepts the byte which is IDX-th
4052 byte of the INPUT. */
4056 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4060 ch = re_string_byte_at (&mctx->input, idx);
4064 if (node->opr.c != ch)
4068 case SIMPLE_BRACKET:
4069 if (!bitset_contain (node->opr.sbcset, ch))
4073 #ifdef RE_ENABLE_I18N
4074 case OP_UTF8_PERIOD:
4080 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4081 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4089 if (node->constraint)
4091 /* The node has constraints. Check whether the current context
4092 satisfies the constraints. */
4093 unsigned int context = re_string_context_at (&mctx->input, idx,
4095 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4102 /* Extend the buffers, if the buffers have run out. */
4104 static reg_errcode_t
4105 internal_function __attribute_warn_unused_result__
4106 extend_buffers (re_match_context_t *mctx)
4109 re_string_t *pstr = &mctx->input;
4111 /* Avoid overflow. */
4112 if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4115 /* Double the lengthes of the buffers. */
4116 ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2));
4117 if (BE (ret != REG_NOERROR, 0))
4120 if (mctx->state_log != NULL)
4122 /* And double the length of state_log. */
4123 /* XXX We have no indication of the size of this buffer. If this
4124 allocation fail we have no indication that the state_log array
4125 does not have the right size. */
4126 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4127 pstr->bufs_len + 1);
4128 if (BE (new_array == NULL, 0))
4130 mctx->state_log = new_array;
4133 /* Then reconstruct the buffers. */
4136 #ifdef RE_ENABLE_I18N
4137 if (pstr->mb_cur_max > 1)
4139 ret = build_wcs_upper_buffer (pstr);
4140 if (BE (ret != REG_NOERROR, 0))
4144 #endif /* RE_ENABLE_I18N */
4145 build_upper_buffer (pstr);
4149 #ifdef RE_ENABLE_I18N
4150 if (pstr->mb_cur_max > 1)
4151 build_wcs_buffer (pstr);
4153 #endif /* RE_ENABLE_I18N */
4155 if (pstr->trans != NULL)
4156 re_string_translate_buffer (pstr);
4163 /* Functions for matching context. */
4165 /* Initialize MCTX. */
4167 static reg_errcode_t
4168 internal_function __attribute_warn_unused_result__
4169 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4171 mctx->eflags = eflags;
4172 mctx->match_last = -1;
4175 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4176 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4177 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4180 /* Already zero-ed by the caller.
4182 mctx->bkref_ents = NULL;
4183 mctx->nbkref_ents = 0;
4184 mctx->nsub_tops = 0; */
4185 mctx->abkref_ents = n;
4186 mctx->max_mb_elem_len = 1;
4187 mctx->asub_tops = n;
4191 /* Clean the entries which depend on the current input in MCTX.
4192 This function must be invoked when the matcher changes the start index
4193 of the input, or changes the input string. */
4197 match_ctx_clean (re_match_context_t *mctx)
4200 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4203 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4204 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4206 re_sub_match_last_t *last = top->lasts[sl_idx];
4207 re_free (last->path.array);
4210 re_free (top->lasts);
4213 re_free (top->path->array);
4214 re_free (top->path);
4219 mctx->nsub_tops = 0;
4220 mctx->nbkref_ents = 0;
4223 /* Free all the memory associated with MCTX. */
4227 match_ctx_free (re_match_context_t *mctx)
4229 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4230 match_ctx_clean (mctx);
4231 re_free (mctx->sub_tops);
4232 re_free (mctx->bkref_ents);
4235 /* Add a new backreference entry to MCTX.
4236 Note that we assume that caller never call this function with duplicate
4237 entry, and call with STR_IDX which isn't smaller than any existing entry.
4240 static reg_errcode_t
4241 internal_function __attribute_warn_unused_result__
4242 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4245 if (mctx->nbkref_ents >= mctx->abkref_ents)
4247 struct re_backref_cache_entry* new_entry;
4248 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4249 mctx->abkref_ents * 2);
4250 if (BE (new_entry == NULL, 0))
4252 re_free (mctx->bkref_ents);
4255 mctx->bkref_ents = new_entry;
4256 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4257 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4258 mctx->abkref_ents *= 2;
4260 if (mctx->nbkref_ents > 0
4261 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4262 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4264 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4265 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4266 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4267 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4269 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4270 If bit N is clear, means that this entry won't epsilon-transition to
4271 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4272 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4275 A backreference does not epsilon-transition unless it is empty, so set
4276 to all zeros if FROM != TO. */
4277 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4278 = (from == to ? ~0 : 0);
4280 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4281 if (mctx->max_mb_elem_len < to - from)
4282 mctx->max_mb_elem_len = to - from;
4286 /* Search for the first entry which has the same str_idx, or -1 if none is
4287 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4291 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4293 int left, right, mid, last;
4294 last = right = mctx->nbkref_ents;
4295 for (left = 0; left < right;)
4297 mid = (left + right) / 2;
4298 if (mctx->bkref_ents[mid].str_idx < str_idx)
4303 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4309 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4312 static reg_errcode_t
4313 internal_function __attribute_warn_unused_result__
4314 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4317 assert (mctx->sub_tops != NULL);
4318 assert (mctx->asub_tops > 0);
4320 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4322 int new_asub_tops = mctx->asub_tops * 2;
4323 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4324 re_sub_match_top_t *,
4326 if (BE (new_array == NULL, 0))
4328 mctx->sub_tops = new_array;
4329 mctx->asub_tops = new_asub_tops;
4331 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4332 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4334 mctx->sub_tops[mctx->nsub_tops]->node = node;
4335 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4339 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4340 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4342 static re_sub_match_last_t *
4344 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4346 re_sub_match_last_t *new_entry;
4347 if (BE (subtop->nlasts == subtop->alasts, 0))
4349 int new_alasts = 2 * subtop->alasts + 1;
4350 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4351 re_sub_match_last_t *,
4353 if (BE (new_array == NULL, 0))
4355 subtop->lasts = new_array;
4356 subtop->alasts = new_alasts;
4358 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4359 if (BE (new_entry != NULL, 1))
4361 subtop->lasts[subtop->nlasts] = new_entry;
4362 new_entry->node = node;
4363 new_entry->str_idx = str_idx;
4371 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4372 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4374 sctx->sifted_states = sifted_sts;
4375 sctx->limited_states = limited_sts;
4376 sctx->last_node = last_node;
4377 sctx->last_str_idx = last_str_idx;
4378 re_node_set_init_empty (&sctx->limits);