1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static void re_string_construct_common (const char *str, int len,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28 #endif /* RE_ENABLE_I18N */
29 static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
30 const re_node_set *nodes,
31 unsigned int hash) internal_function;
32 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
33 unsigned int hash) internal_function;
34 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
35 const re_node_set *nodes,
36 unsigned int hash) internal_function;
37 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
38 const re_node_set *nodes,
40 unsigned int hash) internal_function;
41 static unsigned int inline calc_state_hash (const re_node_set *nodes,
42 unsigned int context) internal_function;
44 /* Functions for string operation. */
46 /* This function allocate the buffers. It is necessary to call
47 re_string_reconstruct before using the object. */
50 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
53 int len, init_len, icase;
54 RE_TRANSLATE_TYPE trans;
60 /* Ensure at least one character fits into the buffers. */
61 if (init_len < dfa->mb_cur_max)
62 init_len = dfa->mb_cur_max;
63 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
64 re_string_construct_common (str, len, pstr, trans, icase, dfa);
66 ret = re_string_realloc_buffers (pstr, init_buf_len);
67 if (BE (ret != REG_NOERROR, 0))
70 pstr->word_char = dfa->word_char;
71 pstr->word_ops_used = dfa->word_ops_used;
72 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
73 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
74 pstr->valid_raw_len = pstr->valid_len;
78 /* This function allocate the buffers, and initialize them. */
81 re_string_construct (pstr, str, len, trans, icase, dfa)
85 RE_TRANSLATE_TYPE trans;
89 memset (pstr, '\0', sizeof (re_string_t));
90 re_string_construct_common (str, len, pstr, trans, icase, dfa);
94 ret = re_string_realloc_buffers (pstr, len + 1);
95 if (BE (ret != REG_NOERROR, 0))
98 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
102 #ifdef RE_ENABLE_I18N
103 if (dfa->mb_cur_max > 1)
107 ret = build_wcs_upper_buffer (pstr);
108 if (BE (ret != REG_NOERROR, 0))
110 if (pstr->valid_raw_len >= len)
112 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
114 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
115 if (BE (ret != REG_NOERROR, 0))
120 #endif /* RE_ENABLE_I18N */
121 build_upper_buffer (pstr);
125 #ifdef RE_ENABLE_I18N
126 if (dfa->mb_cur_max > 1)
127 build_wcs_buffer (pstr);
129 #endif /* RE_ENABLE_I18N */
132 re_string_translate_buffer (pstr);
135 pstr->valid_len = pstr->bufs_len;
136 pstr->valid_raw_len = pstr->bufs_len;
144 /* Helper functions for re_string_allocate, and re_string_construct. */
147 re_string_realloc_buffers (pstr, new_buf_len)
151 #ifdef RE_ENABLE_I18N
152 if (pstr->mb_cur_max > 1)
154 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
155 if (BE (new_array == NULL, 0))
157 pstr->wcs = new_array;
158 if (pstr->offsets != NULL)
160 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
161 if (BE (new_array == NULL, 0))
163 pstr->offsets = new_array;
166 #endif /* RE_ENABLE_I18N */
167 if (pstr->mbs_allocated)
169 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
171 if (BE (new_array == NULL, 0))
173 pstr->mbs = new_array;
175 pstr->bufs_len = new_buf_len;
181 re_string_construct_common (str, len, pstr, trans, icase, dfa)
185 RE_TRANSLATE_TYPE trans;
189 pstr->raw_mbs = (const unsigned char *) str;
192 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
193 pstr->icase = icase ? 1 : 0;
194 pstr->mbs_allocated = (trans != NULL || icase);
195 pstr->mb_cur_max = dfa->mb_cur_max;
196 pstr->is_utf8 = dfa->is_utf8;
197 pstr->map_notascii = dfa->map_notascii;
198 pstr->stop = pstr->len;
199 pstr->raw_stop = pstr->stop;
202 #ifdef RE_ENABLE_I18N
204 /* Build wide character buffer PSTR->WCS.
205 If the byte sequence of the string are:
206 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
207 Then wide character buffer will be:
208 <wc1> , WEOF , <wc2> , WEOF , <wc3>
209 We use WEOF for padding, they indicate that the position isn't
210 a first byte of a multibyte character.
212 Note that this function assumes PSTR->VALID_LEN elements are already
213 built and starts from PSTR->VALID_LEN. */
216 build_wcs_buffer (pstr)
220 unsigned char buf[pstr->mb_cur_max];
222 unsigned char buf[64];
225 int byte_idx, end_idx, mbclen, remain_len;
227 /* Build the buffers from pstr->valid_len to either pstr->len or
229 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
230 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
235 remain_len = end_idx - byte_idx;
236 prev_st = pstr->cur_state;
237 /* Apply the translation if we need. */
238 if (BE (pstr->trans != NULL, 0))
242 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
244 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
245 buf[i] = pstr->trans[ch];
247 p = (const char *) buf;
250 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
251 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
252 if (BE (mbclen == (size_t) -2, 0))
254 /* The buffer doesn't have enough space, finish to build. */
255 pstr->cur_state = prev_st;
258 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
260 /* We treat these cases as a singlebyte character. */
262 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
263 if (BE (pstr->trans != NULL, 0))
264 wc = pstr->trans[wc];
265 pstr->cur_state = prev_st;
268 /* Write wide character and padding. */
269 pstr->wcs[byte_idx++] = wc;
270 /* Write paddings. */
271 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
272 pstr->wcs[byte_idx++] = WEOF;
274 pstr->valid_len = byte_idx;
275 pstr->valid_raw_len = byte_idx;
278 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
279 but for REG_ICASE. */
282 build_wcs_upper_buffer (pstr)
286 int src_idx, byte_idx, end_idx, mbclen, remain_len;
288 unsigned char buf[pstr->mb_cur_max];
290 unsigned char buf[64];
293 byte_idx = pstr->valid_len;
294 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
297 /* The following optimization assumes that the wchar_t encoding is
299 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
301 while (byte_idx < end_idx)
305 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
306 && mbsinit (&pstr->cur_state))
308 /* In case of a singlebyte character. */
310 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
311 /* The next step uses the assumption that wchar_t is encoded
312 with ISO 10646: all ASCII values can be converted like
314 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
319 remain_len = end_idx - byte_idx;
320 prev_st = pstr->cur_state;
321 mbclen = mbrtowc (&wc,
322 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 + byte_idx), remain_len, &pstr->cur_state);
324 if (BE (mbclen > 0, 1))
332 mbcdlen = wcrtomb (buf, wcu, &prev_st);
333 if (BE (mbclen == mbcdlen, 1))
334 memcpy (pstr->mbs + byte_idx, buf, mbclen);
342 memcpy (pstr->mbs + byte_idx,
343 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
344 pstr->wcs[byte_idx++] = wcu;
345 /* Write paddings. */
346 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
347 pstr->wcs[byte_idx++] = WEOF;
349 else if (mbclen == (size_t) -1 || mbclen == 0)
351 /* It is an invalid character or '\0'. Just use the byte. */
352 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
353 pstr->mbs[byte_idx] = ch;
354 /* And also cast it to wide char. */
355 pstr->wcs[byte_idx++] = (wchar_t) ch;
356 if (BE (mbclen == (size_t) -1, 0))
357 pstr->cur_state = prev_st;
361 /* The buffer doesn't have enough space, finish to build. */
362 pstr->cur_state = prev_st;
366 pstr->valid_len = byte_idx;
367 pstr->valid_raw_len = byte_idx;
372 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
379 remain_len = end_idx - byte_idx;
380 prev_st = pstr->cur_state;
381 if (BE (pstr->trans != NULL, 0))
385 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
387 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
388 buf[i] = pstr->trans[ch];
390 p = (const char *) buf;
393 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
394 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
395 if (BE (mbclen > 0, 1))
403 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
404 if (BE (mbclen == mbcdlen, 1))
405 memcpy (pstr->mbs + byte_idx, buf, mbclen);
410 if (byte_idx + mbcdlen > pstr->bufs_len)
412 pstr->cur_state = prev_st;
416 if (pstr->offsets == NULL)
418 pstr->offsets = re_malloc (int, pstr->bufs_len);
420 if (pstr->offsets == NULL)
423 if (!pstr->offsets_needed)
425 for (i = 0; i < byte_idx; ++i)
426 pstr->offsets[i] = i;
427 pstr->offsets_needed = 1;
430 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
431 pstr->wcs[byte_idx] = wcu;
432 pstr->offsets[byte_idx] = src_idx;
433 for (i = 1; i < mbcdlen; ++i)
435 pstr->offsets[byte_idx + i]
436 = src_idx + (i < mbclen ? i : mbclen - 1);
437 pstr->wcs[byte_idx + i] = WEOF;
439 pstr->len += mbcdlen - mbclen;
440 if (pstr->raw_stop > src_idx)
441 pstr->stop += mbcdlen - mbclen;
442 end_idx = (pstr->bufs_len > pstr->len)
443 ? pstr->len : pstr->bufs_len;
450 memcpy (pstr->mbs + byte_idx, p, mbclen);
452 if (BE (pstr->offsets_needed != 0, 0))
455 for (i = 0; i < mbclen; ++i)
456 pstr->offsets[byte_idx + i] = src_idx + i;
460 pstr->wcs[byte_idx++] = wcu;
461 /* Write paddings. */
462 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
463 pstr->wcs[byte_idx++] = WEOF;
465 else if (mbclen == (size_t) -1 || mbclen == 0)
467 /* It is an invalid character or '\0'. Just use the byte. */
468 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
470 if (BE (pstr->trans != NULL, 0))
471 ch = pstr->trans [ch];
472 pstr->mbs[byte_idx] = ch;
474 if (BE (pstr->offsets_needed != 0, 0))
475 pstr->offsets[byte_idx] = src_idx;
478 /* And also cast it to wide char. */
479 pstr->wcs[byte_idx++] = (wchar_t) ch;
480 if (BE (mbclen == (size_t) -1, 0))
481 pstr->cur_state = prev_st;
485 /* The buffer doesn't have enough space, finish to build. */
486 pstr->cur_state = prev_st;
490 pstr->valid_len = byte_idx;
491 pstr->valid_raw_len = src_idx;
495 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
499 re_string_skip_chars (pstr, new_raw_idx, last_wc)
505 int rawbuf_idx, mbclen;
508 /* Skip the characters which are not necessary to check. */
509 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
510 rawbuf_idx < new_raw_idx;)
513 remain_len = pstr->len - rawbuf_idx;
514 prev_st = pstr->cur_state;
515 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
516 remain_len, &pstr->cur_state);
517 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
519 /* We treat these cases as a singlebyte character. */
521 pstr->cur_state = prev_st;
523 /* Then proceed the next character. */
524 rawbuf_idx += mbclen;
526 *last_wc = (wint_t) wc;
529 #endif /* RE_ENABLE_I18N */
531 /* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
535 build_upper_buffer (pstr)
538 int char_idx, end_idx;
539 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
541 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
543 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
544 if (BE (pstr->trans != NULL, 0))
545 ch = pstr->trans[ch];
547 pstr->mbs[char_idx] = toupper (ch);
549 pstr->mbs[char_idx] = ch;
551 pstr->valid_len = char_idx;
552 pstr->valid_raw_len = char_idx;
555 /* Apply TRANS to the buffer in PSTR. */
558 re_string_translate_buffer (pstr)
561 int buf_idx, end_idx;
562 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
564 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
566 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567 pstr->mbs[buf_idx] = pstr->trans[ch];
570 pstr->valid_len = buf_idx;
571 pstr->valid_raw_len = buf_idx;
574 /* This function re-construct the buffers.
575 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576 convert to upper case in case of REG_ICASE, apply translation. */
579 re_string_reconstruct (pstr, idx, eflags)
583 int offset = idx - pstr->raw_mbs_idx;
587 #ifdef RE_ENABLE_I18N
588 if (pstr->mb_cur_max > 1)
589 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590 #endif /* RE_ENABLE_I18N */
591 pstr->len = pstr->raw_len;
592 pstr->stop = pstr->raw_stop;
594 pstr->raw_mbs_idx = 0;
595 pstr->valid_raw_len = 0;
596 pstr->offsets_needed = 0;
597 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
599 if (!pstr->mbs_allocated)
600 pstr->mbs = (unsigned char *) pstr->raw_mbs;
606 /* Are the characters which are already checked remain? */
607 if (offset < pstr->valid_raw_len
608 #ifdef RE_ENABLE_I18N
609 /* Handling this would enlarge the code too much.
610 Accept a slowdown in that case. */
611 && pstr->offsets_needed == 0
615 /* Yes, move them to the front of the buffer. */
616 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
617 #ifdef RE_ENABLE_I18N
618 if (pstr->mb_cur_max > 1)
619 memmove (pstr->wcs, pstr->wcs + offset,
620 (pstr->valid_len - offset) * sizeof (wint_t));
621 #endif /* RE_ENABLE_I18N */
622 if (pstr->mbs_allocated)
623 memmove (pstr->mbs, pstr->mbs + offset,
624 pstr->valid_len - offset);
625 pstr->valid_len -= offset;
626 pstr->valid_raw_len -= offset;
628 assert (pstr->valid_len > 0);
633 /* No, skip all characters until IDX. */
634 #ifdef RE_ENABLE_I18N
635 if (BE (pstr->offsets_needed, 0))
637 pstr->len = pstr->raw_len - idx + offset;
638 pstr->stop = pstr->raw_stop - idx + offset;
639 pstr->offsets_needed = 0;
643 pstr->valid_raw_len = 0;
644 #ifdef RE_ENABLE_I18N
645 if (pstr->mb_cur_max > 1)
653 const unsigned char *raw, *p, *q, *end;
655 /* Special case UTF-8. Multi-byte chars start with any
656 byte other than 0x80 - 0xbf. */
657 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
658 end = raw + (offset - pstr->mb_cur_max);
659 for (p = raw + offset - 1; p >= end; --p)
660 if ((*p & 0xc0) != 0x80)
664 int mlen = raw + pstr->len - p;
665 unsigned char buf[6];
668 if (BE (pstr->trans != NULL, 0))
670 int i = mlen < 6 ? mlen : 6;
672 buf[i] = pstr->trans[p[i]];
675 /* XXX Don't use mbrtowc, we know which conversion
676 to use (UTF-8 -> UCS4). */
677 memset (&cur_state, 0, sizeof (cur_state));
678 mlen = mbrtowc (&wc2, p, mlen, &cur_state)
679 - (raw + offset - p);
682 memset (&pstr->cur_state, '\0',
684 pstr->valid_len = mlen;
692 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
693 if (BE (pstr->valid_len, 0))
695 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
696 pstr->wcs[wcs_idx] = WEOF;
697 if (pstr->mbs_allocated)
698 memset (pstr->mbs, 255, pstr->valid_len);
700 pstr->valid_raw_len = pstr->valid_len;
701 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
702 && IS_WIDE_WORD_CHAR (wc))
704 : ((IS_WIDE_NEWLINE (wc)
705 && pstr->newline_anchor)
706 ? CONTEXT_NEWLINE : 0));
709 #endif /* RE_ENABLE_I18N */
711 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
714 pstr->tip_context = (bitset_contain (pstr->word_char, c)
716 : ((IS_NEWLINE (c) && pstr->newline_anchor)
717 ? CONTEXT_NEWLINE : 0));
720 if (!pstr->mbs_allocated)
723 pstr->raw_mbs_idx = idx;
725 pstr->stop -= offset;
727 /* Then build the buffers. */
728 #ifdef RE_ENABLE_I18N
729 if (pstr->mb_cur_max > 1)
733 int ret = build_wcs_upper_buffer (pstr);
734 if (BE (ret != REG_NOERROR, 0))
738 build_wcs_buffer (pstr);
741 #endif /* RE_ENABLE_I18N */
744 build_upper_buffer (pstr);
745 else if (pstr->trans != NULL)
746 re_string_translate_buffer (pstr);
748 pstr->valid_len = pstr->len;
756 re_string_peek_byte_case (pstr, idx)
757 const re_string_t *pstr;
762 /* Handle the common (easiest) cases first. */
763 if (BE (!pstr->mbs_allocated, 1))
764 return re_string_peek_byte (pstr, idx);
766 #ifdef RE_ENABLE_I18N
767 if (pstr->mb_cur_max > 1
768 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
769 return re_string_peek_byte (pstr, idx);
772 off = pstr->cur_idx + idx;
773 #ifdef RE_ENABLE_I18N
774 if (pstr->offsets_needed)
775 off = pstr->offsets[off];
778 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
780 #ifdef RE_ENABLE_I18N
781 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
782 this function returns CAPITAL LETTER I instead of first byte of
783 DOTLESS SMALL LETTER I. The latter would confuse the parser,
784 since peek_byte_case doesn't advance cur_idx in any way. */
785 if (pstr->offsets_needed && !isascii (ch))
786 return re_string_peek_byte (pstr, idx);
793 re_string_fetch_byte_case (pstr)
796 if (BE (!pstr->mbs_allocated, 1))
797 return re_string_fetch_byte (pstr);
799 #ifdef RE_ENABLE_I18N
800 if (pstr->offsets_needed)
804 /* For tr_TR.UTF-8 [[:islower:]] there is
805 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
806 in that case the whole multi-byte character and return
807 the original letter. On the other side, with
808 [[: DOTLESS SMALL LETTER I return [[:I, as doing
809 anything else would complicate things too much. */
811 if (!re_string_first_byte (pstr, pstr->cur_idx))
812 return re_string_fetch_byte (pstr);
814 off = pstr->offsets[pstr->cur_idx];
815 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
818 return re_string_fetch_byte (pstr);
820 re_string_skip_bytes (pstr,
821 re_string_char_size_at (pstr, pstr->cur_idx));
826 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
830 re_string_destruct (pstr)
833 #ifdef RE_ENABLE_I18N
835 re_free (pstr->offsets);
836 #endif /* RE_ENABLE_I18N */
837 if (pstr->mbs_allocated)
841 /* Return the context at IDX in INPUT. */
844 re_string_context_at (input, idx, eflags)
845 const re_string_t *input;
849 if (idx < 0 || idx == input->len)
852 /* In this case, we use the value stored in input->tip_context,
853 since we can't know the character in input->mbs[-1] here. */
854 return input->tip_context;
855 else /* (idx == input->len) */
856 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
857 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
859 #ifdef RE_ENABLE_I18N
860 if (input->mb_cur_max > 1)
864 while(input->wcs[wc_idx] == WEOF)
867 /* It must not happen. */
868 assert (wc_idx >= 0);
872 return input->tip_context;
874 wc = input->wcs[wc_idx];
875 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
877 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
878 ? CONTEXT_NEWLINE : 0);
883 c = re_string_byte_at (input, idx);
884 if (bitset_contain (input->word_char, c))
886 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
890 /* Functions for set operation. */
893 re_node_set_alloc (set, size)
899 set->elems = re_malloc (int, size);
900 if (BE (set->elems == NULL, 0))
906 re_node_set_init_1 (set, elem)
912 set->elems = re_malloc (int, 1);
913 if (BE (set->elems == NULL, 0))
915 set->alloc = set->nelem = 0;
918 set->elems[0] = elem;
923 re_node_set_init_2 (set, elem1, elem2)
928 set->elems = re_malloc (int, 2);
929 if (BE (set->elems == NULL, 0))
934 set->elems[0] = elem1;
941 set->elems[0] = elem1;
942 set->elems[1] = elem2;
946 set->elems[0] = elem2;
947 set->elems[1] = elem1;
954 re_node_set_init_copy (dest, src)
956 const re_node_set *src;
958 dest->nelem = src->nelem;
961 dest->alloc = dest->nelem;
962 dest->elems = re_malloc (int, dest->alloc);
963 if (BE (dest->elems == NULL, 0))
965 dest->alloc = dest->nelem = 0;
968 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
971 re_node_set_init_empty (dest);
975 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
976 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
977 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
980 re_node_set_add_intersect (dest, src1, src2)
982 const re_node_set *src1, *src2;
984 int i1, i2, is, id, delta, sbase;
985 if (src1->nelem == 0 || src2->nelem == 0)
988 /* We need dest->nelem + 2 * elems_in_intersection; this is a
989 conservative estimate. */
990 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
992 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
993 int *new_elems = re_realloc (dest->elems, int, new_alloc);
994 if (BE (new_elems == NULL, 0))
996 dest->elems = new_elems;
997 dest->alloc = new_alloc;
1000 /* Find the items in the intersection of SRC1 and SRC2, and copy
1001 into the top of DEST those that are not already in DEST itself. */
1002 sbase = dest->nelem + src1->nelem + src2->nelem;
1003 i1 = src1->nelem - 1;
1004 i2 = src2->nelem - 1;
1005 id = dest->nelem - 1;
1008 if (src1->elems[i1] == src2->elems[i2])
1010 /* Try to find the item in DEST. Maybe we could binary search? */
1011 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1014 if (id < 0 || dest->elems[id] != src1->elems[i1])
1015 dest->elems[--sbase] = src1->elems[i1];
1017 if (--i1 < 0 || --i2 < 0)
1021 /* Lower the highest of the two items. */
1022 else if (src1->elems[i1] < src2->elems[i2])
1034 id = dest->nelem - 1;
1035 is = dest->nelem + src1->nelem + src2->nelem - 1;
1036 delta = is - sbase + 1;
1038 /* Now copy. When DELTA becomes zero, the remaining
1039 DEST elements are already in place; this is more or
1040 less the same loop that is in re_node_set_merge. */
1041 dest->nelem += delta;
1042 if (delta > 0 && id >= 0)
1045 if (dest->elems[is] > dest->elems[id])
1047 /* Copy from the top. */
1048 dest->elems[id + delta--] = dest->elems[is--];
1054 /* Slide from the bottom. */
1055 dest->elems[id + delta] = dest->elems[id];
1061 /* Copy remaining SRC elements. */
1062 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1067 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1068 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1070 static reg_errcode_t
1071 re_node_set_init_union (dest, src1, src2)
1073 const re_node_set *src1, *src2;
1076 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1078 dest->alloc = src1->nelem + src2->nelem;
1079 dest->elems = re_malloc (int, dest->alloc);
1080 if (BE (dest->elems == NULL, 0))
1085 if (src1 != NULL && src1->nelem > 0)
1086 return re_node_set_init_copy (dest, src1);
1087 else if (src2 != NULL && src2->nelem > 0)
1088 return re_node_set_init_copy (dest, src2);
1090 re_node_set_init_empty (dest);
1093 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1095 if (src1->elems[i1] > src2->elems[i2])
1097 dest->elems[id++] = src2->elems[i2++];
1100 if (src1->elems[i1] == src2->elems[i2])
1102 dest->elems[id++] = src1->elems[i1++];
1104 if (i1 < src1->nelem)
1106 memcpy (dest->elems + id, src1->elems + i1,
1107 (src1->nelem - i1) * sizeof (int));
1108 id += src1->nelem - i1;
1110 else if (i2 < src2->nelem)
1112 memcpy (dest->elems + id, src2->elems + i2,
1113 (src2->nelem - i2) * sizeof (int));
1114 id += src2->nelem - i2;
1120 /* Calculate the union set of the sets DEST and SRC. And store it to
1121 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1123 static reg_errcode_t
1124 re_node_set_merge (dest, src)
1126 const re_node_set *src;
1128 int is, id, sbase, delta;
1129 if (src == NULL || src->nelem == 0)
1131 if (dest->alloc < 2 * src->nelem + dest->nelem)
1133 int new_alloc = 2 * (src->nelem + dest->alloc);
1134 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1135 if (BE (new_buffer == NULL, 0))
1137 dest->elems = new_buffer;
1138 dest->alloc = new_alloc;
1141 if (BE (dest->nelem == 0, 0))
1143 dest->nelem = src->nelem;
1144 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1148 /* Copy into the top of DEST the items of SRC that are not
1149 found in DEST. Maybe we could binary search in DEST? */
1150 for (sbase = dest->nelem + 2 * src->nelem,
1151 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1153 if (dest->elems[id] == src->elems[is])
1155 else if (dest->elems[id] < src->elems[is])
1156 dest->elems[--sbase] = src->elems[is--];
1157 else /* if (dest->elems[id] > src->elems[is]) */
1163 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1165 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1168 id = dest->nelem - 1;
1169 is = dest->nelem + 2 * src->nelem - 1;
1170 delta = is - sbase + 1;
1174 /* Now copy. When DELTA becomes zero, the remaining
1175 DEST elements are already in place. */
1176 dest->nelem += delta;
1179 if (dest->elems[is] > dest->elems[id])
1181 /* Copy from the top. */
1182 dest->elems[id + delta--] = dest->elems[is--];
1188 /* Slide from the bottom. */
1189 dest->elems[id + delta] = dest->elems[id];
1192 /* Copy remaining SRC elements. */
1193 memcpy (dest->elems, dest->elems + sbase,
1194 delta * sizeof (int));
1203 /* Insert the new element ELEM to the re_node_set* SET.
1204 SET should not already have ELEM.
1205 return -1 if an error is occured, return 1 otherwise. */
1208 re_node_set_insert (set, elem)
1213 /* In case the set is empty. */
1214 if (set->alloc == 0)
1216 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1222 if (BE (set->nelem, 0) == 0)
1224 /* We already guaranteed above that set->alloc != 0. */
1225 set->elems[0] = elem;
1230 /* Realloc if we need. */
1231 if (set->alloc == set->nelem)
1234 set->alloc = set->alloc * 2;
1235 new_array = re_realloc (set->elems, int, set->alloc);
1236 if (BE (new_array == NULL, 0))
1238 set->elems = new_array;
1241 /* Move the elements which follows the new element. Test the
1242 first element separately to skip a check in the inner loop. */
1243 if (elem < set->elems[0])
1246 for (idx = set->nelem; idx > 0; idx--)
1247 set->elems[idx] = set->elems[idx - 1];
1251 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1252 set->elems[idx] = set->elems[idx - 1];
1255 /* Insert the new element. */
1256 set->elems[idx] = elem;
1261 /* Compare two node sets SET1 and SET2.
1262 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1265 re_node_set_compare (set1, set2)
1266 const re_node_set *set1, *set2;
1269 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1271 for (i = set1->nelem ; --i >= 0 ; )
1272 if (set1->elems[i] != set2->elems[i])
1277 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1280 re_node_set_contains (set, elem)
1281 const re_node_set *set;
1284 int idx, right, mid;
1285 if (set->nelem <= 0)
1288 /* Binary search the element. */
1290 right = set->nelem - 1;
1293 mid = (idx + right) / 2;
1294 if (set->elems[mid] < elem)
1299 return set->elems[idx] == elem ? idx + 1 : 0;
1303 re_node_set_remove_at (set, idx)
1307 if (idx < 0 || idx >= set->nelem)
1310 for (; idx < set->nelem; idx++)
1311 set->elems[idx] = set->elems[idx + 1];
1315 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1316 Or return -1, if an error will be occured. */
1319 re_dfa_add_node (dfa, token, mode)
1324 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1326 int new_nodes_alloc = dfa->nodes_alloc * 2;
1327 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1329 if (BE (new_array == NULL, 0))
1331 dfa->nodes = new_array;
1334 int *new_nexts, *new_indices;
1335 re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1337 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1338 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1339 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1340 new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1342 new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1344 if (BE (new_nexts == NULL || new_indices == NULL
1345 || new_edests == NULL || new_eclosures == NULL
1346 || new_inveclosures == NULL, 0))
1348 dfa->nexts = new_nexts;
1349 dfa->org_indices = new_indices;
1350 dfa->edests = new_edests;
1351 dfa->eclosures = new_eclosures;
1352 dfa->inveclosures = new_inveclosures;
1354 dfa->nodes_alloc = new_nodes_alloc;
1356 dfa->nodes[dfa->nodes_len] = token;
1357 dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1358 dfa->nodes[dfa->nodes_len].duplicated = 0;
1359 dfa->nodes[dfa->nodes_len].constraint = 0;
1360 return dfa->nodes_len++;
1363 static unsigned int inline
1364 calc_state_hash (nodes, context)
1365 const re_node_set *nodes;
1366 unsigned int context;
1368 unsigned int hash = nodes->nelem + context;
1370 for (i = 0 ; i < nodes->nelem ; i++)
1371 hash += nodes->elems[i];
1375 /* Search for the state whose node_set is equivalent to NODES.
1376 Return the pointer to the state, if we found it in the DFA.
1377 Otherwise create the new one and return it. In case of an error
1378 return NULL and set the error code in ERR.
1379 Note: - We assume NULL as the invalid state, then it is possible that
1380 return value is NULL and ERR is REG_NOERROR.
1381 - We never return non-NULL value in case of any errors, it is for
1384 static re_dfastate_t*
1385 re_acquire_state (err, dfa, nodes)
1388 const re_node_set *nodes;
1391 re_dfastate_t *new_state;
1392 struct re_state_table_entry *spot;
1394 if (BE (nodes->nelem == 0, 0))
1399 hash = calc_state_hash (nodes, 0);
1400 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1402 for (i = 0 ; i < spot->num ; i++)
1404 re_dfastate_t *state = spot->array[i];
1405 if (hash != state->hash)
1407 if (re_node_set_compare (&state->nodes, nodes))
1411 /* There are no appropriate state in the dfa, create the new one. */
1412 new_state = create_ci_newstate (dfa, nodes, hash);
1413 if (BE (new_state != NULL, 1))
1422 /* Search for the state whose node_set is equivalent to NODES and
1423 whose context is equivalent to CONTEXT.
1424 Return the pointer to the state, if we found it in the DFA.
1425 Otherwise create the new one and return it. In case of an error
1426 return NULL and set the error code in ERR.
1427 Note: - We assume NULL as the invalid state, then it is possible that
1428 return value is NULL and ERR is REG_NOERROR.
1429 - We never return non-NULL value in case of any errors, it is for
1432 static re_dfastate_t*
1433 re_acquire_state_context (err, dfa, nodes, context)
1436 const re_node_set *nodes;
1437 unsigned int context;
1440 re_dfastate_t *new_state;
1441 struct re_state_table_entry *spot;
1443 if (nodes->nelem == 0)
1448 hash = calc_state_hash (nodes, context);
1449 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1451 for (i = 0 ; i < spot->num ; i++)
1453 re_dfastate_t *state = spot->array[i];
1454 if (state->hash == hash
1455 && state->context == context
1456 && re_node_set_compare (state->entrance_nodes, nodes))
1459 /* There are no appropriate state in `dfa', create the new one. */
1460 new_state = create_cd_newstate (dfa, nodes, context, hash);
1461 if (BE (new_state != NULL, 1))
1470 /* Allocate memory for DFA state and initialize common properties.
1471 Return the new state if succeeded, otherwise return NULL. */
1473 static re_dfastate_t *
1474 create_newstate_common (dfa, nodes, hash)
1476 const re_node_set *nodes;
1479 re_dfastate_t *newstate;
1481 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1482 if (BE (newstate == NULL, 0))
1484 err = re_node_set_init_copy (&newstate->nodes, nodes);
1485 if (BE (err != REG_NOERROR, 0))
1490 newstate->trtable = NULL;
1491 newstate->hash = hash;
1495 /* Store the new state NEWSTATE whose hash value is HASH in appropriate
1496 position. Return value indicate the error code if failed. */
1498 static reg_errcode_t
1499 register_state (dfa, newstate, hash)
1501 re_dfastate_t *newstate;
1504 struct re_state_table_entry *spot;
1505 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1507 if (BE (spot->alloc <= spot->num, 0))
1509 int new_alloc = 2 * spot->num + 2;
1510 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1512 if (BE (new_array == NULL, 0))
1514 spot->array = new_array;
1515 spot->alloc = new_alloc;
1517 spot->array[spot->num++] = newstate;
1521 /* Create the new state which is independ of contexts.
1522 Return the new state if succeeded, otherwise return NULL. */
1524 static re_dfastate_t *
1525 create_ci_newstate (dfa, nodes, hash)
1527 const re_node_set *nodes;
1532 re_dfastate_t *newstate;
1533 newstate = create_newstate_common (dfa, nodes, hash);
1534 if (BE (newstate == NULL, 0))
1536 newstate->entrance_nodes = &newstate->nodes;
1538 for (i = 0 ; i < nodes->nelem ; i++)
1540 re_token_t *node = dfa->nodes + nodes->elems[i];
1541 re_token_type_t type = node->type;
1542 if (type == CHARACTER && !node->constraint)
1545 /* If the state has the halt node, the state is a halt state. */
1546 else if (type == END_OF_RE)
1548 #ifdef RE_ENABLE_I18N
1549 else if (type == COMPLEX_BRACKET
1550 || type == OP_UTF8_PERIOD
1551 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1552 newstate->accept_mb = 1;
1553 #endif /* RE_ENABLE_I18N */
1554 else if (type == OP_BACK_REF)
1555 newstate->has_backref = 1;
1556 else if (type == ANCHOR || node->constraint)
1557 newstate->has_constraint = 1;
1559 err = register_state (dfa, newstate, hash);
1560 if (BE (err != REG_NOERROR, 0))
1562 free_state (newstate);
1568 /* Create the new state which is depend on the context CONTEXT.
1569 Return the new state if succeeded, otherwise return NULL. */
1571 static re_dfastate_t *
1572 create_cd_newstate (dfa, nodes, context, hash)
1574 const re_node_set *nodes;
1575 unsigned int context, hash;
1577 int i, nctx_nodes = 0;
1579 re_dfastate_t *newstate;
1581 newstate = create_newstate_common (dfa, nodes, hash);
1582 if (BE (newstate == NULL, 0))
1584 newstate->context = context;
1585 newstate->entrance_nodes = &newstate->nodes;
1587 for (i = 0 ; i < nodes->nelem ; i++)
1589 unsigned int constraint = 0;
1590 re_token_t *node = dfa->nodes + nodes->elems[i];
1591 re_token_type_t type = node->type;
1592 if (node->constraint)
1593 constraint = node->constraint;
1595 if (type == CHARACTER && !constraint)
1597 /* If the state has the halt node, the state is a halt state. */
1598 else if (type == END_OF_RE)
1600 #ifdef RE_ENABLE_I18N
1601 else if (type == COMPLEX_BRACKET
1602 || type == OP_UTF8_PERIOD
1603 || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1604 newstate->accept_mb = 1;
1605 #endif /* RE_ENABLE_I18N */
1606 else if (type == OP_BACK_REF)
1607 newstate->has_backref = 1;
1608 else if (type == ANCHOR)
1609 constraint = node->opr.ctx_type;
1613 if (newstate->entrance_nodes == &newstate->nodes)
1615 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1616 if (BE (newstate->entrance_nodes == NULL, 0))
1618 free_state (newstate);
1621 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1623 newstate->has_constraint = 1;
1626 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1628 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1633 err = register_state (dfa, newstate, hash);
1634 if (BE (err != REG_NOERROR, 0))
1636 free_state (newstate);
1644 re_dfastate_t *state;
1646 if (state->entrance_nodes != &state->nodes)
1648 re_node_set_free (state->entrance_nodes);
1649 re_free (state->entrance_nodes);
1651 re_node_set_free (&state->nodes);
1652 re_free (state->trtable);