1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static void re_string_construct_common (const char *str, int len,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
26 const re_node_set *nodes,
27 unsigned int hash) internal_function;
28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
29 const re_node_set *nodes,
31 unsigned int hash) internal_function;
33 /* Functions for string operation. */
35 /* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
41 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
64 /* This function allocate the buffers, and initialize them. */
68 re_string_construct (re_string_t *pstr, const char *str, int len,
69 RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
86 if (dfa->mb_cur_max > 1)
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
93 if (pstr->valid_raw_len >= len)
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
103 #endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
108 #ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
112 #endif /* RE_ENABLE_I18N */
115 re_string_translate_buffer (pstr);
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
127 /* Helper functions for re_string_allocate, and re_string_construct. */
131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
133 #ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
136 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
137 if (BE (new_wcs == NULL, 0))
140 if (pstr->offsets != NULL)
142 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
143 if (BE (new_offsets == NULL, 0))
145 pstr->offsets = new_offsets;
148 #endif /* RE_ENABLE_I18N */
149 if (pstr->mbs_allocated)
151 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
153 if (BE (new_mbs == NULL, 0))
157 pstr->bufs_len = new_buf_len;
164 re_string_construct_common (const char *str, int len, re_string_t *pstr,
165 RE_TRANSLATE_TYPE trans, int icase,
168 pstr->raw_mbs = (const unsigned char *) str;
172 pstr->icase = icase ? 1 : 0;
173 pstr->mbs_allocated = (trans != NULL || icase);
174 pstr->mb_cur_max = dfa->mb_cur_max;
175 pstr->is_utf8 = dfa->is_utf8;
176 pstr->map_notascii = dfa->map_notascii;
177 pstr->stop = pstr->len;
178 pstr->raw_stop = pstr->stop;
181 #ifdef RE_ENABLE_I18N
183 /* Build wide character buffer PSTR->WCS.
184 If the byte sequence of the string are:
185 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
186 Then wide character buffer will be:
187 <wc1> , WEOF , <wc2> , WEOF , <wc3>
188 We use WEOF for padding, they indicate that the position isn't
189 a first byte of a multibyte character.
191 Note that this function assumes PSTR->VALID_LEN elements are already
192 built and starts from PSTR->VALID_LEN. */
196 build_wcs_buffer (re_string_t *pstr)
199 unsigned char buf[MB_LEN_MAX];
200 assert (MB_LEN_MAX >= pstr->mb_cur_max);
202 unsigned char buf[64];
205 int byte_idx, end_idx, remain_len;
208 /* Build the buffers from pstr->valid_len to either pstr->len or
210 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
211 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
216 remain_len = end_idx - byte_idx;
217 prev_st = pstr->cur_state;
218 /* Apply the translation if we need. */
219 if (BE (pstr->trans != NULL, 0))
223 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
225 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
226 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
228 p = (const char *) buf;
231 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
232 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
233 if (BE (mbclen == (size_t) -2, 0))
235 /* The buffer doesn't have enough space, finish to build. */
236 pstr->cur_state = prev_st;
239 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
241 /* We treat these cases as a singlebyte character. */
243 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
244 if (BE (pstr->trans != NULL, 0))
245 wc = pstr->trans[wc];
246 pstr->cur_state = prev_st;
249 /* Write wide character and padding. */
250 pstr->wcs[byte_idx++] = wc;
251 /* Write paddings. */
252 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
253 pstr->wcs[byte_idx++] = WEOF;
255 pstr->valid_len = byte_idx;
256 pstr->valid_raw_len = byte_idx;
259 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
260 but for REG_ICASE. */
264 build_wcs_upper_buffer (re_string_t *pstr)
267 int src_idx, byte_idx, end_idx, remain_len;
270 char buf[MB_LEN_MAX];
271 assert (MB_LEN_MAX >= pstr->mb_cur_max);
276 byte_idx = pstr->valid_len;
277 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
279 /* The following optimization assumes that ASCII characters can be
280 mapped to wide characters with a simple cast. */
281 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
283 while (byte_idx < end_idx)
287 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
288 && mbsinit (&pstr->cur_state))
290 /* In case of a singlebyte character. */
292 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
293 /* The next step uses the assumption that wchar_t is encoded
294 ASCII-safe: all ASCII values can be converted like this. */
295 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
300 remain_len = end_idx - byte_idx;
301 prev_st = pstr->cur_state;
302 mbclen = mbrtowc (&wc,
303 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
304 + byte_idx), remain_len, &pstr->cur_state);
305 if (BE (mbclen + 2 > 2, 1))
313 mbcdlen = wcrtomb (buf, wcu, &prev_st);
314 if (BE (mbclen == mbcdlen, 1))
315 memcpy (pstr->mbs + byte_idx, buf, mbclen);
323 memcpy (pstr->mbs + byte_idx,
324 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
325 pstr->wcs[byte_idx++] = wcu;
326 /* Write paddings. */
327 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
328 pstr->wcs[byte_idx++] = WEOF;
330 else if (mbclen == (size_t) -1 || mbclen == 0)
332 /* It is an invalid character or '\0'. Just use the byte. */
333 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
334 pstr->mbs[byte_idx] = ch;
335 /* And also cast it to wide char. */
336 pstr->wcs[byte_idx++] = (wchar_t) ch;
337 if (BE (mbclen == (size_t) -1, 0))
338 pstr->cur_state = prev_st;
342 /* The buffer doesn't have enough space, finish to build. */
343 pstr->cur_state = prev_st;
347 pstr->valid_len = byte_idx;
348 pstr->valid_raw_len = byte_idx;
352 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
357 remain_len = end_idx - byte_idx;
358 prev_st = pstr->cur_state;
359 if (BE (pstr->trans != NULL, 0))
363 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
365 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
366 buf[i] = pstr->trans[ch];
368 p = (const char *) buf;
371 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
372 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
373 if (BE (mbclen + 2 > 2, 1))
381 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
382 if (BE (mbclen == mbcdlen, 1))
383 memcpy (pstr->mbs + byte_idx, buf, mbclen);
384 else if (mbcdlen != (size_t) -1)
388 if (byte_idx + mbcdlen > pstr->bufs_len)
390 pstr->cur_state = prev_st;
394 if (pstr->offsets == NULL)
396 pstr->offsets = re_malloc (int, pstr->bufs_len);
398 if (pstr->offsets == NULL)
401 if (!pstr->offsets_needed)
403 for (i = 0; i < (size_t) byte_idx; ++i)
404 pstr->offsets[i] = i;
405 pstr->offsets_needed = 1;
408 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
409 pstr->wcs[byte_idx] = wcu;
410 pstr->offsets[byte_idx] = src_idx;
411 for (i = 1; i < mbcdlen; ++i)
413 pstr->offsets[byte_idx + i]
414 = src_idx + (i < mbclen ? i : mbclen - 1);
415 pstr->wcs[byte_idx + i] = WEOF;
417 pstr->len += mbcdlen - mbclen;
418 if (pstr->raw_stop > src_idx)
419 pstr->stop += mbcdlen - mbclen;
420 end_idx = (pstr->bufs_len > pstr->len)
421 ? pstr->len : pstr->bufs_len;
427 memcpy (pstr->mbs + byte_idx, p, mbclen);
430 memcpy (pstr->mbs + byte_idx, p, mbclen);
432 if (BE (pstr->offsets_needed != 0, 0))
435 for (i = 0; i < mbclen; ++i)
436 pstr->offsets[byte_idx + i] = src_idx + i;
440 pstr->wcs[byte_idx++] = wcu;
441 /* Write paddings. */
442 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
443 pstr->wcs[byte_idx++] = WEOF;
445 else if (mbclen == (size_t) -1 || mbclen == 0)
447 /* It is an invalid character or '\0'. Just use the byte. */
448 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
450 if (BE (pstr->trans != NULL, 0))
451 ch = pstr->trans [ch];
452 pstr->mbs[byte_idx] = ch;
454 if (BE (pstr->offsets_needed != 0, 0))
455 pstr->offsets[byte_idx] = src_idx;
458 /* And also cast it to wide char. */
459 pstr->wcs[byte_idx++] = (wchar_t) ch;
460 if (BE (mbclen == (size_t) -1, 0))
461 pstr->cur_state = prev_st;
465 /* The buffer doesn't have enough space, finish to build. */
466 pstr->cur_state = prev_st;
470 pstr->valid_len = byte_idx;
471 pstr->valid_raw_len = src_idx;
475 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
480 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
487 /* Skip the characters which are not necessary to check. */
488 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
489 rawbuf_idx < new_raw_idx;)
492 remain_len = pstr->len - rawbuf_idx;
493 prev_st = pstr->cur_state;
494 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
495 remain_len, &pstr->cur_state);
496 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
498 /* We treat these cases as a single byte character. */
499 if (mbclen == 0 || remain_len == 0)
502 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
504 pstr->cur_state = prev_st;
506 /* Then proceed the next character. */
507 rawbuf_idx += mbclen;
509 *last_wc = (wint_t) wc;
512 #endif /* RE_ENABLE_I18N */
514 /* Build the buffer PSTR->MBS, and apply the translation if we need.
515 This function is used in case of REG_ICASE. */
519 build_upper_buffer (re_string_t *pstr)
521 int char_idx, end_idx;
522 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
524 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
526 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
527 if (BE (pstr->trans != NULL, 0))
528 ch = pstr->trans[ch];
530 pstr->mbs[char_idx] = toupper (ch);
532 pstr->mbs[char_idx] = ch;
534 pstr->valid_len = char_idx;
535 pstr->valid_raw_len = char_idx;
538 /* Apply TRANS to the buffer in PSTR. */
542 re_string_translate_buffer (re_string_t *pstr)
544 int buf_idx, end_idx;
545 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
547 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
549 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
550 pstr->mbs[buf_idx] = pstr->trans[ch];
553 pstr->valid_len = buf_idx;
554 pstr->valid_raw_len = buf_idx;
557 /* This function re-construct the buffers.
558 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
559 convert to upper case in case of REG_ICASE, apply translation. */
563 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
565 int offset = idx - pstr->raw_mbs_idx;
566 if (BE (offset < 0, 0))
569 #ifdef RE_ENABLE_I18N
570 if (pstr->mb_cur_max > 1)
571 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
572 #endif /* RE_ENABLE_I18N */
573 pstr->len = pstr->raw_len;
574 pstr->stop = pstr->raw_stop;
576 pstr->raw_mbs_idx = 0;
577 pstr->valid_raw_len = 0;
578 pstr->offsets_needed = 0;
579 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
580 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
581 if (!pstr->mbs_allocated)
582 pstr->mbs = (unsigned char *) pstr->raw_mbs;
586 if (BE (offset != 0, 1))
588 /* Should the already checked characters be kept? */
589 if (BE (offset < pstr->valid_raw_len, 1))
591 /* Yes, move them to the front of the buffer. */
592 #ifdef RE_ENABLE_I18N
593 if (BE (pstr->offsets_needed, 0))
595 int low = 0, high = pstr->valid_len, mid;
598 mid = (high + low) / 2;
599 if (pstr->offsets[mid] > offset)
601 else if (pstr->offsets[mid] < offset)
607 if (pstr->offsets[mid] < offset)
609 pstr->tip_context = re_string_context_at (pstr, mid - 1,
611 /* This can be quite complicated, so handle specially
612 only the common and easy case where the character with
613 different length representation of lower and upper
614 case is present at or after offset. */
615 if (pstr->valid_len > offset
616 && mid == offset && pstr->offsets[mid] == offset)
618 memmove (pstr->wcs, pstr->wcs + offset,
619 (pstr->valid_len - offset) * sizeof (wint_t));
620 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
621 pstr->valid_len -= offset;
622 pstr->valid_raw_len -= offset;
623 for (low = 0; low < pstr->valid_len; low++)
624 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
628 /* Otherwise, just find out how long the partial multibyte
629 character at offset is and fill it with WEOF/255. */
630 pstr->len = pstr->raw_len - idx + offset;
631 pstr->stop = pstr->raw_stop - idx + offset;
632 pstr->offsets_needed = 0;
633 while (mid > 0 && pstr->offsets[mid - 1] == offset)
635 while (mid < pstr->valid_len)
636 if (pstr->wcs[mid] != WEOF)
640 if (mid == pstr->valid_len)
644 pstr->valid_len = pstr->offsets[mid] - offset;
647 for (low = 0; low < pstr->valid_len; ++low)
648 pstr->wcs[low] = WEOF;
649 memset (pstr->mbs, 255, pstr->valid_len);
652 pstr->valid_raw_len = pstr->valid_len;
658 pstr->tip_context = re_string_context_at (pstr, offset - 1,
660 #ifdef RE_ENABLE_I18N
661 if (pstr->mb_cur_max > 1)
662 memmove (pstr->wcs, pstr->wcs + offset,
663 (pstr->valid_len - offset) * sizeof (wint_t));
664 #endif /* RE_ENABLE_I18N */
665 if (BE (pstr->mbs_allocated, 0))
666 memmove (pstr->mbs, pstr->mbs + offset,
667 pstr->valid_len - offset);
668 pstr->valid_len -= offset;
669 pstr->valid_raw_len -= offset;
671 assert (pstr->valid_len > 0);
677 /* No, skip all characters until IDX. */
678 int prev_valid_len = pstr->valid_len;
680 #ifdef RE_ENABLE_I18N
681 if (BE (pstr->offsets_needed, 0))
683 pstr->len = pstr->raw_len - idx + offset;
684 pstr->stop = pstr->raw_stop - idx + offset;
685 pstr->offsets_needed = 0;
689 #ifdef RE_ENABLE_I18N
690 if (pstr->mb_cur_max > 1)
697 const unsigned char *raw, *p, *q, *end;
699 /* Special case UTF-8. Multi-byte chars start with any
700 byte other than 0x80 - 0xbf. */
701 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
702 end = raw + (offset - pstr->mb_cur_max);
703 if (end < pstr->raw_mbs)
705 p = raw + offset - 1;
707 /* We know the wchar_t encoding is UCS4, so for the simple
708 case, ASCII characters, skip the conversion step. */
709 if (isascii (*p) && BE (pstr->trans == NULL, 1))
711 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
712 /* pstr->valid_len = 0; */
717 for (; p >= end; --p)
718 if ((*p & 0xc0) != 0x80)
722 int mlen = raw + pstr->len - p;
723 unsigned char buf[6];
727 if (BE (pstr->trans != NULL, 0))
729 int i = mlen < 6 ? mlen : 6;
731 buf[i] = pstr->trans[p[i]];
734 /* XXX Don't use mbrtowc, we know which conversion
735 to use (UTF-8 -> UCS4). */
736 memset (&cur_state, 0, sizeof (cur_state));
737 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
739 if (raw + offset - p <= mbclen
740 && mbclen < (size_t) -2)
742 memset (&pstr->cur_state, '\0',
744 pstr->valid_len = mbclen - (raw + offset - p);
752 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
755 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
757 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
758 && IS_WIDE_WORD_CHAR (wc))
760 : ((IS_WIDE_NEWLINE (wc)
761 && pstr->newline_anchor)
762 ? CONTEXT_NEWLINE : 0));
763 if (BE (pstr->valid_len, 0))
765 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
766 pstr->wcs[wcs_idx] = WEOF;
767 if (pstr->mbs_allocated)
768 memset (pstr->mbs, 255, pstr->valid_len);
770 pstr->valid_raw_len = pstr->valid_len;
773 #endif /* RE_ENABLE_I18N */
775 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
776 pstr->valid_raw_len = 0;
779 pstr->tip_context = (bitset_contain (pstr->word_char, c)
781 : ((IS_NEWLINE (c) && pstr->newline_anchor)
782 ? CONTEXT_NEWLINE : 0));
785 if (!BE (pstr->mbs_allocated, 0))
788 pstr->raw_mbs_idx = idx;
790 pstr->stop -= offset;
792 /* Then build the buffers. */
793 #ifdef RE_ENABLE_I18N
794 if (pstr->mb_cur_max > 1)
798 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
799 if (BE (ret != REG_NOERROR, 0))
803 build_wcs_buffer (pstr);
806 #endif /* RE_ENABLE_I18N */
807 if (BE (pstr->mbs_allocated, 0))
810 build_upper_buffer (pstr);
811 else if (pstr->trans != NULL)
812 re_string_translate_buffer (pstr);
815 pstr->valid_len = pstr->len;
822 internal_function __attribute ((pure))
823 re_string_peek_byte_case (const re_string_t *pstr, int idx)
827 /* Handle the common (easiest) cases first. */
828 if (BE (!pstr->mbs_allocated, 1))
829 return re_string_peek_byte (pstr, idx);
831 #ifdef RE_ENABLE_I18N
832 if (pstr->mb_cur_max > 1
833 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
834 return re_string_peek_byte (pstr, idx);
837 off = pstr->cur_idx + idx;
838 #ifdef RE_ENABLE_I18N
839 if (pstr->offsets_needed)
840 off = pstr->offsets[off];
843 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
845 #ifdef RE_ENABLE_I18N
846 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
847 this function returns CAPITAL LETTER I instead of first byte of
848 DOTLESS SMALL LETTER I. The latter would confuse the parser,
849 since peek_byte_case doesn't advance cur_idx in any way. */
850 if (pstr->offsets_needed && !isascii (ch))
851 return re_string_peek_byte (pstr, idx);
858 internal_function __attribute ((pure))
859 re_string_fetch_byte_case (re_string_t *pstr)
861 if (BE (!pstr->mbs_allocated, 1))
862 return re_string_fetch_byte (pstr);
864 #ifdef RE_ENABLE_I18N
865 if (pstr->offsets_needed)
869 /* For tr_TR.UTF-8 [[:islower:]] there is
870 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
871 in that case the whole multi-byte character and return
872 the original letter. On the other side, with
873 [[: DOTLESS SMALL LETTER I return [[:I, as doing
874 anything else would complicate things too much. */
876 if (!re_string_first_byte (pstr, pstr->cur_idx))
877 return re_string_fetch_byte (pstr);
879 off = pstr->offsets[pstr->cur_idx];
880 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
883 return re_string_fetch_byte (pstr);
885 re_string_skip_bytes (pstr,
886 re_string_char_size_at (pstr, pstr->cur_idx));
891 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
896 re_string_destruct (re_string_t *pstr)
898 #ifdef RE_ENABLE_I18N
900 re_free (pstr->offsets);
901 #endif /* RE_ENABLE_I18N */
902 if (pstr->mbs_allocated)
906 /* Return the context at IDX in INPUT. */
910 re_string_context_at (const re_string_t *input, int idx, int eflags)
914 /* In this case, we use the value stored in input->tip_context,
915 since we can't know the character in input->mbs[-1] here. */
916 return input->tip_context;
917 if (BE (idx == input->len, 0))
918 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
919 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
920 #ifdef RE_ENABLE_I18N
921 if (input->mb_cur_max > 1)
925 while(input->wcs[wc_idx] == WEOF)
928 /* It must not happen. */
929 assert (wc_idx >= 0);
933 return input->tip_context;
935 wc = input->wcs[wc_idx];
936 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
938 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
939 ? CONTEXT_NEWLINE : 0);
944 c = re_string_byte_at (input, idx);
945 if (bitset_contain (input->word_char, c))
947 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
951 /* Functions for set operation. */
955 re_node_set_alloc (re_node_set *set, int size)
959 set->elems = re_malloc (int, size);
960 if (BE (set->elems == NULL, 0))
967 re_node_set_init_1 (re_node_set *set, int elem)
971 set->elems = re_malloc (int, 1);
972 if (BE (set->elems == NULL, 0))
974 set->alloc = set->nelem = 0;
977 set->elems[0] = elem;
983 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
986 set->elems = re_malloc (int, 2);
987 if (BE (set->elems == NULL, 0))
992 set->elems[0] = elem1;
999 set->elems[0] = elem1;
1000 set->elems[1] = elem2;
1004 set->elems[0] = elem2;
1005 set->elems[1] = elem1;
1011 static reg_errcode_t
1013 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1015 dest->nelem = src->nelem;
1018 dest->alloc = dest->nelem;
1019 dest->elems = re_malloc (int, dest->alloc);
1020 if (BE (dest->elems == NULL, 0))
1022 dest->alloc = dest->nelem = 0;
1025 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1028 re_node_set_init_empty (dest);
1032 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1033 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1034 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1036 static reg_errcode_t
1038 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1039 const re_node_set *src2)
1041 int i1, i2, is, id, delta, sbase;
1042 if (src1->nelem == 0 || src2->nelem == 0)
1045 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1046 conservative estimate. */
1047 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1049 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1050 int *new_elems = re_realloc (dest->elems, int, new_alloc);
1051 if (BE (new_elems == NULL, 0))
1053 dest->elems = new_elems;
1054 dest->alloc = new_alloc;
1057 /* Find the items in the intersection of SRC1 and SRC2, and copy
1058 into the top of DEST those that are not already in DEST itself. */
1059 sbase = dest->nelem + src1->nelem + src2->nelem;
1060 i1 = src1->nelem - 1;
1061 i2 = src2->nelem - 1;
1062 id = dest->nelem - 1;
1065 if (src1->elems[i1] == src2->elems[i2])
1067 /* Try to find the item in DEST. Maybe we could binary search? */
1068 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1071 if (id < 0 || dest->elems[id] != src1->elems[i1])
1072 dest->elems[--sbase] = src1->elems[i1];
1074 if (--i1 < 0 || --i2 < 0)
1078 /* Lower the highest of the two items. */
1079 else if (src1->elems[i1] < src2->elems[i2])
1091 id = dest->nelem - 1;
1092 is = dest->nelem + src1->nelem + src2->nelem - 1;
1093 delta = is - sbase + 1;
1095 /* Now copy. When DELTA becomes zero, the remaining
1096 DEST elements are already in place; this is more or
1097 less the same loop that is in re_node_set_merge. */
1098 dest->nelem += delta;
1099 if (delta > 0 && id >= 0)
1102 if (dest->elems[is] > dest->elems[id])
1104 /* Copy from the top. */
1105 dest->elems[id + delta--] = dest->elems[is--];
1111 /* Slide from the bottom. */
1112 dest->elems[id + delta] = dest->elems[id];
1118 /* Copy remaining SRC elements. */
1119 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1124 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1125 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1127 static reg_errcode_t
1129 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1130 const re_node_set *src2)
1133 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1135 dest->alloc = src1->nelem + src2->nelem;
1136 dest->elems = re_malloc (int, dest->alloc);
1137 if (BE (dest->elems == NULL, 0))
1142 if (src1 != NULL && src1->nelem > 0)
1143 return re_node_set_init_copy (dest, src1);
1144 else if (src2 != NULL && src2->nelem > 0)
1145 return re_node_set_init_copy (dest, src2);
1147 re_node_set_init_empty (dest);
1150 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1152 if (src1->elems[i1] > src2->elems[i2])
1154 dest->elems[id++] = src2->elems[i2++];
1157 if (src1->elems[i1] == src2->elems[i2])
1159 dest->elems[id++] = src1->elems[i1++];
1161 if (i1 < src1->nelem)
1163 memcpy (dest->elems + id, src1->elems + i1,
1164 (src1->nelem - i1) * sizeof (int));
1165 id += src1->nelem - i1;
1167 else if (i2 < src2->nelem)
1169 memcpy (dest->elems + id, src2->elems + i2,
1170 (src2->nelem - i2) * sizeof (int));
1171 id += src2->nelem - i2;
1177 /* Calculate the union set of the sets DEST and SRC. And store it to
1178 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1180 static reg_errcode_t
1182 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1184 int is, id, sbase, delta;
1185 if (src == NULL || src->nelem == 0)
1187 if (dest->alloc < 2 * src->nelem + dest->nelem)
1189 int new_alloc = 2 * (src->nelem + dest->alloc);
1190 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1191 if (BE (new_buffer == NULL, 0))
1193 dest->elems = new_buffer;
1194 dest->alloc = new_alloc;
1197 if (BE (dest->nelem == 0, 0))
1199 dest->nelem = src->nelem;
1200 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1204 /* Copy into the top of DEST the items of SRC that are not
1205 found in DEST. Maybe we could binary search in DEST? */
1206 for (sbase = dest->nelem + 2 * src->nelem,
1207 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1209 if (dest->elems[id] == src->elems[is])
1211 else if (dest->elems[id] < src->elems[is])
1212 dest->elems[--sbase] = src->elems[is--];
1213 else /* if (dest->elems[id] > src->elems[is]) */
1219 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1221 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1224 id = dest->nelem - 1;
1225 is = dest->nelem + 2 * src->nelem - 1;
1226 delta = is - sbase + 1;
1230 /* Now copy. When DELTA becomes zero, the remaining
1231 DEST elements are already in place. */
1232 dest->nelem += delta;
1235 if (dest->elems[is] > dest->elems[id])
1237 /* Copy from the top. */
1238 dest->elems[id + delta--] = dest->elems[is--];
1244 /* Slide from the bottom. */
1245 dest->elems[id + delta] = dest->elems[id];
1248 /* Copy remaining SRC elements. */
1249 memcpy (dest->elems, dest->elems + sbase,
1250 delta * sizeof (int));
1259 /* Insert the new element ELEM to the re_node_set* SET.
1260 SET should not already have ELEM.
1261 return -1 if an error is occured, return 1 otherwise. */
1265 re_node_set_insert (re_node_set *set, int elem)
1268 /* In case the set is empty. */
1269 if (set->alloc == 0)
1271 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1277 if (BE (set->nelem, 0) == 0)
1279 /* We already guaranteed above that set->alloc != 0. */
1280 set->elems[0] = elem;
1285 /* Realloc if we need. */
1286 if (set->alloc == set->nelem)
1289 set->alloc = set->alloc * 2;
1290 new_elems = re_realloc (set->elems, int, set->alloc);
1291 if (BE (new_elems == NULL, 0))
1293 set->elems = new_elems;
1296 /* Move the elements which follows the new element. Test the
1297 first element separately to skip a check in the inner loop. */
1298 if (elem < set->elems[0])
1301 for (idx = set->nelem; idx > 0; idx--)
1302 set->elems[idx] = set->elems[idx - 1];
1306 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1307 set->elems[idx] = set->elems[idx - 1];
1310 /* Insert the new element. */
1311 set->elems[idx] = elem;
1316 /* Insert the new element ELEM to the re_node_set* SET.
1317 SET should not already have any element greater than or equal to ELEM.
1318 Return -1 if an error is occured, return 1 otherwise. */
1322 re_node_set_insert_last (re_node_set *set, int elem)
1324 /* Realloc if we need. */
1325 if (set->alloc == set->nelem)
1328 set->alloc = (set->alloc + 1) * 2;
1329 new_elems = re_realloc (set->elems, int, set->alloc);
1330 if (BE (new_elems == NULL, 0))
1332 set->elems = new_elems;
1335 /* Insert the new element. */
1336 set->elems[set->nelem++] = elem;
1340 /* Compare two node sets SET1 and SET2.
1341 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1344 internal_function __attribute ((pure))
1345 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1348 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1350 for (i = set1->nelem ; --i >= 0 ; )
1351 if (set1->elems[i] != set2->elems[i])
1356 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1359 internal_function __attribute ((pure))
1360 re_node_set_contains (const re_node_set *set, int elem)
1362 unsigned int idx, right, mid;
1363 if (set->nelem <= 0)
1366 /* Binary search the element. */
1368 right = set->nelem - 1;
1371 mid = (idx + right) / 2;
1372 if (set->elems[mid] < elem)
1377 return set->elems[idx] == elem ? idx + 1 : 0;
1382 re_node_set_remove_at (re_node_set *set, int idx)
1384 if (idx < 0 || idx >= set->nelem)
1387 for (; idx < set->nelem; idx++)
1388 set->elems[idx] = set->elems[idx + 1];
1392 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1393 Or return -1, if an error will be occured. */
1397 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1399 int type = token.type;
1400 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1402 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1403 int *new_nexts, *new_indices;
1404 re_node_set *new_edests, *new_eclosures;
1405 re_token_t *new_nodes;
1407 /* Avoid overflows. */
1408 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1411 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1412 if (BE (new_nodes == NULL, 0))
1414 dfa->nodes = new_nodes;
1415 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1416 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1417 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1418 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1419 if (BE (new_nexts == NULL || new_indices == NULL
1420 || new_edests == NULL || new_eclosures == NULL, 0))
1422 dfa->nexts = new_nexts;
1423 dfa->org_indices = new_indices;
1424 dfa->edests = new_edests;
1425 dfa->eclosures = new_eclosures;
1426 dfa->nodes_alloc = new_nodes_alloc;
1428 dfa->nodes[dfa->nodes_len] = token;
1429 dfa->nodes[dfa->nodes_len].constraint = 0;
1430 #ifdef RE_ENABLE_I18N
1431 dfa->nodes[dfa->nodes_len].accept_mb =
1432 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1434 dfa->nexts[dfa->nodes_len] = -1;
1435 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1436 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1437 return dfa->nodes_len++;
1440 static inline unsigned int
1442 calc_state_hash (const re_node_set *nodes, unsigned int context)
1444 unsigned int hash = nodes->nelem + context;
1446 for (i = 0 ; i < nodes->nelem ; i++)
1447 hash += nodes->elems[i];
1451 /* Search for the state whose node_set is equivalent to NODES.
1452 Return the pointer to the state, if we found it in the DFA.
1453 Otherwise create the new one and return it. In case of an error
1454 return NULL and set the error code in ERR.
1455 Note: - We assume NULL as the invalid state, then it is possible that
1456 return value is NULL and ERR is REG_NOERROR.
1457 - We never return non-NULL value in case of any errors, it is for
1460 static re_dfastate_t *
1462 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1463 const re_node_set *nodes)
1466 re_dfastate_t *new_state;
1467 struct re_state_table_entry *spot;
1469 if (BE (nodes->nelem == 0, 0))
1474 hash = calc_state_hash (nodes, 0);
1475 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1477 for (i = 0 ; i < spot->num ; i++)
1479 re_dfastate_t *state = spot->array[i];
1480 if (hash != state->hash)
1482 if (re_node_set_compare (&state->nodes, nodes))
1486 /* There are no appropriate state in the dfa, create the new one. */
1487 new_state = create_ci_newstate (dfa, nodes, hash);
1488 if (BE (new_state == NULL, 0))
1494 /* Search for the state whose node_set is equivalent to NODES and
1495 whose context is equivalent to CONTEXT.
1496 Return the pointer to the state, if we found it in the DFA.
1497 Otherwise create the new one and return it. In case of an error
1498 return NULL and set the error code in ERR.
1499 Note: - We assume NULL as the invalid state, then it is possible that
1500 return value is NULL and ERR is REG_NOERROR.
1501 - We never return non-NULL value in case of any errors, it is for
1504 static re_dfastate_t *
1506 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1507 const re_node_set *nodes, unsigned int context)
1510 re_dfastate_t *new_state;
1511 struct re_state_table_entry *spot;
1513 if (nodes->nelem == 0)
1518 hash = calc_state_hash (nodes, context);
1519 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1521 for (i = 0 ; i < spot->num ; i++)
1523 re_dfastate_t *state = spot->array[i];
1524 if (state->hash == hash
1525 && state->context == context
1526 && re_node_set_compare (state->entrance_nodes, nodes))
1529 /* There are no appropriate state in `dfa', create the new one. */
1530 new_state = create_cd_newstate (dfa, nodes, context, hash);
1531 if (BE (new_state == NULL, 0))
1537 /* Finish initialization of the new state NEWSTATE, and using its hash value
1538 HASH put in the appropriate bucket of DFA's state table. Return value
1539 indicates the error code if failed. */
1541 static reg_errcode_t
1542 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1545 struct re_state_table_entry *spot;
1549 newstate->hash = hash;
1550 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1551 if (BE (err != REG_NOERROR, 0))
1553 for (i = 0; i < newstate->nodes.nelem; i++)
1555 int elem = newstate->nodes.elems[i];
1556 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1557 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1560 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1561 if (BE (spot->alloc <= spot->num, 0))
1563 int new_alloc = 2 * spot->num + 2;
1564 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1566 if (BE (new_array == NULL, 0))
1568 spot->array = new_array;
1569 spot->alloc = new_alloc;
1571 spot->array[spot->num++] = newstate;
1576 free_state (re_dfastate_t *state)
1578 re_node_set_free (&state->non_eps_nodes);
1579 re_node_set_free (&state->inveclosure);
1580 if (state->entrance_nodes != &state->nodes)
1582 re_node_set_free (state->entrance_nodes);
1583 re_free (state->entrance_nodes);
1585 re_node_set_free (&state->nodes);
1586 re_free (state->word_trtable);
1587 re_free (state->trtable);
1591 /* Create the new state which is independ of contexts.
1592 Return the new state if succeeded, otherwise return NULL. */
1594 static re_dfastate_t *
1596 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1601 re_dfastate_t *newstate;
1603 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1604 if (BE (newstate == NULL, 0))
1606 err = re_node_set_init_copy (&newstate->nodes, nodes);
1607 if (BE (err != REG_NOERROR, 0))
1613 newstate->entrance_nodes = &newstate->nodes;
1614 for (i = 0 ; i < nodes->nelem ; i++)
1616 re_token_t *node = dfa->nodes + nodes->elems[i];
1617 re_token_type_t type = node->type;
1618 if (type == CHARACTER && !node->constraint)
1620 #ifdef RE_ENABLE_I18N
1621 newstate->accept_mb |= node->accept_mb;
1622 #endif /* RE_ENABLE_I18N */
1624 /* If the state has the halt node, the state is a halt state. */
1625 if (type == END_OF_RE)
1627 else if (type == OP_BACK_REF)
1628 newstate->has_backref = 1;
1629 else if (type == ANCHOR || node->constraint)
1630 newstate->has_constraint = 1;
1632 err = register_state (dfa, newstate, hash);
1633 if (BE (err != REG_NOERROR, 0))
1635 free_state (newstate);
1641 /* Create the new state which is depend on the context CONTEXT.
1642 Return the new state if succeeded, otherwise return NULL. */
1644 static re_dfastate_t *
1646 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1647 unsigned int context, unsigned int hash)
1649 int i, nctx_nodes = 0;
1651 re_dfastate_t *newstate;
1653 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1654 if (BE (newstate == NULL, 0))
1656 err = re_node_set_init_copy (&newstate->nodes, nodes);
1657 if (BE (err != REG_NOERROR, 0))
1663 newstate->context = context;
1664 newstate->entrance_nodes = &newstate->nodes;
1666 for (i = 0 ; i < nodes->nelem ; i++)
1668 re_token_t *node = dfa->nodes + nodes->elems[i];
1669 re_token_type_t type = node->type;
1670 unsigned int constraint = node->constraint;
1672 if (type == CHARACTER && !constraint)
1674 #ifdef RE_ENABLE_I18N
1675 newstate->accept_mb |= node->accept_mb;
1676 #endif /* RE_ENABLE_I18N */
1678 /* If the state has the halt node, the state is a halt state. */
1679 if (type == END_OF_RE)
1681 else if (type == OP_BACK_REF)
1682 newstate->has_backref = 1;
1686 if (newstate->entrance_nodes == &newstate->nodes)
1688 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1689 if (BE (newstate->entrance_nodes == NULL, 0))
1691 free_state (newstate);
1694 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1696 newstate->has_constraint = 1;
1699 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1701 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1706 err = register_state (dfa, newstate, hash);
1707 if (BE (err != REG_NOERROR, 0))
1709 free_state (newstate);