+2003-11-24 Jakub Jelinek <jakub@redhat.com>
+
+ * posix/regex_internal.h (re_token_t): Add word_char bit. Add
+ comment.
+ (re_dfa_t): Add sb_char field.
+ (bitset_mask): New function.
+ * posix/regcomp.c (free_dfa_content): Free sb_char.
+ (init_dfa): Don't initialize word_char unnecessarily.
+ Initialize sb_char.
+ (duplicate_node): Don't duplicate !word_char CHARACTERs with
+ NEXT_WORD_CONSTRAINT constraint or word_char CHARACTERs with
+ NEXT_NOTWORD_CONSTRAINT. Return -1 in *new_idx instead.
+ (duplicate_node_closure): Handle clone_dest == -1 from
+ duplicate_node.
+ (peek_token): Initialize word_char bit.
+ (parse_expression, parse_dup_op): Add comments.
+ (parse_bracket_exp): Don't set bitmask bits for multi-byte char
+ starting bytes here at the beginning. Mask off the bits right
+ before creating SIMPLE_BRACKET.
+ (build_charclass_op): Likewise.
+ * posix/regexec.c (group_nodes_into_DFAstates) <case OP_PERIOD>: Only
+ set accept bits for single-byte characters.
+ (group_nodes_into_DFAstates): Don't rely on characters 0 .. 127
+ being single byte encoded and the rest multi-byte.
+ * posix/bug-regex19.c (tests): Add new tests.
+ (do_mb_tests): Initialize t to *test.
+ (main): Fail even on do_mb_tests errors.
+
2003-11-23 Ulrich Drepper <drepper@redhat.com>
* posix/regexec.c: Correct several memory allocation problems.
{ERE, ".\\b.", "=A=", 0, 0},
{ERE, ".\\b.", "==", 0, -1},
{ERE, ".\\b.", "ABA", 0, -1},
+ {ERE, "[^k]\\b[^k]", "AA~", 0, 1},
+ {ERE, "[^k]\\b[^k]", "=A=", 0, 0},
+ {ERE, "[^k]\\b[^k]", "Ak~kA~", 0, 4},
+ {ERE, "[^k]\\b[^k]", "==", 0, -1},
+ {ERE, "[^k]\\b[^k]", "ABA", 0, -1},
+ {ERE, "[^k]\\b[^k]", "Ak~", 0, -1},
+ {ERE, "[^k]\\b[^k]", "k=k", 0, -1},
+ {ERE, "[^C]\\b[^C]", "AA~", 0, 1},
+ {ERE, "[^C]\\b[^C]", "=A=", 0, 0},
+ {ERE, "[^C]\\b[^C]", "AC~CA~", 0, 4},
+ {ERE, "[^C]\\b[^C]", "==", 0, -1},
+ {ERE, "[^C]\\b[^C]", "ABA", 0, -1},
+ {ERE, "[^C]\\b[^C]", "AC~", 0, -1},
+ {ERE, "[^C]\\b[^C]", "C=C", 0, -1},
{ERE, "\\<(A|!|.B)", "A=AC", 0, 0},
{ERE, "\\<(A|!|.B)", "=AC", 0, 1},
{ERE, "\\<(A|!|.B)", "!AC", 0, 1},
{ERE, ".\\<.", "AA~", 0, -1},
{ERE, ".\\<.", "==", 0, -1},
{ERE, ".\\<.", "ABA", 0, -1},
+ {ERE, "[^k]\\<[^k]", "=k=A=", 0, 2},
+ {ERE, "[^k]\\<[^k]", "kk~", 0, -1},
+ {ERE, "[^k]\\<[^k]", "==", 0, -1},
+ {ERE, "[^k]\\<[^k]", "ABA", 0, -1},
+ {ERE, "[^k]\\<[^k]", "=k=", 0, -1},
+ {ERE, "[^C]\\<[^C]", "=C=A=", 0, 2},
+ {ERE, "[^C]\\<[^C]", "CC~", 0, -1},
+ {ERE, "[^C]\\<[^C]", "==", 0, -1},
+ {ERE, "[^C]\\<[^C]", "ABA", 0, -1},
+ {ERE, "[^C]\\<[^C]", "=C=", 0, -1},
{ERE, ".\\B.", "ABA", 0, 0},
{ERE, ".\\B.", "=BDC", 0, 1},
+ {ERE, "[^k]\\B[^k]", "kkkABA", 0, 3},
+ {ERE, "[^k]\\B[^k]", "kBk", 0, -1},
+ {ERE, "[^C]\\B[^C]", "CCCABA", 0, 3},
+ {ERE, "[^C]\\B[^C]", "CBC", 0, -1},
{ERE, ".(\\b|\\B).", "=~AB", 0, 1},
{ERE, ".(\\b|\\B).", "A=C", 0, 0},
{ERE, ".(\\b|\\B).", "ABC", 0, 0},
{ERE, ".(\\b|\\B).", "=~\\!", 0, -1},
+ {ERE, "[^k](\\b|\\B)[^k]", "=~AB", 0, 1},
+ {ERE, "[^k](\\b|\\B)[^k]", "A=C", 0, 0},
+ {ERE, "[^k](\\b|\\B)[^k]", "ABC", 0, 0},
+ {ERE, "[^k](\\b|\\B)[^k]", "=~kBD", 0, 3},
+ {ERE, "[^k](\\b|\\B)[^k]", "=~\\!", 0, -1},
+ {ERE, "[^k](\\b|\\B)[^k]", "=~kB", 0, -1},
+ {ERE, "[^C](\\b|\\B)[^C]", "=~AB", 0, 1},
+ {ERE, "[^C](\\b|\\B)[^C]", "A=C", 0, 0},
+ {ERE, "[^C](\\b|\\B)[^C]", "ABC", 0, 0},
+ {ERE, "[^C](\\b|\\B)[^C]", "=~CBD", 0, 3},
+ {ERE, "[^C](\\b|\\B)[^C]", "=~\\!", 0, -1},
+ {ERE, "[^C](\\b|\\B)[^C]", "=~CB", 0, -1},
{ERE, "\\b([A]|[!]|.B)", "A=AC", 0, 0},
{ERE, "\\b([A]|[!]|.B)", "=AC", 0, 1},
{ERE, "\\b([A]|[!]|.B)", "!AC", 0, 1},
char string[strlen (test->string) * 4 + 1];
char fail[8 + sizeof ("UTF-8 ")];
+ t = *test;
t.pattern = pattern;
t.string = string;
strcpy (fail, "UTF-8 ");
ret = 1;
}
ret |= do_one_test (&tests[i], "UTF-8 ");
- // Until the implementation is fixed, ignore the results of the
- // MB tests.
- /* ret |= */do_mb_tests (&tests[i]);
+ ret |= do_mb_tests (&tests[i]);
}
return ret;
}
re_free (dfa->state_table);
re_free (dfa->word_char);
+#ifdef RE_ENABLE_I18N
+ re_free (dfa->sb_char);
+#endif
#ifdef DEBUG
re_free (dfa->re_str);
#endif
dfa->subexps_alloc = 1;
dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
- dfa->word_char = NULL;
+ /* dfa->word_char = NULL; */
dfa->mb_cur_max = MB_CUR_MAX;
#ifdef _LIBC
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
!= 0);
#endif
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ {
+ int i, j, ch;
+
+ dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
+ if (BE (dfa->sb_char == NULL, 0))
+ return REG_ESPACE;
+#ifdef _LIBC
+ if (dfa->is_utf8)
+ memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
+ else
+#endif
+ for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
+ for (j = 0; j < UINT_BITS; ++j, ++ch)
+ if (btowc (ch) != WEOF)
+ dfa->sb_char[i] |= 1 << j;
+ }
+#endif
if (BE (dfa->nodes == NULL || dfa->state_table == NULL
|| dfa->subexps == NULL, 0))
if (BE (err != REG_NOERROR, 0))
return err;
dfa->nexts[clone_node] = dfa->nexts[org_node];
+ if (clone_dest == -1)
+ break;
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
if (BE (err != REG_NOERROR, 0))
return err;
+ if (clone_dest == -1)
+ break;
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
if (BE (err != REG_NOERROR, 0))
return err;
- ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
- if (BE (ret < 0, 0))
- return REG_ESPACE;
- err = duplicate_node_closure (dfa, org_dest, clone_dest,
- root_node, constraint);
- if (BE (err != REG_NOERROR, 0))
- return err;
+ if (clone_dest != -1)
+ {
+ ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
+ if (BE (ret < 0, 0))
+ return REG_ESPACE;
+ err = duplicate_node_closure (dfa, org_dest, clone_dest,
+ root_node, constraint);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
+ }
}
else
{
err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
if (BE (err != REG_NOERROR, 0))
return err;
+ if (clone_dest == -1)
+ break;
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
return REG_ESPACE;
int *new_idx, org_idx;
unsigned int constraint;
{
- int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
+ int dup_idx;
+
+ if (dfa->nodes[org_idx].type == CHARACTER
+ && (((constraint & NEXT_WORD_CONSTRAINT)
+ && !dfa->nodes[org_idx].word_char)
+ || ((constraint & NEXT_NOTWORD_CONSTRAINT)
+ && dfa->nodes[org_idx].word_char)))
+ {
+ /* \<!, \>W etc. can never match. Don't duplicate them, instead
+ tell the caller they shouldn't be added to edests. */
+ *new_idx = -1;
+ return REG_NOERROR;
+ }
+
+ dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
if (BE (dup_idx == -1, 0))
return REG_ESPACE;
dfa->nodes[dup_idx].constraint = constraint;
c = re_string_peek_byte (input, 0);
token->opr.c = c;
+ token->word_char = 0;
#ifdef RE_ENABLE_I18N
token->mb_partial = 0;
if (input->mb_cur_max > 1 &&
c2 = re_string_peek_byte_case (input, 1);
token->opr.c = c2;
token->type = CHARACTER;
+#ifdef RE_ENABLE_I18N
+ if (input->mb_cur_max > 1)
+ {
+ wint_t wc = re_string_wchar_at (input,
+ re_string_cur_idx (input) + 1);
+ token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
+ }
+ else
+#endif
+ token->word_char = IS_WORD_CHAR (c2) != 0;
+
switch (c2)
{
case '|':
}
token->type = CHARACTER;
+#ifdef RE_ENABLE_I18N
+ if (input->mb_cur_max > 1)
+ {
+ wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
+ token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
+ }
+ else
+#endif
+ token->word_char = IS_WORD_CHAR (token->opr.c);
+
switch (c)
{
case '\n':
/* Then we can these characters as normal characters. */
token->type = CHARACTER;
+ /* mb_partial and word_char bits should be initialized already
+ by peek_token. */
tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
if (BE (tree == NULL, 0))
{
re_string_set_index (regexp, start_idx);
*token = start_token;
token->type = CHARACTER;
+ /* mb_partial and word_char bits should be already initialized by
+ peek_token. */
return dup_elem;
}
if (token->type == OP_NON_MATCH_LIST)
{
#ifdef RE_ENABLE_I18N
- int i;
mbcset->non_match = 1;
#else /* not RE_ENABLE_I18N */
non_match = 1;
*err = REG_BADPAT;
goto parse_bracket_exp_free_return;
}
-#ifdef RE_ENABLE_I18N
- if (dfa->mb_cur_max > 1)
- for (i = 0; i < SBC_MAX; ++i)
- if (__btowc (i) == WEOF)
- bitset_set (sbcset, i);
-#endif /* RE_ENABLE_I18N */
}
/* We treat the first ']' as a normal character. */
if (non_match)
#endif /* not RE_ENABLE_I18N */
bitset_not (sbcset);
+#ifdef RE_ENABLE_I18N
+ /* Ensure only single byte characters are set. */
+ if (dfa->mb_cur_max > 1)
+ bitset_mask (sbcset, dfa->sb_char);
+#endif /* RE_ENABLE_I18N */
/* Build a tree for simple bracket. */
br_token.type = SIMPLE_BRACKET;
if (not)
{
#ifdef RE_ENABLE_I18N
- int i;
/*
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
bitset_set(cset->sbcset, '\0');
*/
mbcset->non_match = 1;
- if (dfa->mb_cur_max > 1)
- for (i = 0; i < SBC_MAX; ++i)
- if (__btowc (i) == WEOF)
- bitset_set (sbcset, i);
#else /* not RE_ENABLE_I18N */
non_match = 1;
#endif /* not RE_ENABLE_I18N */
#endif /* not RE_ENABLE_I18N */
bitset_not (sbcset);
+#ifdef RE_ENABLE_I18N
+ /* Ensure only single byte characters are set. */
+ if (dfa->mb_cur_max > 1)
+ bitset_mask (sbcset, dfa->sb_char);
+#endif
+
/* Build a tree for simple bracket. */
br_token.type = SIMPLE_BRACKET;
br_token.opr.sbcset = sbcset;
static inline void bitset_not (bitset set);
static inline void bitset_merge (bitset dest, const bitset src);
static inline void bitset_not_merge (bitset dest, const bitset src);
+static inline void bitset_mask (bitset dest, const bitset src);
#define PREV_WORD_CONSTRAINT 0x0001
#define PREV_NOTWORD_CONSTRAINT 0x0002
unsigned int constraint : 10; /* context constraint */
unsigned int duplicated : 1;
#ifdef RE_ENABLE_I18N
+ /* These 2 bits can be moved into the union if needed (e.g. if running out
+ of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */
unsigned int mb_partial : 1;
#endif
+ unsigned int word_char : 1;
} re_token_t;
#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
re_dfastate_t *init_state_begbuf;
bin_tree_t *str_tree;
bin_tree_storage_t *str_tree_storage;
+ re_bitset_ptr_t sb_char;
int str_tree_storage_idx;
/* number of subexpressions `re_nsub' is in regex_t. */
dest[i] |= ~src[i];
}
+static inline void
+bitset_mask (dest, src)
+ bitset dest;
+ const bitset src;
+{
+ int bitset_i;
+ for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
+ dest[bitset_i] &= src[bitset_i];
+}
+
#if defined RE_ENABLE_I18N && !defined RE_NO_INTERNAL_PROTOTYPES
/* Inline functions for re_string. */
static inline int
}
else if (type == OP_PERIOD)
{
- bitset_set_all (accepts);
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ bitset_merge (accepts, dfa->sb_char);
+ else
+#endif
+ bitset_set_all (accepts);
if (!(preg->syntax & RE_DOT_NEWLINE))
bitset_clear (accepts, '\n');
if (preg->syntax & RE_DOT_NOT_NULL)
match it the context. */
if (constraint)
{
- int word_char_max;
-
if (constraint & NEXT_NEWLINE_CONSTRAINT)
{
int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
continue;
}
- /* This assumes ASCII compatible locale. We cannot say
- anything about the non-ascii chars. */
- word_char_max
- = dfa->mb_cur_max > 1 ? BITSET_UINTS / 2 : BITSET_UINTS;
if (constraint & NEXT_WORD_CONSTRAINT)
- for (j = 0; j < word_char_max; ++j)
- accepts[j] &= dfa->word_char[j];
+ {
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ for (j = 0; j < BITSET_UINTS; ++j)
+ accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]);
+ else
+#endif
+ for (j = 0; j < BITSET_UINTS; ++j)
+ accepts[j] &= dfa->word_char[j];
+ }
if (constraint & NEXT_NOTWORD_CONSTRAINT)
- for (j = 0; j < word_char_max; ++j)
- accepts[j] &= ~dfa->word_char[j];
+ {
+#ifdef RE_ENABLE_I18N
+ if (dfa->mb_cur_max > 1)
+ for (j = 0; j < BITSET_UINTS; ++j)
+ accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]);
+ else
+#endif
+ for (j = 0; j < BITSET_UINTS; ++j)
+ accepts[j] &= ~dfa->word_char[j];
+ }
}
/* Then divide `accepts' into DFA states, or create a new