\xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS
\xe2\x80\x94 EM DASH */
/* Should not match. */
+ {RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 0, -1},
{RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 2, -1},
{RE_SYNTAX_POSIX_BASIC, "A\\>", "aAAO", 1, -1},
+ {RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 0, -1},
{RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 2, -1},
{RE_SYNTAX_POSIX_BASIC, "A\\b", "aAAO", 1, -1},
+ {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1},
{RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1},
#if 0
- /* XXX Not used since they fail so far. */
+ /* XXX these 2 tests still fail. */
+ {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1},
{RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1},
- {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1},
#endif
+ {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1},
/* Should match. */
{RE_SYNTAX_POSIX_BASIC, "\\<A", "AA", 0, 0},
{RE_SYNTAX_POSIX_BASIC, "\\<A", "a-AA", 2, 2},
{RE_SYNTAX_POSIX_BASIC, "\\bA", "a-AA", 2, 2},
{RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA-", 1, 2},
{RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA", 1, 2},
-#if 0
- /* XXX Not used since they fail so far. */
{RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "\xc3\x84\xc3\x84", 0, 0},
{RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3},
{RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84", 1, 3}
-#endif
};
int
static reg_errcode_t prune_impossible_nodes (const regex_t *preg,
re_match_context_t *mctx);
static int check_matching (const regex_t *preg, re_match_context_t *mctx,
- int fl_search, int fl_longest_match);
+ int fl_longest_match);
static int check_halt_node_context (const re_dfa_t *dfa, int node,
unsigned int context);
static int check_halt_state_context (const regex_t *preg,
re_dfastate_t **src, int num);
static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
re_match_context_t *mctx,
- re_dfastate_t *state, int fl_search);
+ re_dfastate_t *state);
static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa,
re_match_context_t *mctx,
re_node_set *cur_nodes,
int str_idx);
+#if 0
static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
re_dfastate_t *pstate,
- int fl_search,
re_match_context_t *mctx);
+#endif
#ifdef RE_ENABLE_I18N
static reg_errcode_t transit_state_mb (const regex_t *preg,
re_dfastate_t *pstate,
int last_str, int subexp_num,
int fl_open);
static re_dfastate_t **build_trtable (const regex_t *dfa,
- const re_dfastate_t *state,
- int fl_search);
+ re_dfastate_t *state);
#ifdef RE_ENABLE_I18N
static int check_node_accept_bytes (const regex_t *preg, int node_idx,
const re_string_t *input, int idx);
/* It seems to be appropriate one, then use the matcher. */
/* We assume that the matching starts from 0. */
mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
- match_last = check_matching (preg, &mctx, 0, fl_longest_match);
+ match_last = check_matching (preg, &mctx, fl_longest_match);
if (match_last != -1)
{
if (BE (match_last == -2, 0))
if (dfa->init_state->has_constraint)
{
unsigned int context;
- context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
- preg->newline_anchor);
+ context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
+ preg->newline_anchor);
if (IS_WORD_CONTEXT (context))
return dfa->init_state_word;
else if (IS_ORDINARY_CONTEXT (context))
/* Check whether the regular expression match input string INPUT or not,
and return the index where the matching end, return -1 if not match,
or return -2 in case of an error.
- FL_SEARCH means we must search where the matching starts,
FL_LONGEST_MATCH means we want the POSIX longest matching.
Note that the matcher assume that the maching starts from the current
index of the buffer. */
static int
-check_matching (preg, mctx, fl_search, fl_longest_match)
+check_matching (preg, mctx, fl_longest_match)
const regex_t *preg;
re_match_context_t *mctx;
- int fl_search, fl_longest_match;
+ int fl_longest_match;
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
reg_errcode_t err;
while (!re_string_eoi (mctx->input))
{
- cur_state = transit_state (&err, preg, mctx, cur_state,
- fl_search && !match);
+ cur_state = transit_state (&err, preg, mctx, cur_state);
if (cur_state == NULL) /* Reached at the invalid state or an error. */
{
cur_str_idx = re_string_cur_idx (mctx->input);
if (BE (err != REG_NOERROR, 0))
return -2;
- if (fl_search && !match)
- {
- /* Restart from initial state, since we are searching
- the point from where matching start. */
-#ifdef RE_ENABLE_I18N
- if (dfa->mb_cur_max == 1
- || re_string_first_byte (mctx->input, cur_str_idx))
-#endif /* RE_ENABLE_I18N */
- cur_state = acquire_init_state_context (&err, preg, mctx,
- cur_str_idx);
- if (BE (cur_state == NULL && err != REG_NOERROR, 0))
- return -2;
- if (mctx->state_log != NULL)
- mctx->state_log[cur_str_idx] = cur_state;
- }
- else if (!fl_longest_match && match)
+ if (!fl_longest_match && match)
break;
- else /* (fl_longest_match && match) || (!fl_search && !match) */
+ else
{
if (mctx->state_log == NULL)
break;
update the destination of STATE_LOG. */
static re_dfastate_t *
-transit_state (err, preg, mctx, state, fl_search)
+transit_state (err, preg, mctx, state)
reg_errcode_t *err;
const regex_t *preg;
re_match_context_t *mctx;
re_dfastate_t *state;
- int fl_search;
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
re_dfastate_t **trtable, *next_state;
{
/* Use transition table */
ch = re_string_fetch_byte (mctx->input);
- trtable = fl_search ? state->trtable_search : state->trtable;
+ trtable = state->trtable;
if (trtable == NULL)
{
- trtable = build_trtable (preg, state, fl_search);
- if (fl_search)
- state->trtable_search = trtable;
+ trtable = build_trtable (preg, state);
+ if (trtable == NULL)
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ }
+ if (BE (state->word_trtable, 0))
+ {
+ unsigned int context;
+ context
+ = re_string_context_at (mctx->input,
+ re_string_cur_idx (mctx->input) - 1,
+ mctx->eflags, preg->newline_anchor);
+ if (IS_WORD_CONTEXT (context))
+ next_state = trtable[ch + SBC_MAX];
else
- state->trtable = trtable;
+ next_state = trtable[ch];
}
- next_state = trtable[ch];
+ else
+ next_state = trtable[ch];
}
+#if 0
else
{
/* don't use transition table */
- next_state = transit_state_sb (err, preg, state, fl_search, mctx);
+ next_state = transit_state_sb (err, preg, state, mctx);
if (BE (next_state == NULL && err != REG_NOERROR, 0))
return NULL;
}
+#endif
}
cur_idx = re_string_cur_idx (mctx->input);
return REG_NOERROR;
}
+#if 0
/* Return the next state to which the current state STATE will transit by
accepting the current input byte. */
static re_dfastate_t *
-transit_state_sb (err, preg, state, fl_search, mctx)
+transit_state_sb (err, preg, state, mctx)
reg_errcode_t *err;
const regex_t *preg;
re_dfastate_t *state;
- int fl_search;
re_match_context_t *mctx;
{
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
}
}
}
- if (fl_search)
- {
-#ifdef RE_ENABLE_I18N
- int not_initial = 0;
- if (dfa->mb_cur_max > 1)
- for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
- if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
- {
- not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
- break;
- }
- if (!not_initial)
-#endif
- {
- *err = re_node_set_merge (&next_nodes,
- dfa->init_state->entrance_nodes);
- if (BE (*err != REG_NOERROR, 0))
- {
- re_node_set_free (&next_nodes);
- return NULL;
- }
- }
- }
context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
preg->newline_anchor);
next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
re_string_skip_bytes (mctx->input, 1);
return next_state;
}
+#endif
#ifdef RE_ENABLE_I18N
static reg_errcode_t
Return the new table if succeeded, otherwise return NULL. */
static re_dfastate_t **
-build_trtable (preg, state, fl_search)
+build_trtable (preg, state)
const regex_t *preg;
- const re_dfastate_t *state;
- int fl_search;
+ re_dfastate_t *state;
{
reg_errcode_t err;
re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
/* Initialize transiton table. */
trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
+ state->word_trtable = 0;
if (BE (trtable == NULL, 0))
{
if (dests_node_malloced)
free (dests_node);
/* Return NULL in case of an error, trtable otherwise. */
if (ndests == 0)
- return trtable;
+ {
+ state->trtable = trtable;
+ return trtable;
+ }
free (trtable);
return NULL;
}
goto out_free;
}
}
- /* If search flag is set, merge the initial state. */
- if (fl_search)
- {
-#ifdef RE_ENABLE_I18N
- int not_initial = 0;
- for (j = 0; j < follows.nelem; ++j)
- if (dfa->nodes[follows.elems[j]].type == CHARACTER)
- {
- not_initial = dfa->nodes[follows.elems[j]].mb_partial;
- break;
- }
- if (!not_initial)
-#endif
- {
- err = re_node_set_merge (&follows,
- dfa->init_state->entrance_nodes);
- if (BE (err != REG_NOERROR, 0))
- goto out_free;
- }
- }
dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
goto out_free;
for (j = 0; j < UINT_BITS; ++j, ++ch)
if ((acceptable[i] >> j) & 1)
{
- /* The current state accepts the character ch. */
- if (IS_WORD_CHAR (ch))
- {
- for (k = 0; k < ndests; ++k)
- if ((dests_ch[k][i] >> j) & 1)
+ for (k = 0; k < ndests; ++k)
+ if ((dests_ch[k][i] >> j) & 1)
+ {
+ /* k-th destination accepts the word character ch. */
+ if (state->word_trtable)
{
- /* k-th destination accepts the word character ch. */
- trtable[ch] = dest_states_word[k];
- /* There must be only one destination which accepts
- character ch. See group_nodes_into_DFAstates. */
- break;
+ trtable[ch] = dest_states[k];
+ trtable[ch + SBC_MAX] = dest_states_word[k];
}
- }
- else /* not WORD_CHAR */
- {
- for (k = 0; k < ndests; ++k)
- if ((dests_ch[k][i] >> j) & 1)
+ else if (dfa->mb_cur_max > 1
+ && dest_states[k] != dest_states_word[k])
{
- /* k-th destination accepts the non-word character ch. */
+ re_dfastate_t **new_trtable;
+
+ new_trtable = (re_dfastate_t **)
+ realloc (trtable,
+ sizeof (re_dfastate_t *)
+ * 2 * SBC_MAX);
+ if (BE (new_trtable == NULL, 0))
+ goto out_free;
+ memcpy (new_trtable + SBC_MAX, new_trtable,
+ sizeof (re_dfastate_t *) * SBC_MAX);
+ trtable = new_trtable;
+ state->word_trtable = 1;
trtable[ch] = dest_states[k];
- /* There must be only one destination which accepts
- character ch. See group_nodes_into_DFAstates. */
- break;
+ trtable[ch + SBC_MAX] = dest_states_word[k];
}
- }
+ else if (IS_WORD_CHAR (ch))
+ trtable[ch] = dest_states_word[k];
+ else
+ trtable[ch] = dest_states[k];
+ /* There must be only one destination which accepts
+ character ch. See group_nodes_into_DFAstates. */
+ break;
+ }
}
/* new line */
if (bitset_contain (acceptable, NEWLINE_CHAR))
{
/* k-th destination accepts newline character. */
trtable[NEWLINE_CHAR] = dest_states_nl[k];
+ if (state->word_trtable)
+ trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[k];
/* There must be only one destination which accepts
newline. See group_nodes_into_DFAstates. */
break;
if (dests_node_malloced)
free (dests_node);
+ state->trtable = trtable;
return trtable;
}
match it the context. */
if (constraint)
{
+ int word_char_max;
+
if (constraint & NEXT_NEWLINE_CONSTRAINT)
{
int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
bitset_empty (accepts);
continue;
}
+
+ /* This assumes ASCII compatible locale. We cannot say
+ anything about the non-ascii chars. */
+ word_char_max
+ = dfa->mb_cur_max > 1 ? BITSET_UINTS / 2 : BITSET_UINTS;
if (constraint & NEXT_WORD_CONSTRAINT)
- for (j = 0; j < BITSET_UINTS; ++j)
+ for (j = 0; j < word_char_max; ++j)
accepts[j] &= dfa->word_char[j];
if (constraint & NEXT_NOTWORD_CONSTRAINT)
- for (j = 0; j < BITSET_UINTS; ++j)
+ for (j = 0; j < word_char_max; ++j)
accepts[j] &= ~dfa->word_char[j];
}
if (strstr (pattern, "[:xdigit:]"))
return 0;
+ /* XXX: regex ATM handles only single byte equivalence classes. */
+ if (strstr (pattern, "[[=b=]]"))
+ return 0;
+
for (i = 1; i < 16; ++i)
{
char *p = letters;
- if (i & 1)
+ if ((i & 1)
+ && (strchr (pattern, 'a') || strchr (string, 'a')
+ || strchr (pattern, 'A') || strchr (string, 'A')))
*p++ = 'a', *p++ = 'A';
- if (i & 2)
+ if ((i & 2)
+ && (strchr (pattern, 'b') || strchr (string, 'b')
+ || strchr (pattern, 'B') || strchr (string, 'B')))
*p++ = 'b', *p++ = 'B';
- if (i & 4)
+ if ((i & 4)
+ && (strchr (pattern, 'c') || strchr (string, 'c')
+ || strchr (pattern, 'C') || strchr (string, 'C')))
*p++ = 'c', *p++ = 'C';
- if (i & 8)
+ if ((i & 8)
+ && (strchr (pattern, 'd') || strchr (string, 'd')
+ || strchr (pattern, 'D') || strchr (string, 'D')))
*p++ = 'd', *p++ = 'D';
*p++ = '\0';
sprintf (fail, "UTF-8 %s FAIL", letters);
replace_special_chars (matches);
}
- setlocale (LC_ALL, "C");
+ if (setlocale (LC_ALL, "C") == NULL)
+ {
+ puts ("setlocale C failed");
+ ret = 1;
+ }
if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
|| (try_bre_ere
&& test (pattern, cflags & ~REG_EXTENDED, string, eflags,
ret = 1;
else if (test_utf8)
{
- setlocale (LC_ALL, "cs_CZ.UTF-8");
- if (test (pattern, cflags, string, eflags, expect, matches,
- "UTF-8 FAIL")
- || (try_bre_ere
- && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
- expect, matches, "UTF-8 FAIL")))
+ if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
+ {
+ puts ("setlocale cs_CZ.UTF-8 failed");
+ ret = 1;
+ }
+ else if (test (pattern, cflags, string, eflags, expect, matches,
+ "UTF-8 FAIL")
+ || (try_bre_ere
+ && test (pattern, cflags & ~REG_EXTENDED, string,
+ eflags, expect, matches, "UTF-8 FAIL")))
ret = 1;
else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
|| (try_bre_ere