From 01ed6ceb7c440f0695726463ee9ee307921ea97e Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 7 Sep 2005 01:15:33 +0000 Subject: [PATCH] * posix/regex_internal.c (re_string_reconstruct): Avoid calling mbrtowc for very simple UTF-8 case. 2005-09-01 Paul Eggert * posix/regex_internal.c (build_wcs_upper_buffer): Fix portability bugs in int versus size_t comparisons. 2005-09-06 Ulrich Drepper * posix/regex_internal.c (re_acquire_state): Make DFA pointer arg a pointer-to-const. (re_acquire_state_context): Likewise. * posix/regex_internal.h: Adjust prototypes. 2005-08-31 Jim Meyering * posix/regcomp.c (search_duplicated_node): Make first pointer arg a pointer-to-const. * posix/regex_internal.c (create_ci_newstate, create_cd_newstate, register_state): Likewise. * posix/regexec.c (search_cur_bkref_entry, check_dst_limits): (check_dst_limits_calc_pos_1, check_dst_limits_calc_pos): (group_nodes_into_DFAstates): Likewise. * posix/regexec.c (re_search_internal): Simplify update of rm_so and rm_eo by replacing "if (A == B) A += C - B;" with the equivalent of "if (A == B) A = C;". 2005-09-06 Ulrich Drepper * posix/regcomp.c (re_compile_internal): Change third parameter type to size_t. (init_dfa): Likewise. Make sure that arithmetic on pat_len doesn't overflow. * posix/regex_internal.h (struct re_dfa_t): Change type of nodes_alloc and nodes_len to size_t. * posix/regex_internal.c (re_dfa_add_node): Use size_t as type for new_nodes_alloc. Check for overflow. 2005-08-31 Paul Eggert * posix/regcomp.c (re_compile_fastmap_iter, init_dfa, init_word_char): (optimize_subexps, lower_subexp): Don't assume 1<<31 has defined behavior on hosts with 32-bit int, since the signed shift might overflow. Use 1u<<31 instead. * posix/regex_internal.h (bitset_set, bitset_clear, bitset_contain): Likewise. * posix/regexec.c (check_dst_limits_calc_pos_1): Likewise. (check_subexp_matching_top): Likewise. * posix/regcomp.c (optimize_subexps, lower_subexp): Use CHAR_BIT rather than 8, for clarity. * posix/regexec.c (check_dst_limits_calc_pos_1): (check_subexp_matching_top): Likewise. * posix/regcomp.c (init_dfa): Make table_size unsigned, so that we don't have to worry about portability issues when shifting it left. Remove no-longer-needed test for table_size > 0. * posix/regcomp.c (parse_sub_exp): Do not shift more bits than there are in a word, as the resulting behavior is undefined. * posix/regexec.c (check_dst_limits_calc_pos_1): Likewise; in one case, a <= should have been an <, and in another case the whole test was missing. * posix/regex_internal.h (BYTE_BITS): Remove. All uses changed to the standard name CHAR_BIT. --- ChangeLog | 67 +++++++++++++++++++++++++++ posix/regcomp.c | 39 +++++++++------- posix/regex_internal.c | 120 ++++++++++++++++++++++++++++--------------------- posix/regex_internal.h | 22 ++++----- posix/regexec.c | 74 +++++++++++++++++------------- 5 files changed, 212 insertions(+), 110 deletions(-) diff --git a/ChangeLog b/ChangeLog index 23e8cc9..31d39e8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,72 @@ +2005-09-06 Ulrich Drepper + + * posix/regex_internal.c (re_string_reconstruct): Avoid calling + mbrtowc for very simple UTF-8 case. + +2005-09-01 Paul Eggert + + * posix/regex_internal.c (build_wcs_upper_buffer): Fix portability + bugs in int versus size_t comparisons. + +2005-09-06 Ulrich Drepper + + * posix/regex_internal.c (re_acquire_state): Make DFA pointer arg + a pointer-to-const. + (re_acquire_state_context): Likewise. + * posix/regex_internal.h: Adjust prototypes. + +2005-08-31 Jim Meyering + + * posix/regcomp.c (search_duplicated_node): Make first pointer arg + a pointer-to-const. + * posix/regex_internal.c (create_ci_newstate, create_cd_newstate, + register_state): Likewise. + * posix/regexec.c (search_cur_bkref_entry, check_dst_limits): + (check_dst_limits_calc_pos_1, check_dst_limits_calc_pos): + (group_nodes_into_DFAstates): Likewise. + 2005-08-31 Paul Eggert + * posix/regexec.c (re_search_internal): Simplify update of + rm_so and rm_eo by replacing "if (A == B) A += C - B;" + with the equivalent of "if (A == B) A = C;". + +2005-09-06 Ulrich Drepper + + * posix/regcomp.c (re_compile_internal): Change third parameter type + to size_t. + (init_dfa): Likewise. Make sure that arithmetic on pat_len doesn't + overflow. + * posix/regex_internal.h (struct re_dfa_t): Change type of nodes_alloc + and nodes_len to size_t. + * posix/regex_internal.c (re_dfa_add_node): Use size_t as type for + new_nodes_alloc. Check for overflow. + +2005-08-31 Paul Eggert + + * posix/regcomp.c (re_compile_fastmap_iter, init_dfa, init_word_char): + (optimize_subexps, lower_subexp): + Don't assume 1<<31 has defined behavior on hosts with 32-bit int, + since the signed shift might overflow. Use 1u<<31 instead. + * posix/regex_internal.h (bitset_set, bitset_clear, bitset_contain): + Likewise. + * posix/regexec.c (check_dst_limits_calc_pos_1): Likewise. + (check_subexp_matching_top): Likewise. + * posix/regcomp.c (optimize_subexps, lower_subexp): + Use CHAR_BIT rather than 8, for clarity. + * posix/regexec.c (check_dst_limits_calc_pos_1): + (check_subexp_matching_top): Likewise. + * posix/regcomp.c (init_dfa): Make table_size unsigned, so that we + don't have to worry about portability issues when shifting it left. + Remove no-longer-needed test for table_size > 0. + * posix/regcomp.c (parse_sub_exp): Do not shift more bits than there + are in a word, as the resulting behavior is undefined. + * posix/regexec.c (check_dst_limits_calc_pos_1): Likewise; + in one case, a <= should have been an <, and in another case the + whole test was missing. + * posix/regex_internal.h (BYTE_BITS): Remove. All uses changed to + the standard name CHAR_BIT. + * posix/regex_internal.h (re_sub_match_top_t): Remove unused member next_last_offset. (struct re_dfa_t): Remove unused member states_alloc. diff --git a/posix/regcomp.c b/posix/regcomp.c index d820533..c93f79e 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -19,11 +19,11 @@ 02111-1307 USA. */ static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, - int length, reg_syntax_t syntax); + size_t length, reg_syntax_t syntax); static void re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, char *fastmap); -static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); +static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); static void init_word_char (re_dfa_t *dfa); #ifdef RE_ENABLE_I18N static void free_charset (re_charset_t *cset); @@ -51,7 +51,7 @@ static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node, int root_node, unsigned int constraint); static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); -static int search_duplicated_node (re_dfa_t *dfa, int org_node, +static int search_duplicated_node (const re_dfa_t *dfa, int org_node, unsigned int constraint); static reg_errcode_t calc_eclosure (re_dfa_t *dfa); static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, @@ -368,7 +368,7 @@ re_compile_fastmap_iter (bufp, init_state, fastmap) int i, j, ch; for (i = 0, ch = 0; i < BITSET_UINTS; ++i) for (j = 0; j < UINT_BITS; ++j, ++ch) - if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) + if (dfa->nodes[node].opr.sbcset[i] & (1u << j)) re_set_fastmap (fastmap, icase, ch); } #ifdef RE_ENABLE_I18N @@ -740,7 +740,7 @@ static reg_errcode_t re_compile_internal (preg, pattern, length, syntax) regex_t *preg; const char * pattern; - int length; + size_t length; reg_syntax_t syntax; { reg_errcode_t err = REG_NOERROR; @@ -781,6 +781,7 @@ re_compile_internal (preg, pattern, length, syntax) return err; } #ifdef DEBUG + /* Note: length+1 will not overflow since it is checked in init_dfa. */ dfa->re_str = re_malloc (char, length + 1); strncpy (dfa->re_str, pattern, length + 1); #endif @@ -840,9 +841,9 @@ re_compile_internal (preg, pattern, length, syntax) static reg_errcode_t init_dfa (dfa, pat_len) re_dfa_t *dfa; - int pat_len; + size_t pat_len; { - int table_size; + unsigned int table_size; #ifndef _LIBC char *codeset_name; #endif @@ -852,11 +853,15 @@ init_dfa (dfa, pat_len) /* Force allocation of str_tree_storage the first time. */ dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; + /* Avoid overflows. */ + if (pat_len == SIZE_MAX) + return REG_ESPACE; + dfa->nodes_alloc = pat_len + 1; dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); /* table_size = 2 ^ ceil(log pat_len) */ - for (table_size = 1; table_size > 0; table_size <<= 1) + for (table_size = 1; ; table_size <<= 1) if (table_size > pat_len) break; @@ -916,7 +921,7 @@ init_dfa (dfa, pat_len) { wint_t wch = __btowc (ch); if (wch != WEOF) - dfa->sb_char[i] |= 1 << j; + dfa->sb_char[i] |= 1u << j; # ifndef _LIBC if (isascii (ch) && wch != ch) dfa->map_notascii = 1; @@ -944,7 +949,7 @@ init_word_char (dfa) for (i = 0, ch = 0; i < BITSET_UINTS; ++i) for (j = 0; j < UINT_BITS; ++j, ++ch) if (isalnum (ch) || ch == '_') - dfa->word_char[i] |= 1 << j; + dfa->word_char[i] |= 1u << j; } /* Free the work area which are only used while compiling. */ @@ -1277,8 +1282,8 @@ optimize_subexps (extra, node) node->left->parent = node; dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; - if (other_idx < 8 * sizeof (dfa->used_bkref_map)) - dfa->used_bkref_map &= ~(1 << other_idx); + if (other_idx < CHAR_BIT * sizeof dfa->used_bkref_map) + dfa->used_bkref_map &= ~(1u << other_idx); } return REG_NOERROR; @@ -1326,8 +1331,8 @@ lower_subexp (err, preg, node) very common, so we do not lose much. An example that triggers this case is the sed "script" /\(\)/x. */ && node->left != NULL - && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map) - || !(dfa->used_bkref_map & (1 << node->token.opr.idx)))) + && (node->token.opr.idx >= CHAR_BIT * sizeof dfa->used_bkref_map + || !(dfa->used_bkref_map & (1u << node->token.opr.idx)))) return node->left; /* Convert the SUBEXP node to the concatenation of an @@ -1574,7 +1579,7 @@ duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, static int search_duplicated_node (dfa, org_node, constraint) - re_dfa_t *dfa; + const re_dfa_t *dfa; int org_node; unsigned int constraint; { @@ -2492,7 +2497,9 @@ parse_sub_exp (regexp, preg, token, syntax, nest, err) if (BE (*err != REG_NOERROR, 0)) return NULL; } - dfa->completed_bkref_map |= 1 << cur_nsub; + + if (cur_nsub <= '9' - '1') + dfa->completed_bkref_map |= 1 << cur_nsub; tree = create_tree (dfa, tree, NULL, SUBEXP); if (BE (tree == NULL, 0)) diff --git a/posix/regex_internal.c b/posix/regex_internal.c index 821ed7f..240e887 100644 --- a/posix/regex_internal.c +++ b/posix/regex_internal.c @@ -26,12 +26,13 @@ static void re_string_construct_common (const char *str, int len, static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc) internal_function; #endif /* RE_ENABLE_I18N */ -static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate, +static reg_errcode_t register_state (const re_dfa_t *dfa, + re_dfastate_t *newstate, unsigned int hash) internal_function; -static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa, +static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, unsigned int hash) internal_function; -static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa, +static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, unsigned int context, unsigned int hash) internal_function; @@ -654,37 +655,50 @@ re_string_reconstruct (pstr, idx, eflags) byte other than 0x80 - 0xbf. */ raw = pstr->raw_mbs + pstr->raw_mbs_idx; end = raw + (offset - pstr->mb_cur_max); - for (p = raw + offset - 1; p >= end; --p) - if ((*p & 0xc0) != 0x80) - { - mbstate_t cur_state; - wchar_t wc2; - int mlen = raw + pstr->len - p; - unsigned char buf[6]; - - q = p; - if (BE (pstr->trans != NULL, 0)) - { - int i = mlen < 6 ? mlen : 6; - while (--i >= 0) - buf[i] = pstr->trans[p[i]]; - q = buf; - } - /* XXX Don't use mbrtowc, we know which conversion - to use (UTF-8 -> UCS4). */ - memset (&cur_state, 0, sizeof (cur_state)); - mlen = (mbrtowc (&wc2, (const char *) p, mlen, - &cur_state) - - (raw + offset - p)); - if (mlen >= 0) - { - memset (&pstr->cur_state, '\0', - sizeof (mbstate_t)); - pstr->valid_len = mlen; - wc = wc2; - } - break; - } + p = raw + offset - 1; +#ifdef _LIBC + /* We know the wchar_t encoding is UCS4, so for the simple + case, ASCII characters, skip the conversion step. */ + if (isascii (*p) && BE (pstr->trans == NULL, 1)) + { + memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); + pstr->valid_len = 0; + wc = (wchar_t) *p; + } + else +#endif + for (; p >= end; --p) + if ((*p & 0xc0) != 0x80) + { + mbstate_t cur_state; + wchar_t wc2; + int mlen = raw + pstr->len - p; + unsigned char buf[6]; + size_t mbclen; + + q = p; + if (BE (pstr->trans != NULL, 0)) + { + int i = mlen < 6 ? mlen : 6; + while (--i >= 0) + buf[i] = pstr->trans[p[i]]; + q = buf; + } + /* XXX Don't use mbrtowc, we know which conversion + to use (UTF-8 -> UCS4). */ + memset (&cur_state, 0, sizeof (cur_state)); + mbclen = mbrtowc (&wc2, (const char *) p, mlen, + &cur_state); + if (raw + offset - p <= mbclen + && mbclen < (size_t) -2) + { + memset (&pstr->cur_state, '\0', + sizeof (mbstate_t)); + pstr->valid_len = mbclen - (raw + offset - p); + wc = wc2; + } + break; + } } if (wc == WEOF) @@ -738,15 +752,15 @@ re_string_reconstruct (pstr, idx, eflags) } else #endif /* RE_ENABLE_I18N */ - if (BE (pstr->mbs_allocated, 0)) - { - if (pstr->icase) - build_upper_buffer (pstr); - else if (pstr->trans != NULL) - re_string_translate_buffer (pstr); - } - else - pstr->valid_len = pstr->len; + if (BE (pstr->mbs_allocated, 0)) + { + if (pstr->icase) + build_upper_buffer (pstr); + else if (pstr->trans != NULL) + re_string_translate_buffer (pstr); + } + else + pstr->valid_len = pstr->len; pstr->cur_idx = 0; return REG_NOERROR; @@ -1345,12 +1359,16 @@ re_dfa_add_node (dfa, token) int type = token.type; if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) { - int new_nodes_alloc = dfa->nodes_alloc * 2; + size_t new_nodes_alloc = dfa->nodes_alloc * 2; int *new_nexts, *new_indices; re_node_set *new_edests, *new_eclosures; + re_token_t *new_nodes; + + /* Avoid overflows. */ + if (BE (new_nodes_alloc < dfa->nodes_alloc, 0)) + return -1; - re_token_t *new_nodes = re_realloc (dfa->nodes, re_token_t, - new_nodes_alloc); + new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); if (BE (new_nodes == NULL, 0)) return -1; dfa->nodes = new_nodes; @@ -1403,7 +1421,7 @@ calc_state_hash (nodes, context) static re_dfastate_t* re_acquire_state (err, dfa, nodes) reg_errcode_t *err; - re_dfa_t *dfa; + const re_dfa_t *dfa; const re_node_set *nodes; { unsigned int hash; @@ -1448,7 +1466,7 @@ re_acquire_state (err, dfa, nodes) static re_dfastate_t* re_acquire_state_context (err, dfa, nodes, context) reg_errcode_t *err; - re_dfa_t *dfa; + const re_dfa_t *dfa; const re_node_set *nodes; unsigned int context; { @@ -1486,7 +1504,7 @@ re_acquire_state_context (err, dfa, nodes, context) static reg_errcode_t register_state (dfa, newstate, hash) - re_dfa_t *dfa; + const re_dfa_t *dfa; re_dfastate_t *newstate; unsigned int hash; { @@ -1525,7 +1543,7 @@ register_state (dfa, newstate, hash) static re_dfastate_t * create_ci_newstate (dfa, nodes, hash) - re_dfa_t *dfa; + const re_dfa_t *dfa; const re_node_set *nodes; unsigned int hash; { @@ -1576,7 +1594,7 @@ create_ci_newstate (dfa, nodes, hash) static re_dfastate_t * create_cd_newstate (dfa, nodes, context, hash) - re_dfa_t *dfa; + const re_dfa_t *dfa; const re_node_set *nodes; unsigned int context, hash; { diff --git a/posix/regex_internal.h b/posix/regex_internal.h index cdaa8fc..4d6a7a8 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -91,8 +91,6 @@ # define inline #endif -/* Number of bits in a byte. */ -#define BYTE_BITS 8 /* Number of single byte character. */ #define SBC_MAX 256 @@ -123,16 +121,16 @@ extern const char __re_error_msgid[] attribute_hidden; extern const size_t __re_error_msgid_idx[] attribute_hidden; /* Number of bits in an unsinged int. */ -#define UINT_BITS (sizeof (unsigned int) * BYTE_BITS) +#define UINT_BITS (sizeof (unsigned int) * CHAR_BIT) /* Number of unsigned int in an bit_set. */ #define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS) typedef unsigned int bitset[BITSET_UINTS]; typedef unsigned int *re_bitset_ptr_t; typedef const unsigned int *re_const_bitset_ptr_t; -#define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS) -#define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS)) -#define bitset_contain(set,i) (set[i / UINT_BITS] & (1 << i % UINT_BITS)) +#define bitset_set(set,i) (set[i / UINT_BITS] |= 1u << i % UINT_BITS) +#define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1u << i % UINT_BITS)) +#define bitset_contain(set,i) (set[i / UINT_BITS] & (1u << i % UINT_BITS)) #define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS) #define bitset_set_all(set) \ memset (set, 255, sizeof (unsigned int) * BITSET_UINTS) @@ -627,8 +625,8 @@ struct re_fail_stack_t struct re_dfa_t { re_token_t *nodes; - int nodes_alloc; - int nodes_len; + size_t nodes_alloc; + size_t nodes_len; int *nexts; int *org_indices; re_node_set *edests; @@ -701,10 +699,12 @@ static void re_node_set_remove_at (re_node_set *set, int idx) internal_function; #define re_node_set_empty(p) ((p)->nelem = 0) #define re_node_set_free(set) re_free ((set)->elems) static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token) internal_function; -static re_dfastate_t *re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, - const re_node_set *nodes) internal_function; +static re_dfastate_t *re_acquire_state (reg_errcode_t *err, const + re_dfa_t *dfa, + const re_node_set *nodes) + internal_function; static re_dfastate_t *re_acquire_state_context (reg_errcode_t *err, - re_dfa_t *dfa, + const re_dfa_t *dfa, const re_node_set *nodes, unsigned int context) internal_function; static void free_state (re_dfastate_t *state) internal_function; diff --git a/posix/regexec.c b/posix/regexec.c index bdb2c4c..2322f14 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -25,7 +25,7 @@ static void match_ctx_free (re_match_context_t *cache) internal_function; static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, int str_idx, int from, int to) internal_function; -static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx) +static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx) internal_function; static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx) internal_function; @@ -104,13 +104,14 @@ static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa, static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node, re_node_set *dest_nodes, const re_node_set *and_nodes) internal_function; -static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits, +static int check_dst_limits (const re_match_context_t *mctx, + re_node_set *limits, int dst_node, int dst_idx, int src_node, int src_idx) internal_function; -static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx, +static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries, int subexp_idx, int from_node, int bkref_idx) internal_function; -static int check_dst_limits_calc_pos (re_match_context_t *mctx, +static int check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit, int subexp_idx, int node, int str_idx, int bkref_idx) internal_function; @@ -185,7 +186,7 @@ static unsigned int find_collation_sequence_value (const unsigned char *mbs, size_t name_len) internal_function; # endif /* _LIBC */ #endif /* RE_ENABLE_I18N */ -static int group_nodes_into_DFAstates (re_dfa_t *dfa, +static int group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state, re_node_set *states_node, bitset *states_ch) internal_function; @@ -883,14 +884,14 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, #ifdef RE_ENABLE_I18N if (BE (mctx.input.offsets_needed != 0, 0)) { - if (pmatch[reg_idx].rm_so == mctx.input.valid_len) - pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len; - else - pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so]; - if (pmatch[reg_idx].rm_eo == mctx.input.valid_len) - pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len; - else - pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo]; + pmatch[reg_idx].rm_so = + (pmatch[reg_idx].rm_so == mctx.input.valid_len + ? mctx.input.valid_raw_len + : mctx.input.offsets[pmatch[reg_idx].rm_so]); + pmatch[reg_idx].rm_eo = + (pmatch[reg_idx].rm_eo == mctx.input.valid_len + ? mctx.input.valid_raw_len + : mctx.input.offsets[pmatch[reg_idx].rm_eo]); } #else assert (mctx.input.offsets_needed == 0); @@ -1887,7 +1888,7 @@ sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates) static int check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx) - re_match_context_t *mctx; + const re_match_context_t *mctx; re_node_set *limits; int dst_node, dst_idx, src_node, src_idx; { @@ -1924,7 +1925,7 @@ check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx) static int check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) - re_match_context_t *mctx; + const re_match_context_t *mctx; int boundaries, subexp_idx, from_node, bkref_idx; { re_dfa_t *const dfa = mctx->dfa; @@ -1949,8 +1950,9 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) if (ent->node != node) continue; - if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map) - && !(ent->eps_reachable_subexps_map & (1 << subexp_idx))) + if (subexp_idx + < CHAR_BIT * sizeof ent->eps_reachable_subexps_map + && !(ent->eps_reachable_subexps_map & (1u << subexp_idx))) continue; /* Recurse trying to reach the OP_OPEN_SUBEXP and @@ -1976,7 +1978,9 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) if (cpos == 0 && (boundaries & 2)) return 0; - ent->eps_reachable_subexps_map &= ~(1 << subexp_idx); + if (subexp_idx + < CHAR_BIT * sizeof ent->eps_reachable_subexps_map) + ent->eps_reachable_subexps_map &= ~(1u << subexp_idx); } while (ent++->more); } @@ -2002,7 +2006,7 @@ check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx) static int check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx) - re_match_context_t *mctx; + const re_match_context_t *mctx; int limit, subexp_idx, from_node, str_idx, bkref_idx; { struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; @@ -2443,8 +2447,8 @@ check_subexp_matching_top (mctx, cur_nodes, str_idx) { int node = cur_nodes->elems[node_idx]; if (dfa->nodes[node].type == OP_OPEN_SUBEXP - && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map)) - && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx)) + && dfa->nodes[node].opr.idx < CHAR_BIT * sizeof dfa->used_bkref_map + && dfa->used_bkref_map & (1u << dfa->nodes[node].opr.idx)) { err = match_ctx_add_subtop (mctx, node, str_idx); if (BE (err != REG_NOERROR, 0)) @@ -2557,7 +2561,8 @@ transit_state_mb (mctx, pstate) if (BE (err != REG_NOERROR, 0)) return err; } - context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags); + context = re_string_context_at (&mctx->input, dest_idx - 1, + mctx->eflags); mctx->state_log[dest_idx] = re_acquire_state_context (&err, dfa, &dest_nodes, context); if (dest_state != NULL) @@ -2696,7 +2701,8 @@ get_subexp (mctx, bkref_node, bkref_str_idx) int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); if (cache_idx != -1) { - const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx; + const struct re_backref_cache_entry *entry + = mctx->bkref_ents + cache_idx; do if (entry->node == bkref_node) return REG_NOERROR; /* We already checked it. */ @@ -2743,7 +2749,8 @@ get_subexp (mctx, bkref_node, bkref_str_idx) buf = (const char *) re_string_get_buffer (&mctx->input); } if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0) - break; /* We don't need to search this sub expression any more. */ + /* We don't need to search this sub expression any more. */ + break; } bkref_str_off += sl_str_diff; sl_str += sl_str_diff; @@ -2794,7 +2801,8 @@ get_subexp (mctx, bkref_node, bkref_str_idx) continue; /* Does this state have a ')' of the sub expression? */ nodes = &mctx->state_log[sl_str]->nodes; - cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP); + cls_node = find_subexp_node (dfa, nodes, subexp_num, + OP_CLOSE_SUBEXP); if (cls_node == -1) continue; /* No. */ if (sub_top->path == NULL) @@ -2807,7 +2815,8 @@ get_subexp (mctx, bkref_node, bkref_str_idx) /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node in the current context? */ err = check_arrival (mctx, sub_top->path, sub_top->node, - sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP); + sub_top->str_idx, cls_node, sl_str, + OP_CLOSE_SUBEXP); if (err == REG_NOMATCH) continue; if (BE (err != REG_NOERROR, 0)) @@ -2841,7 +2850,8 @@ get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str) int to_idx; /* Can the subexpression arrive the back reference? */ err = check_arrival (mctx, &sub_last->path, sub_last->node, - sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP); + sub_last->str_idx, bkref_node, bkref_str, + OP_OPEN_SUBEXP); if (err != REG_NOERROR) return err; err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, @@ -3539,10 +3549,10 @@ out_free: static int group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch) - re_dfa_t *dfa; - const re_dfastate_t *state; - re_node_set *dests_node; - bitset *dests_ch; + const re_dfa_t *dfa; + const re_dfastate_t *state; + re_node_set *dests_node; + bitset *dests_ch; { reg_errcode_t err; int result; @@ -4265,7 +4275,7 @@ match_ctx_add_entry (mctx, node, str_idx, from, to) static int search_cur_bkref_entry (mctx, str_idx) - re_match_context_t *mctx; + const re_match_context_t *mctx; int str_idx; { int left, right, mid, last; -- 2.7.4