From aed7b151d4e452f2288e88e25caf094ac07def29 Mon Sep 17 00:00:00 2001 From: David Mitchell Date: Mon, 20 May 2013 15:04:11 +0100 Subject: [PATCH] regex engine: simplify is_utf8_pat handling Since this value is actually just always equal to cBOOL(RX_UTF8(rx)), there's no need to save the old value of the local boolean (as u.eval.saved_utf8_pat) when switching back and forwards between regexes with (??{}); instead, just re-calculate it whenever we switch, and update reginfo->is_utf8_pat and its cached value in the is_utf8_pat local var accordingly. Also, pass reginfo as an arg to S_setup_EXACTISH_ST_c1_c2() rather than is_utf8_pat directly; this will allow us to eliminate PL_reg_match_utf8 shortly. A new test is included that detects a mistake I made while working up this change: I recalculated is_utf8_pat, but forgot to update reginfo->is_utf8_pat too. --- regexec.c | 24 ++++++++++-------------- regexp.h | 1 - t/re/re_tests | 4 ++++ 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/regexec.c b/regexec.c index 1537d35..59a3dab 100644 --- a/regexec.c +++ b/regexec.c @@ -628,7 +628,6 @@ Perl_re_intuit_start(pTHX_ char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); - bool is_utf8_pat; regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; #ifdef DEBUGGING @@ -643,8 +642,6 @@ Perl_re_intuit_start(pTHX_ RX_MATCH_UTF8_set(rx,utf8_target); PL_reg_match_utf8 = cBOOL(utf8_target); - is_utf8_pat = cBOOL(RX_UTF8(rx)); - /* CHR_DIST() would be more correct here but it makes things slow. */ if (prog->minlen > strend - strpos) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, @@ -655,7 +652,7 @@ Perl_re_intuit_start(pTHX_ reginfo->eval_state = NULL; reginfo->strbeg = strbeg; reginfo->strend = strend; - reginfo->is_utf8_pat = is_utf8_pat; + reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); reginfo->intuit = 1; if (utf8_target) { @@ -3195,7 +3192,7 @@ S_clear_backtrack_stack(pTHX_ void *p) } static bool S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, - U8* c1_utf8, int *c2p, U8* c2_utf8, bool is_utf8_pat) + U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) { /* This function determines if there are one or two characters that match * the first character of the passed-in EXACTish node , and if @@ -3252,6 +3249,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, UV c1 = CHRTEST_NOT_A_CP_1; UV c2 = CHRTEST_NOT_A_CP_2; bool use_chrtest_void = FALSE; + const bool is_utf8_pat = reginfo->is_utf8_pat; /* Used when we have both utf8 input and utf8 output, to avoid converting * to/from code points */ @@ -5077,8 +5075,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* XXXX This is too dramatic a measure... */ PL_reg_maxiter = 0; - ST.saved_utf8_pat = is_utf8_pat; - is_utf8_pat = cBOOL(RX_UTF8(re_sv)); + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); ST.prev_rex = rex_sv; ST.prev_curlyx = cur_curlyx; @@ -5097,8 +5094,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case EVAL_AB: /* cleanup after a successful (??{A})B */ /* note: this is called twice; first after popping B, then A */ - is_utf8_pat = ST.saved_utf8_pat; rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); @@ -5115,8 +5112,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ /* note: this is called twice; first after popping B, then A */ - is_utf8_pat = ST.saved_utf8_pat; rex_sv = ST.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); @@ -5743,7 +5740,7 @@ NULL if (PL_regkind[OP(text_node)] == EXACT) { if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, - is_utf8_pat)) + reginfo)) { sayNO; } @@ -5920,7 +5917,7 @@ NULL friends need to change. */ if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, - is_utf8_pat)) + reginfo)) { sayNO; } @@ -6154,14 +6151,13 @@ NULL fake_end: if (cur_eval) { /* we've just finished A in /(??{A})B/; now continue with B */ - st->u.eval.saved_utf8_pat = is_utf8_pat; - is_utf8_pat = cur_eval->u.eval.saved_utf8_pat; st->u.eval.prev_rex = rex_sv; /* inner */ /* Save *all* the positions. */ st->u.eval.cp = regcppush(rex, 0, maxopenparen); rex_sv = cur_eval->u.eval.prev_rex; + is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); rexi = RXi_GET(rex); @@ -6776,7 +6772,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, - reginfo->is_utf8_pat)) + reginfo)) { if (c1 == CHRTEST_VOID) { /* Use full Unicode fold matching */ diff --git a/regexp.h b/regexp.h index cd2bad8..844ec6b 100644 --- a/regexp.h +++ b/regexp.h @@ -685,7 +685,6 @@ typedef struct regmatch_state { struct regmatch_state *prev_eval; struct regmatch_state *prev_curlyx; REGEXP *prev_rex; - bool saved_utf8_pat; /* saved copy of is_utf8_pat */ CHECKPOINT cp; /* remember current savestack indexes */ CHECKPOINT lastcp; U32 close_paren; /* which close bracket is our end */ diff --git a/t/re/re_tests b/t/re/re_tests index 7e7fc85..a317904 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1736,4 +1736,8 @@ ab[c\\\](??{"x"})]{3}d ab\\](d y - - /(?l:a?\w)/ b y $& b m?^xy\?$? xy? y $& xy? +# check we have the right utf8ness as we switch back and forth between +# patterns +^(\x{100}|a)(??{ qr/.?\xF7/d}) a_\xF7 y - - + # vim: softtabstop=0 noexpandtab -- 2.7.4