From d1b2014af5da7b0714bfc8832c3ce55eb9aaa94a Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Sun, 23 Feb 2014 20:53:51 +0100 Subject: [PATCH] fix RT #121299 - Inconsistent behavior with backreferences nested inside subpattern references Match variables should be dynamically scoped during GOSUB and GOSTART. The callers state should be inherited by the callee, but once the callee returns, the callers state should be restored. This is different from EVAL, where the callers and callees state are expected to not be the same (although might be the same), and where the "reasonable" match semantics differ. Currently the following two one liners will produce different results: $ ./perl -Ilib -le'"<>>" =~/ < (?: \1 | [ab]+ ) (>) (?0)? /x and print $&;' <>> $ ./perl -Ilib -le'$qr= qr/ < (?: \1 | [ab]+ ) (>) (??{ $qr })? /x; "<>>" =~ m/$qr/ and print $&;' While I think reasonable people could argue that we should special case things when we know that the return from (??{ ... }) is the same as the currently executing pattern I think explaining the difference would be harder than necessary. On the contrary making GOSUB/GOSTART exactly the same as EVAL, so that the match vars were totally independent seems to throw away an opportunity for much more powerful semantics than can be offered by EVAL. --- pod/perlre.pod | 15 +++++++++++---- regexec.c | 34 +++++++++++++++++++++------------- t/re/re_tests | 9 ++++++++- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/pod/perlre.pod b/pod/perlre.pod index c2026d8..544c4a7 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -1486,14 +1486,21 @@ into perl, so changing it requires a custom build. =item C<(?I)> C<(?-I)> C<(?+I)> C<(?R)> C<(?0)> X<(?PARNO)> X<(?1)> X<(?R)> X<(?0)> X<(?-1)> X<(?+1)> X<(?-PARNO)> X<(?+PARNO)> X X X -X +X X X + +Recursive subpattern. Treat the contents of a given capture buffer in the +current pattern as an independent subpattern and attempt to match it at +the current position in the string. Information about capture state from +the caller for things like backreferences is available to the subpattern, +but capture buffers set by the subpattern are not visible to the caller. Similar to C<(??{ code })> except that it does not involve executing any code or potentially compiling a returned pattern string; instead it treats the part of the current pattern contained within a specified capture group -as an independent pattern that must match at the current position. -Capture groups contained by the pattern will have the value as determined -by the outermost recursion. +as an independent pattern that must match at the current position. Also +different is the treatment of capture buffers, unlike C<(??{ code })> +recursive patterns have access to their callers match state, so one can +use backreferences safely. I is a sequence of digits (not starting with 0) whose value reflects the paren-number of the capture group to recurse to. C<(?R)> recurses to diff --git a/regexec.c b/regexec.c index 9ceb4e3..6dd0297 100644 --- a/regexec.c +++ b/regexec.c @@ -291,8 +291,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) PERL_ARGS_ASSERT_REGCPPUSH; if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", - paren_elems_to_push); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i", + paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -5129,7 +5129,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) startpoint = rei->program+1; ST.close_paren = 0; } + + /* Save all the positions seen so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + + /* and then jump to the code we share with EVAL */ goto eval_recurse_doit; + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ @@ -5366,6 +5373,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) re->sublen = rex->sublen; re->suboffset = rex->suboffset; re->subcoffset = rex->subcoffset; + re->lastparen = 0; + re->lastcloseparen = 0; rei = RXi_GET(re); DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, @@ -5373,18 +5382,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ + /* Save all the seen positions so far. */ + ST.cp = regcppush(rex, 0, maxopenparen); + REGCP_SET(ST.lastcp); + /* and set maxopenparen to 0, since we are starting a "fresh" match */ + maxopenparen = 0; + /* run the pattern returned from (??{...}) */ - eval_recurse_doit: /* Share code with GOSUB below this line */ - /* run the pattern returned from (??{...}) */ - - /* Save *all* the positions. */ - ST.cp = regcppush(rex, 0, maxopenparen); - REGCP_SET(ST.lastcp); - - re->lastparen = 0; - re->lastcloseparen = 0; - - maxopenparen = 0; + eval_recurse_doit: /* Share code with GOSUB below this line + * At this point we expect the stack context to be + * set up correctly */ /* invalidate the S-L poscache. We're now executing a * different set of WHILEM ops (and their associated @@ -5396,6 +5403,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * pattern again */ reginfo->poscache_maxiter = 0; + /* the new regexp might have a different is_utf8_pat than we do */ is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv)); ST.prev_rex = rex_sv; diff --git a/t/re/re_tests b/t/re/re_tests index 22c0c74..ab867f5 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1857,8 +1857,15 @@ A+(*PRUNE)BC(?{}) AAABC y $& AAABC '(?-m:^abc)'m x\nabcde n - - # RT #121248 - segfault - /^[+]([^(]+)$/mi li n - - +# RT #121299 - Inconsistent behavior with backreferences nested inside subpattern references +/(.)((o)\1)(?2)/ fofof y $2 of +/(.)(?2)((\1)(?4)(\1))/ fffffff y $1 f +/(.)(?2)((.)(?4)(\1))/ foffoff y $2 off +/^(.\2?)(.)(?1)$/ abcb y $2 b +/^(.\1?)(?1)$/ aba y $1 a +/^ (\3(?2)\3)? ((.)) (?1) $/x aaba y $2 a +/^ (a|\3(?1)\2|(?2)) ((b|c)(?4)?) (?1) (d(?1)) $/x abbcdcabbda y $2 b # Keep these lines at the end of the file # vim: softtabstop=0 noexpandtab -- 2.7.4