From 58e23c8d7d24dd08c87b5d56819ad45527176c15 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Wed, 22 Nov 2006 18:11:02 +0100 Subject: [PATCH] \G with /g results in infinite loop in 5.6 and later Message-ID: <9b18b3110611220811k1a54f650t1bd7c6a9450b0a7e@mail.gmail.com> p4raw-id: //depot/perl@29354 --- pod/perldiag.pod | 10 ++++----- pod/perlre.pod | 22 +++++++++++++++----- pp_hot.c | 24 +++++++++++++-------- regcomp.c | 63 +++++++++++++++++++++++++++++++++++--------------------- regcomp.h | 8 +++++++ regcomp.sym | 1 - regexec.c | 19 +++++++++++------ regexp.h | 2 ++ t/op/pat.t | 27 ++++++++++++++++++++---- t/op/regmesg.t | 4 ++-- 10 files changed, 124 insertions(+), 56 deletions(-) diff --git a/pod/perldiag.pod b/pod/perldiag.pod index e6a8b0f..26c2bf5 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2208,11 +2208,10 @@ an undefined value for the length. See L. to check the return value of your socket() call? See L. -=item Lookbehind longer than %d not implemented in regex; marked by <-- HERE in m/%s/ +=item Lookbehind longer than %d not implemented in regex m/%s/ (F) There is currently a limit on the length of string which lookbehind can -handle. This restriction may be eased in a future release. The <-- HERE -shows in the regular expression about where the problem was discovered. +handle. This restriction may be eased in a future release. =item lstat() on filehandle %s @@ -4786,11 +4785,10 @@ something else of the same name (usually a subroutine) is exported by that module. It usually means you put the wrong funny character on the front of your variable. -=item Variable length lookbehind not implemented in regex; marked by <-- HERE in m/%s/ +=item Variable length lookbehind not implemented in m/%s/ (F) Lookbehind is allowed only for subexpressions whose length is fixed and -known at compile time. The <-- HERE shows in the regular expression about -where the problem was discovered. See L. +known at compile time. See L. =item Variable length character upgraded in print diff --git a/pod/perlre.pod b/pod/perlre.pod index 7df5647..c1cc75d 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -443,13 +443,25 @@ It is also useful when writing C-like scanners, when you have several patterns that you want to match against consequent substrings of your string, see the previous reference. The actual location where C<\G> will match can also be influenced by using C as -an lvalue: see L. Currently C<\G> is only fully -supported when anchored to the start of the pattern; while it -is permitted to use it elsewhere, as in C, some -such uses (C, for example) currently cause problems, and -it is recommended that you avoid such usage for now. +an lvalue: see L. Note that the rule for zero-length +matches is modified somewhat, in that contents to the left of C<\G> is +not counted when determining the length of the match. Thus the following +will not match forever: X<\G> + $str = 'ABC'; + pos($str) = 1; + while (/.\G/g) { + print $&; + } + +It will print 'A' and then terminate, as it considers the match to +be zero-width, and thus will not match at the same position twice in a +row. + +It is worth noting that C<\G> improperly used can result in an infinite +loop. Take care when using patterns that include C<\G> in an alternation. + =head3 Capture buffers The bracketing construct C<( ... )> creates capture buffers. To diff --git a/pp_hot.c b/pp_hot.c index 14bfd2c..8420757 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -1304,6 +1304,7 @@ PP(pp_match) const I32 oldsave = PL_savestack_ix; I32 update_minmatch = 1; I32 had_zerolen = 0; + U32 gpos = 0; if (PL_op->op_flags & OPf_STACKED) TARG = POPs; @@ -1355,13 +1356,18 @@ PP(pp_match) else if (rx->reganch & ROPT_ANCH_GPOS) { r_flags |= REXEC_IGNOREPOS; rx->endp[0] = rx->startp[0] = mg->mg_len; - } - minmatch = (mg->mg_flags & MGf_MINMATCH); + } else if (rx->reganch & ROPT_GPOS_FLOAT) + gpos = mg->mg_len; + else + rx->endp[0] = rx->startp[0] = mg->mg_len; + minmatch = (mg->mg_flags & MGf_MINMATCH) ? rx->gofs + 1 : 0; update_minmatch = 0; } } } - if ((!global && rx->nparens) + /* remove comment to get faster /g but possibly unsafe $1 vars after a + match. Test for the unsafe vars will fail as well*/ + if (( /* !global && */ rx->nparens) || SvTEMP(TARG) || PL_sawampersand || (pm->op_pmflags & PMf_EVAL)) r_flags |= REXEC_COPY_STR; if (SvSCREAM(TARG)) @@ -1369,8 +1375,8 @@ PP(pp_match) play_it_again: if (global && rx->startp[0] != -1) { - t = s = rx->endp[0] + truebase; - if ((s + rx->minlen) > strend) + t = s = rx->endp[0] + truebase - rx->gofs; + if ((s + rx->minlen) > strend || s < truebase) goto nope; if (update_minmatch++) minmatch = had_zerolen; @@ -1391,7 +1397,7 @@ play_it_again: && !SvROK(TARG)) /* Cannot trust since INTUIT cannot guess ^ */ goto yup; } - if (CALLREGEXEC(rx, (char*)s, (char *)strend, (char*)truebase, minmatch, TARG, NULL, r_flags)) + if (CALLREGEXEC(rx, (char*)s, (char *)strend, (char*)truebase, minmatch, TARG, (void*)gpos, r_flags)) { PL_curpm = pm; if (dynpm->op_pmflags & PMf_ONCE) @@ -1441,14 +1447,14 @@ play_it_again: } if (rx->startp[0] != -1) { mg->mg_len = rx->endp[0]; - if (rx->startp[0] == rx->endp[0]) + if (rx->startp[0] + rx->gofs == rx->endp[0]) mg->mg_flags |= MGf_MINMATCH; else mg->mg_flags &= ~MGf_MINMATCH; } } had_zerolen = (rx->startp[0] != -1 - && rx->startp[0] == rx->endp[0]); + && rx->startp[0] + rx->gofs == rx->endp[0]); PUTBACK; /* EVAL blocks may use stack */ r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST; goto play_it_again; @@ -1475,7 +1481,7 @@ play_it_again: } if (rx->startp[0] != -1) { mg->mg_len = rx->endp[0]; - if (rx->startp[0] == rx->endp[0]) + if (rx->startp[0] + rx->gofs == rx->endp[0]) mg->mg_flags |= MGf_MINMATCH; else mg->mg_flags &= ~MGf_MINMATCH; diff --git a/regcomp.c b/regcomp.c index 15f1feb..520f2fd 100644 --- a/regcomp.c +++ b/regcomp.c @@ -370,7 +370,7 @@ static const scan_data_t zero_scan_data = * arg. Show regex, up to a maximum length. If it's too long, chop and add * "...". */ -#define FAIL(msg) STMT_START { \ +#define _FAIL(code) STMT_START { \ const char *ellipses = ""; \ IV len = RExC_end - RExC_precomp; \ \ @@ -381,10 +381,17 @@ static const scan_data_t zero_scan_data = len = RegexLengthToShowInErrorMessages - 10; \ ellipses = "..."; \ } \ - Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ - msg, (int)len, RExC_precomp, ellipses); \ + code; \ } STMT_END +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ + msg, (int)len, RExC_precomp, ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \ + arg, (int)len, RExC_precomp, ellipses)) + /* * Simple_vFAIL -- like FAIL, but marks the current location in the scan */ @@ -2426,6 +2433,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else data_fake.last_closep = &fake; + + data_fake.pos_delta = delta; next = regnext(scan); scan = NEXTOPER(scan); if (code != BRANCH) @@ -2434,7 +2443,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; - } + } if (flags & SCF_WHILEM_VISITED_POS) f |= SCF_WHILEM_VISITED_POS; @@ -3475,6 +3484,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else data_fake.last_closep = &fake; + data_fake.pos_delta = delta; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ cl_init(pRExC_state, &intrnl); @@ -3489,10 +3499,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, last, &data_fake, stopparen, recursed, NULL, f, depth+1); if (scan->flags) { if (deltanext) { - vFAIL("Variable length lookbehind not implemented"); + FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = (U8)minnext; } @@ -3546,6 +3556,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else data_fake.last_closep = &fake; data_fake.flags = 0; + data_fake.pos_delta = delta; if (is_inf) data_fake.flags |= SF_IS_INF; if ( flags & SCF_DO_STCLASS && !scan->flags @@ -3563,10 +3574,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, last, &data_fake, stopparen, recursed, NULL, f,depth+1); if (scan->flags) { if (deltanext) { - vFAIL("Variable length lookbehind not implemented"); + FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = (U8)*minnextp; } @@ -3655,6 +3666,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_anything(pRExC_state, data->start_class); flags &= ~SCF_DO_STCLASS; } + else if (OP(scan) == GPOS) { + if (!(RExC_rx->reganch & ROPT_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) + { + if (!(RExC_rx->reganch & ROPT_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->reganch |= ROPT_ANCH_GPOS; + if (RExC_rx->gofs < (U32)min) + RExC_rx->gofs = min; + } else { + RExC_rx->reganch |= ROPT_GPOS_FLOAT; + RExC_rx->gofs = 0; + } + } #ifdef TRIE_STUDY_OPT #ifdef FULL_TRIE_STUDY else if (PL_regkind[OP(scan)] == TRIE) { @@ -3691,7 +3715,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else data_fake.last_closep = &fake; - + data_fake.pos_delta = delta; if (flags & SCF_DO_STCLASS) { cl_init(pRExC_state, &this_class); data_fake.start_class = &this_class; @@ -4042,25 +4066,18 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char); +#else + /* bulk initialize fields with 0. */ + Zero(r, sizeof(regexp), char); #endif - /* initialization begins here */ + + /* non-zero initialization begins here */ r->engine= RE_ENGINE_PTR; r->refcnt = 1; r->prelen = xend - exp; r->precomp = savepvn(RExC_precomp, r->prelen); - r->subbeg = NULL; -#ifdef PERL_OLD_COPY_ON_WRITE - r->saved_copy = NULL; -#endif r->reganch = pm->op_pmflags & PMf_COMPILETIME; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - r->lastparen = 0; /* mg.c reads this. */ - - r->substrs = 0; /* Useful during FAIL. */ - r->startp = 0; /* Useful during FAIL. */ - r->endp = 0; - r->swap = NULL; - r->paren_names = 0; if (RExC_seen & REG_SEEN_RECURSE) { Newxz(RExC_open_parens, RExC_npar,regnode *); @@ -4235,7 +4252,7 @@ reStudy: else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->reganch & ROPT_ANCH) ) + !(r->reganch & ROPT_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) { /* turn .* into ^.* with an implied $*=1 */ const int type = @@ -8135,7 +8152,7 @@ Perl_regdump(pTHX_ const regexp *r) PerlIO_putc(Perl_debug_log, ' '); } if (r->reganch & ROPT_GPOS_SEEN) - PerlIO_printf(Perl_debug_log, "GPOS "); + PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", r->gofs); if (r->reganch & ROPT_SKIP) PerlIO_printf(Perl_debug_log, "plus "); if (r->reganch & ROPT_IMPLICIT) diff --git a/regcomp.h b/regcomp.h index f64168a..e8fd39f 100644 --- a/regcomp.h +++ b/regcomp.h @@ -11,10 +11,18 @@ typedef OP OP_4tree; /* Will be redefined later. */ +/* Convert branch sequences to more efficient trie ops? */ #define PERL_ENABLE_TRIE_OPTIMISATION 1 + +/* Be really agressive about optimising patterns with trie sequences? */ #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 + +/* Should the optimiser take positive assertions into account? */ #define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 1 + +/* Not for production use: */ #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0 + /* Unless the next line is uncommented it is illegal to combine lazy matching with possessive matching. Frankly it doesn't make much sense to allow it as X*?+ matches nothing, X+?+ matches a single char only, diff --git a/regcomp.sym b/regcomp.sym index d6b97d5..656988e 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -182,7 +182,6 @@ SKIP VERB, no-sv 1 On failure skip forward (to the mark) before retrying COMMIT VERB, no-sv 1 Pattern fails outright if backtracking through this CUTGROUP VERB, no-sv 1 On failure go to the next alternation in the group - # NEW STUFF ABOVE THIS LINE -- Please update counts below. ################################################################################ diff --git a/regexec.c b/regexec.c index e505fb4..8da6166 100644 --- a/regexec.c +++ b/regexec.c @@ -1635,7 +1635,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ /* minend: end of match must be >=minend after stringarg. */ -/* data: May be used for some additional optimizations. */ +/* data: May be used for some additional optimizations. + Currently its only used, with a U32 cast, for transmitting + the ganch offset when doing a /g match. This will change */ /* nosave: For optimizations. */ { dVAR; @@ -1711,7 +1713,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * MAGIC *mg; if (flags & REXEC_IGNOREPOS) /* Means: check only at start */ - reginfo.ganch = startpos; + reginfo.ganch = startpos + prog->gofs; else if (sv && SvTYPE(sv) >= SVt_PVMG && SvMAGIC(sv) && (mg = mg_find(sv, PERL_MAGIC_regex_global)) @@ -1720,10 +1722,12 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * if (prog->reganch & ROPT_ANCH_GPOS) { if (s > reginfo.ganch) goto phooey; - s = reginfo.ganch; + s = reginfo.ganch - prog->gofs; } } - else /* pos() not defined */ + else if (data) { + reginfo.ganch = strbeg + (UV)data; + } else /* pos() not defined */ reginfo.ganch = strbeg; } if (PL_curpm && (PM_GETRE(PL_curpm) == prog)) { @@ -1810,7 +1814,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * /* the warning about reginfo.ganch being used without intialization is bogus -- we set it above, when prog->reganch & ROPT_GPOS_SEEN and we only enter this block when the same bit is set. */ - if (regtry(®info, ®info.ganch)) + char *tmp_s = reginfo.ganch - prog->gofs; + if (regtry(®info, &tmp_s)) goto got_it; goto phooey; } @@ -2623,6 +2628,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) during a successfull match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; + /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop @@ -2643,7 +2649,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) GET_RE_DEBUG_FLAGS_DECL; #endif - DEBUG_STACK_r( { + DEBUG_OPTIMISE_r( { PerlIO_printf(Perl_debug_log,"regmatch start\n"); }); /* on first ever call to regmatch, allocate first slab */ @@ -4688,6 +4694,7 @@ NULL (long)(locinput - PL_reg_starttry), (long)(reginfo->till - PL_reg_starttry), PL_colors[5])); + sayNO_SILENT; /* Cannot match: too short. */ } PL_reginput = locinput; /* put where regtry can find it */ diff --git a/regexp.h b/regexp.h index 49d6cd1..7fa1884 100644 --- a/regexp.h +++ b/regexp.h @@ -54,6 +54,7 @@ typedef struct regexp { I32 refcnt; I32 minlen; /* mininum possible length of string to match */ I32 minlenret; /* mininum possible length of $& */ + U32 gofs; /* chars left of pos that we search from */ I32 prelen; /* length of precomp */ U32 nparens; /* number of parentheses */ U32 lastparen; /* last paren matched */ @@ -114,6 +115,7 @@ typedef struct regexp_engine { #define ROPT_MATCH_UTF8 0x10000000 /* subbeg is utf-8 */ #define ROPT_VERBARG_SEEN 0x20000000 #define ROPT_CUTGROUP_SEEN 0x40000000 +#define ROPT_GPOS_FLOAT 0x80000000 #define RE_USE_INTUIT_NOML 0x00100000 /* Best to intuit before matching */ #define RE_USE_INTUIT_ML 0x00200000 diff --git a/t/op/pat.t b/t/op/pat.t index cf6d54f..21db20c 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4053,10 +4053,10 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { { local $Message="RT#22395"; our $count; - for my $l (1,10,100,1000) { + for my $l (10,100,1000) { $count=0; ('a' x $l) =~ /(.*)(?{$count++})[bc]/; - iseq($l+1,$count,"Should be L+1 not L*(L+3)/2 (L=$l)"); + iseq( $count, $l + 1, "# TODO Should be L+1 not L*(L+3)/2 (L=$l)"); } } { @@ -4083,6 +4083,17 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { iseq($count,3); iseq($text,' word2 word4 word6 '); } +{ + # RT#6893 + local $_= qq(A\nB\nC\n); + my @res; + while (m#(\G|\n)([^\n]*)\n#gsx) + { + push @res,"$2"; + last if @res>3; + } + iseq("@res","A B C","RT#6893: /g pattern shouldn't infinite loop"); +} { # From Message-ID: <877ixs6oa6.fsf@k75.linux.bogus> @@ -4094,6 +4105,13 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { iseq($dow_name,$time_string,"UTF8 trie common prefix extraction"); } +{ + my $v; + ($v='bar')=~/(\w+)/g; + $v='foo'; + iseq("$1",'bar','# TODO $1 is safe after /g - may fail due to specialized config in pp_hot.c') +} + # Test counter is at bottom of file. Put new tests above here. #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4137,9 +4155,10 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, } # Put new tests above the dotted line about a page above this comment - +iseq(0+$::test,$::TestCount,"Got the right number of tests!"); # Don't forget to update this! BEGIN { - $::TestCount = 1367; + $::TestCount = 1369; print "1..$::TestCount\n"; } + diff --git a/t/op/regmesg.t b/t/op/regmesg.t index fbfb6b2..d53a1f8 100644 --- a/t/op/regmesg.t +++ b/t/op/regmesg.t @@ -28,9 +28,9 @@ my @death = ( '/[[=foo=]]/' => 'POSIX syntax [= =] is reserved for future extensions in regex; marked by {#} in m/[[=foo=]{#}]/', - '/(?<= .*)/' => 'Variable length lookbehind not implemented in regex; marked by {#} in m/(?<= .*){#}/', + '/(?<= .*)/' => 'Variable length lookbehind not implemented in regex m/(?<= .*)/', - '/(?<= x{1000})/' => 'Lookbehind longer than 255 not implemented in regex; marked by {#} in m/(?<= x{1000}){#}/', + '/(?<= x{1000})/' => 'Lookbehind longer than 255 not implemented in regex m/(?<= x{1000})/', '/(?@)/' => 'Sequence (?@...) not implemented in regex; marked by {#} in m/(?@{#})/', -- 2.7.4