From e1901655935137420b3a46ad23c873753fcbbbc7 Mon Sep 17 00:00:00 2001 From: Ilya Zakharevich Date: Mon, 20 Nov 2000 13:30:52 -0500 Subject: [PATCH] Overeager visited-positions optimizations Message-ID: <20001120183051.A15228@monk.mps.ohio-state.edu> p4raw-id: //depot/perl@7815 --- pod/perlre.pod | 12 ++++++++---- regcomp.c | 34 +++++++++++++++++++++++++++------- t/op/re_tests | 2 ++ 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/pod/perlre.pod b/pod/perlre.pod index 8a85241..182f5bd 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -910,10 +910,14 @@ ways they can use backtracking to try match. For example, without internal optimizations done by the regular expression engine, this will take a painfully long time to run: - 'aaaaaaaaaaaa' =~ /((a{0,5}){0,5}){0,5}[c]/ - -And if you used C<*>'s instead of limiting it to 0 through 5 matches, -then it would take forever--or until you ran out of stack space. + 'aaaaaaaaaaaa' =~ /((a{0,5}){0,5})*[c]/ + +And if you used C<*>'s in the internal groups instead of limiting them +to 0 through 5 matches, then it would take forever--or until you ran +out of stack space. Moreover, these internal optimizations are not +always applicable. For example, if you put C<{0,5}> instead of C<*> +on the external group, no current optimization is applicable, and the +match takes a long time to finish. A powerful tool for optimizing such beasts is what is known as an "independent group", diff --git a/regcomp.c b/regcomp.c index 82fb3c1..3b474b2 100644 --- a/regcomp.c +++ b/regcomp.c @@ -227,6 +227,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define SCF_DO_STCLASS_AND 0x0800 #define SCF_DO_STCLASS_OR 0x1000 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 #define RF_utf8 8 #define UTF (PL_reg_flags & RF_utf8) @@ -723,6 +724,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ minnext = study_chunk(pRExC_state, &scan, &deltanext, next, &data_fake, f); @@ -953,6 +956,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg f |= SCF_DO_STCLASS_AND; f &= ~SCF_DO_STCLASS_OR; } + /* These are the cases when once a subexpression + fails at a particular position, it cannot succeed + even after backtracking at the enclosing scope. + + XXXX what if minimal match and we are at the + initial run of {n,m}? */ + if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, @@ -1083,13 +1094,20 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } #endif /* Optimize again: */ - study_chunk(pRExC_state, &nxt1, &deltanext, nxt, NULL, 0); + study_chunk(pRExC_state, &nxt1, &deltanext, nxt, + NULL, 0); } else oscan->flags = 0; } - else if (OP(oscan) == CURLYX && data && ++data->whilem_c < 16) { - /* This stays as CURLYX, and can put the count/of pair. */ + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, we can put the count/of pair. */ /* Find WHILEM (as in regexec.c) */ regnode *nxt = oscan + NEXT_OFF(oscan); @@ -1419,8 +1437,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg && OP(scan) == IFMATCH ) { /* Lookahead */ cl_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; - f = SCF_DO_STCLASS_AND; + f |= SCF_DO_STCLASS_AND; } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f); @@ -1439,7 +1459,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg data->flags |= SF_HAS_EVAL; if (data) data->whilem_c = data_fake.whilem_c; - if (f) { + if (f & SCF_DO_STCLASS_AND) { int was = (data->start_class->flags & ANYOF_EOS); cl_and(data->start_class, &intrnl); @@ -1779,7 +1799,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) data.last_closep = &last_close; minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */ - &data, SCF_DO_SUBSTR | stclass_flag); + &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag); if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !RExC_seen_zerolen @@ -1888,7 +1908,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) cl_init(pRExC_state, &ch_class); data.start_class = &ch_class; data.last_closep = &last_close; - minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND); + minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); r->check_substr = r->anchored_substr = r->float_substr = Nullsv; if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { diff --git a/t/op/re_tests b/t/op/re_tests index 102157c..8aa6933 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -779,3 +779,5 @@ tt+$ xxxtt y - - ^(a\1?){4}$ aaaaaa y $1 aa ^(0+)?(?:x(1))? x1 y - - ^([0-9a-fA-F]+)(?:x([0-9a-fA-F]+)?)(?:x([0-9a-fA-F]+))? 012cxx0190 y - - +^(b+?|a){1,2}c bbbac y $1 a +^(b+?|a){1,2}c bbbbac y $1 a -- 2.7.4