From d6ef167873919ed43a86136ba20f5a410a05e7ca Mon Sep 17 00:00:00 2001 From: David Mitchell Date: Fri, 13 Dec 2013 16:35:14 +0000 Subject: [PATCH] RT#120692 slow intuit with long utf8 strings Some code in re_intuit_start() that tries to find the range of chars to which the BM substr find can be applied, uses logic that is very inefficient once utf8 was enabled. Basically the code tries to find the maximum end-point where the substr could be found, by taking the minimum of: * start + prog->check_offset_max + length(substr) * end - prog->check_end_shift Except that these values are in char lengths and need to be converted to bytes before calling fbm_instr(). The code formerly involved scanning the whole of the remaining string to determine how many chars it had. By doing the calculation a different way, we can avoid this. This makes the following two regexps each take milliseconds rather than 10s of seconds: my $s = 'ab' x 1_000_000; utf8::upgrade($s); 1 while $s =~ m/\Ga+ba+b/g; $s=~ /^a{1,2}x/ for 1..10_000; --- regexec.c | 27 +++++++++++++++++++-------- t/re/pat.t | 16 +++++++++++++++- t/re/re_tests | 1 + 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/regexec.c b/regexec.c index 374f413..08f1066 100644 --- a/regexec.c +++ b/regexec.c @@ -628,6 +628,7 @@ Perl_re_intuit_start(pTHX_ SSize_t start_shift = 0; /* Should be nonnegative! */ SSize_t end_shift = 0; + SSize_t max_shift = -1; /* max char start position of floating substr */ char *s; SV *check; char *t; @@ -747,14 +748,8 @@ Perl_re_intuit_start(pTHX_ start_shift = prog->check_offset_min; /* okay to underestimate on CC */ end_shift = prog->check_end_shift; - if (!ml_anch) { - const SSize_t end = prog->check_offset_max + CHR_SVLEN(check) - - (SvTAIL(check) != 0); - const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end; - - if (end_shift < eshift) - end_shift = eshift; - } + if (!ml_anch && prog->check_offset_max != SSize_t_MAX) + max_shift = prog->check_offset_max; } else { /* Can match at random position */ ml_anch = 0; @@ -795,6 +790,7 @@ Perl_re_intuit_start(pTHX_ }); if (prog->intflags & PREGf_CANY_SEEN) { + start_point= (U8*)(s + srch_start_shift); end_point= (U8*)(strend - srch_end_shift); } else { @@ -802,6 +798,21 @@ Perl_re_intuit_start(pTHX_ end_point= HOP3(strend, -srch_end_shift, strbeg); } + if (max_shift != -1) { + U8 *p = (U8*)s; + + assert(max_shift >= 0); + if (srch_start_shift > 0) + p = start_point; /* don't HOP over chars already HOPed */ + if (p < end_point) + p = HOP3(p, + (max_shift - (srch_end_shift > 0 ? srch_start_shift : 0) + + CHR_SVLEN(check) - (SvTAIL(check) != 0)), + end_point); + if (p < end_point) + end_point = p; + } + DEBUG_OPTIMISE_MORE_r({ PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n", (int)(end_point - start_point), diff --git a/t/re/pat.t b/t/re/pat.t index 91274e6..a052ee7 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -20,7 +20,7 @@ BEGIN { require './test.pl'; } -plan tests => 712; # Update this when adding/deleting tests. +plan tests => 714; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1513,6 +1513,20 @@ EOP is($i, 0, "RT 120446: mustn't run slowly"); } + { + # [perl #120692] + # these tests should be virtually instantaneous. If they take 10s of + # seconds, there's a bug in intuit_start. + + my $s = 'ab' x 1_000_000; + utf8::upgrade($s); + 1 while $s =~ m/\Ga+ba+b/g; + pass("RT#120692 \\G mustn't run slowly"); + + $s=~ /^a{1,2}x/ for 1..10_000; + pass("RT#120692 a{1,2} mustn't run slowly"); + } + # These are based on looking at the code in regcomp.c # We don't look for specific code, just the existence of an SSC foreach my $re (qw( qr/a?c/ diff --git a/t/re/re_tests b/t/re/re_tests index c6ff296..525118d 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1851,6 +1851,7 @@ A+(*PRUNE)BC(?{}) AAABC y $& AAABC # utf8 cache length panics \x{100}[xy]\x{100}{2} \x{100}y\x{100}\x{100} y $& \x{100}y\x{100}\x{100} \x{100}a{2,3} \x{100}aaa y $& \x{100}aaa +^x?abc?de abcde y $& abcde # Keep these lines at the end of the file # vim: softtabstop=0 noexpandtab -- 2.7.4