From 1a08ba3a6099186231e66bc4524910ad4ae3c51d Mon Sep 17 00:00:00 2001 From: David Mitchell Date: Sun, 2 Feb 2014 14:47:34 +0000 Subject: [PATCH] re_intuit_start(): calc fbm_instr() end in bytes When calculating the end limit of the string to pass to fbm_instr(), we usually have a pointer to the latest point where the substr could start, whereas fbm_instr() expects a pointer to the latest point where the substr could end. Since fmb_intr() purely matches bytes (it cares not whether those bytes are part of a utf8 stream of not), the value of the latest end point will always be: (latest start point) + SvCUR(sv) - !!SvTAIL(sv) i.e. work in bytes, even if we have utf8 values. In some of the places where fbm_instr() is used, the calculation is being done partially or fully in chars rather than bytes. This is not incorrect, and indeed may in theory calculate a slightly lower end limit sometimes and thus stop the fbm earlier. But this comes at the cost having to do utf8 length calculations and HOPs back from the end of the string. So we're trading off not having to do utf8 skips on the last few chars against the fbm not uselessly searching the last few chars. These roughly cancel each other out. But since we no longer do HOPs before starting the fbm, we win every time the fbm doesn't get near the end of the string. So in conclusion, simpler code and better than or equal performance. --- regexec.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/regexec.c b/regexec.c index 5b0303c..b49b8eb 100644 --- a/regexec.c +++ b/regexec.c @@ -845,9 +845,11 @@ Perl_re_intuit_start(pTHX_ && prog->check_offset_max != SSize_t_MAX && start_shift < prog->check_offset_max) { - SSize_t off = prog->check_offset_max - start_shift - + CHR_SVLEN(check) - !!SvTAIL(check); - end_point = HOP3lim(start_point, off, end_point); + SSize_t len = SvCUR(check) - !!SvTAIL(check); + end_point = HOP3lim(start_point, + prog->check_offset_max - start_shift, + end_point -len) + + len; } DEBUG_OPTIMISE_MORE_r({ @@ -948,7 +950,7 @@ Perl_re_intuit_start(pTHX_ assert(SvPOK(must)); s = fbm_instr( (unsigned char*)s, - HOP3(last + SvCUR(must), -(SvTAIL(must)!=0), strbeg), + (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0), must, multiline ? FBMrf_MULTILINE : 0 ); -- 2.7.4