From: David Mitchell Date: Sat, 18 Jan 2014 23:46:49 +0000 (+0000) Subject: re_intuit_start(): bias last* vars; revive reghop4 X-Git-Tag: upstream/5.20.0~464^2~50 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2974eaec4289b97f9c1cee4ae413d4864a578ed0;p=platform%2Fupstream%2Fperl.git re_intuit_start(): bias last* vars; revive reghop4 In the "just matched float substr, now match fixed substr" branch, initially add an extra prog->anchored_offset to the last and last2 vars; since a lot of the later calculations involve adding anchored_offset, doing this early to the last* vars means less work in some cases. In particular, last is calculated from s by a single HOP4(s, prog->anchored_offset-start_shift,...) rather than two separate HOP3(s, -start_shift,...); HOP3(..., prog->anchored_offset,...); which may mostly cancel each other out. Similarly with last2. Later, we can skip adding prog->anchored_offset to last1, since its antecedents already have the bias added. In the case of failure, calculating a new start position involves an extra HOP to s, but removes a HOP from other_last, so the two cancel out. To make this work, I revived the reghop4() function which had been commented out, and added a HOP4c() wrapper macro. This is like HOP3c(), but allows you to specify both lower and upper limits. Useful when you don't know the sign of the offset in advance. (Yves had earlier added this function, but had commented it out until such time as it was actually used.) I also added some extra comments to this block and removed the comment about it being maybe broken under utf8, since I'm auditing the code for utf8-safeness. --- diff --git a/embed.fnc b/embed.fnc index 64aa735..6f743e4 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2169,10 +2169,8 @@ ERsn |U8* |reghop3 |NN U8 *s|SSize_t off|NN const U8 *lim ERsM |SV* |core_regclass_swash|NULLOK const regexp *prog \ |NN const struct regnode *node|bool doinit \ |NULLOK SV **listsvp -#ifdef XXX_dmq ERsn |U8* |reghop4 |NN U8 *s|SSize_t off|NN const U8 *llim \ |NN const U8 *rlim -#endif ERsn |U8* |reghopmaybe3 |NN U8 *s|SSize_t off|NN const U8 *lim ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c \ |NN char *s|NN const char *strend \ diff --git a/embed.h b/embed.h index 2a6064c..d1224eb 100644 --- a/embed.h +++ b/embed.h @@ -1004,6 +1004,7 @@ #define regcppop(a,b) S_regcppop(aTHX_ a,b) #define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c) #define reghop3 S_reghop3 +#define reghop4 S_reghop4 #define reghopmaybe3 S_reghopmaybe3 #define reginclass(a,b,c,d,e) S_reginclass(aTHX_ a,b,c,d,e) #define regmatch(a,b,c) S_regmatch(aTHX_ a,b,c) @@ -1011,9 +1012,6 @@ #define regtry(a,b) S_regtry(aTHX_ a,b) #define to_byte_substr(a) S_to_byte_substr(aTHX_ a) #define to_utf8_substr(a) S_to_utf8_substr(aTHX_ a) -# if defined(XXX_dmq) -#define reghop4 S_reghop4 -# endif # endif # if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) #define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d) diff --git a/proto.h b/proto.h index 0115eab..88e246a 100644 --- a/proto.h +++ b/proto.h @@ -7175,6 +7175,14 @@ STATIC U8* S_reghop3(U8 *s, SSize_t off, const U8 *lim) #define PERL_ARGS_ASSERT_REGHOP3 \ assert(s); assert(lim) +STATIC U8* S_reghop4(U8 *s, SSize_t off, const U8 *llim, const U8 *rlim) + __attribute__warn_unused_result__ + __attribute__nonnull__(1) + __attribute__nonnull__(3) + __attribute__nonnull__(4); +#define PERL_ARGS_ASSERT_REGHOP4 \ + assert(s); assert(llim); assert(rlim) + STATIC U8* S_reghopmaybe3(U8 *s, SSize_t off, const U8 *lim) __attribute__warn_unused_result__ __attribute__nonnull__(1) @@ -7224,16 +7232,6 @@ STATIC void S_to_utf8_substr(pTHX_ regexp * prog) #define PERL_ARGS_ASSERT_TO_UTF8_SUBSTR \ assert(prog) -# if defined(XXX_dmq) -STATIC U8* S_reghop4(U8 *s, SSize_t off, const U8 *llim, const U8 *rlim) - __attribute__warn_unused_result__ - __attribute__nonnull__(1) - __attribute__nonnull__(3) - __attribute__nonnull__(4); -#define PERL_ARGS_ASSERT_REGHOP4 \ - assert(s); assert(llim); assert(rlim) - -# endif #endif #if defined(PERL_IN_SCOPE_C) STATIC void S_save_pushptri32ptr(pTHX_ void *const ptr1, const I32 i, void *const ptr2, const int type); diff --git a/regexec.c b/regexec.c index f28ef7d..8b2d7c5 100644 --- a/regexec.c +++ b/regexec.c @@ -135,6 +135,10 @@ static const char* const non_utf8_target_but_utf8_required ? reghop3((U8*)(pos), off, (U8*)(lim)) \ : (U8*)((pos + off) > lim ? lim : (pos + off))) +#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ + ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ + : (U8*)(pos + off)) +#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ #define NEXTCHR_IS_EOS (nextchr < 0) @@ -906,11 +910,24 @@ Perl_re_intuit_start(pTHX_ if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { do_other_anchored: { - char * const last = HOP3c(s, -start_shift, strbeg); + char * last; char *last1, *last2; char * const saved_s = s; SV* must; + /* we've previously found a floating substr starting at s. + * This means that the regex origin must lie somewhere + * between min: HOP3(s, -check_offset_max) + * between max: HOP3(s, -check_offset_min) + * (except that min will be >= strpos) + * So the fixed substr must lie somewhere between + * HOP3(min, anchored_offset) + * HOP3(max, anchored_offset) + SvCUR(substr) + */ + assert(strpos + start_shift <= s); + last = HOP4c(s, prog->anchored_offset-start_shift, + strbeg, strend); + t = s - prog->check_offset_max; if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ && (!utf8_target @@ -922,14 +939,12 @@ Perl_re_intuit_start(pTHX_ t = HOP3c(t, prog->anchored_offset, strend); if (t < other_last) /* These positions already checked */ t = other_last; - last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); + + assert(prog->minlen > prog->anchored_offset); + last2 = last1 = HOP3c(strend, + prog->anchored_offset-prog->minlen, strbeg); if (last < last1) last1 = last; - /* XXXX It is not documented what units *_offsets are in. - We assume bytes, but this is clearly wrong. - Meaning this code needs to be carefully reviewed for errors. - dmq. - */ /* On end-of-str: see comment below. */ must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; @@ -940,8 +955,7 @@ Perl_re_intuit_start(pTHX_ else s = fbm_instr( (unsigned char*)t, - HOP3(HOP3(last1, prog->anchored_offset, strend) - + SvCUR(must), -(SvTAIL(must)!=0), strbeg), + HOP3(last1 + SvCUR(must), -(SvTAIL(must)!=0), strbeg), must, multiline ? FBMrf_MULTILINE : 0 ); @@ -963,8 +977,8 @@ Perl_re_intuit_start(pTHX_ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, ", trying floating at offset %ld...\n", (long)(HOP3c(saved_s, 1, strend) - i_strpos))); - other_last = HOP3c(last1, prog->anchored_offset+1, strend); - s = HOP3c(last, 1, strend); + other_last = HOP3c(last1, 1, strend); + s = HOP4c(last, 1 - prog->anchored_offset, strbeg, strend); goto restart; } else { @@ -7797,11 +7811,6 @@ S_reghop3(U8 *s, SSize_t off, const U8* lim) return s; } -#ifdef XXX_dmq -/* there are a bunch of places where we use two reghop3's that should - be replaced with this routine. but since thats not done yet - we ifdef it out - dmq -*/ STATIC U8 * S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { @@ -7827,7 +7836,6 @@ S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) } return s; } -#endif STATIC U8 * S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)