From 49f55535e0b402f8cbdf839b5f2c88306c91a31d Mon Sep 17 00:00:00 2001 From: Father Chrysostomos Date: Tue, 30 Jul 2013 23:49:58 -0700 Subject: [PATCH] Stop substr re optimisation from rejecting long strs Using I32 for the fields that record information about the location of a fixed string that must be found for a regular expression to match can result in match failures, because I32 is not large enough to store offsets >= 2**31. SSize_t is appropriate, since it is 64 bits on 64-bit platforms and 32 bits on 32-bit platforms. This commit changes enough instances of I32 to SSize_t to get the added test passing and suppress compiler warnings. A later commit will change many more. --- embed.fnc | 4 ++-- proto.h | 2 +- regcomp.c | 39 ++++++++++++++++++++------------------- regexec.c | 4 ++-- regexp.h | 4 ++-- t/bigmem/regexp.t | 6 +++++- 6 files changed, 32 insertions(+), 27 deletions(-) diff --git a/embed.fnc b/embed.fnc index 39b3bb4..ecf3e23 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2045,9 +2045,9 @@ Esn |void |cl_and |NN struct regnode_charclass_class *cl \ Esn |void |cl_or |NN const struct RExC_state_t *pRExC_state \ |NN struct regnode_charclass_class *cl \ |NN const struct regnode_charclass_class *or_with -Es |I32 |study_chunk |NN struct RExC_state_t *pRExC_state \ +Es |SSize_t|study_chunk |NN struct RExC_state_t *pRExC_state \ |NN regnode **scanp|NN I32 *minlenp \ - |NN I32 *deltap|NN regnode *last \ + |NN SSize_t *deltap|NN regnode *last \ |NULLOK struct scan_data_t *data \ |I32 stopparen|NULLOK U8* recursed \ |NULLOK struct regnode_charclass_class *and_withp \ diff --git a/proto.h b/proto.h index d7c9f32..26b52a3 100644 --- a/proto.h +++ b/proto.h @@ -6773,7 +6773,7 @@ STATIC void S_scan_commit(pTHX_ const struct RExC_state_t *pRExC_state, struct s #define PERL_ARGS_ASSERT_SCAN_COMMIT \ assert(pRExC_state); assert(data); assert(minlenp) -STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U8* recursed, struct regnode_charclass_class *and_withp, U32 flags, U32 depth) +STATIC SSize_t S_study_chunk(pTHX_ struct RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, SSize_t *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U8* recursed, struct regnode_charclass_class *and_withp, U32 flags, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3) diff --git a/regcomp.c b/regcomp.c index 5a55756..1f89754 100644 --- a/regcomp.c +++ b/regcomp.c @@ -345,25 +345,25 @@ typedef struct RExC_state_t { typedef struct scan_data_t { /*I32 len_min; unused */ /*I32 len_delta; unused */ - I32 pos_min; + SSize_t pos_min; I32 pos_delta; SV *last_found; I32 last_end; /* min value, <0 unless valid. */ - I32 last_start_min; + SSize_t last_start_min; I32 last_start_max; SV **longest; /* Either &l_fixed, or &l_float. */ SV *longest_fixed; /* longest fixed string found in pattern */ - I32 offset_fixed; /* offset where it starts */ + SSize_t offset_fixed; /* offset where it starts */ I32 *minlen_fixed; /* pointer to the minlen relevant to the string */ I32 lookbehind_fixed; /* is the position of the string modfied by LB */ SV *longest_float; /* longest floating string found in pattern */ - I32 offset_float_min; /* earliest point in string it can appear */ + SSize_t offset_float_min; /* earliest point in string it can appear */ I32 offset_float_max; /* latest point in string it can appear */ I32 *minlen_float; /* pointer to the minlen relevant to the string */ - I32 lookbehind_float; /* is the position of the string modified by LB */ + SSize_t lookbehind_float; /* is the pos of the string modified by LB */ I32 flags; I32 whilem_c; - I32 *last_closep; + SSize_t *last_closep; struct regnode_charclass_class *start_class; } scan_data_t; @@ -3028,9 +3028,9 @@ typedef struct scan_frame { #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) -STATIC I32 +STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, - I32 *minlenp, I32 *deltap, + I32 *minlenp, SSize_t *deltap, regnode *last, scan_data_t *data, I32 stopparen, @@ -3124,7 +3124,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* NOTE - There is similar code to this block below for handling TRIE nodes on a re-study. If you change stuff here check there too. */ - I32 max1 = 0, min1 = I32_MAX, num = 0; + SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0; struct regnode_charclass_class accum; regnode * const startbranch=scan; @@ -3134,7 +3134,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_init_zero(pRExC_state, &accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0, fake; + SSize_t deltanext, minnext, fake; + I32 f = 0; struct regnode_charclass_class this_class; num++; @@ -3678,7 +3679,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, flags &= ~SCF_DO_STCLASS; } else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ - I32 l = STR_LEN(scan); + SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); /* Search for fixed substrings supports EXACT only. */ @@ -3795,8 +3796,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, flags &= ~SCF_DO_STCLASS; } else if (REGNODE_VARIES(OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, fl = 0; - I32 f = flags, pos_before = 0; + SSize_t mincount, maxcount, minnext, deltanext; + I32 fl = 0, f = flags, pos_before = 0; regnode * const oscan = scan; struct regnode_charclass_class this_class; struct regnode_charclass_class *oclass = NULL; @@ -4373,7 +4374,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext In this case we can't do fixed string optimisation. */ - I32 deltanext, minnext, fake = 0; + SSize_t deltanext, minnext, fake = 0; regnode *nscan; struct regnode_charclass_class intrnl; int f = 0; @@ -4615,7 +4616,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext for ( word=1 ; word <= trie->wordcount ; word++) { - I32 deltanext=0, minnext=0, f = 0, fake; + SSize_t deltanext=0, minnext=0, f = 0, fake; struct regnode_charclass_class this_class; data_fake.flags = 0; @@ -6075,11 +6076,11 @@ reStudy: /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */ - I32 fake; + SSize_t fake; STRLEN longest_float_length, longest_fixed_length; struct regnode_charclass_class ch_class; /* pointed to by data */ int stclass_flag; - I32 last_close = 0; /* pointed to by data */ + SSize_t last_close = 0; /* pointed to by data */ regnode *first= scan; regnode *first_next= regnext(first); /* @@ -6361,9 +6362,9 @@ reStudy: } else { /* Several toplevels. Best we can is to set minlen. */ - I32 fake; + SSize_t fake; struct regnode_charclass_class ch_class; - I32 last_close = 0; + SSize_t last_close = 0; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); diff --git a/regexec.c b/regexec.c index 29991b5..eaabe8c 100644 --- a/regexec.c +++ b/regexec.c @@ -615,7 +615,7 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = 0; /* Should be nonnegative! */ I32 end_shift = 0; char *s; @@ -751,7 +751,7 @@ Perl_re_intuit_start(pTHX_ the "check" substring in the region corrected by start/end_shift. */ { - I32 srch_start_shift = start_shift; + SSize_t srch_start_shift = start_shift; I32 srch_end_shift = end_shift; U8* start_point; U8* end_point; diff --git a/regexp.h b/regexp.h index ebbbde1..8542cb1 100644 --- a/regexp.h +++ b/regexp.h @@ -36,8 +36,8 @@ struct regexp_engine; struct regexp; struct reg_substr_datum { - I32 min_offset; - I32 max_offset; + SSize_t min_offset; + SSize_t max_offset; SV *substr; /* non-utf8 variant */ SV *utf8_substr; /* utf8 variant */ I32 end_shift; diff --git a/t/bigmem/regexp.t b/t/bigmem/regexp.t index 9404f2c..d5496f1 100644 --- a/t/bigmem/regexp.t +++ b/t/bigmem/regexp.t @@ -12,7 +12,7 @@ $ENV{PERL_TEST_MEMORY} >= 2 $Config{ptrsize} >= 8 or skip_all("Need 64-bit pointers for this test"); -plan(5); +plan(6); # [perl #116907] # ${\2} to defeat constant folding, which in this case actually slows @@ -33,3 +33,7 @@ $x =~ /./g; is "$'", 'efg', q "$' after match against long string"; is "$-[0],$+[0]", '2147483651,2147483652', '@- and @+ after matches past 2**31'; + +# Substring optimisations +is $x =~ /(?:(?:.{32766}){32766}){2}(?:.{32766}){8}.{8}ef/, 1, + 'anchored substr past 2**31'; -- 2.7.4