From bcdf74043c1fd4b60b7764f5cd7d87525cf77e74 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Wed, 31 Jan 2007 00:58:31 +0100 Subject: [PATCH] Improve regex stringification code Message-ID: <9b18b3110701301458k2f6a8254hea6c6db28489c38b@mail.gmail.com> p4raw-id: //depot/perl@30084 --- embed.fnc | 5 +- embed.h | 2 + ext/re/re.xs | 7 +- ext/re/t/re_funcs.t | 17 ++-- op.c | 5 +- perl.h | 25 +----- pod/perlapi.pod | 29 +++++++ proto.h | 7 +- regcomp.c | 222 ++++++++++++++++++++++++++++++---------------------- regcomp.h | 1 + regexec.c | 2 +- regexp.h | 32 +++++++- 12 files changed, 217 insertions(+), 137 deletions(-) diff --git a/embed.fnc b/embed.fnc index 2b3112d..5564a8c 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1339,8 +1339,9 @@ Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd|U32 de Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth Es |SV * |reg_scan_name |NN struct RExC_state_t *state|U32 flags Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth -EsRn |char* |regwhite |NN char *p|NN const char *e -Es |char* |nextchar |NN struct RExC_state_t *state +EsRn |char * |regwhite |NN struct RExC_state_t *state|NN char *p +Es |char * |nextchar |NN struct RExC_state_t *state +Es |bool |reg_skipcomment|NN struct RExC_state_t *state Es |void |scan_commit |NN const struct RExC_state_t* state|NN struct scan_data_t *data|NN I32 *minlenp|int is_inf Esn |void |cl_anything |NN const struct RExC_state_t* state|NN struct regnode_charclass_class *cl EsRn |int |cl_is_anything |NN const struct regnode_charclass_class *cl diff --git a/embed.h b/embed.h index aa6b1dd..64a3c3d 100644 --- a/embed.h +++ b/embed.h @@ -1333,6 +1333,7 @@ #define join_exact S_join_exact #define regwhite S_regwhite #define nextchar S_nextchar +#define reg_skipcomment S_reg_skipcomment #define scan_commit S_scan_commit #define cl_anything S_cl_anything #define cl_is_anything S_cl_is_anything @@ -3542,6 +3543,7 @@ #define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f) #define regwhite S_regwhite #define nextchar(a) S_nextchar(aTHX_ a) +#define reg_skipcomment(a) S_reg_skipcomment(aTHX_ a) #define scan_commit(a,b,c,d) S_scan_commit(aTHX_ a,b,c,d) #define cl_anything S_cl_anything #define cl_is_anything S_cl_is_anything diff --git a/ext/re/re.xs b/ext/re/re.xs index aa601cf..1bc20fc 100644 --- a/ext/re/re.xs +++ b/ext/re/re.xs @@ -120,7 +120,7 @@ PPCODE: modifiers" in this scenario. */ - char *fptr = "msix"; + char *fptr = INT_PAT_MODS; char ch; U16 match_flags = (U16)((re->extflags & PMf_COMPILETIME) >> 12); @@ -140,11 +140,8 @@ PPCODE: XSRETURN(2); } else { /* Scalar, so use the string that Perl would return */ - if (!mg->mg_ptr) - CALLREG_STRINGIFY(mg,0,0); - /* return the pattern in (?msix:..) format */ - pattern = sv_2mortal(newSVpvn(mg->mg_ptr,mg->mg_len)); + pattern = sv_2mortal(newSVpvn(re->wrapped,re->wraplen)); if (re->extflags & RXf_UTF8) SvUTF8_on(pattern); XPUSHs(pattern); diff --git a/ext/re/t/re_funcs.t b/ext/re/t/re_funcs.t index 736829c..bf8202a 100644 --- a/ext/re/t/re_funcs.t +++ b/ext/re/t/re_funcs.t @@ -16,14 +16,15 @@ use Test::More; # test count at bottom of file use re qw(is_regexp regexp_pattern regmust regname regnames regnames_count regnames_iterinit regnames_iternext); -my $qr=qr/foo/i; - -ok(is_regexp($qr),'is_regexp($qr)'); -ok(!is_regexp(''),'is_regexp("")'); -is((regexp_pattern($qr))[0],'foo','regexp_pattern[0]'); -is((regexp_pattern($qr))[1],'i','regexp_pattern[1]'); -is(regexp_pattern($qr),'(?i-xsm:foo)','scalar regexp_pattern'); -ok(!regexp_pattern(''),'!regexp_pattern("")'); +{ + my $qr=qr/foo/ki; + ok(is_regexp($qr),'is_regexp($qr)'); + ok(!is_regexp(''),'is_regexp("")'); + is((regexp_pattern($qr))[0],'foo','regexp_pattern[0]'); + is((regexp_pattern($qr))[1],'ik','regexp_pattern[1]'); + is(regexp_pattern($qr),'(?ki-xsm:foo)','scalar regexp_pattern'); + ok(!regexp_pattern(''),'!regexp_pattern("")'); +} { my $qr=qr/here .* there/x; my ($anchored,$floating)=regmust($qr); diff --git a/op.c b/op.c index 77d183b..ce3261e 100644 --- a/op.c +++ b/op.c @@ -7295,9 +7295,10 @@ Perl_ck_join(pTHX_ OP *o) if (ckWARN(WARN_SYNTAX)) { const REGEXP *re = PM_GETRE(kPMOP); const char *pmstr = re ? re->precomp : "STRING"; + const STRLEN len = re ? re->prelen : 6; Perl_warner(aTHX_ packWARN(WARN_SYNTAX), - "/%s/ should probably be written as \"%s\"", - pmstr, pmstr); + "/%.*s/ should probably be written as \"%.*s\"", + len, pmstr, len, pmstr); } } return ck_fun(o); diff --git a/perl.h b/perl.h index fd77c0e..e2d4253 100644 --- a/perl.h +++ b/perl.h @@ -229,30 +229,7 @@ #endif -/* chars and strings used as regex pattern modifiers - * Singlular is a 'c'har, plural is a "string" - */ -#define EXEC_PAT_MOD 'e' -#define KEEPCOPY_PAT_MOD 'k' -#define ONCE_PAT_MOD 'o' -#define GLOBAL_PAT_MOD 'g' -#define CONTINUE_PAT_MOD 'c' -#define MULTILINE_PAT_MOD 'm' -#define SINGLE_PAT_MOD 's' -#define IGNORE_PAT_MOD 'i' -#define XTENDED_PAT_MOD 'x' - -#define ONCE_PAT_MODS "o" -#define KEEPCOPY_PAT_MODS "k" -#define EXEC_PAT_MODS "e" -#define LOOP_PAT_MODS "gc" - -#define STD_PAT_MODS "msix" - -#define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS -#define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS -#define M_PAT_MODS QR_PAT_MODS LOOP_PAT_MODS -#define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS + /* diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 1ce3684..18b87a1 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -134,6 +134,35 @@ array itself. =for hackers Found in file av.c +=item av_create_and_push +X + +Push an SV onto the end of the array, creating the array if necessary. +A small internal helper function to remove a commonly duplicated idiom. + +NOTE: this function is experimental and may change or be +removed without notice. + + void av_create_and_push(AV **const avp, SV *const val) + +=for hackers +Found in file av.c + +=item av_create_and_unshift_one +X + +Unshifts an SV onto the beginning of the array, creating the array if +necessary. +A small internal helper function to remove a commonly duplicated idiom. + +NOTE: this function is experimental and may change or be +removed without notice. + + SV** av_create_and_unshift_one(AV **const avp, SV *const val) + +=for hackers +Found in file av.c + =item av_delete X diff --git a/proto.h b/proto.h index 5eed24f..8110b29 100644 --- a/proto.h +++ b/proto.h @@ -3635,12 +3635,15 @@ STATIC U32 S_join_exact(pTHX_ struct RExC_state_t *state, regnode *scan, I32 *mi __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); -STATIC char* S_regwhite(char *p, const char *e) +STATIC char * S_regwhite(struct RExC_state_t *state, char *p) __attribute__warn_unused_result__ __attribute__nonnull__(1) __attribute__nonnull__(2); -STATIC char* S_nextchar(pTHX_ struct RExC_state_t *state) +STATIC char * S_nextchar(pTHX_ struct RExC_state_t *state) + __attribute__nonnull__(pTHX_1); + +STATIC bool S_reg_skipcomment(pTHX_ struct RExC_state_t *state) __attribute__nonnull__(pTHX_1); STATIC void S_scan_commit(pTHX_ const struct RExC_state_t* state, struct scan_data_t *data, I32 *minlenp, int is_inf) diff --git a/regcomp.c b/regcomp.c index ec79ced..7a74cfc 100644 --- a/regcomp.c +++ b/regcomp.c @@ -662,7 +662,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min } data->last_end = -1; data->flags &= ~SF_BEFORE_EOL; - DEBUG_STUDYDATA("cl_anything: ",data,0); + DEBUG_STUDYDATA("commit: ",data,0); } /* Can match anything (initialization) */ @@ -2050,6 +2050,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* needed for dumping*/ DEBUG_r(if (optimize) { regnode *opt = convert; + while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } @@ -4139,8 +4140,64 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) r->engine= RE_ENGINE_PTR; r->refcnt = 1; r->prelen = xend - exp; - r->precomp = savepvn(RExC_precomp, r->prelen); r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; + { + bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + r->wraplen = r->prelen + has_minus + has_k + has_runon + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(r->wrapped, r->wraplen, char ); + p = r->wrapped; + *p++='('; *p++='?'; + if (has_k) + *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + { + char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; + char *colon = r + 1; + char ch; + + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + else + *r-- = ch; + reganch >>= 1; + } + if(has_minus) { + *r = '-'; + p = colon; + } + } + + *p++=':'; + Copy(RExC_precomp, p, r->prelen, char); + r->precomp = p; + p += r->prelen; + if (has_runon) + *p++='\n'; + *p=')'; + + + if (0) + PerlIO_printf(Perl_debug_log, + "RExC_precomp: %.*s\nr->precomp: %.*s\nr->wrapped:%.*s\n", + r->prelen, + RExC_precomp, + r->prelen, + r->precomp, + r->wraplen, + r->wrapped + ); + + + } + r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ @@ -6654,9 +6711,7 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - while (RExC_parse < RExC_end && *RExC_parse != '\n') - RExC_parse++; - if (RExC_parse < RExC_end) + if ( reg_skipcomment( pRExC_state ) ) goto tryagain; } /* FALL THROUGH */ @@ -6685,7 +6740,7 @@ tryagain: char * const oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + p = regwhite( pRExC_state, p ); switch (*p) { case '^': case '$': @@ -6833,13 +6888,13 @@ tryagain: ender = *p++; break; } - if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regwhite( pRExC_state, p ); if (UTF && FOLD) { /* Prime the casefolded buffer. */ ender = toFOLD_uni(ender, tmpbuf, &foldlen); } - if (ISMULT2(p)) { /* Back off on ?+*. */ + if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else if (UTF) { @@ -6941,15 +6996,22 @@ tryagain: } STATIC char * -S_regwhite(char *p, const char *e) +S_regwhite( RExC_state_t *pRExC_state, char *p ) { + const char *e = RExC_end; while (p < e) { if (isSPACE(*p)) ++p; else if (*p == '#') { + bool ended = 0; do { - p++; - } while (p < e && *p != '\n'); + if (*p++ == '\n') { + ended = 1; + break; + } + } while (p < e); + if (!ended) + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; } else break; @@ -7731,6 +7793,49 @@ parseit: #undef _C_C_T_ +/* reg_skipcomment() + + Absorbs an /x style # comments from the input stream. + Returns true if there is more text remaining in the stream. + Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + terminates the pattern without including a newline. + + Note its the callers responsibility to ensure that we are + actually in /x mode + +*/ + +STATIC bool +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +{ + bool ended = 0; + while (RExC_parse < RExC_end) + if (*RExC_parse++ == '\n') { + ended = 1; + break; + } + if (!ended) { + /* we ran off the end of the pattern without ending + the comment, so we have to add an \n when wrapping */ + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + return 0; + } else + return 1; +} + +/* nextchar() + + Advance that parse position, and optionall absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + STATIC char* S_nextchar(pTHX_ RExC_state_t *pRExC_state) { @@ -7753,9 +7858,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } else if (*RExC_parse == '#') { - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') break; - continue; + if ( reg_skipcomment( pRExC_state ) ) + continue; } } return retval; @@ -8524,10 +8628,6 @@ Perl_pregfree(pTHX_ struct regexp *r) return; CALLREGFREE_PVT(r); /* free the private data */ - - /* gcov results gave these as non-null 100% of the time, so there's no - optimisation in checking them before calling Safefree */ - Safefree(r->precomp); RX_MATCH_COPY_FREE(r); #ifdef PERL_OLD_COPY_ON_WRITE if (r->saved_copy) @@ -8545,8 +8645,8 @@ Perl_pregfree(pTHX_ struct regexp *r) Safefree(r->substrs); } if (r->paren_names) - SvREFCNT_dec(r->paren_names); - + SvREFCNT_dec(r->paren_names); + Safefree(r->wrapped); Safefree(r->startp); Safefree(r->endp); Safefree(r); @@ -8738,11 +8838,14 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) } else ret->substrs = NULL; - ret->precomp = SAVEPVN(r->precomp, r->prelen); + ret->wrapped = SAVEPVN(r->wrapped, r->wraplen); + ret->precomp = ret->wrapped + (r->precomp - r->wrapped); + ret->prelen = r->prelen; + ret->wraplen = r->wraplen; + ret->refcnt = r->refcnt; ret->minlen = r->minlen; ret->minlenret = r->minlenret; - ret->prelen = r->prelen; ret->nparens = r->nparens; ret->lastparen = r->lastparen; ret->lastcloseparen = r->lastcloseparen; @@ -8809,8 +8912,8 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) reti->swap = NULL; } - reti->regstclass = NULL; + if (ri->data) { struct reg_data *d; const int count = ri->data->count; @@ -8915,83 +9018,18 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) */ #ifndef PERL_IN_XSUB_RE + char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { dVAR; const regexp * const re = (regexp *)mg->mg_obj; - - if (!mg->mg_ptr) { - const char *fptr = STD_PAT_MODS; /*"msix"*/ - char reflags[7]; - char ch; - bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); - U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12); - bool need_newline = 0; - int left = 0; - int right = 4 + hask; - if (hask) - reflags[left++] = KEEPCOPY_PAT_MOD; /*'k'*/ - while((ch = *fptr++)) { - if(reganch & 1) { - reflags[left++] = ch; - } - else { - reflags[right--] = ch; - } - reganch >>= 1; - } - if(hasm) { - reflags[left] = '-'; - left = 5 + hask; - } - /* printf("[%*.7s]\n",left,reflags); */ - mg->mg_len = re->prelen + 4 + left; - /* - * If /x was used, we have to worry about a regex ending with a - * comment later being embedded within another regex. If so, we don't - * want this regex's "commentization" to leak out to the right part of - * the enclosing regex, we must cap it with a newline. - * - * So, if /x was used, we scan backwards from the end of the regex. If - * we find a '#' before we find a newline, we need to add a newline - * ourself. If we find a '\n' first (or if we don't find '#' or '\n'), - * we don't need to add anything. -jfriedl - */ - if (PMf_EXTENDED & re->extflags) { - const char *endptr = re->precomp + re->prelen; - while (endptr >= re->precomp) { - const char c = *(endptr--); - if (c == '\n') - break; /* don't need another */ - if (c == '#') { - /* we end while in a comment, so we need a newline */ - mg->mg_len++; /* save space for it */ - need_newline = 1; /* note to add it */ - break; - } - } - } - - Newx(mg->mg_ptr, mg->mg_len + 1 + left, char); - mg->mg_ptr[0] = '('; - mg->mg_ptr[1] = '?'; - Copy(reflags, mg->mg_ptr+2, left, char); - *(mg->mg_ptr+left+2) = ':'; - Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char); - if (need_newline) - mg->mg_ptr[mg->mg_len - 2] = '\n'; - mg->mg_ptr[mg->mg_len - 1] = ')'; - mg->mg_ptr[mg->mg_len] = 0; - } if (haseval) *haseval = re->seen_evals; if (flags) *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); - if (lp) - *lp = mg->mg_len; - return mg->mg_ptr; + *lp = re->wraplen; + return re->wrapped; } /* diff --git a/regcomp.h b/regcomp.h index 51b14b7..3d08ac8 100644 --- a/regcomp.h +++ b/regcomp.h @@ -414,6 +414,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define REG_TOP_LEVEL_BRANCHES 0x00000040 #define REG_SEEN_VERBARG 0x00000080 #define REG_SEEN_CUTGROUP 0x00000100 +#define REG_SEEN_RUN_ON_COMMENT 0x00000200 START_EXTERN_C diff --git a/regexec.c b/regexec.c index cad8f61..6c82ba7 100644 --- a/regexec.c +++ b/regexec.c @@ -498,7 +498,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, /* end shift should be non negative here */ } -#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ +#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", (IV)end_shift, prog->precomp); diff --git a/regexp.h b/regexp.h index a2e0fc1..0bf886b 100644 --- a/regexp.h +++ b/regexp.h @@ -82,6 +82,8 @@ typedef struct regexp { /* Information about the match that isn't often used */ char *precomp; /* pre-compilation regular expression */ I32 prelen; /* length of precomp */ + char *wrapped; /* wrapped version of the pattern */ + I32 wraplen; /* length of wrapped */ I32 seen_evals; /* number of eval groups in the pattern - for security checks */ HV *paren_names; /* Optional hash of paren names */ @@ -138,7 +140,7 @@ typedef struct regexp_engine { #define RXf_START_ONLY 0x00000200 /* Pattern is /^/ */ #define RXf_WHITE 0x00000400 /* Pattern is /\s+/ */ -/* 0xF800 of extflags is used by (RXf_)PMf_COMPILETIME */ +/* 0x1F800 of extflags is used by (RXf_)PMf_COMPILETIME */ #define RXf_PMf_LOCALE 0x00000800 /* use locale */ #define RXf_PMf_MULTILINE 0x00001000 /* /m */ #define RXf_PMf_SINGLELINE 0x00002000 /* /s */ @@ -155,6 +157,34 @@ typedef struct regexp_engine { case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \ case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; break +/* chars and strings used as regex pattern modifiers + * Singlular is a 'c'har, plural is a "string" + */ +#define EXEC_PAT_MOD 'e' +#define KEEPCOPY_PAT_MOD 'k' +#define ONCE_PAT_MOD 'o' +#define GLOBAL_PAT_MOD 'g' +#define CONTINUE_PAT_MOD 'c' +#define MULTILINE_PAT_MOD 'm' +#define SINGLE_PAT_MOD 's' +#define IGNORE_PAT_MOD 'i' +#define XTENDED_PAT_MOD 'x' + +#define ONCE_PAT_MODS "o" +#define KEEPCOPY_PAT_MODS "k" +#define EXEC_PAT_MODS "e" +#define LOOP_PAT_MODS "gc" + +#define STD_PAT_MODS "msix" + +#define INT_PAT_MODS STD_PAT_MODS KEEPCOPY_PAT_MODS + +#define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS +#define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS +#define M_PAT_MODS QR_PAT_MODS LOOP_PAT_MODS +#define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS + + /* What we have seen */ #define RXf_LOOKBEHIND_SEEN 0x00020000 #define RXf_EVAL_SEEN 0x00040000 -- 2.7.4