From 894be9b73be0493c898492f5cfad130c681ee44d Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Mon, 9 Oct 2006 22:36:20 +0200 Subject: [PATCH] Regexp Recurse by name. Message-ID: <9b18b3110610091136g48e5b154tf16d00d38e80a6dc@mail.gmail.com> (with doc nits) p4raw-id: //depot/perl@28981 --- embed.fnc | 1 + embed.h | 2 + ext/re/re.pm | 30 ++++++++----- pod/perlre.pod | 22 +++++++--- proto.h | 3 ++ regcomp.c | 131 ++++++++++++++++++++++++++++++++++++++------------------- t/op/re_tests | 3 ++ 7 files changed, 132 insertions(+), 60 deletions(-) diff --git a/embed.fnc b/embed.fnc index bccc933..9be1e37 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1315,6 +1315,7 @@ Es |regnode*|regpiece |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth Es |regnode*|reg_namedseq |NN struct RExC_state_t *state|NULLOK UV *valuep Es |void |reginsert |NN struct RExC_state_t *state|U8 op|NN regnode *opnd|U32 depth Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth +Es |SV * |reg_scan_name |NN struct RExC_state_t *state|U32 flags Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth EsRn |char* |regwhite |NN char *p|NN const char *e Es |char* |nextchar |NN struct RExC_state_t *state diff --git a/embed.h b/embed.h index a3e8f70..dc5efad 100644 --- a/embed.h +++ b/embed.h @@ -1319,6 +1319,7 @@ #define reg_namedseq S_reg_namedseq #define reginsert S_reginsert #define regtail S_regtail +#define reg_scan_name S_reg_scan_name #define join_exact S_join_exact #define regwhite S_regwhite #define nextchar S_nextchar @@ -3518,6 +3519,7 @@ #define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b) #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) +#define reg_scan_name(a,b) S_reg_scan_name(aTHX_ a,b) #define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f) #define regwhite S_regwhite #define nextchar(a) S_nextchar(aTHX_ a) diff --git a/ext/re/re.pm b/ext/re/re.pm index 87a450d..b763fef 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -218,7 +218,7 @@ sub setcolor { if ($@) { $ENV{PERL_RE_COLORS}||=qq'\t\t> <\t> <\t\t' } - + } my %flags = ( @@ -242,23 +242,33 @@ my %flags = ( ); $flags{ALL} = -1; $flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; +$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE}; $flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; $flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; $flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; -my $installed =eval { - require XSLoader; - XSLoader::load('re'); - install(); -}; +my $installed; sub _load_unload { my ($on)= @_; if ($on) { - die "'re' not installed!?" unless $installed; - #warn "installed: $installed\n"; - install(); # allow for changes in colors - $^H{regcomp}= $installed; + if ( ! defined($installed) ) { + require XSLoader; + XSLoader::load('re'); + $installed = install() || 0; + } + if ( ! $installed ) { + die "'re' not installed!?"; + } else { + # We could just say = $installed; but then we wouldn't + # "see" any changes to the color environment var. + + # install() returns an integer, which if casted properly + # in C resolves to a structure containing the regex + # hooks. Setting it to a random integer will guarantee + # segfaults. + $^H{regcomp} = install(); + } } else { delete $^H{regcomp}; } diff --git a/pod/perlre.pod b/pod/perlre.pod index 7cc5dec..a22344f 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -820,9 +820,8 @@ Recursing deeper than 50 times without consuming any input string will result in a fatal error. The maximum depth is compiled into perl, so changing it requires a custom build. -=item C<(?PARNO)> C<(?R)> - -X<(?PARNO)> X<(?1)> +=item C<(?PARNO)> C<(?R)> C<(?0)> +X<(?PARNO)> X<(?1)> X<(?R)> X<(?0)> X X X B: This extended regular expression feature is considered @@ -834,9 +833,10 @@ pattern that must match at the current position. Capture buffers contained by the pattern will have the value as determined by the outermost recursion. -PARNO is a sequence of digits not starting with 0 whose value -reflects the paren-number of the capture buffer to recurse to. -C<(?R)> curses to the beginning of the pattern. +PARNO is a sequence of digits (not starting with 0) whose value reflects +the paren-number of the capture buffer to recurse to. C<(?R)> recurses to +the beginning of the whole pattern. C<(?0)> is an alternate syntax for +C<(?R)>. The following pattern matches a function foo() which may contain balanced parenthesis as the argument. @@ -881,6 +881,16 @@ a recursed group, in PCRE and Python the recursed into group is treated as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect the pattern being recursed into. +=item C<(?&NAME)> +X<(?&NAME)> + +Recurse to a named subpattern. Identical to (?PARNO) except that the +parenthesis to recurse to is determined by name. If multiple parens have +the same name, then it recurses to the leftmost. + +It is an error to refer to a name that is not declared somewhere in the +pattern. + =item C<< (?>pattern) >> X X X X diff --git a/proto.h b/proto.h index dc740cb..0e51ab4 100644 --- a/proto.h +++ b/proto.h @@ -3586,6 +3586,9 @@ STATIC void S_regtail(pTHX_ struct RExC_state_t *state, regnode *p, const regnod __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); +STATIC SV * S_reg_scan_name(pTHX_ struct RExC_state_t *state, U32 flags) + __attribute__nonnull__(pTHX_1); + STATIC U32 S_join_exact(pTHX_ struct RExC_state_t *state, regnode *scan, I32 *min, U32 flags, regnode *val, U32 depth) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) diff --git a/regcomp.c b/regcomp.c index 71c9133..64e6c8d 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4323,6 +4323,35 @@ Perl_reg_named_buff_sv(pTHX_ SV* namesv) } } +/* Scans the name of a named buffer from the pattern. + * If flags is true then returns an SV containing the name. + */ +STATIC SV* +S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { + char *name_start = RExC_parse; + if (UTF) { + STRLEN numlen; + while (isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT))) + RExC_parse += numlen; + } + else { + while (isIDFIRST(*RExC_parse)) + RExC_parse++; + } + if (flags) { + SV* svname = sv_2mortal(Perl_newSVpvn(aTHX_ name_start, + (int)(RExC_parse - name_start))); + if (UTF) + SvUTF8_on(svname); + return svname; + } + else { + return NULL; + } +} + #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ int rem=(int)(RExC_end - RExC_parse); \ int cut; \ @@ -4430,37 +4459,28 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = *RExC_parse++; ret = NULL; /* For look-ahead/behind. */ switch (paren) { - + case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; - else if (*RExC_parse != '=') - { /* (?<...>) */ + else if (*RExC_parse != '=') { /* (?<...>) */ char *name_start; + SV *svname; paren= '>'; case '\'': /* (?'...') */ name_start= RExC_parse; - if (UTF) { - STRLEN numlen; - while(isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, UTF8_ALLOW_DEFAULT))) - RExC_parse += numlen; - } else { - while(isIDFIRST(*RExC_parse)) - RExC_parse++; - } + svname = reg_scan_name(pRExC_state,SIZE_ONLY); if (RExC_parse == name_start) goto unknown; if (*RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); if (SIZE_ONLY) { - SV *svname= Perl_newSVpvf(aTHX_ "%.*s", - (int)(RExC_parse - name_start), name_start); HE *he_str; SV *sv_dat = NULL; - + if (!svname) /* shouldnt happen */ + Perl_croak(aTHX_ + "panic: reg_scan_name returned NULL"); if (!RExC_paren_names) { RExC_paren_names= newHV(); sv_2mortal((SV*)RExC_paren_names); @@ -4511,22 +4531,53 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; - case '0' : - case 'R' : - if (*RExC_parse != ')') + case '0' : /* (?0) */ + case 'R' : /* (?R) */ + if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); reg_node(pRExC_state, SRECURSE); - break; + break; /* (?PARNO) */ + { /* named and numeric backreferences */ + I32 num; + char * parse_start; + case '&': /* (?&NAME) */ + parse_start = RExC_parse - 1; + { + char *name_start = RExC_parse; + SV *svname = reg_scan_name(pRExC_state, !SIZE_ONLY); + if (RExC_parse == name_start) + goto unknown; + if (*RExC_parse != ')') + vFAIL("Expecting close bracket"); + if (!SIZE_ONLY) { + HE *he_str = NULL; + SV *sv_dat; + if (!svname) /* shouldn't happen*/ + Perl_croak(aTHX_ "panic: reg_scan_name returned NULL"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 ); + if (he_str) + sv_dat = HeVAL(he_str); + else + vFAIL("Reference to nonexistent group"); + num = *((I32 *)SvPVX(sv_dat)); + } else { + num = 0; + } + } + goto gen_recurse_regop; + /* NOT REACHED */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': RExC_parse--; - { - const I32 num = atoi(RExC_parse); - char * const parse_start = RExC_parse - 1; /* MJD */ + num = atoi(RExC_parse); + parse_start = RExC_parse - 1; /* MJD */ while (isDIGIT(*RExC_parse)) RExC_parse++; if (*RExC_parse!=')') vFAIL("Expecting close bracket"); + + gen_recurse_regop: ret = reganode(pRExC_state, RECURSE, num); if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) { @@ -4537,7 +4588,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_emit++; DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Recurse #%"UVuf" to %"IVdf"\n", ARG(ret), ARG2L(ret))); - } else{ + } else { RExC_size++; RExC_seen|=REG_SEEN_RECURSE; } @@ -4546,7 +4597,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; - } + } /* named and numeric backreferences */ + /* NOT REACHED */ + case 'p': /* (?p...) */ if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})"); @@ -5680,18 +5733,9 @@ tryagain: } else { char* name_start = (RExC_parse += 2); I32 num = 0; - ch= (ch == '<') ? '>' : '\''; - - if (UTF) { - STRLEN numlen; - while(isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, UTF8_ALLOW_DEFAULT))) - RExC_parse += numlen; - } else { - while(isIDFIRST(*RExC_parse)) - RExC_parse++; - } + SV *svname = reg_scan_name(pRExC_state,!SIZE_ONLY); + ch= (ch == '<') ? '>' : '\''; + if (RExC_parse == name_start || *RExC_parse != ch) vFAIL2("Sequence \\k%c... not terminated", (ch == '>' ? '<' : ch)); @@ -5704,14 +5748,13 @@ tryagain: if (!SIZE_ONLY) { - SV *svname = Perl_newSVpvf(aTHX_ "%.*s", - (int)(RExC_parse - name_start), name_start); - HE *he_str; + HE *he_str = NULL; SV *sv_dat; - if (UTF) - SvUTF8_on(svname); - he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 ); - SvREFCNT_dec(svname); + if (!svname) + Perl_croak(aTHX_ + "panic: reg_scan_name returned NULL"); + if (RExC_paren_names) + he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 ); if ( he_str ) { sv_dat = HeVAL(he_str); } else { diff --git a/t/op/re_tests b/t/op/re_tests index 08d45b2..83de44a 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -1037,3 +1037,6 @@ X(?<=foo.)[YZ] ..XfooXY.. y pos 8 /(?foo)\k'n'/ ..foofoo.. y $1 foo /(?foo)\k'n'/ ..foofoo.. y $+{n} foo /(?:(?foo)|(?bar))\k/ ..barbar.. y $+{n} bar +/^(?'main'<(?:[^<>]+|(?&crap)|(?&main))*>)(?'empty')(?'crap'!>!>!>)$/ <!>!>><>>!>!>!> y $+{main} <!>!>><>> +/^(?'main'<(?:[^<>]+|(?&main))*>)$/ <<><<<><>>>> y $1 <<><<<><>>>> +/(?'first'(?&second)*)(?'second'[fF]o+)/ fooFoFoo y $+{first}-$+{second} fooFo-Foo -- 2.7.4