From 81714fb9c03d91d66b66cab6e899e81bf64a2ca7 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Fri, 6 Oct 2006 21:16:01 +0200 Subject: [PATCH] Re: [PATCH] Initial attempt at named captures for perls regexp engine Message-ID: <9b18b3110610061016x5ddce965u30d9a821f632d450@mail.gmail.com> p4raw-id: //depot/perl@28957 --- XSUB.h | 1 + doop.c | 5 +- dump.c | 1 + embed.fnc | 1 + embed.h | 2 + gv.c | 4 ++ hv.c | 109 +++++++++++++++++++++++++++++-- perl.h | 14 ++++ pod/perlre.pod | 142 +++++++++++++++++++++++++++++----------- pod/perltodo.pod | 10 --- pod/perlvar.pod | 13 +++- pp.c | 2 +- proto.h | 3 + regcomp.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++---- regcomp.h | 1 + regcomp.sym | 16 +++-- regexec.c | 49 ++++++++++---- regexp.h | 3 +- regnodes.h | 147 ++++++++++++++++++++++------------------- sv.c | 3 + t/op/pat.t | 27 +++++++- t/op/re_tests | 17 +++++ t/op/regexp.t | 2 +- toke.c | 2 +- 24 files changed, 612 insertions(+), 156 deletions(-) diff --git a/XSUB.h b/XSUB.h index de5d33b..e4cc816 100644 --- a/XSUB.h +++ b/XSUB.h @@ -391,6 +391,7 @@ Rethrows a previously caught exception. See L. # define VTBL_uvar &PL_vtbl_uvar # define VTBL_defelem &PL_vtbl_defelem # define VTBL_regexp &PL_vtbl_regexp +# define VTBL_regdata_names &PL_vtbl_regdata_names # define VTBL_regdata &PL_vtbl_regdata # define VTBL_regdatum &PL_vtbl_regdatum # ifdef USE_LOCALE_COLLATE diff --git a/doop.c b/doop.c index 5d1fc7a..1620465 100644 --- a/doop.c +++ b/doop.c @@ -1425,8 +1425,11 @@ Perl_do_kv(pTHX) RETURN; } - if (! SvTIED_mg((SV*)keys, PERL_MAGIC_tied)) + if (! SvTIED_mg((SV*)keys, PERL_MAGIC_tied) + && ! SvTIED_mg((SV*)keys, PERL_MAGIC_regdata_names)) + { i = HvKEYS(keys); + } else { i = 0; while (hv_iternext(keys)) i++; diff --git a/dump.c b/dump.c index ce2c7ca..c61516b 100644 --- a/dump.c +++ b/dump.c @@ -1127,6 +1127,7 @@ static const struct { const char type; const char *name; } magic_names[] = { { PERL_MAGIC_sv, "sv(\\0)" }, { PERL_MAGIC_arylen, "arylen(#)" }, { PERL_MAGIC_rhash, "rhash(%)" }, + { PERL_MAGIC_regdata_names, "regdata_names(+)" }, { PERL_MAGIC_pos, "pos(.)" }, { PERL_MAGIC_symtab, "symtab(:)" }, { PERL_MAGIC_backref, "backref(<)" }, diff --git a/embed.fnc b/embed.fnc index 6723d92..bccc933 100644 --- a/embed.fnc +++ b/embed.fnc @@ -680,6 +680,7 @@ Ap |I32 |regexec_flags |NN regexp* prog|NN char* stringarg \ |NN char* strend|NN char* strbeg|I32 minend \ |NN SV* screamer|NULLOK void* data|U32 flags ApR |regnode*|regnext |NN regnode* p +Ep |SV*|reg_named_buff_sv |NN SV* namesv Ep |void |regprop |NULLOK const regexp *prog|NN SV* sv|NN const regnode* o Ap |void |repeatcpy |NN char* to|NN const char* from|I32 len|I32 count ApP |char* |rninstr |NN const char* big|NN const char* bigend \ diff --git a/embed.h b/embed.h index 0e06d49..a3e8f70 100644 --- a/embed.h +++ b/embed.h @@ -689,6 +689,7 @@ #define regexec_flags Perl_regexec_flags #define regnext Perl_regnext #if defined(PERL_CORE) || defined(PERL_EXT) +#define reg_named_buff_sv Perl_reg_named_buff_sv #define regprop Perl_regprop #endif #define repeatcpy Perl_repeatcpy @@ -2894,6 +2895,7 @@ #define regexec_flags(a,b,c,d,e,f,g,h) Perl_regexec_flags(aTHX_ a,b,c,d,e,f,g,h) #define regnext(a) Perl_regnext(aTHX_ a) #if defined(PERL_CORE) || defined(PERL_EXT) +#define reg_named_buff_sv(a) Perl_reg_named_buff_sv(aTHX_ a) #define regprop(a,b,c) Perl_regprop(aTHX_ a,b,c) #endif #define repeatcpy(a,b,c,d) Perl_repeatcpy(aTHX_ a,b,c,d) diff --git a/gv.c b/gv.c index 637e82f..9ad4434 100644 --- a/gv.c +++ b/gv.c @@ -1188,10 +1188,14 @@ Perl_gv_fetchpvn_flags(pTHX_ const char *nambeg, STRLEN full_len, I32 flags, goto magicalize; case '+': + GvMULTI_on(gv); { AV* const av = GvAVn(gv); + HV* const hv = GvHVn(gv); sv_magic((SV*)av, (SV*)av, PERL_MAGIC_regdata, NULL, 0); SvREADONLY_on(av); + hv_magic(hv, NULL, PERL_MAGIC_regdata_names); + SvREADONLY_on(hv); /* FALL THROUGH */ } case '\023': /* $^S */ diff --git a/hv.c b/hv.c index d1835b2..8552cd2 100644 --- a/hv.c +++ b/hv.c @@ -450,12 +450,12 @@ S_hv_fetch_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen, xhv = (XPVHV*)SvANY(hv); if (SvMAGICAL(hv)) { if (SvRMAGICAL(hv) && !(action & (HV_FETCH_ISSTORE|HV_FETCH_ISEXISTS))) { - if (mg_find((SV*)hv, PERL_MAGIC_tied) || SvGMAGICAL((SV*)hv)) { - sv = sv_newmortal(); + MAGIC *regdata = NULL; + if (mg_find((SV*)hv, PERL_MAGIC_tied) || SvGMAGICAL((SV*)hv) + || (regdata = mg_find((SV*)hv, PERL_MAGIC_regdata_names))) { /* XXX should be able to skimp on the HE/HEK here when HV_FETCH_JUST_SV is true. */ - if (!keysv) { keysv = newSVpvn(key, klen); if (is_utf8) { @@ -464,7 +464,16 @@ S_hv_fetch_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen, } else { keysv = newSVsv(keysv); } - mg_copy((SV*)hv, sv, (char *)keysv, HEf_SVKEY); + if (regdata) { + sv = Perl_reg_named_buff_sv(aTHX_ keysv); + if (!sv) { + SvREFCNT_dec(keysv); + return 0; + } + } else { + sv = sv_newmortal(); + mg_copy((SV*)hv, sv, (char *)keysv, HEf_SVKEY); + } /* grab a fake HE/HEK pair from the pool or make a new one */ entry = PL_hv_fetch_ent_mh; @@ -1923,7 +1932,17 @@ Perl_hv_iterinit(pTHX_ HV *hv) } else { hv_auxinit(hv); } - + if (SvMAGICAL(hv) && SvRMAGICAL(hv)) { + MAGIC *mg = mg_find((SV*)hv, PERL_MAGIC_regdata_names); + if ( mg ) { + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx && rx->paren_names) { + (void)hv_iterinit(rx->paren_names); + } + } + } + } /* used to be xhv->xhv_fill before 5.004_65 */ return HvTOTALKEYS(hv); } @@ -2078,6 +2097,7 @@ Perl_hv_iternext_flags(pTHX_ HV *hv, I32 flags) if (!hv) Perl_croak(aTHX_ "Bad hash"); + xhv = (XPVHV*)SvANY(hv); if (!SvOOK(hv)) { @@ -2089,8 +2109,85 @@ Perl_hv_iternext_flags(pTHX_ HV *hv, I32 flags) iter = HvAUX(hv); oldentry = entry = iter->xhv_eiter; /* HvEITER(hv) */ + if (SvMAGICAL(hv) && SvRMAGICAL(hv) && + (mg = mg_find((SV*)hv, PERL_MAGIC_regdata_names))) + { + SV * key; + SV *val = NULL; + REGEXP * rx; + if (!PL_curpm) + return NULL; + rx = PM_GETRE(PL_curpm); + if (rx && rx->paren_names) { + hv = rx->paren_names; + } else { + return NULL; + } - if ((mg = SvTIED_mg((SV*)hv, PERL_MAGIC_tied))) { + key = sv_newmortal(); + if (entry) { + sv_setsv(key, HeSVKEY_force(entry)); + SvREFCNT_dec(HeSVKEY(entry)); /* get rid of previous key */ + } + else { + char *k; + HEK *hek; + + /* one HE per MAGICAL hash */ + iter->xhv_eiter = entry = new_HE(); /* HvEITER(hv) = new_HE() */ + Zero(entry, 1, HE); + Newxz(k, HEK_BASESIZE + sizeof(SV*), char); + hek = (HEK*)k; + HeKEY_hek(entry) = hek; + HeKLEN(entry) = HEf_SVKEY; + } + { + while (!val) { + HE *temphe = hv_iternext_flags(hv,flags); + if (temphe) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->startp[nums[i]] != -1 && + rx->endp[nums[i]] != -1) + { + parno = nums[i]; + break; + } + } + if (parno) { + GV *gv_paren; + STRLEN len; + SV *sv = sv_newmortal(); + const char* pvkey = HePV(temphe, len); + + Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno); + gv_paren = Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV); + Perl_sv_setpvn(aTHX_ key, pvkey, len); + val = GvSVn(gv_paren); + } + } else { + break; + } + } + } + if (val && SvOK(key)) { + /* force key to stay around until next time */ + HeSVKEY_set(entry, SvREFCNT_inc_simple_NN(key)); + HeVAL(entry) = SvREFCNT_inc_simple_NN(val); + return entry; /* beware, hent_val is not set */ + } + if (HeVAL(entry)) + SvREFCNT_dec(HeVAL(entry)); + Safefree(HeKEY_hek(entry)); + del_HE(entry); + iter->xhv_eiter = NULL; /* HvEITER(hv) = NULL */ + return NULL; + + } else if ((mg = SvTIED_mg((SV*)hv, PERL_MAGIC_tied))) { SV * const key = sv_newmortal(); if (entry) { sv_setsv(key, HeSVKEY_force(entry)); diff --git a/perl.h b/perl.h index 93b4d62..3338ea2 100644 --- a/perl.h +++ b/perl.h @@ -3615,6 +3615,8 @@ Gid_t getegid (void); #define PERL_MAGIC_overload_elem 'a' /* %OVERLOAD hash element */ #define PERL_MAGIC_overload_table 'c' /* Holds overload table (AMT) on stash */ #define PERL_MAGIC_bm 'B' /* Boyer-Moore (fast string search) */ +#define PERL_MAGIC_regdata_names '+' /* Regex named capture buffer hash + (%+ support) */ #define PERL_MAGIC_regdata 'D' /* Regex match position data (@+ and @- vars) */ #define PERL_MAGIC_regdatum 'd' /* Regex match position data element */ @@ -4830,6 +4832,18 @@ MGVTBL_SET( ); MGVTBL_SET( + PL_vtbl_regdata_names, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL +); + +MGVTBL_SET( PL_vtbl_regdata, NULL, NULL, diff --git a/pod/perlre.pod b/pod/perlre.pod index c4dd7c5..7cc5dec 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -191,20 +191,26 @@ X X<\w> X<\W> X<\s> X<\S> X<\d> X<\D> X<\X> X<\p> X<\P> X<\C> X X - \w Match a "word" character (alphanumeric plus "_") - \W Match a non-"word" character - \s Match a whitespace character - \S Match a non-whitespace character - \d Match a digit character - \D Match a non-digit character - \pP Match P, named property. Use \p{Prop} for longer names. - \PP Match non-P - \X Match eXtended Unicode "combining character sequence", - equivalent to (?:\PM\pM*) - \C Match a single C char (octet) even under Unicode. - NOTE: breaks up characters into their UTF-8 bytes, - so you may end up with malformed pieces of UTF-8. - Unsupported in lookbehind. + \w Match a "word" character (alphanumeric plus "_") + \W Match a non-"word" character + \s Match a whitespace character + \S Match a non-whitespace character + \d Match a digit character + \D Match a non-digit character + \pP Match P, named property. Use \p{Prop} for longer names. + \PP Match non-P + \X Match eXtended Unicode "combining character sequence", + equivalent to (?:\PM\pM*) + \C Match a single C char (octet) even under Unicode. + NOTE: breaks up characters into their UTF-8 bytes, + so you may end up with malformed pieces of UTF-8. + Unsupported in lookbehind. + \1 Backreference to a a specific group. + '1' may actually be any positive integer + \k Named backreference + \N{name} Named unicode character, or unicode escape. + \x12 Hexadecimal escape sequence + \x{1234} Long hexadecimal escape sequence A C<\w> matches a single alphanumeric character (an alphabetic character, or a decimal digit) or C<_>, not a whole word. Use C<\w+> @@ -403,7 +409,7 @@ X<\G> The bracketing construct C<( ... )> creates capture buffers. To refer to the digit'th buffer use \ within the match. Outside the match use "$" instead of "\". (The -\ notation works in certain circumstances outside +\ notation works in certain circumstances outside the match. See the warning below about \1 vs $1 for details.) Referring back to another part of the match is called a I. @@ -414,20 +420,38 @@ There is no limit to the number of captured substrings that you may use. However Perl also uses \10, \11, etc. as aliases for \010, \011, etc. (Recall that 0 means octal, so \011 is the character at number 9 in your coded character set; which would be the 10th character, -a horizontal tab under ASCII.) Perl resolves this -ambiguity by interpreting \10 as a backreference only if at least 10 -left parentheses have opened before it. Likewise \11 is a -backreference only if at least 11 left parentheses have opened -before it. And so on. \1 through \9 are always interpreted as +a horizontal tab under ASCII.) Perl resolves this +ambiguity by interpreting \10 as a backreference only if at least 10 +left parentheses have opened before it. Likewise \11 is a +backreference only if at least 11 left parentheses have opened +before it. And so on. \1 through \9 are always interpreted as backreferences. +Additionally, as of Perl 5.10 you may use named capture buffers and named +backreferences. The notation is C<< (?...) >> and C<< \k >> +(you may also use single quotes instead of angle brackets to quote the +name). The only difference with named capture buffers and unnamed ones is +that multiple buffers may have the same name and that the contents of +named capture buffers is available via the C<%+> hash. When multiple +groups share the same name C<$+{name}> and C<< \k >> refer to the +leftmost defined group, thus it's possible to do things with named capture +buffers that would otherwise require C<(??{})> code to accomplish. Named +capture buffers are numbered just as normal capture buffers are and may be +referenced via the magic numeric variables or via numeric backreferences +as well as by name. + Examples: s/^([^ ]*) *([^ ]*)/$2 $1/; # swap first two words - if (/(.)\1/) { # find first doubled char - print "'$1' is the first doubled character\n"; - } + /(.)\1/ # find first doubled char + and print "'$1' is the first doubled character\n"; + + /(?.)\k/ # ... a different way + and print "'$+{char}' is the first doubled character\n"; + + /(?.)\1/ # ... mix and match + and print "'$1' is the first doubled character\n"; if (/Time: (..):(..):(..)/) { # parse out values $hours = $1; @@ -443,7 +467,7 @@ everything before the matched string. C<$'> returns everything after the matched string. And C<$^N> contains whatever was matched by the most-recently closed group (submatch). C<$^N> can be used in extended patterns (see below), for example to assign a submatch to a -variable. +variable. X<$+> X<$^N> X<$&> X<$`> X<$'> The numbered match variables ($1, $2, $3, etc.) and the related punctuation @@ -620,6 +644,48 @@ A zero-width negative look-behind assertion. For example C matches any occurrence of "foo" that does not follow "bar". Works only for fixed-width look-behind. +=item C<(?'NAME'pattern)> + +=item C<< (?pattern) >> +X<< (?) >> X<(?'NAME')> X X + +A named capture buffer. Identical in every respect to normal capturing +parens C<()> but for the additional fact that C<%+> may be used after +a succesful match to refer to a named buffer. See C for more +details on the C<%+> hash. + +If multiple distinct capture buffers have the same name then the +$+{NAME} will refer to the leftmost defined buffer in the match. + +The forms C<(?'NAME'pattern)> and C<(?pattern)> are equivalent. + +B While the notation of this construct is the same as the similar +function in .NET regexes, the behavior is not, in Perl the buffers are +numbered sequentially regardless of being named or not. Thus in the +pattern + + /(x)(?y)(z)/ + +$+{foo} will be the same as $2, and $3 will contain 'z' instead of +the opposite which is what a .NET regex hacker might expect. + +Currently NAME is restricted to word chars only. In other words, it +must match C. + +=item C<< \k >> + +=item C<< \k'name' >> + +Named backreference. Similar to numeric backreferences, except that +the group is designated by name and not number. If multiple groups +have the same name then it refers to the leftmost defined group in +the current match. + +It is an error to refer to a name not defined by a C<(?)> +earlier in the pattern. + +Both forms are equivalent. + =item C<(?{ code })> X<(?{})> X X X @@ -726,7 +792,7 @@ Thus, ('a' x 100)=~/(??{'(.)' x 100})/ -B match, it will B set $1. +B match, it will B set $1. The C is not interpolated. As before, the rules to determine where the C ends are currently somewhat convoluted. @@ -762,21 +828,21 @@ X X X B: This extended regular expression feature is considered highly experimental, and may be changed or deleted without notice. -Similar to C<(??{ code })> except it does not involve compiling any code, -instead it treats the contents of a capture buffer as an independent +Similar to C<(??{ code })> except it does not involve compiling any code, +instead it treats the contents of a capture buffer as an independent pattern that must match at the current position. Capture buffers -contained by the pattern will have the value as determined by the +contained by the pattern will have the value as determined by the outermost recursion. PARNO is a sequence of digits not starting with 0 whose value -reflects the paren-number of the capture buffer to recurse to. +reflects the paren-number of the capture buffer to recurse to. C<(?R)> curses to the beginning of the pattern. -The following pattern matches a function foo() which may contain -balanced parenthesis as the argument. +The following pattern matches a function foo() which may contain +balanced parenthesis as the argument. $re = qr{ ( # paren group 1 (full function) - foo + foo ( # paren group 2 (parens) \( ( # paren group 3 (contents of parens) @@ -802,18 +868,18 @@ the output produced should be the following: $1 = foo(bar(baz)+baz(bop)) $2 = (bar(baz)+baz(bop)) - $3 = bar(baz)+baz(bop) + $3 = bar(baz)+baz(bop) -If there is no corresponding capture buffer defined, then it is a +If there is no corresponding capture buffer defined, then it is a fatal error. Recursing deeper than 50 times without consuming any input -string will also result in a fatal error. The maximum depth is compiled +string will also result in a fatal error. The maximum depth is compiled into perl, so changing it requires a custom build. -B that this pattern does not behave the same way as the equivalent +B that this pattern does not behave the same way as the equivalent PCRE or Python construct of the same form. In perl you can backtrack into a recursed group, in PCRE and Python the recursed into group is treated -as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect -the pattern being recursed into. +as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect +the pattern being recursed into. =item C<< (?>pattern) >> X X X X diff --git a/pod/perltodo.pod b/pod/perltodo.pod index 50a79d9..4a54bcd 100644 --- a/pod/perltodo.pod +++ b/pod/perltodo.pod @@ -629,16 +629,6 @@ Fix (or rewrite) the implementation of the C closures. This will allow the use of a regex from inside (?{ }), (??{ }) and (?(?{ })|) constructs. -=head2 Add named capture to regexp engine - -Named capture is supported by .NET, PCRE and Python. Its embarrassing -Perl doesn't support it yet. - -Jeffrey Friedl notes that "the most glaring omission [in perl's regexp -engine] offered by other implementations is named capture". - -demerphq is working on this. - =head2 Add possessive quantifiers to regexp engine Possessive quantifiers are a syntactic sugar that affords a more diff --git a/pod/perlvar.pod b/pod/perlvar.pod index 4d8c17e..8a486b2 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -313,6 +313,17 @@ past where $2 ends, and so on. You can use C<$#+> to determine how many subgroups were in the last successful match. See the examples given for the C<@-> variable. +=item %+ +X<%+> + +Similar to C<@+>, the C<%+> hash allows access to the named capture +buffers, should they exist, in the last successful match in the +currently active dynamic scope. + +C<$+{foo}> is equivalent to C<$1> after the following match: + + 'foo'=~/(?foo)/; + =item HANDLE->input_line_number(EXPR) =item $INPUT_LINE_NUMBER @@ -322,7 +333,7 @@ examples given for the C<@-> variable. =item $. X<$.> X<$NR> X<$INPUT_LINE_NUMBER> X -Current line number for the last filehandle accessed. +Current line number for the last filehandle accessed. Each filehandle in Perl counts the number of lines that have been read from it. (Depending on the value of C<$/>, Perl's idea of what diff --git a/pp.c b/pp.c index 25279a3..f04b55d 100644 --- a/pp.c +++ b/pp.c @@ -3862,7 +3862,7 @@ PP(pp_each) { dVAR; dSP; - HV * const hash = (HV*)POPs; + HV * hash = (HV*)POPs; HE *entry; const I32 gimme = GIMME_V; diff --git a/proto.h b/proto.h index e10c8eb..dc740cb 100644 --- a/proto.h +++ b/proto.h @@ -1863,6 +1863,9 @@ PERL_CALLCONV regnode* Perl_regnext(pTHX_ regnode* p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); +PERL_CALLCONV SV* Perl_reg_named_buff_sv(pTHX_ SV* namesv) + __attribute__nonnull__(pTHX_1); + PERL_CALLCONV void Perl_regprop(pTHX_ const regexp *prog, SV* sv, const regnode* o) __attribute__nonnull__(pTHX_2) __attribute__nonnull__(pTHX_3); diff --git a/regcomp.c b/regcomp.c index 4895ea4..ca5830f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -120,6 +120,7 @@ typedef struct RExC_state_t { regnode **parens; /* offsets of each paren */ I32 utf8; HV *charnames; /* cache of named sequences */ + HV *paren_names; /* Paren names */ #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -153,6 +154,7 @@ typedef struct RExC_state_t { #define RExC_utf8 (pRExC_state->utf8) #define RExC_charnames (pRExC_state->charnames) #define RExC_parens (pRExC_state->parens) +#define RExC_paren_names (pRExC_state->paren_names) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ @@ -3771,8 +3773,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_emit = &PL_regdummy; RExC_whilem_seen = 0; RExC_charnames = NULL; - RExC_parens= NULL; - + RExC_parens = NULL; + RExC_paren_names = NULL; + #if 0 /* REGC() is (currently) a NOP at the first pass. * Clever compilers notice this and complain. --jhi */ REGC((U8)REG_MAGIC, (char*)RExC_emit); @@ -3782,15 +3785,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_precomp = NULL; return(NULL); } - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required ")); - DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size)); - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n")); DEBUG_PARSE_r({ + PerlIO_printf(Perl_debug_log, + "Required size %"IVdf" nodes\n" + "Starting second pass (creation)\n", + (IV)RExC_size); RExC_lastnum=0; RExC_lastparse=NULL; }); - - /* Small enough for pointer-storage convention? If extralen==0, this means that we will not need long jumps. */ if (RExC_size >= 0x10000L && RExC_extralen) @@ -3826,8 +3828,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->substrs = 0; /* Useful during FAIL. */ r->startp = 0; /* Useful during FAIL. */ - r->endp = 0; + r->paren_names = 0; + if (RExC_seen & REG_SEEN_RECURSE) { Newx(RExC_parens, RExC_npar,regnode *); SAVEFREEPV(RExC_parens); @@ -3997,13 +4000,13 @@ reStudy: /* Scan is after the zeroth branch, first is atomic matcher. */ #ifdef TRIE_STUDY_OPT - DEBUG_COMPILE_r( + DEBUG_PARSE_r( if (!restudied) PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", (IV)(first - scan + 1)) ); #else - DEBUG_COMPILE_r( + DEBUG_PARSE_r( PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", (IV)(first - scan + 1)) ); @@ -4252,6 +4255,11 @@ reStudy: r->reganch |= ROPT_EVAL_SEEN; if (RExC_seen & REG_SEEN_CANY) r->reganch |= ROPT_CANY_SEEN; + if (RExC_paren_names) + r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); + else + r->paren_names = NULL; + Newxz(r->startp, RExC_npar, I32); Newxz(r->endp, RExC_npar, I32); @@ -4280,6 +4288,41 @@ reStudy: #undef END_BLOCK #undef RE_ENGINE_PTR +SV* +Perl_reg_named_buff_sv(pTHX_ SV* namesv) +{ + I32 parno = 0; /* no match */ + if (PL_curpm) { + const REGEXP * const rx = PM_GETRE(PL_curpm); + if (rx && rx->paren_names) { + HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; ilastcloseparen) >= nums[i] && + rx->startp[nums[i]] != -1 && + rx->endp[nums[i]] != -1) + { + parno = nums[i]; + break; + } + } + } + } + } + if ( !parno ) { + return 0; + } else { + GV *gv_paren; + SV *sv= sv_newmortal(); + Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno); + gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV); + return GvSVn(gv_paren); + } +} + #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ int rem=(int)(RExC_end - RExC_parse); \ int cut; \ @@ -4387,12 +4430,66 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) paren = *RExC_parse++; ret = NULL; /* For look-ahead/behind. */ switch (paren) { + case '<': /* (?<...) */ - RExC_seen |= REG_SEEN_LOOKBEHIND; if (*RExC_parse == '!') paren = ','; - if (*RExC_parse != '=' && *RExC_parse != '!') - goto unknown; + else if (*RExC_parse != '=') + { /* (?<...>) */ + char *name_start; + paren= '>'; + case '\'': /* (?'...') */ + name_start= RExC_parse; + if (UTF) { + STRLEN numlen; + while(isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT))) + RExC_parse += numlen; + } else { + while(isIDFIRST(*RExC_parse)) + RExC_parse++; + } + if (RExC_parse == name_start) + goto unknown; + if (*RExC_parse != paren) + vFAIL2("Sequence (?%c... not terminated", + paren=='>' ? '<' : paren); + if (SIZE_ONLY) { + SV *svname= Perl_newSVpvf(aTHX_ "%.*s", + (int)(RExC_parse - name_start), name_start); + HE *he_str; + SV *sv_dat; + + if (!RExC_paren_names) { + RExC_paren_names= newHV(); + sv_2mortal((SV*)RExC_paren_names); + } + he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); + if ( he_str ) { + sv_dat = HeVAL(he_str); + } else { + /* croak baby croak */ + } + if (SvPOK(sv_dat)) { + IV count=SvIV(sv_dat); + I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1); + SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32)); + pv[count]=RExC_npar; + SvIVX(sv_dat)++; + } else { + (void)SvUPGRADE(sv_dat,SVt_PVNV); + sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); + SvIOK_on(sv_dat); + SvIVX(sv_dat)= 1; + } + /*sv_dump(sv_dat);*/ + } + nextchar(pRExC_state); + paren = 1; + goto capturing_parens; + } + RExC_seen |= REG_SEEN_LOOKBEHIND; RExC_parse++; case '=': /* (?=...) */ case '!': /* (?!...) */ @@ -4412,6 +4509,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; + case '0' : case 'R' : if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); @@ -4657,6 +4755,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } else { /* (...) */ + capturing_parens: parno = RExC_npar; RExC_npar++; ret = reganode(pRExC_state, OPEN, parno); @@ -5567,6 +5666,68 @@ tryagain: ++RExC_parse; ret= reg_namedseq(pRExC_state, NULL); break; + case 'k': + { + char ch= RExC_parse[1]; + if (ch != '<' && ch != '\'') { + if (SIZE_ONLY) + vWARN( RExC_parse + 1, + "Possible broken named back reference treated as literal k"); + parse_start--; + goto defchar; + } else { + char* name_start = (RExC_parse += 2); + I32 num = 0; + ch= (ch == '<') ? '>' : '\''; + + if (UTF) { + STRLEN numlen; + while(isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, UTF8_ALLOW_DEFAULT))) + RExC_parse += numlen; + } else { + while(isIDFIRST(*RExC_parse)) + RExC_parse++; + } + if (RExC_parse == name_start || *RExC_parse != ch) + vFAIL2("Sequence \\k%c... not terminated", + (ch == '>' ? '<' : ch)); + + RExC_sawback = 1; + ret = reganode(pRExC_state, + (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), + num); + *flagp |= HASWIDTH; + + + if (!SIZE_ONLY) { + SV *svname = Perl_newSVpvf(aTHX_ "%.*s", + (int)(RExC_parse - name_start), name_start); + HE *he_str; + SV *sv_dat; + if (UTF) + SvUTF8_on(svname); + he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 ); + SvREFCNT_dec(svname); + if ( he_str ) { + sv_dat = HeVAL(he_str); + } else { + vFAIL("Reference to nonexistent group"); + } + num = add_data( pRExC_state, 1, "S" ); + ARG_SET(ret,num); + RExC_rx->data->data[num]=(void*)sv_dat; + SvREFCNT_inc(sv_dat); + } + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret); /* MJD */ + nextchar(pRExC_state); + + } + break; + } case 'n': case 'r': case 't': @@ -7690,6 +7851,8 @@ Perl_pregfree(pTHX_ struct regexp *r) SvREFCNT_dec(r->float_utf8); Safefree(r->substrs); } + if (r->paren_names) + SvREFCNT_dec(r->paren_names); if (r->data) { int n = r->data->count; PAD* new_comppad = NULL; @@ -7700,6 +7863,7 @@ Perl_pregfree(pTHX_ struct regexp *r) /* If you add a ->what type here, update the comment in regcomp.h */ switch (r->data->what[n]) { case 's': + case 'S': SvREFCNT_dec((SV*)r->data->data[n]); break; case 'f': @@ -7793,6 +7957,7 @@ Perl_pregfree(pTHX_ struct regexp *r) #define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t)) #define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t)) +#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t)) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* @@ -7856,6 +8021,7 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) /* legal options are one of: sfpont see also regcomp.h and pregfree() */ case 's': + case 'S': d->data[i] = sv_dup_inc((SV *)r->data->data[i], param); break; case 'p': @@ -7920,6 +8086,8 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param) ret->sublen = r->sublen; ret->engine = r->engine; + + ret->paren_names = hv_dup_inc(r->paren_names, param); if (RX_MATCH_COPIED(ret)) ret->subbeg = SAVEPVN(r->subbeg, r->sublen); diff --git a/regcomp.h b/regcomp.h index 166be14..e7b5a2c 100644 --- a/regcomp.h +++ b/regcomp.h @@ -413,6 +413,7 @@ END_EXTERN_C * in the character class * t - trie struct * T - aho-trie struct + * S - sv for named capture lookup * 20010712 mjd@plover.com * (Remember to update re_dup() and pregfree() if you add any items.) */ diff --git a/regcomp.sym b/regcomp.sym index f3f7164..21904e1 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -77,9 +77,9 @@ BACK BACK, no Match "", "next" ptr points backward. #*Literals (33..35) -EXACT EXACT, sv Match this string (preceded by length). -EXACTF EXACT, sv Match this string, folded (prec. by length). -EXACTFL EXACT, sv Match this string, folded in locale (w/len). +EXACT EXACT, str Match this string (preceded by length). +EXACTF EXACT, str Match this string, folded (prec. by length). +EXACTFL EXACT, str Match this string, folded in locale (w/len). #*Do nothing types (36..37) @@ -154,15 +154,21 @@ TRIEC TRIE, trie charclass Same as TRIE, but with embedded charclass data AHOCORASICK TRIE, trie 1 Aho Corasick stclass. flags==type AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charclass data -#*Recursion (65) +#*Recursion (65..66) RECURSE RECURSE, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2 SRECURSE RECURSE, no recurse to start of pattern +#*Named references (67..69) +NREF NREF, no-sv 1 Match some already matched string +NREFF NREF, no-sv 1 Match already matched string, folded +NREFFL NREF, no-sv 1 Match already matched string, folded in loc. + + # NEW STUFF ABOVE THIS LINE -- Please update counts below. ################################################################################ -#*SPECIAL REGOPS (65, 66) +#*SPECIAL REGOPS (70, 71) # This is not really a node, but an optimized away piece of a "long" node. # To simplify debugging output, we mark it as if it were a node diff --git a/regexec.c b/regexec.c index bd061fd..2743c53 100644 --- a/regexec.c +++ b/regexec.c @@ -2028,6 +2028,8 @@ got_it: the same. */ restore_pos(aTHX_ prog); } + if (prog->paren_names) + (void)hv_iterinit(prog->paren_names); /* make sure $`, $&, $', and $digit will work later */ if ( !(flags & REXEC_NOT_FIRST) ) { @@ -3288,13 +3290,39 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) locinput++; nextchr = UCHARAT(locinput); break; + + case NREFFL: + { + char *s; + char type = OP(scan); + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NREF: + case NREFF: + { + SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ]; + I32 *nums=(I32*)SvPVX(sv_dat); + for ( n=0; n= nums[n] && + PL_regstartp[nums[n]] != -1 && + PL_regendp[nums[n]] != -1) + { + n = nums[n]; + type = REF + ( type - NREF ); + goto do_ref; + } + } + sayNO; + /* unreached */ + } case REFFL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case REF: - case REFF: { - char *s; + case REFF: n = ARG(scan); /* which paren pair */ + type = OP(scan); + do_ref: ln = PL_regstartp[n]; PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if ((I32)*PL_reglastparen < n || ln == -1) @@ -3303,7 +3331,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) break; s = PL_bostr + ln; - if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */ + if (do_utf8 && type != REF) { /* REF can do byte comparison */ char *l = locinput; const char *e = PL_bostr + PL_regendp[n]; /* @@ -3311,7 +3339,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) * in the 8-bit case (no pun intended) because in Unicode we * have to map both upper and title case to lower case. */ - if (OP(scan) == REFF) { + if (type == REFF) { while (s < e) { STRLEN ulen1, ulen2; U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; @@ -3334,24 +3362,23 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog) /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchr && - (OP(scan) == REF || - (UCHARAT(s) != ((OP(scan) == REFF - ? PL_fold : PL_fold_locale)[nextchr])))) + (type == REF || + (UCHARAT(s) != (type == REFF + ? PL_fold : PL_fold_locale)[nextchr]))) sayNO; ln = PL_regendp[n] - ln; if (locinput + ln > PL_regeol) sayNO; - if (ln > 1 && (OP(scan) == REF + if (ln > 1 && (type == REF ? memNE(s, locinput, ln) - : (OP(scan) == REFF + : (type == REFF ? ibcmp(s, locinput, ln) : ibcmp_locale(s, locinput, ln)))) sayNO; locinput += ln; nextchr = UCHARAT(locinput); break; - } - + } case NOTHING: case TAIL: break; diff --git a/regexp.h b/regexp.h index 4048669..faed0ee 100644 --- a/regexp.h +++ b/regexp.h @@ -54,7 +54,8 @@ typedef struct regexp { U32 lastcloseparen; /* last paren matched */ U32 reganch; /* Internal use only + Tainted information used by regexec? */ - const struct regexp_engine* engine; + HV *paren_names; /* Paren names */ + const struct regexp_engine* engine; regnode program[1]; /* Unwarranted chumminess with compiler. */ } regexp; diff --git a/regnodes.h b/regnodes.h index 78db033..3030e04 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 68 -#define REGMATCH_STATE_MAX 98 +#define REGNODE_MAX 71 +#define REGMATCH_STATE_MAX 101 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -76,41 +76,44 @@ #define AHOCORASICKC 64 /* 0x40 Same as AHOCORASICK, but with embedded charclass data */ #define RECURSE 65 /* 0x41 recurse to paren arg1 at (signed) ofs arg2 */ #define SRECURSE 66 /* 0x42 recurse to start of pattern */ -#define OPTIMIZED 67 /* 0x43 Placeholder for dump. */ -#define PSEUDO 68 /* 0x44 Pseudo opcode for internal use. */ +#define NREF 67 /* 0x43 Match some already matched string */ +#define NREFF 68 /* 0x44 Match already matched string, folded */ +#define NREFFL 69 /* 0x45 Match already matched string, folded in loc. */ +#define OPTIMIZED 70 /* 0x46 Placeholder for dump. */ +#define PSEUDO 71 /* 0x47 Pseudo opcode for internal use. */ /* ------------ States ------------- */ -#define TRIE_next 69 /* 0x45 Regmatch state for TRIE */ -#define TRIE_next_fail 70 /* 0x46 Regmatch state for TRIE */ -#define EVAL_AB 71 /* 0x47 Regmatch state for EVAL */ -#define EVAL_AB_fail 72 /* 0x48 Regmatch state for EVAL */ -#define CURLYX_end 73 /* 0x49 Regmatch state for CURLYX */ -#define CURLYX_end_fail 74 /* 0x4a Regmatch state for CURLYX */ -#define WHILEM_A_pre 75 /* 0x4b Regmatch state for WHILEM */ -#define WHILEM_A_pre_fail 76 /* 0x4c Regmatch state for WHILEM */ -#define WHILEM_A_min 77 /* 0x4d Regmatch state for WHILEM */ -#define WHILEM_A_min_fail 78 /* 0x4e Regmatch state for WHILEM */ -#define WHILEM_A_max 79 /* 0x4f Regmatch state for WHILEM */ -#define WHILEM_A_max_fail 80 /* 0x50 Regmatch state for WHILEM */ -#define WHILEM_B_min 81 /* 0x51 Regmatch state for WHILEM */ -#define WHILEM_B_min_fail 82 /* 0x52 Regmatch state for WHILEM */ -#define WHILEM_B_max 83 /* 0x53 Regmatch state for WHILEM */ -#define WHILEM_B_max_fail 84 /* 0x54 Regmatch state for WHILEM */ -#define BRANCH_next 85 /* 0x55 Regmatch state for BRANCH */ -#define BRANCH_next_fail 86 /* 0x56 Regmatch state for BRANCH */ -#define CURLYM_A 87 /* 0x57 Regmatch state for CURLYM */ -#define CURLYM_A_fail 88 /* 0x58 Regmatch state for CURLYM */ -#define CURLYM_B 89 /* 0x59 Regmatch state for CURLYM */ -#define CURLYM_B_fail 90 /* 0x5a Regmatch state for CURLYM */ -#define IFMATCH_A 91 /* 0x5b Regmatch state for IFMATCH */ -#define IFMATCH_A_fail 92 /* 0x5c Regmatch state for IFMATCH */ -#define CURLY_B_min_known 93 /* 0x5d Regmatch state for CURLY */ -#define CURLY_B_min_known_fail 94 /* 0x5e Regmatch state for CURLY */ -#define CURLY_B_min 95 /* 0x5f Regmatch state for CURLY */ -#define CURLY_B_min_fail 96 /* 0x60 Regmatch state for CURLY */ -#define CURLY_B_max 97 /* 0x61 Regmatch state for CURLY */ -#define CURLY_B_max_fail 98 /* 0x62 Regmatch state for CURLY */ +#define TRIE_next 72 /* 0x48 Regmatch state for TRIE */ +#define TRIE_next_fail 73 /* 0x49 Regmatch state for TRIE */ +#define EVAL_AB 74 /* 0x4a Regmatch state for EVAL */ +#define EVAL_AB_fail 75 /* 0x4b Regmatch state for EVAL */ +#define CURLYX_end 76 /* 0x4c Regmatch state for CURLYX */ +#define CURLYX_end_fail 77 /* 0x4d Regmatch state for CURLYX */ +#define WHILEM_A_pre 78 /* 0x4e Regmatch state for WHILEM */ +#define WHILEM_A_pre_fail 79 /* 0x4f Regmatch state for WHILEM */ +#define WHILEM_A_min 80 /* 0x50 Regmatch state for WHILEM */ +#define WHILEM_A_min_fail 81 /* 0x51 Regmatch state for WHILEM */ +#define WHILEM_A_max 82 /* 0x52 Regmatch state for WHILEM */ +#define WHILEM_A_max_fail 83 /* 0x53 Regmatch state for WHILEM */ +#define WHILEM_B_min 84 /* 0x54 Regmatch state for WHILEM */ +#define WHILEM_B_min_fail 85 /* 0x55 Regmatch state for WHILEM */ +#define WHILEM_B_max 86 /* 0x56 Regmatch state for WHILEM */ +#define WHILEM_B_max_fail 87 /* 0x57 Regmatch state for WHILEM */ +#define BRANCH_next 88 /* 0x58 Regmatch state for BRANCH */ +#define BRANCH_next_fail 89 /* 0x59 Regmatch state for BRANCH */ +#define CURLYM_A 90 /* 0x5a Regmatch state for CURLYM */ +#define CURLYM_A_fail 91 /* 0x5b Regmatch state for CURLYM */ +#define CURLYM_B 92 /* 0x5c Regmatch state for CURLYM */ +#define CURLYM_B_fail 93 /* 0x5d Regmatch state for CURLYM */ +#define IFMATCH_A 94 /* 0x5e Regmatch state for IFMATCH */ +#define IFMATCH_A_fail 95 /* 0x5f Regmatch state for IFMATCH */ +#define CURLY_B_min_known 96 /* 0x60 Regmatch state for CURLY */ +#define CURLY_B_min_known_fail 97 /* 0x61 Regmatch state for CURLY */ +#define CURLY_B_min 98 /* 0x62 Regmatch state for CURLY */ +#define CURLY_B_min_fail 99 /* 0x63 Regmatch state for CURLY */ +#define CURLY_B_max 100 /* 0x64 Regmatch state for CURLY */ +#define CURLY_B_max_fail 101 /* 0x65 Regmatch state for CURLY */ /* PL_regkind[] What type of regop or state is this. */ @@ -185,6 +188,9 @@ EXTCONST U8 PL_regkind[] = { TRIE, /* AHOCORASICKC */ RECURSE, /* RECURSE */ RECURSE, /* SRECURSE */ + NREF, /* NREF */ + NREF, /* NREFF */ + NREF, /* NREFFL */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -292,6 +298,9 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_charclass), /* AHOCORASICKC */ EXTRA_SIZE(struct regnode_2L), /* RECURSE */ 0, /* SRECURSE */ + EXTRA_SIZE(struct regnode_1), /* NREF */ + EXTRA_SIZE(struct regnode_1), /* NREFF */ + EXTRA_SIZE(struct regnode_1), /* NREFFL */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -366,6 +375,9 @@ static const char reg_off_by_arg[] = { 0, /* AHOCORASICKC */ 0, /* RECURSE */ 0, /* SRECURSE */ + 0, /* NREF */ + 0, /* NREFF */ + 0, /* NREFFL */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -441,39 +453,42 @@ const char * reg_name[] = { "AHOCORASICKC", /* 0x40 */ "RECURSE", /* 0x41 */ "SRECURSE", /* 0x42 */ - "OPTIMIZED", /* 0x43 */ - "PSEUDO", /* 0x44 */ + "NREF", /* 0x43 */ + "NREFF", /* 0x44 */ + "NREFFL", /* 0x45 */ + "OPTIMIZED", /* 0x46 */ + "PSEUDO", /* 0x47 */ /* ------------ States ------------- */ - "TRIE_next", /* 0x45 */ - "TRIE_next_fail", /* 0x46 */ - "EVAL_AB", /* 0x47 */ - "EVAL_AB_fail", /* 0x48 */ - "CURLYX_end", /* 0x49 */ - "CURLYX_end_fail", /* 0x4a */ - "WHILEM_A_pre", /* 0x4b */ - "WHILEM_A_pre_fail", /* 0x4c */ - "WHILEM_A_min", /* 0x4d */ - "WHILEM_A_min_fail", /* 0x4e */ - "WHILEM_A_max", /* 0x4f */ - "WHILEM_A_max_fail", /* 0x50 */ - "WHILEM_B_min", /* 0x51 */ - "WHILEM_B_min_fail", /* 0x52 */ - "WHILEM_B_max", /* 0x53 */ - "WHILEM_B_max_fail", /* 0x54 */ - "BRANCH_next", /* 0x55 */ - "BRANCH_next_fail", /* 0x56 */ - "CURLYM_A", /* 0x57 */ - "CURLYM_A_fail", /* 0x58 */ - "CURLYM_B", /* 0x59 */ - "CURLYM_B_fail", /* 0x5a */ - "IFMATCH_A", /* 0x5b */ - "IFMATCH_A_fail", /* 0x5c */ - "CURLY_B_min_known", /* 0x5d */ - "CURLY_B_min_known_fail", /* 0x5e */ - "CURLY_B_min", /* 0x5f */ - "CURLY_B_min_fail", /* 0x60 */ - "CURLY_B_max", /* 0x61 */ - "CURLY_B_max_fail", /* 0x62 */ + "TRIE_next", /* 0x48 */ + "TRIE_next_fail", /* 0x49 */ + "EVAL_AB", /* 0x4a */ + "EVAL_AB_fail", /* 0x4b */ + "CURLYX_end", /* 0x4c */ + "CURLYX_end_fail", /* 0x4d */ + "WHILEM_A_pre", /* 0x4e */ + "WHILEM_A_pre_fail", /* 0x4f */ + "WHILEM_A_min", /* 0x50 */ + "WHILEM_A_min_fail", /* 0x51 */ + "WHILEM_A_max", /* 0x52 */ + "WHILEM_A_max_fail", /* 0x53 */ + "WHILEM_B_min", /* 0x54 */ + "WHILEM_B_min_fail", /* 0x55 */ + "WHILEM_B_max", /* 0x56 */ + "WHILEM_B_max_fail", /* 0x57 */ + "BRANCH_next", /* 0x58 */ + "BRANCH_next_fail", /* 0x59 */ + "CURLYM_A", /* 0x5a */ + "CURLYM_A_fail", /* 0x5b */ + "CURLYM_B", /* 0x5c */ + "CURLYM_B_fail", /* 0x5d */ + "IFMATCH_A", /* 0x5e */ + "IFMATCH_A_fail", /* 0x5f */ + "CURLY_B_min_known", /* 0x60 */ + "CURLY_B_min_known_fail", /* 0x61 */ + "CURLY_B_min", /* 0x62 */ + "CURLY_B_min_fail", /* 0x63 */ + "CURLY_B_max", /* 0x64 */ + "CURLY_B_max_fail", /* 0x65 */ }; #endif /* DEBUGGING */ #else diff --git a/sv.c b/sv.c index 16c6523..19e1d26 100644 --- a/sv.c +++ b/sv.c @@ -4484,6 +4484,9 @@ Perl_sv_magic(pTHX_ register SV *sv, SV *obj, int how, const char *name, I32 nam case PERL_MAGIC_regdata: vtable = &PL_vtbl_regdata; break; + case PERL_MAGIC_regdata_names: + vtable = &PL_vtbl_regdata_names; + break; case PERL_MAGIC_regdatum: vtable = &PL_vtbl_regdatum; break; diff --git a/t/op/pat.t b/t/op/pat.t index e20a6f7..e1ac167 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3657,6 +3657,31 @@ SKIP:{ } } +{ + my $s='123453456'; + $s=~s/(?\d+)\k/$+{digits}/; + ok($s eq '123456','Named capture (angle brackets) s///'); + $s='123453456'; + $s=~s/(?'digits'\d+)\k'digits'/$+{digits}/; + ok($s eq '123456','Named capture (single quotes) s///'); +} +{ + my $s='foo bar baz'; + my (@k,@v,$count); + if ($s=~/(?foo)\s+(?bar)?\s+(?baz)/) { + while (my ($k,$v)=each(%+)) { + $count++; + } + @k=sort keys(%+); + @v=sort values(%+); + } + ok($count==3,"Got 3 keys in %+ via each ($count)"); + ok(@k == 3, 'Got 3 keys in %+ via keys'); + ok("@k" eq "A B C", "Got expected keys"); + ok("@v" eq "bar baz foo", "Got expected values"); +} + + # stress test CURLYX/WHILEM. # # This test includes varying levels of nesting, and according to @@ -3771,5 +3796,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, or print "# Unexpected outcome: should pass or crash perl\n"; # Don't forget to update this! -BEGIN{print "1..1264\n"}; +BEGIN{print "1..1270\n"}; diff --git a/t/op/re_tests b/t/op/re_tests index 6759f34..08d45b2 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -1020,3 +1020,20 @@ X(?<=foo.)[YZ] ..XfooXY.. y pos 8 ^(<(?:[^<>]+|(?1))*>)$ <<><<<><>>>> y $1 <<><<<><>>>> ((?2)*)([fF]o+) fooFoFoo y $1-$2 fooFo-Foo (<(?:[^<>]+|(?R))*>) <<><<<><>>>> y $1 <<><<<><>>>> +(?foo|bar|baz) snofooewa y $1 foo +(?foo|bar|baz) snofooewa y $+{n} foo +(?foo|bar|baz)(?[ew]+) snofooewa y $+{n} foo +(?foo|bar|baz)(?[ew]+) snofooewa y $+{m} ew +(?foo)|(?bar)|(?baz) snofooewa y $+{n} foo +(?foo)(??{ $+{n} }) snofooefoofoowaa y $+{n} foo +/(?'n'foo|bar|baz)/ snofooewa y $1 foo +/(?'n'foo|bar|baz)/ snofooewa y $+{n} foo +/(?'n'foo|bar|baz)(?'m'[ew]+)/ snofooewa y $+{n} foo +/(?'n'foo|bar|baz)(?'m'[ew]+)/ snofooewa y $+{m} ew +/(?'n'foo)|(?'n'bar)|(?baz)/ snobazewa y $+{n} baz +/(?'n'foo)(??{ $+{n} })/ snofooefoofoowaa y $+{n} foo +/(?'n'foo)\k/ ..foofoo.. y $1 foo +/(?'n'foo)\k/ ..foofoo.. y $+{n} foo +/(?foo)\k'n'/ ..foofoo.. y $1 foo +/(?foo)\k'n'/ ..foofoo.. y $+{n} foo +/(?:(?foo)|(?bar))\k/ ..barbar.. y $+{n} bar diff --git a/t/op/regexp.t b/t/op/regexp.t index 2b21766..6a469b7 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -64,7 +64,7 @@ while () { $input = join(':',$pat,$subject,$result,$repl,$expect); infty_subst(\$pat); infty_subst(\$expect); - $pat = "'$pat'" unless $pat =~ /^[:']/; + $pat = "'$pat'" unless $pat =~ /^[:'\/]/; $pat =~ s/(\$\{\w+\})/$1/eeg; $pat =~ s/\\n/\n/g; $subject = eval qq("$subject"); diff --git a/toke.c b/toke.c index 1cce947..5c24cca 100644 --- a/toke.c +++ b/toke.c @@ -1793,7 +1793,7 @@ S_scan_const(pTHX_ char *start) const char * const leaveit = /* set of acceptably-backslashed characters */ (const char *) (PL_lex_inpat - ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-Nnrtfeaxcz0123456789[{]} \t\n\r\f\v#" + ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-Nnrktfeaxcz0123456789[{]} \t\n\r\f\v#" : ""); if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { -- 2.7.4