Regexp Recurse by name.
authorYves Orton <demerphq@gmail.com>
Mon, 9 Oct 2006 20:36:20 +0000 (22:36 +0200)
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>
Tue, 10 Oct 2006 13:13:18 +0000 (13:13 +0000)
Message-ID: <9b18b3110610091136g48e5b154tf16d00d38e80a6dc@mail.gmail.com>

(with doc nits)

p4raw-id: //depot/perl@28981

embed.fnc
embed.h
ext/re/re.pm
pod/perlre.pod
proto.h
regcomp.c
t/op/re_tests

index bccc933..9be1e37 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1315,6 +1315,7 @@ Es        |regnode*|regpiece      |NN struct RExC_state_t *state|NN I32 *flagp|U32 depth
 Es     |regnode*|reg_namedseq  |NN struct RExC_state_t *state|NULLOK UV *valuep
 Es     |void   |reginsert      |NN struct RExC_state_t *state|U8 op|NN regnode *opnd|U32 depth
 Es     |void   |regtail        |NN struct RExC_state_t *state|NN regnode *p|NN const regnode *val|U32 depth
+Es     |SV *   |reg_scan_name  |NN struct RExC_state_t *state|U32 flags
 Es     |U32    |join_exact     |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth
 EsRn   |char*  |regwhite       |NN char *p|NN const char *e
 Es     |char*  |nextchar       |NN struct RExC_state_t *state
diff --git a/embed.h b/embed.h
index a3e8f70..dc5efad 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define reg_namedseq           S_reg_namedseq
 #define reginsert              S_reginsert
 #define regtail                        S_regtail
+#define reg_scan_name          S_reg_scan_name
 #define join_exact             S_join_exact
 #define regwhite               S_regwhite
 #define nextchar               S_nextchar
 #define reg_namedseq(a,b)      S_reg_namedseq(aTHX_ a,b)
 #define reginsert(a,b,c,d)     S_reginsert(aTHX_ a,b,c,d)
 #define regtail(a,b,c,d)       S_regtail(aTHX_ a,b,c,d)
+#define reg_scan_name(a,b)     S_reg_scan_name(aTHX_ a,b)
 #define join_exact(a,b,c,d,e,f)        S_join_exact(aTHX_ a,b,c,d,e,f)
 #define regwhite               S_regwhite
 #define nextchar(a)            S_nextchar(aTHX_ a)
index 87a450d..b763fef 100644 (file)
@@ -218,7 +218,7 @@ sub setcolor {
  if ($@) {
     $ENV{PERL_RE_COLORS}||=qq'\t\t> <\t> <\t\t'
  }
-                
+
 }
 
 my %flags = (
@@ -242,23 +242,33 @@ my %flags = (
 );
 $flags{ALL} = -1;
 $flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
+$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE};
 $flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
 $flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
 $flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
 
-my $installed =eval {
-    require XSLoader;
-    XSLoader::load('re');
-    install();
-};
+my $installed;
 
 sub _load_unload {
     my ($on)= @_;
     if ($on) {
-        die "'re' not installed!?" unless $installed;
-        #warn "installed: $installed\n";
-        install();  # allow for changes in colors
-        $^H{regcomp}= $installed;
+        if ( ! defined($installed) ) {
+            require XSLoader;
+            XSLoader::load('re');
+            $installed = install() || 0;
+        }
+        if ( ! $installed ) {
+            die "'re' not installed!?";
+        }  else {
+            # We could just say = $installed; but then we wouldn't
+            # "see" any changes to the color environment var.
+
+            # install() returns an integer, which if casted properly
+            # in C resolves to a structure containing the regex
+            # hooks. Setting it to a random integer will guarantee
+            # segfaults.
+            $^H{regcomp} = install();
+        }
     } else {
         delete $^H{regcomp};
     }
index 7cc5dec..a22344f 100644 (file)
@@ -820,9 +820,8 @@ Recursing deeper than 50 times without consuming any input string will
 result in a fatal error.  The maximum depth is compiled into perl, so 
 changing it requires a custom build.
 
-=item C<(?PARNO)> C<(?R)>
-
-X<(?PARNO)> X<(?1)>
+=item C<(?PARNO)> C<(?R)> C<(?0)>
+X<(?PARNO)> X<(?1)> X<(?R)> X<(?0)>
 X<regex, recursive> X<regexp, recursive> X<regular expression, recursive>
 
 B<WARNING>:  This extended regular expression feature is considered
@@ -834,9 +833,10 @@ pattern that must match at the current position.  Capture buffers
 contained by the pattern will have the value as determined by the
 outermost recursion.
 
-PARNO is a sequence of digits not starting with 0 whose value
-reflects the paren-number of the capture buffer to recurse to.
-C<(?R)> curses to the beginning of the pattern.
+PARNO is a sequence of digits (not starting with 0) whose value reflects
+the paren-number of the capture buffer to recurse to. C<(?R)> recurses to
+the beginning of the whole pattern. C<(?0)> is an alternate syntax for
+C<(?R)>.
 
 The following pattern matches a function foo() which may contain
 balanced parenthesis as the argument.
@@ -881,6 +881,16 @@ a recursed group, in PCRE and Python the recursed into group is treated
 as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect
 the pattern being recursed into.
 
+=item C<(?&NAME)>
+X<(?&NAME)>
+
+Recurse to a named subpattern. Identical to (?PARNO) except that the
+parenthesis to recurse to is determined by name. If multiple parens have
+the same name, then it recurses to the leftmost.
+
+It is an error to refer to a name that is not declared somewhere in the
+pattern.
+
 =item C<< (?>pattern) >>
 X<backtrack> X<backtracking> X<atomic> X<possessive>
 
diff --git a/proto.h b/proto.h
index dc740cb..0e51ab4 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -3586,6 +3586,9 @@ STATIC void       S_regtail(pTHX_ struct RExC_state_t *state, regnode *p, const regnod
                        __attribute__nonnull__(pTHX_2)
                        __attribute__nonnull__(pTHX_3);
 
+STATIC SV *    S_reg_scan_name(pTHX_ struct RExC_state_t *state, U32 flags)
+                       __attribute__nonnull__(pTHX_1);
+
 STATIC U32     S_join_exact(pTHX_ struct RExC_state_t *state, regnode *scan, I32 *min, U32 flags, regnode *val, U32 depth)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2)
index 71c9133..64e6c8d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -4323,6 +4323,35 @@ Perl_reg_named_buff_sv(pTHX_ SV* namesv)
     }
 }
 
+/* Scans the name of a named buffer from the pattern.
+ * If flags is true then returns an SV containing the name.
+ */
+STATIC SV*
+S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
+    char *name_start = RExC_parse;
+    if (UTF) {
+       STRLEN numlen;
+       while (isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
+                       RExC_end - RExC_parse,
+                       &numlen, UTF8_ALLOW_DEFAULT)))
+           RExC_parse += numlen;
+    }
+    else {
+       while (isIDFIRST(*RExC_parse))
+           RExC_parse++;
+    }
+    if (flags) {
+       SV* svname = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
+                   (int)(RExC_parse - name_start)));
+       if (UTF)
+           SvUTF8_on(svname);
+       return svname;
+    }
+    else {
+       return NULL;
+    }
+}
+
 #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
     int rem=(int)(RExC_end - RExC_parse);                       \
     int cut;                                                    \
@@ -4430,37 +4459,28 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            paren = *RExC_parse++;
            ret = NULL;                 /* For look-ahead/behind. */
            switch (paren) {
-               
+
            case '<':           /* (?<...) */
                if (*RExC_parse == '!')
                    paren = ',';
-               else if (*RExC_parse != '=') 
-               {               /* (?<...>) */
+               else if (*RExC_parse != '=') { /* (?<...>) */
                    char *name_start;
+                   SV *svname;
                    paren= '>';
             case '\'':          /* (?'...') */
                    name_start= RExC_parse;
-                   if (UTF) {
-                       STRLEN numlen;
-                       while(isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, 
-                           RExC_end - RExC_parse, 
-                           &numlen, UTF8_ALLOW_DEFAULT)))
-                               RExC_parse += numlen;
-                   } else {
-                       while(isIDFIRST(*RExC_parse))
-                           RExC_parse++;
-                   }
+                   svname = reg_scan_name(pRExC_state,SIZE_ONLY);
                    if (RExC_parse == name_start)
                        goto unknown;
                    if (*RExC_parse != paren)
                        vFAIL2("Sequence (?%c... not terminated",
                            paren=='>' ? '<' : paren);
                    if (SIZE_ONLY) {
-                       SV *svname= Perl_newSVpvf(aTHX_ "%.*s",
-                               (int)(RExC_parse - name_start), name_start);
                        HE *he_str;
                        SV *sv_dat = NULL;
-
+                        if (!svname) /* shouldnt happen */
+                            Perl_croak(aTHX_
+                                "panic: reg_scan_name returned NULL");
                         if (!RExC_paren_names) {
                             RExC_paren_names= newHV();
                             sv_2mortal((SV*)RExC_paren_names);
@@ -4511,22 +4531,53 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                nextchar(pRExC_state);
                *flagp = TRYAGAIN;
                return NULL;
-           case '0' :
-            case 'R' :
-                if (*RExC_parse != ')')
+           case '0' :           /* (?0) */
+           case 'R' :           /* (?R) */
+               if (*RExC_parse != ')')
                    FAIL("Sequence (?R) not terminated");
                reg_node(pRExC_state, SRECURSE);
-               break;
+               break;           /* (?PARNO) */
+            { /* named and numeric backreferences */
+                I32 num;
+                char * parse_start;
+            case '&':            /* (?&NAME) */
+                parse_start = RExC_parse - 1;
+                {
+                   char *name_start = RExC_parse;
+                   SV *svname = reg_scan_name(pRExC_state, !SIZE_ONLY);
+                   if (RExC_parse == name_start)
+                       goto unknown;
+                   if (*RExC_parse != ')')
+                       vFAIL("Expecting close bracket");
+                   if (!SIZE_ONLY) {
+                       HE *he_str = NULL;
+                        SV *sv_dat;
+                        if (!svname) /* shouldn't happen*/
+                            Perl_croak(aTHX_ "panic: reg_scan_name returned NULL");
+                        if (RExC_paren_names)
+                            he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 );
+                        if (he_str)
+                            sv_dat = HeVAL(he_str);
+                        else
+                            vFAIL("Reference to nonexistent group");
+                        num = *((I32 *)SvPVX(sv_dat));
+                    } else {
+                        num = 0;
+                    }
+                }
+                goto gen_recurse_regop;
+                /* NOT REACHED */
             case '1': case '2': case '3': case '4': /* (?1) */
            case '5': case '6': case '7': case '8': case '9':
                RExC_parse--;
-           {
-               const I32 num = atoi(RExC_parse);
-               char * const parse_start = RExC_parse - 1; /* MJD */
+               num = atoi(RExC_parse);
+               parse_start = RExC_parse - 1; /* MJD */
                while (isDIGIT(*RExC_parse))
                        RExC_parse++;
                if (*RExC_parse!=')') 
                    vFAIL("Expecting close bracket");
+                       
+              gen_recurse_regop:
                 ret = reganode(pRExC_state, RECURSE, num);
                 if (!SIZE_ONLY) {
                    if (num > (I32)RExC_rx->nparens) {
@@ -4537,7 +4588,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     RExC_emit++;
                    DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
                        "Recurse #%"UVuf" to %"IVdf"\n", ARG(ret), ARG2L(ret)));
-               } else{
+               } else {
                    RExC_size++;
                    RExC_seen|=REG_SEEN_RECURSE;
                }
@@ -4546,7 +4597,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 
                 nextchar(pRExC_state);
                 return ret;
-            }
+            } /* named and numeric backreferences */
+            /* NOT REACHED */
+
            case 'p':           /* (?p...) */
                if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
                    vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
@@ -5680,18 +5733,9 @@ tryagain:
            } else {
                char* name_start = (RExC_parse += 2);
                I32 num = 0;
-               ch= (ch == '<') ? '>' : '\'';
-
-                if (UTF) {
-                    STRLEN numlen;
-                    while(isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, 
-                        RExC_end - RExC_parse, 
-                        &numlen, UTF8_ALLOW_DEFAULT)))
-                            RExC_parse += numlen;
-               } else {
-                   while(isIDFIRST(*RExC_parse))
-                       RExC_parse++;
-               }
+                SV *svname = reg_scan_name(pRExC_state,!SIZE_ONLY);
+                ch= (ch == '<') ? '>' : '\'';
+                    
                 if (RExC_parse == name_start || *RExC_parse != ch)
                     vFAIL2("Sequence \\k%c... not terminated",
                         (ch == '>' ? '<' : ch));
@@ -5704,14 +5748,13 @@ tryagain:
                 
                
                 if (!SIZE_ONLY) {
-                    SV *svname = Perl_newSVpvf(aTHX_ "%.*s", 
-                            (int)(RExC_parse - name_start), name_start);
-                    HE *he_str;
+                    HE *he_str = NULL;
                     SV *sv_dat;
-                    if (UTF) 
-                        SvUTF8_on(svname);
-                    he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 );
-                    SvREFCNT_dec(svname);
+                    if (!svname)
+                        Perl_croak(aTHX_
+                                "panic: reg_scan_name returned NULL");
+                    if (RExC_paren_names)                                
+                        he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 );
                     if ( he_str ) {
                         sv_dat = HeVAL(he_str);
                     } else {
index 08d45b2..83de44a 100644 (file)
@@ -1037,3 +1037,6 @@ X(?<=foo.)[YZ]    ..XfooXY..      y       pos     8
 /(?<n>foo)\k'n'/       ..foofoo..      y       $1      foo
 /(?<n>foo)\k'n'/       ..foofoo..      y       $+{n}   foo
 /(?:(?<n>foo)|(?<n>bar))\k<n>/ ..barbar..      y       $+{n}   bar
+/^(?'main'<(?:[^<>]+|(?&crap)|(?&main))*>)(?'empty')(?'crap'!>!>!>)$/  <<!>!>!>><>>!>!>!>      y       $+{main}        <<!>!>!>><>>
+/^(?'main'<(?:[^<>]+|(?&main))*>)$/    <<><<<><>>>>    y       $1      <<><<<><>>>>
+/(?'first'(?&second)*)(?'second'[fF]o+)/       fooFoFoo        y       $+{first}-$+{second}    fooFo-Foo