Re: [PATCH] Initial attempt at named captures for perls regexp engine

author Yves Orton <demerphq@gmail.com>

Fri, 6 Oct 2006 19:16:01 +0000 (21:16 +0200)

committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>

Sat, 7 Oct 2006 14:30:32 +0000 (14:30 +0000)
author Yves Orton <demerphq@gmail.com>
Fri, 6 Oct 2006 19:16:01 +0000 (21:16 +0200)
committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
Sat, 7 Oct 2006 14:30:32 +0000 (14:30 +0000)
diff --git a/XSUB.h b/XSUB.h

index de5d33b..e4cc816 100644 (file)
--- a/XSUB.h
+++ b/XSUB.h
@@ -391,6 +391,7 @@ Rethrows a previously caught exception.  See L<perlguts/"Exception Handling">.
  #  define VTBL_uvar            &PL_vtbl_uvar
  #  define VTBL_defelem         &PL_vtbl_defelem
  #  define VTBL_regexp          &PL_vtbl_regexp
+#  define VTBL_regdata_names   &PL_vtbl_regdata_names
  #  define VTBL_regdata         &PL_vtbl_regdata
  #  define VTBL_regdatum                &PL_vtbl_regdatum
  #  ifdef USE_LOCALE_COLLATE
diff --git a/doop.c b/doop.c

index 5d1fc7a..1620465 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -1425,8 +1425,11 @@ Perl_do_kv(pTHX)
             RETURN;
         }
  
-       if (! SvTIED_mg((SV*)keys, PERL_MAGIC_tied))
+       if (! SvTIED_mg((SV*)keys, PERL_MAGIC_tied) 
+           && ! SvTIED_mg((SV*)keys, PERL_MAGIC_regdata_names))
+       {
             i = HvKEYS(keys);
+       }
         else {
             i = 0;
             while (hv_iternext(keys)) i++;
diff --git a/dump.c b/dump.c

index ce2c7ca..c61516b 100644 (file)
--- a/dump.c
+++ b/dump.c
@@ -1127,6 +1127,7 @@ static const struct { const char type; const char *name; } magic_names[] = {
         { PERL_MAGIC_sv,             "sv(\\0)" },
         { PERL_MAGIC_arylen,         "arylen(#)" },
         { PERL_MAGIC_rhash,          "rhash(%)" },
+       { PERL_MAGIC_regdata_names,  "regdata_names(+)" },
         { PERL_MAGIC_pos,            "pos(.)" },
         { PERL_MAGIC_symtab,         "symtab(:)" },
         { PERL_MAGIC_backref,        "backref(<)" },
diff --git a/embed.fnc b/embed.fnc

index 6723d92..bccc933 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -680,6 +680,7 @@ Ap  |I32    |regexec_flags  |NN regexp* prog|NN char* stringarg \
                                 |NN char* strend|NN char* strbeg|I32 minend \
                                 |NN SV* screamer|NULLOK void* data|U32 flags
  ApR    |regnode*|regnext       |NN regnode* p
+Ep     |SV*|reg_named_buff_sv  |NN SV* namesv
  Ep     |void   |regprop        |NULLOK const regexp *prog|NN SV* sv|NN const regnode* o
  Ap     |void   |repeatcpy      |NN char* to|NN const char* from|I32 len|I32 count
  ApP    |char*  |rninstr        |NN const char* big|NN const char* bigend \
diff --git a/embed.h b/embed.h

index 0e06d49..a3e8f70 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -689,6 +689,7 @@
  #define regexec_flags          Perl_regexec_flags
  #define regnext                        Perl_regnext
  #if defined(PERL_CORE) || defined(PERL_EXT)
+#define reg_named_buff_sv      Perl_reg_named_buff_sv
  #define regprop                        Perl_regprop
  #endif
  #define repeatcpy              Perl_repeatcpy
@@ -2894,6 +2895,7 @@
  #define regexec_flags(a,b,c,d,e,f,g,h) Perl_regexec_flags(aTHX_ a,b,c,d,e,f,g,h)
  #define regnext(a)             Perl_regnext(aTHX_ a)
  #if defined(PERL_CORE) || defined(PERL_EXT)
+#define reg_named_buff_sv(a)   Perl_reg_named_buff_sv(aTHX_ a)
  #define regprop(a,b,c)         Perl_regprop(aTHX_ a,b,c)
  #endif
  #define repeatcpy(a,b,c,d)     Perl_repeatcpy(aTHX_ a,b,c,d)
diff --git a/gv.c b/gv.c

index 637e82f..9ad4434 100644 (file)
--- a/gv.c
+++ b/gv.c
@@ -1188,10 +1188,14 @@ Perl_gv_fetchpvn_flags(pTHX_ const char *nambeg, STRLEN full_len, I32 flags,
             goto magicalize;
  
         case '+':
+       GvMULTI_on(gv);
         {
             AV* const av = GvAVn(gv);
+           HV* const hv = GvHVn(gv);
              sv_magic((SV*)av, (SV*)av, PERL_MAGIC_regdata, NULL, 0);
             SvREADONLY_on(av);
+           hv_magic(hv, NULL, PERL_MAGIC_regdata_names);
+           SvREADONLY_on(hv);
             /* FALL THROUGH */
         }
         case '\023':    /* $^S */
diff --git a/hv.c b/hv.c

index d1835b2..8552cd2 100644 (file)
--- a/hv.c
+++ b/hv.c
@@ -450,12 +450,12 @@ S_hv_fetch_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen,
      xhv = (XPVHV*)SvANY(hv);
      if (SvMAGICAL(hv)) {
         if (SvRMAGICAL(hv) && !(action & (HV_FETCH_ISSTORE|HV_FETCH_ISEXISTS))) {
-           if (mg_find((SV*)hv, PERL_MAGIC_tied) || SvGMAGICAL((SV*)hv)) {
-               sv = sv_newmortal();
+           MAGIC *regdata = NULL;
+           if (mg_find((SV*)hv, PERL_MAGIC_tied) || SvGMAGICAL((SV*)hv)
+               || (regdata = mg_find((SV*)hv, PERL_MAGIC_regdata_names))) {
  
                 /* XXX should be able to skimp on the HE/HEK here when
                    HV_FETCH_JUST_SV is true.  */
-
                 if (!keysv) {
                     keysv = newSVpvn(key, klen);
                     if (is_utf8) {
@@ -464,7 +464,16 @@ S_hv_fetch_common(pTHX_ HV *hv, SV *keysv, const char *key, STRLEN klen,
                 } else {
                     keysv = newSVsv(keysv);
                 }
-               mg_copy((SV*)hv, sv, (char *)keysv, HEf_SVKEY);
+               if (regdata) {
+                   sv = Perl_reg_named_buff_sv(aTHX_ keysv);
+                   if (!sv) {
+                       SvREFCNT_dec(keysv);
+                       return 0;
+                   }
+               } else {
+                   sv = sv_newmortal();
+                   mg_copy((SV*)hv, sv, (char *)keysv, HEf_SVKEY);
+               }
  
                 /* grab a fake HE/HEK pair from the pool or make a new one */
                 entry = PL_hv_fetch_ent_mh;
@@ -1923,7 +1932,17 @@ Perl_hv_iterinit(pTHX_ HV *hv)
      } else {
         hv_auxinit(hv);
      }
-
+    if (SvMAGICAL(hv) && SvRMAGICAL(hv)) {
+        MAGIC *mg = mg_find((SV*)hv, PERL_MAGIC_regdata_names);
+        if ( mg ) {
+             if (PL_curpm) {
+                const REGEXP * const rx = PM_GETRE(PL_curpm);
+                if (rx && rx->paren_names) {
+                    (void)hv_iterinit(rx->paren_names);
+                } 
+            } 
+        }
+    }
      /* used to be xhv->xhv_fill before 5.004_65 */
      return HvTOTALKEYS(hv);
  }
@@ -2078,6 +2097,7 @@ Perl_hv_iternext_flags(pTHX_ HV *hv, I32 flags)
  
      if (!hv)
         Perl_croak(aTHX_ "Bad hash");
+
      xhv = (XPVHV*)SvANY(hv);
  
      if (!SvOOK(hv)) {
@@ -2089,8 +2109,85 @@ Perl_hv_iternext_flags(pTHX_ HV *hv, I32 flags)
      iter = HvAUX(hv);
  
      oldentry = entry = iter->xhv_eiter; /* HvEITER(hv) */
+    if (SvMAGICAL(hv) && SvRMAGICAL(hv) &&
+           (mg = mg_find((SV*)hv, PERL_MAGIC_regdata_names)))
+    {
+       SV * key;
+       SV *val = NULL;
+       REGEXP * rx;
+       if (!PL_curpm)
+           return NULL;
+       rx = PM_GETRE(PL_curpm);
+       if (rx && rx->paren_names) {
+           hv = rx->paren_names;
+       } else {
+           return NULL;
+       }
  
-    if ((mg = SvTIED_mg((SV*)hv, PERL_MAGIC_tied))) {
+        key =  sv_newmortal();
+        if (entry) {
+            sv_setsv(key, HeSVKEY_force(entry));
+            SvREFCNT_dec(HeSVKEY(entry));       /* get rid of previous key */
+        }
+        else {
+            char *k;
+            HEK *hek;
+
+            /* one HE per MAGICAL hash */
+            iter->xhv_eiter = entry = new_HE(); /* HvEITER(hv) = new_HE() */
+            Zero(entry, 1, HE);
+            Newxz(k, HEK_BASESIZE + sizeof(SV*), char);
+            hek = (HEK*)k;
+            HeKEY_hek(entry) = hek;
+            HeKLEN(entry) = HEf_SVKEY;
+        }
+        {
+            while (!val) {
+                HE *temphe = hv_iternext_flags(hv,flags);
+                if (temphe) {
+                    IV i;
+                    IV parno = 0;
+                    SV* sv_dat = HeVAL(temphe);
+                    I32 *nums = (I32*)SvPVX(sv_dat);
+                    for ( i = 0; i < SvIVX(sv_dat); i++ ) {
+                        if ((I32)(rx->lastcloseparen) >= nums[i] &&
+                            rx->startp[nums[i]] != -1 &&
+                            rx->endp[nums[i]] != -1) 
+                        {
+                            parno = nums[i];
+                            break;
+                        }
+                    }
+                    if (parno) {
+                        GV *gv_paren;
+                        STRLEN len;
+                        SV *sv = sv_newmortal();
+                        const char* pvkey = HePV(temphe, len);
+                        
+                        Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
+                        gv_paren = Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
+                        Perl_sv_setpvn(aTHX_ key, pvkey, len);
+                        val = GvSVn(gv_paren);
+                    } 
+                } else {
+                    break;
+                }
+            }
+        }
+        if (val && SvOK(key)) {
+            /* force key to stay around until next time */
+            HeSVKEY_set(entry, SvREFCNT_inc_simple_NN(key));
+            HeVAL(entry) = SvREFCNT_inc_simple_NN(val); 
+            return entry;               /* beware, hent_val is not set */
+        }
+        if (HeVAL(entry))
+            SvREFCNT_dec(HeVAL(entry));
+        Safefree(HeKEY_hek(entry));
+        del_HE(entry);
+        iter->xhv_eiter = NULL; /* HvEITER(hv) = NULL */
+        return NULL;
+    
+    } else if ((mg = SvTIED_mg((SV*)hv, PERL_MAGIC_tied))) {
         SV * const key = sv_newmortal();
         if (entry) {
             sv_setsv(key, HeSVKEY_force(entry));
diff --git a/perl.h b/perl.h

index 93b4d62..3338ea2 100644 (file)
--- a/perl.h
+++ b/perl.h
@@ -3615,6 +3615,8 @@ Gid_t getegid (void);
  #define PERL_MAGIC_overload_elem  'a' /* %OVERLOAD hash element */
  #define PERL_MAGIC_overload_table 'c' /* Holds overload table (AMT) on stash */
  #define PERL_MAGIC_bm            'B' /* Boyer-Moore (fast string search) */
+#define PERL_MAGIC_regdata_names  '+' /* Regex named capture buffer hash 
+                                       (%+ support) */
  #define PERL_MAGIC_regdata       'D' /* Regex match position data
                                         (@+ and @- vars) */
  #define PERL_MAGIC_regdatum      'd' /* Regex match position data element */
@@ -4830,6 +4832,18 @@ MGVTBL_SET(
  );
  
  MGVTBL_SET(
+    PL_vtbl_regdata_names,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+);
+
+MGVTBL_SET(
      PL_vtbl_regdata,
      NULL,
      NULL,
diff --git a/pod/perlre.pod b/pod/perlre.pod

index c4dd7c5..7cc5dec 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -191,20 +191,26 @@ X<metacharacter>
  X<\w> X<\W> X<\s> X<\S> X<\d> X<\D> X<\X> X<\p> X<\P> X<\C>
  X<word> X<whitespace>
  
-    \w Match a "word" character (alphanumeric plus "_")
-    \W Match a non-"word" character
-    \s Match a whitespace character
-    \S Match a non-whitespace character
-    \d Match a digit character
-    \D Match a non-digit character
-    \pP        Match P, named property.  Use \p{Prop} for longer names.
-    \PP        Match non-P
-    \X Match eXtended Unicode "combining character sequence",
-        equivalent to (?:\PM\pM*)
-    \C Match a single C char (octet) even under Unicode.
-       NOTE: breaks up characters into their UTF-8 bytes,
-       so you may end up with malformed pieces of UTF-8.
-       Unsupported in lookbehind.
+    \w      Match a "word" character (alphanumeric plus "_")
+    \W      Match a non-"word" character
+    \s      Match a whitespace character
+    \S      Match a non-whitespace character
+    \d      Match a digit character
+    \D      Match a non-digit character
+    \pP             Match P, named property.  Use \p{Prop} for longer names.
+    \PP             Match non-P
+    \X      Match eXtended Unicode "combining character sequence",
+             equivalent to (?:\PM\pM*)
+    \C      Match a single C char (octet) even under Unicode.
+            NOTE: breaks up characters into their UTF-8 bytes,
+            so you may end up with malformed pieces of UTF-8.
+            Unsupported in lookbehind.
+    \1       Backreference to a a specific group. 
+             '1' may actually be any positive integer
+    \k<name> Named backreference
+    \N{name} Named unicode character, or unicode escape.
+    \x12     Hexadecimal escape sequence
+    \x{1234} Long hexadecimal escape sequence
  
  A C<\w> matches a single alphanumeric character (an alphabetic
  character, or a decimal digit) or C<_>, not a whole word.  Use C<\w+>
@@ -403,7 +409,7 @@ X<\G>
  The bracketing construct C<( ... )> creates capture buffers.  To
  refer to the digit'th buffer use \<digit> within the
  match.  Outside the match use "$" instead of "\".  (The
-\<digit> notation works in certain circumstances outside 
+\<digit> notation works in certain circumstances outside
  the match.  See the warning below about \1 vs $1 for details.)
  Referring back to another part of the match is called a
  I<backreference>.
@@ -414,20 +420,38 @@ There is no limit to the number of captured substrings that you may
  use.  However Perl also uses \10, \11, etc. as aliases for \010,
  \011, etc.  (Recall that 0 means octal, so \011 is the character at
  number 9 in your coded character set; which would be the 10th character,
-a horizontal tab under ASCII.)  Perl resolves this 
-ambiguity by interpreting \10 as a backreference only if at least 10 
-left parentheses have opened before it.  Likewise \11 is a 
-backreference only if at least 11 left parentheses have opened 
-before it.  And so on.  \1 through \9 are always interpreted as 
+a horizontal tab under ASCII.)  Perl resolves this
+ambiguity by interpreting \10 as a backreference only if at least 10
+left parentheses have opened before it.  Likewise \11 is a
+backreference only if at least 11 left parentheses have opened
+before it.  And so on.  \1 through \9 are always interpreted as
  backreferences.
  
+Additionally, as of Perl 5.10 you may use named capture buffers and named
+backreferences. The notation is C<< (?<name>...) >> and C<< \k<name> >>
+(you may also use single quotes instead of angle brackets to quote the
+name). The only difference with named capture buffers and unnamed ones is
+that multiple buffers may have the same name and that the contents of
+named capture buffers is available via the C<%+> hash. When multiple
+groups share the same name C<$+{name}> and C<< \k<name> >> refer to the
+leftmost defined group, thus it's possible to do things with named capture
+buffers that would otherwise require C<(??{})> code to accomplish. Named
+capture buffers are numbered just as normal capture buffers are and may be
+referenced via the magic numeric variables or via numeric backreferences
+as well as by name.
+
  Examples:
  
      s/^([^ ]*) *([^ ]*)/$2 $1/;     # swap first two words
  
-     if (/(.)\1/) {                 # find first doubled char
-         print "'$1' is the first doubled character\n";
-     }
+    /(.)\1/                         # find first doubled char
+         and print "'$1' is the first doubled character\n";
+
+    /(?<char>.)\k<char>/            # ... a different way
+         and print "'$+{char}' is the first doubled character\n";
+
+    /(?<char>.)\1/                  # ... mix and match
+         and print "'$1' is the first doubled character\n";
  
      if (/Time: (..):(..):(..)/) {   # parse out values
         $hours = $1;
@@ -443,7 +467,7 @@ everything before the matched string.  C<$'> returns everything
  after the matched string. And C<$^N> contains whatever was matched by
  the most-recently closed group (submatch). C<$^N> can be used in
  extended patterns (see below), for example to assign a submatch to a
-variable. 
+variable.
  X<$+> X<$^N> X<$&> X<$`> X<$'>
  
  The numbered match variables ($1, $2, $3, etc.) and the related punctuation
@@ -620,6 +644,48 @@ A zero-width negative look-behind assertion.  For example C</(?<!bar)foo/>
  matches any occurrence of "foo" that does not follow "bar".  Works
  only for fixed-width look-behind.
  
+=item C<(?'NAME'pattern)>
+
+=item C<< (?<NAME>pattern) >>
+X<< (?<NAME>) >> X<(?'NAME')> X<named capture> X<capture>
+
+A named capture buffer. Identical in every respect to normal capturing
+parens C<()> but for the additional fact that C<%+> may be used after
+a succesful match to refer to a named buffer. See C<perlvar> for more
+details on the C<%+> hash.
+
+If multiple distinct capture buffers have the same name then the
+$+{NAME} will refer to the leftmost defined buffer in the match.
+
+The forms C<(?'NAME'pattern)> and C<(?<NAME>pattern)> are equivalent.
+
+B<NOTE:> While the notation of this construct is the same as the similar
+function in .NET regexes, the behavior is not, in Perl the buffers are
+numbered sequentially regardless of being named or not. Thus in the
+pattern
+
+  /(x)(?<foo>y)(z)/
+
+$+{foo} will be the same as $2, and $3 will contain 'z' instead of
+the opposite which is what a .NET regex hacker might expect.
+
+Currently NAME is restricted to word chars only. In other words, it
+must match C</^\w+$/>.
+
+=item C<< \k<name> >>
+
+=item C<< \k'name' >>
+
+Named backreference. Similar to numeric backreferences, except that
+the group is designated by name and not number. If multiple groups
+have the same name then it refers to the leftmost defined group in
+the current match.
+
+It is an error to refer to a name not defined by a C<(?<NAME>)>
+earlier in the pattern.
+
+Both forms are equivalent.
+
  =item C<(?{ code })>
  X<(?{})> X<regex, code in> X<regexp, code in> X<regular expression, code in>
  
@@ -726,7 +792,7 @@ Thus,
  
      ('a' x 100)=~/(??{'(.)' x 100})/
  
-B<will> match, it will B<not> set $1. 
+B<will> match, it will B<not> set $1.
  
  The C<code> is not interpolated.  As before, the rules to determine
  where the C<code> ends are currently somewhat convoluted.
@@ -762,21 +828,21 @@ X<regex, recursive> X<regexp, recursive> X<regular expression, recursive>
  B<WARNING>:  This extended regular expression feature is considered
  highly experimental, and may be changed or deleted without notice.
  
-Similar to C<(??{ code })> except it does not involve compiling any code, 
-instead it treats the contents of a capture buffer as an independent 
+Similar to C<(??{ code })> except it does not involve compiling any code,
+instead it treats the contents of a capture buffer as an independent
  pattern that must match at the current position.  Capture buffers
-contained by the pattern will have the value as determined by the 
+contained by the pattern will have the value as determined by the
  outermost recursion.
  
  PARNO is a sequence of digits not starting with 0 whose value
-reflects the paren-number of the capture buffer to recurse to. 
+reflects the paren-number of the capture buffer to recurse to.
  C<(?R)> curses to the beginning of the pattern.
  
-The following pattern matches a function foo() which may contain 
-balanced parenthesis as the argument. 
+The following pattern matches a function foo() which may contain
+balanced parenthesis as the argument.
  
    $re = qr{ (                    # paren group 1 (full function)
-              foo              
+              foo
                (                  # paren group 2 (parens)
                  \(
                    (              # paren group 3 (contents of parens)
@@ -802,18 +868,18 @@ the output produced should be the following:
  
      $1 = foo(bar(baz)+baz(bop))
      $2 = (bar(baz)+baz(bop))
-    $3 = bar(baz)+baz(bop)      
+    $3 = bar(baz)+baz(bop)
  
-If there is no corresponding capture buffer defined, then it is a 
+If there is no corresponding capture buffer defined, then it is a
  fatal error.  Recursing deeper than 50 times without consuming any input
-string will also result in a fatal error.  The maximum depth is compiled 
+string will also result in a fatal error.  The maximum depth is compiled
  into perl, so changing it requires a custom build.
  
-B<Note> that this pattern does not behave the same way as the equivalent 
+B<Note> that this pattern does not behave the same way as the equivalent
  PCRE or Python construct of the same form. In perl you can backtrack into
  a recursed group, in PCRE and Python the recursed into group is treated
-as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect 
-the pattern being recursed into. 
+as atomic. Also, constructs like (?i:(?1)) or (?:(?i)(?1)) do not affect
+the pattern being recursed into.
  
  =item C<< (?>pattern) >>
  X<backtrack> X<backtracking> X<atomic> X<possessive>
diff --git a/pod/perltodo.pod b/pod/perltodo.pod

index 50a79d9..4a54bcd 100644 (file)
--- a/pod/perltodo.pod
+++ b/pod/perltodo.pod
@@ -629,16 +629,6 @@ Fix (or rewrite) the implementation of the C</(?{...})/> closures.
  This will allow the use of a regex from inside (?{ }), (??{ }) and
  (?(?{ })|) constructs.
  
-=head2 Add named capture to regexp engine
-
-Named capture is supported by .NET, PCRE and Python. Its embarrassing
-Perl doesn't support it yet. 
-
-Jeffrey Friedl notes that "the most glaring omission [in perl's regexp
-engine] offered by other implementations is named capture".
-
-demerphq is working on this.
-
  =head2 Add possessive quantifiers to regexp engine
  
  Possessive quantifiers are a syntactic sugar that affords a more
diff --git a/pod/perlvar.pod b/pod/perlvar.pod

index 4d8c17e..8a486b2 100644 (file)
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -313,6 +313,17 @@ past where $2 ends, and so on.  You can use C<$#+> to determine
  how many subgroups were in the last successful match.  See the
  examples given for the C<@-> variable.
  
+=item %+
+X<%+>
+
+Similar to C<@+>, the C<%+> hash allows access to the named capture
+buffers, should they exist, in the last successful match in the
+currently active dynamic scope.
+
+C<$+{foo}> is equivalent to C<$1> after the following match:
+
+  'foo'=~/(?<foo>foo)/;
+
  =item HANDLE->input_line_number(EXPR)
  
  =item $INPUT_LINE_NUMBER
@@ -322,7 +333,7 @@ examples given for the C<@-> variable.
  =item $.
  X<$.> X<$NR> X<$INPUT_LINE_NUMBER> X<line number>
  
-Current line number for the last filehandle accessed. 
+Current line number for the last filehandle accessed.
  
  Each filehandle in Perl counts the number of lines that have been read
  from it.  (Depending on the value of C<$/>, Perl's idea of what
diff --git a/pp.c b/pp.c

index 25279a3..f04b55d 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -3862,7 +3862,7 @@ PP(pp_each)
  {
      dVAR;
      dSP;
-    HV * const hash = (HV*)POPs;
+    HV * hash = (HV*)POPs;
      HE *entry;
      const I32 gimme = GIMME_V;
  
diff --git a/proto.h b/proto.h

index e10c8eb..dc740cb 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1863,6 +1863,9 @@ PERL_CALLCONV regnode*    Perl_regnext(pTHX_ regnode* p)
                         __attribute__warn_unused_result__
                         __attribute__nonnull__(pTHX_1);
  
+PERL_CALLCONV SV*      Perl_reg_named_buff_sv(pTHX_ SV* namesv)
+                       __attribute__nonnull__(pTHX_1);
+
  PERL_CALLCONV void     Perl_regprop(pTHX_ const regexp *prog, SV* sv, const regnode* o)
                         __attribute__nonnull__(pTHX_2)
                         __attribute__nonnull__(pTHX_3);
diff --git a/regcomp.c b/regcomp.c

index 4895ea4..ca5830f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -120,6 +120,7 @@ typedef struct RExC_state_t {
      regnode    **parens;               /* offsets of each paren */
      I32                utf8;
      HV         *charnames;             /* cache of named sequences */
+    HV         *paren_names;           /* Paren names */
  #if ADD_TO_REGEXEC
      char       *starttry;              /* -Dr: where regtry was called. */
  #define RExC_starttry  (pRExC_state->starttry)
@@ -153,6 +154,7 @@ typedef struct RExC_state_t {
  #define RExC_utf8      (pRExC_state->utf8)
  #define RExC_charnames  (pRExC_state->charnames)
  #define RExC_parens    (pRExC_state->parens)
+#define RExC_paren_names       (pRExC_state->paren_names)
  
  #define        ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
  #define        ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
@@ -3771,8 +3773,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
      RExC_emit = &PL_regdummy;
      RExC_whilem_seen = 0;
      RExC_charnames = NULL;
-    RExC_parens= NULL;
-    
+    RExC_parens = NULL;
+    RExC_paren_names = NULL;
+
  #if 0 /* REGC() is (currently) a NOP at the first pass.
         * Clever compilers notice this and complain. --jhi */
      REGC((U8)REG_MAGIC, (char*)RExC_emit);
@@ -3782,15 +3785,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
         RExC_precomp = NULL;
         return(NULL);
      }
-    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
-    DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
-    DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
      DEBUG_PARSE_r({
+        PerlIO_printf(Perl_debug_log, 
+            "Required size %"IVdf" nodes\n"
+            "Starting second pass (creation)\n", 
+            (IV)RExC_size);
          RExC_lastnum=0; 
          RExC_lastparse=NULL; 
      });
-
-    
      /* Small enough for pointer-storage convention?
         If extralen==0, this means that we will not need long jumps. */
      if (RExC_size >= 0x10000L && RExC_extralen)
@@ -3826,8 +3828,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
  
      r->substrs = 0;                    /* Useful during FAIL. */
      r->startp = 0;                     /* Useful during FAIL. */
-
      r->endp = 0;                       
+    r->paren_names = 0;
+    
      if (RExC_seen & REG_SEEN_RECURSE) {
          Newx(RExC_parens, RExC_npar,regnode *);
          SAVEFREEPV(RExC_parens);
@@ -3997,13 +4000,13 @@ reStudy:
  
         /* Scan is after the zeroth branch, first is atomic matcher. */
  #ifdef TRIE_STUDY_OPT
-       DEBUG_COMPILE_r(
+       DEBUG_PARSE_r(
             if (!restudied)
                 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
                               (IV)(first - scan + 1))
          );
  #else
-       DEBUG_COMPILE_r(
+       DEBUG_PARSE_r(
             PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
                 (IV)(first - scan + 1))
          );
@@ -4252,6 +4255,11 @@ reStudy:
         r->reganch |= ROPT_EVAL_SEEN;
      if (RExC_seen & REG_SEEN_CANY)
         r->reganch |= ROPT_CANY_SEEN;
+    if (RExC_paren_names)
+        r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
+    else
+        r->paren_names = NULL;
+               
      Newxz(r->startp, RExC_npar, I32);
      Newxz(r->endp, RExC_npar, I32);
      
@@ -4280,6 +4288,41 @@ reStudy:
  #undef END_BLOCK
  #undef RE_ENGINE_PTR
  
+SV*
+Perl_reg_named_buff_sv(pTHX_ SV* namesv)
+{
+    I32 parno = 0; /* no match */
+    if (PL_curpm) {
+        const REGEXP * const rx = PM_GETRE(PL_curpm);
+        if (rx && rx->paren_names) {            
+            HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
+            if (he_str) {
+                IV i;
+                SV* sv_dat=HeVAL(he_str);
+                I32 *nums=(I32*)SvPVX(sv_dat);
+                for ( i=0; i<SvIVX(sv_dat); i++ ) {
+                    if ((I32)(rx->lastcloseparen) >= nums[i] &&
+                        rx->startp[nums[i]] != -1 &&
+                        rx->endp[nums[i]] != -1) 
+                    {
+                        parno = nums[i];
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    if ( !parno ) {
+        return 0;
+    } else {
+        GV *gv_paren;
+        SV *sv= sv_newmortal();
+        Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
+        gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
+        return GvSVn(gv_paren);
+    }
+}
+
  #define DEBUG_PARSE_MSG(funcname)     DEBUG_PARSE_r({           \
      int rem=(int)(RExC_end - RExC_parse);                       \
      int cut;                                                    \
@@ -4387,12 +4430,66 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             paren = *RExC_parse++;
             ret = NULL;                 /* For look-ahead/behind. */
             switch (paren) {
+               
             case '<':           /* (?<...) */
-               RExC_seen |= REG_SEEN_LOOKBEHIND;
                 if (*RExC_parse == '!')
                     paren = ',';
-               if (*RExC_parse != '=' && *RExC_parse != '!')
-                   goto unknown;
+               else if (*RExC_parse != '=') 
+               {               /* (?<...>) */
+                   char *name_start;
+                   paren= '>';
+            case '\'':          /* (?'...') */
+                   name_start= RExC_parse;
+                   if (UTF) {
+                       STRLEN numlen;
+                       while(isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, 
+                           RExC_end - RExC_parse, 
+                           &numlen, UTF8_ALLOW_DEFAULT)))
+                               RExC_parse += numlen;
+                   } else {
+                       while(isIDFIRST(*RExC_parse))
+                           RExC_parse++;
+                   }
+                   if (RExC_parse == name_start)
+                       goto unknown;
+                   if (*RExC_parse != paren)
+                       vFAIL2("Sequence (?%c... not terminated",
+                           paren=='>' ? '<' : paren);
+                   if (SIZE_ONLY) {
+                        SV *svname= Perl_newSVpvf(aTHX_ "%.*s", 
+                            (int)(RExC_parse - name_start), name_start);
+                        HE *he_str;
+                        SV *sv_dat;
+                        
+                        if (!RExC_paren_names) {
+                            RExC_paren_names= newHV();
+                            sv_2mortal((SV*)RExC_paren_names);
+                        }
+                        he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
+                        if ( he_str ) {
+                            sv_dat = HeVAL(he_str);
+                        } else {
+                            /* croak baby croak */
+                        }
+                        if (SvPOK(sv_dat)) {
+                            IV count=SvIV(sv_dat);
+                            I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
+                            SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
+                            pv[count]=RExC_npar;
+                            SvIVX(sv_dat)++;
+                        } else {
+                            (void)SvUPGRADE(sv_dat,SVt_PVNV);
+                            sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
+                            SvIOK_on(sv_dat);
+                            SvIVX(sv_dat)= 1;
+                        }        
+                        /*sv_dump(sv_dat);*/
+                    }
+                    nextchar(pRExC_state);
+                   paren = 1;
+                   goto capturing_parens;
+               }
+                RExC_seen |= REG_SEEN_LOOKBEHIND;
                 RExC_parse++;
             case '=':           /* (?=...) */
             case '!':           /* (?!...) */
@@ -4412,6 +4509,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 nextchar(pRExC_state);
                 *flagp = TRYAGAIN;
                 return NULL;
+           case '0' :
              case 'R' :
                  if (*RExC_parse != ')')
                     FAIL("Sequence (?R) not terminated");
@@ -4657,6 +4755,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             }
         }
         else {                  /* (...) */
+         capturing_parens:
             parno = RExC_npar;
             RExC_npar++;
             ret = reganode(pRExC_state, OPEN, parno);
@@ -5567,6 +5666,68 @@ tryagain:
              ++RExC_parse;
              ret= reg_namedseq(pRExC_state, NULL); 
              break;
+       case 'k':
+        {   
+            char ch= RExC_parse[1];        
+           if (ch != '<' && ch != '\'') {
+               if (SIZE_ONLY)
+                   vWARN( RExC_parse + 1, 
+                       "Possible broken named back reference treated as literal k");
+               parse_start--;
+               goto defchar;
+           } else {
+               char* name_start = (RExC_parse += 2);
+               I32 num = 0;
+               ch= (ch == '<') ? '>' : '\'';
+
+                if (UTF) {
+                    STRLEN numlen;
+                    while(isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse, 
+                        RExC_end - RExC_parse, 
+                        &numlen, UTF8_ALLOW_DEFAULT)))
+                            RExC_parse += numlen;
+               } else {
+                   while(isIDFIRST(*RExC_parse))
+                       RExC_parse++;
+               }
+                if (RExC_parse == name_start || *RExC_parse != ch)
+                    vFAIL2("Sequence \\k%c... not terminated",
+                        (ch == '>' ? '<' : ch));
+                
+                RExC_sawback = 1;
+                ret = reganode(pRExC_state,
+                          (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
+                          num);
+                *flagp |= HASWIDTH;
+                
+               
+                if (!SIZE_ONLY) {
+                    SV *svname = Perl_newSVpvf(aTHX_ "%.*s", 
+                            (int)(RExC_parse - name_start), name_start);
+                    HE *he_str;
+                    SV *sv_dat;
+                    if (UTF) 
+                        SvUTF8_on(svname);
+                    he_str = hv_fetch_ent( RExC_paren_names, svname, 0, 0 );
+                    SvREFCNT_dec(svname);
+                    if ( he_str ) {
+                        sv_dat = HeVAL(he_str);
+                    } else {
+                        vFAIL("Reference to nonexistent group");
+                    }               
+                    num = add_data( pRExC_state, 1, "S" );
+                    ARG_SET(ret,num);
+                    RExC_rx->data->data[num]=(void*)sv_dat;
+                    SvREFCNT_inc(sv_dat);
+                }    
+                /* override incorrect value set in reganode MJD */
+                Set_Node_Offset(ret, parse_start+1);
+                Set_Node_Cur_Length(ret); /* MJD */
+                nextchar(pRExC_state);
+                              
+            }
+            break;
+        }            
         case 'n':
         case 'r':
         case 't':
@@ -7690,6 +7851,8 @@ Perl_pregfree(pTHX_ struct regexp *r)
             SvREFCNT_dec(r->float_utf8);
         Safefree(r->substrs);
      }
+    if (r->paren_names)
+            SvREFCNT_dec(r->paren_names);
      if (r->data) {
         int n = r->data->count;
         PAD* new_comppad = NULL;
@@ -7700,6 +7863,7 @@ Perl_pregfree(pTHX_ struct regexp *r)
            /* If you add a ->what type here, update the comment in regcomp.h */
             switch (r->data->what[n]) {
             case 's':
+           case 'S':
                 SvREFCNT_dec((SV*)r->data->data[n]);
                 break;
             case 'f':
@@ -7793,6 +7957,7 @@ Perl_pregfree(pTHX_ struct regexp *r)
  
  #define sv_dup_inc(s,t)        SvREFCNT_inc(sv_dup(s,t))
  #define av_dup_inc(s,t)        (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
+#define hv_dup_inc(s,t)        (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
  #define SAVEPVN(p,n)   ((p) ? savepvn(p,n) : NULL)
  
  /* 
@@ -7856,6 +8021,7 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
                 /* legal options are one of: sfpont
                    see also regcomp.h and pregfree() */
             case 's':
+           case 'S':
                 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
                 break;
             case 'p':
@@ -7920,6 +8086,8 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
      ret->sublen         = r->sublen;
  
      ret->engine         = r->engine;
+    
+    ret->paren_names    = hv_dup_inc(r->paren_names, param);
  
      if (RX_MATCH_COPIED(ret))
         ret->subbeg  = SAVEPVN(r->subbeg, r->sublen);
diff --git a/regcomp.h b/regcomp.h

index 166be14..e7b5a2c 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -413,6 +413,7 @@ END_EXTERN_C
   *       in the character class
   *   t - trie struct
   *   T - aho-trie struct
+ *   S - sv for named capture lookup
   * 20010712 mjd@plover.com
   * (Remember to update re_dup() and pregfree() if you add any items.)
   */
diff --git a/regcomp.sym b/regcomp.sym

index f3f7164..21904e1 100644 (file)
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -77,9 +77,9 @@ BACK          BACK,   no      Match "", "next" ptr points backward.
  
  #*Literals (33..35)
  
-EXACT          EXACT,  sv      Match this string (preceded by length).
-EXACTF         EXACT,  sv      Match this string, folded (prec. by length).
-EXACTFL                EXACT,  sv      Match this string, folded in locale (w/len).
+EXACT          EXACT,  str     Match this string (preceded by length).
+EXACTF         EXACT,  str     Match this string, folded (prec. by length).
+EXACTFL                EXACT,  str     Match this string, folded in locale (w/len).
  
  #*Do nothing types (36..37)
  
@@ -154,15 +154,21 @@ TRIEC             TRIE,   trie charclass  Same as TRIE, but with embedded charclass data
  AHOCORASICK    TRIE,   trie 1  Aho Corasick stclass. flags==type
  AHOCORASICKC   TRIE,   trie charclass  Same as AHOCORASICK, but with embedded charclass data
  
-#*Recursion (65) 
+#*Recursion (65..66) 
  RECURSE                RECURSE,   num/ofs 2L   recurse to paren arg1 at (signed) ofs arg2
  SRECURSE       RECURSE,   no           recurse to start of pattern
  
+#*Named references (67..69)
+NREF           NREF,    no-sv 1        Match some already matched string
+NREFF          NREF,    no-sv 1        Match already matched string, folded
+NREFFL         NREF,    no-sv 1        Match already matched string, folded in loc.
+
+
  # NEW STUFF ABOVE THIS LINE -- Please update counts below. 
  
  ################################################################################
  
-#*SPECIAL  REGOPS (65, 66)
+#*SPECIAL  REGOPS (70, 71)
  
  # This is not really a node, but an optimized away piece of a "long" node.
  # To simplify debugging output, we mark it as if it were a node
diff --git a/regexec.c b/regexec.c

index bd061fd..2743c53 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -2028,6 +2028,8 @@ got_it:
                                                   the same. */
         restore_pos(aTHX_ prog);
      }
+    if (prog->paren_names) 
+        (void)hv_iterinit(prog->paren_names);
  
      /* make sure $`, $&, $', and $digit will work later */
      if ( !(flags & REXEC_NOT_FIRST) ) {
@@ -3288,13 +3290,39 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
                locinput++;
             nextchr = UCHARAT(locinput);
             break;
+            
+       case NREFFL:
+       {
+           char *s;
+           char type = OP(scan);
+           PL_reg_flags |= RF_tainted;
+           /* FALL THROUGH */
+       case NREF:
+       case NREFF:
+           {
+               SV *sv_dat=(SV*)rex->data->data[ ARG( scan ) ];    
+               I32 *nums=(I32*)SvPVX(sv_dat);
+                for ( n=0; n<SvIVX(sv_dat); n++ ) {
+                    if ((I32)*PL_reglastparen >= nums[n] &&
+                        PL_regstartp[nums[n]] != -1 &&
+                        PL_regendp[nums[n]] != -1) 
+                    {
+                        n = nums[n];
+                        type = REF + ( type - NREF );
+                        goto do_ref;    
+                    }
+                }
+                sayNO;
+                /* unreached */
+            } 
         case REFFL:
             PL_reg_flags |= RF_tainted;
             /* FALL THROUGH */
          case REF:
-       case REFF: {
-           char *s;
+       case REFF: 
             n = ARG(scan);  /* which paren pair */
+           type = OP(scan);
+         do_ref:  
             ln = PL_regstartp[n];
             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
             if ((I32)*PL_reglastparen < n || ln == -1)
@@ -3303,7 +3331,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
                 break;
  
             s = PL_bostr + ln;
-           if (do_utf8 && OP(scan) != REF) {   /* REF can do byte comparison */
+           if (do_utf8 && type != REF) {       /* REF can do byte comparison */
                 char *l = locinput;
                 const char *e = PL_bostr + PL_regendp[n];
                 /*
@@ -3311,7 +3339,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
                  * in the 8-bit case (no pun intended) because in Unicode we
                  * have to map both upper and title case to lower case.
                  */
-               if (OP(scan) == REFF) {
+               if (type == REFF) {
                     while (s < e) {
                         STRLEN ulen1, ulen2;
                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
@@ -3334,24 +3362,23 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
  
             /* Inline the first character, for speed. */
             if (UCHARAT(s) != nextchr &&
-               (OP(scan) == REF ||
-                (UCHARAT(s) != ((OP(scan) == REFF
-                                 ? PL_fold : PL_fold_locale)[nextchr]))))
+               (type == REF ||
+                (UCHARAT(s) != (type == REFF
+                                 ? PL_fold : PL_fold_locale)[nextchr])))
                 sayNO;
             ln = PL_regendp[n] - ln;
             if (locinput + ln > PL_regeol)
                 sayNO;
-           if (ln > 1 && (OP(scan) == REF
+           if (ln > 1 && (type == REF
                            ? memNE(s, locinput, ln)
-                          : (OP(scan) == REFF
+                          : (type == REFF
                               ? ibcmp(s, locinput, ln)
                               : ibcmp_locale(s, locinput, ln))))
                 sayNO;
             locinput += ln;
             nextchr = UCHARAT(locinput);
             break;
-           }
-
+       }
         case NOTHING:
         case TAIL:
             break;
diff --git a/regexp.h b/regexp.h

index 4048669..faed0ee 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -54,7 +54,8 @@ typedef struct regexp {
         U32 lastcloseparen;     /* last paren matched */
         U32 reganch;            /* Internal use only +
                                    Tainted information used by regexec? */
-        const struct regexp_engine* engine;
+       HV *paren_names;        /* Paren names */
+       const struct regexp_engine* engine;
         regnode program[1];     /* Unwarranted chumminess with compiler. */
  } regexp;
  
diff --git a/regnodes.h b/regnodes.h

index 78db033..3030e04 100644 (file)
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
  
  /* Regops and State definitions */
  
-#define REGNODE_MAX            68
-#define REGMATCH_STATE_MAX     98
+#define REGNODE_MAX            71
+#define REGMATCH_STATE_MAX     101
  
  #define        END                     0       /* 0000 End of program. */
  #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
@@ -76,41 +76,44 @@
  #define        AHOCORASICKC            64      /* 0x40 Same as AHOCORASICK, but with embedded charclass data */
  #define        RECURSE                 65      /* 0x41 recurse to paren arg1 at (signed) ofs arg2 */
  #define        SRECURSE                66      /* 0x42 recurse to start of pattern */
-#define        OPTIMIZED               67      /* 0x43 Placeholder for dump. */
-#define        PSEUDO                  68      /* 0x44 Pseudo opcode for internal use. */
+#define        NREF                    67      /* 0x43 Match some already matched string */
+#define        NREFF                   68      /* 0x44 Match already matched string, folded */
+#define        NREFFL                  69      /* 0x45 Match already matched string, folded in loc. */
+#define        OPTIMIZED               70      /* 0x46 Placeholder for dump. */
+#define        PSEUDO                  71      /* 0x47 Pseudo opcode for internal use. */
  
         /* ------------ States ------------- */
  
-#define        TRIE_next               69      /* 0x45 Regmatch state for TRIE */
-#define        TRIE_next_fail          70      /* 0x46 Regmatch state for TRIE */
-#define        EVAL_AB                 71      /* 0x47 Regmatch state for EVAL */
-#define        EVAL_AB_fail            72      /* 0x48 Regmatch state for EVAL */
-#define        CURLYX_end              73      /* 0x49 Regmatch state for CURLYX */
-#define        CURLYX_end_fail         74      /* 0x4a Regmatch state for CURLYX */
-#define        WHILEM_A_pre            75      /* 0x4b Regmatch state for WHILEM */
-#define        WHILEM_A_pre_fail       76      /* 0x4c Regmatch state for WHILEM */
-#define        WHILEM_A_min            77      /* 0x4d Regmatch state for WHILEM */
-#define        WHILEM_A_min_fail       78      /* 0x4e Regmatch state for WHILEM */
-#define        WHILEM_A_max            79      /* 0x4f Regmatch state for WHILEM */
-#define        WHILEM_A_max_fail       80      /* 0x50 Regmatch state for WHILEM */
-#define        WHILEM_B_min            81      /* 0x51 Regmatch state for WHILEM */
-#define        WHILEM_B_min_fail       82      /* 0x52 Regmatch state for WHILEM */
-#define        WHILEM_B_max            83      /* 0x53 Regmatch state for WHILEM */
-#define        WHILEM_B_max_fail       84      /* 0x54 Regmatch state for WHILEM */
-#define        BRANCH_next             85      /* 0x55 Regmatch state for BRANCH */
-#define        BRANCH_next_fail        86      /* 0x56 Regmatch state for BRANCH */
-#define        CURLYM_A                87      /* 0x57 Regmatch state for CURLYM */
-#define        CURLYM_A_fail           88      /* 0x58 Regmatch state for CURLYM */
-#define        CURLYM_B                89      /* 0x59 Regmatch state for CURLYM */
-#define        CURLYM_B_fail           90      /* 0x5a Regmatch state for CURLYM */
-#define        IFMATCH_A               91      /* 0x5b Regmatch state for IFMATCH */
-#define        IFMATCH_A_fail          92      /* 0x5c Regmatch state for IFMATCH */
-#define        CURLY_B_min_known       93      /* 0x5d Regmatch state for CURLY */
-#define        CURLY_B_min_known_fail  94      /* 0x5e Regmatch state for CURLY */
-#define        CURLY_B_min             95      /* 0x5f Regmatch state for CURLY */
-#define        CURLY_B_min_fail        96      /* 0x60 Regmatch state for CURLY */
-#define        CURLY_B_max             97      /* 0x61 Regmatch state for CURLY */
-#define        CURLY_B_max_fail        98      /* 0x62 Regmatch state for CURLY */
+#define        TRIE_next               72      /* 0x48 Regmatch state for TRIE */
+#define        TRIE_next_fail          73      /* 0x49 Regmatch state for TRIE */
+#define        EVAL_AB                 74      /* 0x4a Regmatch state for EVAL */
+#define        EVAL_AB_fail            75      /* 0x4b Regmatch state for EVAL */
+#define        CURLYX_end              76      /* 0x4c Regmatch state for CURLYX */
+#define        CURLYX_end_fail         77      /* 0x4d Regmatch state for CURLYX */
+#define        WHILEM_A_pre            78      /* 0x4e Regmatch state for WHILEM */
+#define        WHILEM_A_pre_fail       79      /* 0x4f Regmatch state for WHILEM */
+#define        WHILEM_A_min            80      /* 0x50 Regmatch state for WHILEM */
+#define        WHILEM_A_min_fail       81      /* 0x51 Regmatch state for WHILEM */
+#define        WHILEM_A_max            82      /* 0x52 Regmatch state for WHILEM */
+#define        WHILEM_A_max_fail       83      /* 0x53 Regmatch state for WHILEM */
+#define        WHILEM_B_min            84      /* 0x54 Regmatch state for WHILEM */
+#define        WHILEM_B_min_fail       85      /* 0x55 Regmatch state for WHILEM */
+#define        WHILEM_B_max            86      /* 0x56 Regmatch state for WHILEM */
+#define        WHILEM_B_max_fail       87      /* 0x57 Regmatch state for WHILEM */
+#define        BRANCH_next             88      /* 0x58 Regmatch state for BRANCH */
+#define        BRANCH_next_fail        89      /* 0x59 Regmatch state for BRANCH */
+#define        CURLYM_A                90      /* 0x5a Regmatch state for CURLYM */
+#define        CURLYM_A_fail           91      /* 0x5b Regmatch state for CURLYM */
+#define        CURLYM_B                92      /* 0x5c Regmatch state for CURLYM */
+#define        CURLYM_B_fail           93      /* 0x5d Regmatch state for CURLYM */
+#define        IFMATCH_A               94      /* 0x5e Regmatch state for IFMATCH */
+#define        IFMATCH_A_fail          95      /* 0x5f Regmatch state for IFMATCH */
+#define        CURLY_B_min_known       96      /* 0x60 Regmatch state for CURLY */
+#define        CURLY_B_min_known_fail  97      /* 0x61 Regmatch state for CURLY */
+#define        CURLY_B_min             98      /* 0x62 Regmatch state for CURLY */
+#define        CURLY_B_min_fail        99      /* 0x63 Regmatch state for CURLY */
+#define        CURLY_B_max             100     /* 0x64 Regmatch state for CURLY */
+#define        CURLY_B_max_fail        101     /* 0x65 Regmatch state for CURLY */
  
  /* PL_regkind[] What type of regop or state is this. */
  
@@ -185,6 +188,9 @@ EXTCONST U8 PL_regkind[] = {
         TRIE,           /* AHOCORASICKC           */
         RECURSE,        /* RECURSE                */
         RECURSE,        /* SRECURSE               */
+       NREF,           /* NREF                   */
+       NREF,           /* NREFF                  */
+       NREF,           /* NREFFL                 */
         NOTHING,        /* OPTIMIZED              */
         PSEUDO,         /* PSEUDO                 */
         /* ------------ States ------------- */
@@ -292,6 +298,9 @@ static const U8 regarglen[] = {
         EXTRA_SIZE(struct regnode_charclass),   /* AHOCORASICKC */
         EXTRA_SIZE(struct regnode_2L),          /* RECURSE      */
         0,                                      /* SRECURSE     */
+       EXTRA_SIZE(struct regnode_1),           /* NREF         */
+       EXTRA_SIZE(struct regnode_1),           /* NREFF        */
+       EXTRA_SIZE(struct regnode_1),           /* NREFFL       */
         0,                                      /* OPTIMIZED    */
         0,                                      /* PSEUDO       */
  };
@@ -366,6 +375,9 @@ static const char reg_off_by_arg[] = {
         0,      /* AHOCORASICKC */
         0,      /* RECURSE      */
         0,      /* SRECURSE     */
+       0,      /* NREF         */
+       0,      /* NREFF        */
+       0,      /* NREFFL       */
         0,      /* OPTIMIZED    */
         0,      /* PSEUDO       */
  };
@@ -441,39 +453,42 @@ const char * reg_name[] = {
         "AHOCORASICKC",                 /* 0x40 */
         "RECURSE",                      /* 0x41 */
         "SRECURSE",                     /* 0x42 */
-       "OPTIMIZED",                    /* 0x43 */
-       "PSEUDO",                       /* 0x44 */
+       "NREF",                         /* 0x43 */
+       "NREFF",                        /* 0x44 */
+       "NREFFL",                       /* 0x45 */
+       "OPTIMIZED",                    /* 0x46 */
+       "PSEUDO",                       /* 0x47 */
         /* ------------ States ------------- */
-       "TRIE_next",                    /* 0x45 */
-       "TRIE_next_fail",               /* 0x46 */
-       "EVAL_AB",                      /* 0x47 */
-       "EVAL_AB_fail",                 /* 0x48 */
-       "CURLYX_end",                   /* 0x49 */
-       "CURLYX_end_fail",              /* 0x4a */
-       "WHILEM_A_pre",                 /* 0x4b */
-       "WHILEM_A_pre_fail",            /* 0x4c */
-       "WHILEM_A_min",                 /* 0x4d */
-       "WHILEM_A_min_fail",            /* 0x4e */
-       "WHILEM_A_max",                 /* 0x4f */
-       "WHILEM_A_max_fail",            /* 0x50 */
-       "WHILEM_B_min",                 /* 0x51 */
-       "WHILEM_B_min_fail",            /* 0x52 */
-       "WHILEM_B_max",                 /* 0x53 */
-       "WHILEM_B_max_fail",            /* 0x54 */
-       "BRANCH_next",                  /* 0x55 */
-       "BRANCH_next_fail",             /* 0x56 */
-       "CURLYM_A",                     /* 0x57 */
-       "CURLYM_A_fail",                /* 0x58 */
-       "CURLYM_B",                     /* 0x59 */
-       "CURLYM_B_fail",                /* 0x5a */
-       "IFMATCH_A",                    /* 0x5b */
-       "IFMATCH_A_fail",               /* 0x5c */
-       "CURLY_B_min_known",            /* 0x5d */
-       "CURLY_B_min_known_fail",       /* 0x5e */
-       "CURLY_B_min",                  /* 0x5f */
-       "CURLY_B_min_fail",             /* 0x60 */
-       "CURLY_B_max",                  /* 0x61 */
-       "CURLY_B_max_fail",             /* 0x62 */
+       "TRIE_next",                    /* 0x48 */
+       "TRIE_next_fail",               /* 0x49 */
+       "EVAL_AB",                      /* 0x4a */
+       "EVAL_AB_fail",                 /* 0x4b */
+       "CURLYX_end",                   /* 0x4c */
+       "CURLYX_end_fail",              /* 0x4d */
+       "WHILEM_A_pre",                 /* 0x4e */
+       "WHILEM_A_pre_fail",            /* 0x4f */
+       "WHILEM_A_min",                 /* 0x50 */
+       "WHILEM_A_min_fail",            /* 0x51 */
+       "WHILEM_A_max",                 /* 0x52 */
+       "WHILEM_A_max_fail",            /* 0x53 */
+       "WHILEM_B_min",                 /* 0x54 */
+       "WHILEM_B_min_fail",            /* 0x55 */
+       "WHILEM_B_max",                 /* 0x56 */
+       "WHILEM_B_max_fail",            /* 0x57 */
+       "BRANCH_next",                  /* 0x58 */
+       "BRANCH_next_fail",             /* 0x59 */
+       "CURLYM_A",                     /* 0x5a */
+       "CURLYM_A_fail",                /* 0x5b */
+       "CURLYM_B",                     /* 0x5c */
+       "CURLYM_B_fail",                /* 0x5d */
+       "IFMATCH_A",                    /* 0x5e */
+       "IFMATCH_A_fail",               /* 0x5f */
+       "CURLY_B_min_known",            /* 0x60 */
+       "CURLY_B_min_known_fail",       /* 0x61 */
+       "CURLY_B_min",                  /* 0x62 */
+       "CURLY_B_min_fail",             /* 0x63 */
+       "CURLY_B_max",                  /* 0x64 */
+       "CURLY_B_max_fail",             /* 0x65 */
  };
  #endif /* DEBUGGING */
  #else
diff --git a/sv.c b/sv.c

index 16c6523..19e1d26 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -4484,6 +4484,9 @@ Perl_sv_magic(pTHX_ register SV *sv, SV *obj, int how, const char *name, I32 nam
      case PERL_MAGIC_regdata:
         vtable = &PL_vtbl_regdata;
         break;
+    case PERL_MAGIC_regdata_names:
+       vtable = &PL_vtbl_regdata_names;
+       break;
      case PERL_MAGIC_regdatum:
         vtable = &PL_vtbl_regdatum;
         break;
diff --git a/t/op/pat.t b/t/op/pat.t

index e20a6f7..e1ac167 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3657,6 +3657,31 @@ SKIP:{
      }
          
  }
+{
+    my $s='123453456';
+    $s=~s/(?<digits>\d+)\k<digits>/$+{digits}/;
+    ok($s eq '123456','Named capture (angle brackets) s///');
+    $s='123453456';
+    $s=~s/(?'digits'\d+)\k'digits'/$+{digits}/;
+    ok($s eq '123456','Named capture (single quotes) s///');    
+}
+{
+    my $s='foo bar baz';
+    my (@k,@v,$count);
+    if ($s=~/(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz)/) {
+        while (my ($k,$v)=each(%+)) {
+            $count++;
+        }
+        @k=sort keys(%+);
+        @v=sort values(%+);
+    }
+    ok($count==3,"Got 3 keys in %+ via each ($count)");
+    ok(@k == 3, 'Got 3 keys in %+ via keys');
+    ok("@k" eq "A B C", "Got expected keys");
+    ok("@v" eq "bar baz foo", "Got expected values");
+}
+        
+       
  # stress test CURLYX/WHILEM.
  #
  # This test includes varying levels of nesting, and according to
@@ -3771,5 +3796,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
      or print "# Unexpected outcome: should pass or crash perl\n";
  
  # Don't forget to update this!
-BEGIN{print "1..1264\n"};
+BEGIN{print "1..1270\n"};
  
diff --git a/t/op/re_tests b/t/op/re_tests

index 6759f34..08d45b2 100644 (file)
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -1020,3 +1020,20 @@ X(?<=foo.)[YZ]   ..XfooXY..      y       pos     8
  ^(<(?:[^<>]+|(?1))*>)$ <<><<<><>>>>    y       $1      <<><<<><>>>>
  ((?2)*)([fF]o+)        fooFoFoo        y       $1-$2   fooFo-Foo
  (<(?:[^<>]+|(?R))*>)   <<><<<><>>>>    y       $1      <<><<<><>>>>
+(?<n>foo|bar|baz)      snofooewa       y       $1      foo
+(?<n>foo|bar|baz)      snofooewa       y       $+{n}   foo
+(?<n>foo|bar|baz)(?<m>[ew]+)   snofooewa       y       $+{n}   foo
+(?<n>foo|bar|baz)(?<m>[ew]+)   snofooewa       y       $+{m}   ew
+(?<n>foo)|(?<n>bar)|(?<n>baz)  snofooewa       y       $+{n}   foo
+(?<n>foo)(??{ $+{n} }) snofooefoofoowaa        y       $+{n}   foo
+/(?'n'foo|bar|baz)/    snofooewa       y       $1      foo
+/(?'n'foo|bar|baz)/    snofooewa       y       $+{n}   foo
+/(?'n'foo|bar|baz)(?'m'[ew]+)/ snofooewa       y       $+{n}   foo
+/(?'n'foo|bar|baz)(?'m'[ew]+)/ snofooewa       y       $+{m}   ew
+/(?'n'foo)|(?'n'bar)|(?<n>baz)/        snobazewa       y       $+{n}   baz
+/(?'n'foo)(??{ $+{n} })/       snofooefoofoowaa        y       $+{n}   foo
+/(?'n'foo)\k<n>/       ..foofoo..      y       $1      foo
+/(?'n'foo)\k<n>/       ..foofoo..      y       $+{n}   foo
+/(?<n>foo)\k'n'/       ..foofoo..      y       $1      foo
+/(?<n>foo)\k'n'/       ..foofoo..      y       $+{n}   foo
+/(?:(?<n>foo)|(?<n>bar))\k<n>/ ..barbar..      y       $+{n}   bar
diff --git a/t/op/regexp.t b/t/op/regexp.t

index 2b21766..6a469b7 100755 (executable)
--- a/t/op/regexp.t
+++ b/t/op/regexp.t
@@ -64,7 +64,7 @@ while (<TESTS>) {
      $input = join(':',$pat,$subject,$result,$repl,$expect);
      infty_subst(\$pat);
      infty_subst(\$expect);
-    $pat = "'$pat'" unless $pat =~ /^[:']/;
+    $pat = "'$pat'" unless $pat =~ /^[:'\/]/;
      $pat =~ s/(\$\{\w+\})/$1/eeg;
      $pat =~ s/\\n/\n/g;
      $subject = eval qq("$subject");
diff --git a/toke.c b/toke.c

index 1cce947..5c24cca 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1793,7 +1793,7 @@ S_scan_const(pTHX_ char *start)
      const char * const leaveit = /* set of acceptably-backslashed characters */
         (const char *)
         (PL_lex_inpat
-        ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-Nnrtfeaxcz0123456789[{]} \t\n\r\f\v#"
+        ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-Nnrktfeaxcz0123456789[{]} \t\n\r\f\v#"
          : "");
  
      if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
author	Yves Orton <demerphq@gmail.com>
	Fri, 6 Oct 2006 19:16:01 +0000 (21:16 +0200)
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
	Sat, 7 Oct 2006 14:30:32 +0000 (14:30 +0000)
XSUB.h		patch \| blob \| history
doop.c		patch \| blob \| history
dump.c		patch \| blob \| history
embed.fnc		patch \| blob \| history
embed.h		patch \| blob \| history
gv.c		patch \| blob \| history
hv.c		patch \| blob \| history
perl.h		patch \| blob \| history
pod/perlre.pod		patch \| blob \| history
pod/perltodo.pod		patch \| blob \| history
pod/perlvar.pod		patch \| blob \| history
pp.c		patch \| blob \| history
proto.h		patch \| blob \| history
regcomp.c		patch \| blob \| history
regcomp.h		patch \| blob \| history
regcomp.sym		patch \| blob \| history
regexec.c		patch \| blob \| history
regexp.h		patch \| blob \| history
regnodes.h		patch \| blob \| history
sv.c		patch \| blob \| history
t/op/pat.t		patch \| blob \| history
t/op/re_tests		patch \| blob \| history
t/op/regexp.t		patch \| blob \| history
toke.c		patch \| blob \| history