New regex syntax omnibus

author Yves Orton <demerphq@gmail.com>

Mon, 6 Nov 2006 13:06:28 +0000 (14:06 +0100)

committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>

Tue, 7 Nov 2006 10:21:25 +0000 (10:21 +0000)
author Yves Orton <demerphq@gmail.com>
Mon, 6 Nov 2006 13:06:28 +0000 (14:06 +0100)
committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
Tue, 7 Nov 2006 10:21:25 +0000 (10:21 +0000)
diff --git a/embed.fnc b/embed.fnc

index 350b43342b627146d201b3fd3a444d14a8cd6288..a3251a093d5053120d4d396c9a492c11172ccdf6 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1359,7 +1359,7 @@ Es        |U8     |regtail_study  |NN struct RExC_state_t *state|NN regnode *p|NN const regn
  
  #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT)
  ERs    |I32    |regmatch       |NN regmatch_info *reginfo|NN regnode *prog
-ERs    |I32    |regrepeat      |NN const regexp *prog|NN const regnode *p|I32 max
+ERs    |I32    |regrepeat      |NN const regexp *prog|NN const regnode *p|I32 max|int depth
  ERs    |I32    |regtry         |NN regmatch_info *reginfo|NN char **startpos
  ERs    |bool   |reginclass     |NULLOK const regexp *prog|NN const regnode *n|NN const U8 *p|NULLOK STRLEN *lenp\
                                 |bool do_utf8sv_is_utf8
diff --git a/embed.h b/embed.h

index 22595d54c2d226af82aca6e5d34b645a80d45f53..fea5b27915a82bce4b35946b6b279109e060fd2e 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -3555,7 +3555,7 @@
  #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT)
  #if defined(PERL_CORE) || defined(PERL_EXT)
  #define regmatch(a,b)          S_regmatch(aTHX_ a,b)
-#define regrepeat(a,b,c)       S_regrepeat(aTHX_ a,b,c)
+#define regrepeat(a,b,c,d)     S_regrepeat(aTHX_ a,b,c,d)
  #define regtry(a,b)            S_regtry(aTHX_ a,b)
  #define reginclass(a,b,c,d,e)  S_reginclass(aTHX_ a,b,c,d,e)
  #define regcppush(a)           S_regcppush(aTHX_ a)
diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t

index 1ccf8b3eca877415e62af7b817707122d444508a..f586d22eeda51eb9153d778ee1a883ac7cdcd41f 100644 (file)
--- a/ext/re/t/regop.t
+++ b/ext/re/t/regop.t
@@ -252,7 +252,7 @@ Matching stclass EXACTF <.> against ".exe"
  #Guessed: match at offset 0
  #%MATCHED%
  #Freeing REx: "[q]"
-Got 100 bytes for offset annotations.
-Offsets: [12]
+Got 108 bytes for offset annotations.
+Offsets: [13]
  1:1[3] 3:4[0]
  %MATCHED%        
diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod

index ff8efcd62161c0e5b137f6e299e17929a8739176..a7e3b4050823b9d9f57041b2aa591534f15d7d89 100644 (file)
--- a/pod/perl595delta.pod
+++ b/pod/perl595delta.pod
@@ -107,8 +107,8 @@ quantifiers. (Yves Orton)
  =item Backtracking control verbs
  
  The regex engine now supports a number of special purpose backtrack
-control verbs: (?COMMIT), (?CUT), (?ERROR) and (?FAIL). See L<perlre>
-for their descriptions.
+control verbs: (*COMMIT), (*MARK), (*CUT), (*ERROR), (*FAIL) and
+(*ACCEPT). See L<perlre> for their descriptions.
  
  =back
  
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index c20b0602c234d41923ff8e9239e0137c9d523229..e9d23267bde776b6d3861f75d3a656cbb4666430 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4291,6 +4291,13 @@ category that is unknown to perl at this point.
  
  Note that if you want to enable a warnings category registered by a module
  (e.g. C<use warnings 'File::Find'>), you must have imported this module
+
+=item Unknown verb pattern '%s' in regex; marked by <-- HERE in m/%s/
+
+(F) You either made a typo or have incorrectly put a C<*> quantifier
+after an open brace in your pattern.  Check the pattern and review
+L<perlre> for details on legal verb patterns.
+
  first.
  
  =item unmatched [ in regex; marked by <-- HERE in m/%s/
@@ -4412,6 +4419,17 @@ character to get your parentheses to balance.  See L<attributes>.
  compressed integer format and could not be converted to an integer.
  See L<perlfunc/pack>.
  
+=item Unterminated verb pattern in regex; marked by <-- HERE in m/%s/
+
+(F) You used a pattern of the form C<(*VERB)> but did not terminate
+the pattern with a C<)>. Fix the pattern and retry.
+
+=item Unterminated verb pattern argument in regex; marked by <-- HERE in m/%s/
+
+(F) You used a pattern of the form C<(*VERB:ARG)> but did not terminate
+the pattern with a C<)>. Fix the pattern and retry.
+
+
  =item Unterminated <> operator
  
  (F) The lexer saw a left angle bracket in a place where it was expecting
@@ -4807,6 +4825,16 @@ anonymous, using the C<sub {}> syntax.  When inner anonymous subs that
  reference variables in outer subroutines are created, they
  are automatically rebound to the current values of such variables.
  
+=item Verb pattern '%s' has a mandatory argument in regex; marked by <-- HERE in m/%s/ 
+
+(F) You used a verb pattern that requires an argument. Supply an argument
+or check that you are using the right verb.
+
+=item Verb pattern '%s' may not have an argument in regex; marked by <-- HERE in m/%s/ 
+
+(F) You used a verb pattern that is not allowed an argument. Remove the 
+argument or check that you are using the right verb.
+
  =item Version number must be a constant number
  
  (P) The attempt to translate a C<use Module n.n LIST> statement into
diff --git a/pod/perlre.pod b/pod/perlre.pod

index bce72914fb876047ba9c91693266df0b4117ac7c..45e41e5f54d5dc65e0c3c8fd0376034087cd71ba 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -933,14 +933,100 @@ the same name, then it recurses to the leftmost.
  It is an error to refer to a name that is not declared somewhere in the
  pattern.
  
-=item C<(?FAIL)> C<(?F)>
-X<(?FAIL)> X<(?F)>
+=item C<(?(condition)yes-pattern|no-pattern)>
+X<(?()>
  
-This pattern matches nothing and always fails. It can be used to force the
-engine to backtrack. It is equivalent to C<(?!)>, but easier to read. In
-fact, C<(?!)> gets optimised into C<(?FAIL)> internally.
+=item C<(?(condition)yes-pattern)>
  
-It is probably useful only when combined with C<(?{})> or C<(??{})>.
+Conditional expression.  C<(condition)> should be either an integer in
+parentheses (which is valid if the corresponding pair of parentheses
+matched), a look-ahead/look-behind/evaluate zero-width assertion, a
+name in angle brackets or single quotes (which is valid if a buffer
+with the given name matched), or the special symbol (R) (true when
+evaluated inside of recursion or eval). Additionally the R may be
+followed by a number, (which will be true when evaluated when recursing
+inside of the appropriate group), or by C<&NAME>, in which case it will
+be true only when evaluated during recursion in the named group.
+
+Here's a summary of the possible predicates:
+
+=over 4
+
+=item (1) (2) ...
+
+Checks if the numbered capturing buffer has matched something.
+
+=item (<NAME>) ('NAME')
+
+Checks if a buffer with the given name has matched something.
+
+=item (?{ CODE })
+
+Treats the code block as the condition.
+
+=item (R)
+
+Checks if the expression has been evaluated inside of recursion.
+
+=item (R1) (R2) ...
+
+Checks if the expression has been evaluated while executing directly
+inside of the n-th capture group. This check is the regex equivalent of
+
+  if ((caller(0))[3] eq 'subname') { ... }
+
+In other words, it does not check the full recursion stack.
+
+=item (R&NAME)
+
+Similar to C<(R1)>, this predicate checks to see if we're executing
+directly inside of the leftmost group with a given name (this is the same
+logic used by C<(?&NAME)> to disambiguate). It does not check the full
+stack, but only the name of the innermost active recursion.
+
+=item (DEFINE)
+
+In this case, the yes-pattern is never directly executed, and no
+no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient.
+See below for details.
+
+=back
+
+For example:
+
+    m{ ( \( )?
+       [^()]+
+       (?(1) \) )
+     }x
+
+matches a chunk of non-parentheses, possibly included in parentheses
+themselves.
+
+A special form is the C<(DEFINE)> predicate, which never executes directly
+its yes-pattern, and does not allow a no-pattern. This allows to define
+subpatterns which will be executed only by using the recursion mechanism.
+This way, you can define a set of regular expression rules that can be
+bundled into any pattern you choose.
+
+It is recommended that for this usage you put the DEFINE block at the
+end of the pattern, and that you name any subpatterns defined within it.
+
+Also, it's worth noting that patterns defined this way probably will
+not be as efficient, as the optimiser is not very clever about
+handling them.
+
+An example of how this might be used is as follows:
+
+  /(?<NAME>(&NAME_PAT))(?<ADDR>(&ADDRESS_PAT))
+   (?(DEFINE)
+     (<NAME_PAT>....)
+     (<ADRESS_PAT>....)
+   )/x
+
+Note that capture buffers matched inside of recursion are not accessible
+after the recursion returns, so the extra layer of capturing buffers are
+necessary. Thus C<$+{NAME_PAT}> would not be defined even though
+C<$+{NAME}> would be.
  
  =item C<< (?>pattern) >>
  X<backtrack> X<backtracking> X<atomic> X<possessive>
@@ -973,12 +1059,12 @@ in the rest of a regular expression.)
  Consider this pattern:
  
      m{ \(
-         ( 
-           [^()]+              # x+
-          | 
+          (
+            [^()]+             # x+
+          |
              \( [^()]* \)
            )+
-       \) 
+       \)
       }x
  
  That will efficiently match a nonempty group with matching parentheses
@@ -992,13 +1078,13 @@ seconds, but that each extra letter doubles this time.  This
  exponential performance will make it appear that your program has
  hung.  However, a tiny change to this pattern
  
-    m{ \( 
-         ( 
-           (?> [^()]+ )        # change x+ above to (?> x+ )
-          | 
+    m{ \(
+          (
+            (?> [^()]+ )       # change x+ above to (?> x+ )
+          |
              \( [^()]* \)
            )+
-       \) 
+       \)
       }x
  
  which uses C<< (?>...) >> matches exactly when the one above does (verifying
@@ -1046,13 +1132,50 @@ to inside of one of these constructs. The following equivalences apply:
      PAT?+               (?>PAT?)
      PAT{min,max}+       (?>PAT{min,max})
  
-=item C<(?COMMIT)>
-X<(?COMMIT)>
+=back
+
+=head2 Special Backtracking Control Verbs
+
+B<WARNING:> These patterns are experimental and subject to change or
+removal in a future version of perl. Their usage in production code should
+be noted to avoid problems during upgrades.
+
+These special patterns are generally of the form C<(*VERB:ARG)>. Unless
+otherwise stated the ARG argument is optional; in some cases, it is
+forbidden.
+
+Any pattern containing a special backtracking verb that allows an argument
+has the special behaviour that when executed it sets the current packages'
+C<$REGERROR> variable. In this case, the following rules apply:
+
+On failure, this variable will be set to the ARG value of the verb
+pattern, if the verb was involved in the failure of the match. If the ARG
+part of the pattern was omitted, then C<$REGERROR> will be set to TRUE.
+
+On a successful match this variable will be set to FALSE.
+
+B<NOTE:> C<$REGERROR> is not a magic variable in the same sense than
+C<$1> and most other regex related variables. It is not local to a
+scope, nor readonly but instead a volatile package variable similar to
+C<$AUTOLOAD>. Use C<local> to localize changes to it to a specific scope
+if necessary.
+
+If a pattern does not contain a special backtracking verb that allows an
+argument, then C<$REGERROR> is not touched at all.
+
+=over 4
+
+=item Verbs that take an argument
+
+=over 4
+
+=item C<(*NOMATCH)> C<(*NOMATCH:NAME)>
+X<(*NOMATCH)> X<(*NOMATCH:NAME)>
  
  This zero-width pattern commits the match at the current point, preventing
-the engine from back-tracking on failure to the left of the commit point.
-Consider the pattern C<A (?COMMIT) B>, where A and B are complex patterns.
-Until the C<(?COMMIT)> is reached, A may backtrack as necessary to match.
+the engine from backtracking on failure to the left of the this point.
+Consider the pattern C<A (*NOMATCH) B>, where A and B are complex patterns.
+Until the C<(*NOMATCH)> is reached, A may backtrack as necessary to match.
  Once it is reached, matching continues in B, which may also backtrack as
  necessary; however, should B not match, then no further backtracking will
  take place, and the pattern will fail outright at that starting position.
@@ -1060,7 +1183,7 @@ take place, and the pattern will fail outright at that starting position.
  The following example counts all the possible matching strings in a
  pattern (without actually matching any of them).
  
-    'aaab'=~/a+b?(?{print "$&\n"; $count++})(?FAIL)/;
+    'aaab' =~ /a+b?(?{print "$&\n"; $count++})(*FAIL)/;
      print "Count=$count\n";
  
  which produces:
@@ -1076,9 +1199,9 @@ which produces:
      a
      Count=9
  
-If we add a C<(?COMMIT)> before the count like the following
+If we add a C<(*NOMATCH)> before the count like the following
  
-    'aaab'=~/a+b?(?COMMIT)(?{print "$&\n"; $count++})(?FAIL)/;
+    'aaab' =~ /a+b?(*NOMATCH)(?{print "$&\n"; $count++})(*FAIL)/;
      print "Count=$count\n";
  
  we prevent backtracking and find the count of the longest matching
@@ -1089,23 +1212,47 @@ at each matching startpoint like so:
      ab
      Count=3
  
-Any number of C<(?COMMIT)> assertions may be used in a pattern.
+Any number of C<(*NOMATCH)> assertions may be used in a pattern.
  
  See also C<< (?>pattern) >> and possessive quantifiers for other
  ways to control backtracking.
  
-=item C<(?CUT)>
-X<(?CUT)>
-
-This zero-width pattern is similar to C<(?COMMIT)>, except that on
-failure it also signifies that whatever text that was matched leading
-up to the C<(?CUT)> pattern cannot match, I<even from another
-starting point>.
-
-Compare the following to the examples in C<(?COMMIT)>, note the string
+=item C<(*MARK)> C<(*MARK:NAME)>
+X<(*MARK)>
+
+This zero-width pattern can be used to mark the point in a string
+reached when a certain part of the pattern has been successfully
+matched. This mark may be given a name. A later C<(*CUT)> pattern
+will then cut at that point if backtracked into on failure. Any
+number of (*MARK) patterns are allowed, and the NAME portion is
+optional and may be duplicated.
+
+See C<*CUT> for more detail.
+
+=item C<(*CUT)> C<(*CUT:NAME)>
+X<(*CUT)>
+
+This zero-width pattern is similar to C<(*NOMATCH)>, except that on
+failure it also signifies that whatever text that was matched leading up
+to the C<(*CUT)> pattern being executed cannot be part of a match, I<even
+if started from a later point>. This effectively means that the regex
+engine moves forward to this position on failure and tries to match
+again, (assuming that there is sufficient room to match).
+
+The name of the C<(*CUT:NAME)> pattern has special significance. If a
+C<(*MARK:NAME)> was encountered while matching, then it is the position
+where that pattern was executed that is used for the "cut point" in the
+string. If no mark of that name was encountered, then the cut is done at
+the point where the C<(*CUT)> was. Similarly if no NAME is specified in
+the C<(*CUT)>, and if a C<(*MARK)> with any name (or none) is encountered,
+then that C<(*MARK)>'s cursor point will be used. If the C<(*CUT)> is not
+preceded by a C<(*MARK)>, then the cut point is where the string was when
+the C<(*CUT)> was encountered.
+
+Compare the following to the examples in C<(*NOMATCH)>, note the string
  is twice as long:
  
-    'aaabaaab'=~/a+b?(?CUT)(?{print "$&\n"; $count++})(?FAIL)/;
+    'aaabaaab' =~ /a+b?(*CUT)(?{print "$&\n"; $count++})(*FAIL)/;
      print "Count=$count\n";
  
  outputs
@@ -1114,17 +1261,17 @@ outputs
      aaab
      Count=2
  
-Once the 'aaab' at the start of the string has matched and the C<(?CUT)>
-executed the next startpoint will be where the cursor was when the
-C<(?CUT)> was executed.
+Once the 'aaab' at the start of the string has matched, and the C<(*CUT)>
+executed, the next startpoint will be where the cursor was when the
+C<(*CUT)> was executed.
  
-=item C<(?ERROR)>
-X<(?ERROR)>
+=item C<(*COMMIT)>
+X<(*COMMIT)>
  
-This zero-width pattern is similar to C<(?CUT)> except that it causes
+This zero-width pattern is similar to C<(*CUT)> except that it causes
  the match to fail outright. No attempts to match will occur again.
  
-    'aaabaaab'=~/a+b?(?ERROR)(?{print "$&\n"; $count++})(?FAIL)/;
+    'aaabaaab' =~ /a+b?(*COMMIT)(?{print "$&\n"; $count++})(*FAIL)/;
      print "Count=$count\n";
  
  outputs
@@ -1132,105 +1279,49 @@ outputs
      aaab
      Count=1
  
-In other words, once the C<(?ERROR)> has been entered and then pattern
-does not match then the regex engine will not try any further matching at
-all on the rest of the string.
-
-=item C<(?(condition)yes-pattern|no-pattern)>
-X<(?()>
-
-=item C<(?(condition)yes-pattern)>
+In other words, once the C<(*COMMIT)> has been entered, and if the pattern
+does not match, the regex engine will not try any further matching on the
+rest of the string.
  
-Conditional expression.  C<(condition)> should be either an integer in
-parentheses (which is valid if the corresponding pair of parentheses
-matched), a look-ahead/look-behind/evaluate zero-width assertion, a
-name in angle brackets or single quotes (which is valid if a buffer
-with the given name matched), the special symbol (R) (true when
-evaluated inside of recursion or eval). Additionally the R may be
-followed by a number, (which will be true when evaluated when recursing
-inside of the appropriate group), or by C<&NAME> in which case it will
-be true only when evaluated during recursion in the named group.
+=back
  
-Here's a summary of the possible predicates:
+=item Verbs without an argument
  
  =over 4
  
-=item (1) (2) ...
-
-Checks if the numbered capturing buffer has matched something.
-
-=item (<NAME>) ('NAME')
+=item C<(*FAIL)> C<(*F)>
+X<(*FAIL)> X<(*F)>
  
-Checks if a buffer with the given name has matched something.
-
-=item (?{ CODE })
-
-Treats the code block as the condition
-
-=item (R)
-
-Checks if the expression has been evaluated inside of recursion.
-
-=item (R1) (R2) ...
+This pattern matches nothing and always fails. It can be used to force the
+engine to backtrack. It is equivalent to C<(?!)>, but easier to read. In
+fact, C<(?!)> gets optimised into C<(*FAIL)> internally.
  
-Checks if the expression has been evaluated while executing directly
-inside of the n-th capture group. This check is the regex equivalent of
+It is probably useful only when combined with C<(?{})> or C<(??{})>.
  
-  if ((caller(0))[3] eq 'subname') { .. }
+=item C<(*ACCEPT)>
+X<(*ACCEPT)>
  
-In other words, it does not check the full recursion stack.
+B<WARNING:> This feature is highly experimental. It is not recommended
+for production code.
  
-=item (R&NAME)
+This pattern matches nothing and causes the end of successful matching at
+the point at which the C<(*ACCEPT)> pattern was encountered, regardless of
+whether there is actually more to match in the string. When inside of a
+nested pattern, such as recursion or a dynamically generated subbpattern
+via C<(??{})>, only the innermost pattern is ended immediately.
  
-Similar to C<(R1)>, this predicate checks to see if we're executing
-directly inside of the leftmost group with a given name (this is the same
-logic used by C<(?&NAME)> to disambiguate). It does not check the full
-stack, but only the name of the innermost active recursion.
+If the C<(*ACCEPT)> is inside of capturing buffers then the buffers are
+marked as ended at the point at which the C<(*ACCEPT)> was encountered.
+For instance:
  
-=item (DEFINE)
+  'AB' =~ /(A (A|B(*ACCEPT)|C) D)(E)/x;
  
-In this case, the yes-pattern is never directly executed, and no
-no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient.
-See below for details.
+will match, and C<$1> will be C<AB> and C<$2> will be C<B>, C<$3> will not
+be set. If another branch in the inner parens were matched, such as in the
+string 'ACDE', then the C<D> and C<E> would have to be matched as well.
  
  =back
  
-For example:
-
-    m{ ( \( )?
-       [^()]+
-       (?(1) \) )
-     }x
-
-matches a chunk of non-parentheses, possibly included in parentheses
-themselves.
-
-A special form is the C<(DEFINE)> predicate, which never executes directly
-its yes-pattern, and does not allow a no-pattern. This allows to define
-subpatterns which will be executed only by using the recursion mechanism.
-This way, you can define a set of regular expression rules that can be
-bundled into any pattern you choose.
-
-It is recommended that for this usage you put the DEFINE block at the
-end of the pattern, and that you name any subpatterns defined within it.
-
-Also, it's worth noting that patterns defined this way probably will
-not be as efficient, as the optimiser is not very clever about
-handling them. YMMV.
-
-An example of how this might be used is as follows:
-
-  /(?<NAME>(&NAME_PAT))(?<ADDR>(&ADDRESS_PAT))
-   (?(DEFINE)
-     (<NAME_PAT>....)
-     (<ADRESS_PAT>....)
-   )/x
-
-Note that capture buffers matched inside of recursion are not accessible
-after the recursion returns, so the extra layer of capturing buffers are
-necessary. Thus C<$+{NAME_PAT}> would not be defined even though
-C<$+{NAME}> would be.
-
  =back
  
  =head2 Backtracking
diff --git a/proto.h b/proto.h

index b141466c45fd56e3fd09b7e639f8fa26b8ddd542..531d58341565ce9b25100a31249aca22740a880e 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -3697,7 +3697,7 @@ STATIC I32        S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                         __attribute__nonnull__(pTHX_1)
                         __attribute__nonnull__(pTHX_2);
  
-STATIC I32     S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
+STATIC I32     S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                         __attribute__warn_unused_result__
                         __attribute__nonnull__(pTHX_1)
                         __attribute__nonnull__(pTHX_2);
diff --git a/regcomp.c b/regcomp.c

index 80d7eecd8d307fc8631a03e1eafaacdc018e2276..3ce84c14f605f2eb5ee5e8e7b50b56abac777567 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -114,6 +114,7 @@ typedef struct RExC_state_t {
      U32                seen;
      I32                size;                   /* Code size. */
      I32                npar;                   /* () count. */
+    I32                nestroot;               /* root parens we are in - used by accept */
      I32                extralen;
      I32                seen_zerolen;
      I32                seen_evals;
@@ -152,6 +153,7 @@ typedef struct RExC_state_t {
  #define RExC_seen      (pRExC_state->seen)
  #define RExC_size      (pRExC_state->size)
  #define RExC_npar      (pRExC_state->npar)
+#define RExC_nestroot   (pRExC_state->nestroot)
  #define RExC_extralen  (pRExC_state->extralen)
  #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
  #define RExC_seen_evals        (pRExC_state->seen_evals)
@@ -335,7 +337,7 @@ static const scan_data_t zero_scan_data =
  #define SCF_WHILEM_VISITED_POS 0x2000
  
  #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
-
+#define SCF_SEEN_ACCEPT         0x8000 
  
  #define UTF (RExC_utf8 != 0)
  #define LOC ((RExC_flags & PMf_LOCALE) != 0)
@@ -2311,6 +2313,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
      scan_data_t data_fake;
      SV *re_trie_maxbuff = NULL;
      regnode *first_non_open = scan;
+    I32 stopmin = I32_MAX;
      GET_RE_DEBUG_FLAGS_DECL;
  #ifdef DEBUGGING
      StructCopy(&zero_scan_data, &data_fake, scan_data_t);
@@ -2411,6 +2414,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     scan = next;
                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
                         pars++;
+                   if (data_fake.flags & SCF_SEEN_ACCEPT) {
+                       if ( stopmin > minnext) 
+                           stopmin = min + min1;
+                       flags &= ~SCF_DO_SUBSTR;
+                       if (data)
+                           data->flags |= SCF_SEEN_ACCEPT;
+                   }
                     if (data) {
                         if (data_fake.flags & SF_HAS_EVAL)
                             data->flags |= SF_HAS_EVAL;
@@ -3580,11 +3590,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 if (data)
                     data->flags |= SF_HAS_EVAL;
         }
-       else if ( OP(scan)==OPFAIL ) {
+       else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
             if (flags & SCF_DO_SUBSTR) {
                 scan_commit(pRExC_state,data,minlenp);
                 flags &= ~SCF_DO_SUBSTR;
             }
+           if (data && OP(scan)==ACCEPT) {
+               data->flags |= SCF_SEEN_ACCEPT;
+               if (stopmin > min)
+                   stopmin = min;
+           }
         }
         else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
         {
@@ -3666,7 +3681,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                      
                      if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
                          pars++;
-                    
+                    if (data_fake.flags & SCF_SEEN_ACCEPT) {
+                        if ( stopmin > min + min1) 
+                           stopmin = min + min1;
+                       flags &= ~SCF_DO_SUBSTR;
+                       if (data)
+                           data->flags |= SCF_SEEN_ACCEPT;
+                   }
                      if (data) {
                          if (data_fake.flags & SF_HAS_EVAL)
                              data->flags |= SF_HAS_EVAL;
@@ -3758,7 +3779,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
      
      DEBUG_STUDYDATA(data,depth);
      
-    return min;
+    return min < stopmin ? min : stopmin;
  }
  
  STATIC I32
@@ -3915,6 +3936,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
      RExC_end = xend;
      RExC_naughty = 0;
      RExC_npar = 1;
+    RExC_nestroot = 0;
      RExC_size = 0L;
      RExC_emit = &PL_regdummy;
      RExC_whilem_seen = 0;
@@ -3952,6 +3974,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
      if (RExC_whilem_seen > 15)
         RExC_whilem_seen = 15;
  
+#ifdef DEBUGGING
+    /* Make room for a sentinel value at the end of the program */
+    RExC_size++;
+#endif
+
      /* Allocate space and zero-initialize. Note, the two step process 
         of zeroing when in debug mode, thus anything assigned has to 
         happen after that */
@@ -4008,6 +4035,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
      RExC_npar = 1;
      RExC_emit_start = r->program;
      RExC_emit = r->program;
+#ifdef DEBUGGING
+    /* put a sentinal on the end of the program so we can check for
+       overwrites */
+    r->program[RExC_size].type = 255;
+#endif
      /* Store the count of eval-groups for security checks: */
      RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
      REGC((U8)REG_MAGIC, (char*) RExC_emit++);
@@ -4415,6 +4447,8 @@ reStudy:
         r->reganch |= ROPT_EVAL_SEEN;
      if (RExC_seen & REG_SEEN_CANY)
         r->reganch |= ROPT_CANY_SEEN;
+    if (RExC_seen & REG_SEEN_VERBARG)
+       r->reganch |= ROPT_VERBARG_SEEN;
      if (RExC_paren_names)
          r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
      else
@@ -4605,6 +4639,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
  #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
  #endif
  
+/* this idea is borrowed from STR_WITH_LEN in handy.h */
+#define CHECK_WORD(s,v,l)  \
+    (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
+
  STATIC regnode *
  S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
      /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
@@ -4641,6 +4679,98 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
  
      /* Make an OPEN node, if parenthesized. */
      if (paren) {
+        if ( *RExC_parse == '*') { /* (*VERB:ARG) */
+           char *start_verb = RExC_parse;
+           STRLEN verb_len = 0;
+           char *start_arg = NULL;
+           unsigned char op = 0;
+           int argok = 1;
+           int internal_argval = 0; /* internal_argval is only useful if !argok */
+           while ( *RExC_parse && *RExC_parse != ')' ) {
+               if ( *RExC_parse == ':' ) {
+                   start_arg = RExC_parse + 1;
+                   break;
+               }
+               RExC_parse++;
+           }
+           ++start_verb;
+           verb_len = RExC_parse - start_verb;
+           if ( start_arg ) {
+               RExC_parse++;
+               while ( *RExC_parse && *RExC_parse != ')' ) 
+                   RExC_parse++;
+               if ( *RExC_parse != ')' ) 
+                   vFAIL("Unterminated verb pattern argument");
+               if ( RExC_parse == start_arg )
+                   start_arg = NULL;
+           } else {
+               if ( *RExC_parse != ')' )
+                   vFAIL("Unterminated verb pattern");
+           }
+           switch ( *start_verb ) {
+            case 'A':  /* (*ACCEPT) */
+                if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
+                   op = ACCEPT;
+                   internal_argval = RExC_nestroot;
+               }
+               break;
+            case 'C':  /* (*COMMIT) */
+                if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
+                    op = COMMIT;
+                else if ( CHECK_WORD("CUT",start_verb,verb_len) )
+                    op = CUT;
+                break;
+            case 'F':  /* (*FAIL) */
+                if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
+                   op = OPFAIL;
+                   argok = 0;
+               }
+               break;
+           case 'M':
+               if ( CHECK_WORD("MARK",start_verb,verb_len) )
+                    op = MARKPOINT;
+                break;
+            case 'N':  /* (*NOMATCH) */
+                if ( CHECK_WORD("NOMATCH",start_verb,verb_len) )
+                    op = NOMATCH;
+                break;
+           }
+           if ( ! op ) {
+               RExC_parse++;
+               vFAIL3("Unknown verb pattern '%.*s'",
+                   verb_len, start_verb);
+           }
+           if ( argok ) {
+                if ( start_arg && internal_argval ) {
+                   vFAIL3("Verb pattern '%.*s' may not have an argument",
+                       verb_len, start_verb); 
+               } else if ( argok < 0 && !start_arg ) {
+                    vFAIL3("Verb pattern '%.*s' has a mandatory argument",
+                       verb_len, start_verb);    
+               } else {
+                   ret = reganode(pRExC_state, op, internal_argval);
+                   if ( ! internal_argval && ! SIZE_ONLY ) {
+                        if (start_arg) {
+                            SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
+                            ARG(ret) = add_data( pRExC_state, 1, "S" );
+                            RExC_rx->data->data[ARG(ret)]=(void*)sv;
+                            ret->flags = 0;
+                        } else {
+                            ret->flags = 1; 
+                        }
+                    }              
+               }
+               if (!internal_argval)
+                   RExC_seen |= REG_SEEN_VERBARG;
+           } else if ( start_arg ) {
+               vFAIL3("Verb pattern '%.*s' may not have an argument",
+                       verb_len, start_verb);    
+           } else {
+               ret = reg_node(pRExC_state, op);
+           }
+           nextchar(pRExC_state);
+           return ret;
+        } else 
         if (*RExC_parse == '?') { /* (?...) */
             U32 posflags = 0, negflags = 0;
             U32 *flagsp = &posflags;
@@ -4711,62 +4841,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                 RExC_parse++;
             case '=':           /* (?=...) */
             case '!':           /* (?!...) */
-               if (*RExC_parse == ')')
-                   goto do_op_fail;
                 RExC_seen_zerolen++;
+               if (*RExC_parse == ')') {
+                   ret=reg_node(pRExC_state, OPFAIL);
+                   nextchar(pRExC_state);
+                   return ret;
+               }
             case ':':           /* (?:...) */
             case '>':           /* (?>...) */
                 break;
-            case 'C':           /* (?CUT) and (?COMMIT) */
-               if (RExC_parse[0] == 'O' &&
-                   RExC_parse[1] == 'M' &&
-                   RExC_parse[2] == 'M' &&
-                   RExC_parse[3] == 'I' &&
-                   RExC_parse[4] == 'T' &&
-                   RExC_parse[5] == ')')
-               {
-                   RExC_parse+=5;
-                   ret = reg_node(pRExC_state, COMMIT);
-                } else if (
-                    RExC_parse[0] == 'U' &&
-                    RExC_parse[1] == 'T' &&
-                    RExC_parse[2] == ')') 
-                {
-                    RExC_parse+=2;
-                    ret = reg_node(pRExC_state, CUT);
-               } else {
-                   vFAIL("Sequence (?C... not terminated");
-               }
-               nextchar(pRExC_state);
-               return ret;
-               break;
-            case 'E':            /* (?ERROR) */
-                if (RExC_parse[0] == 'R' &&
-                    RExC_parse[1] == 'R' &&
-                    RExC_parse[2] == 'O' &&
-                    RExC_parse[3] == 'R' &&
-                    RExC_parse[4] == ')') 
-                {
-                    RExC_parse+=4;
-                    ret = reg_node(pRExC_state, OPERROR);
-                } else {
-                    vFAIL("Sequence (?E... not terminated"); 
-                }
-               nextchar(pRExC_state);
-               return ret;
-                break;                
-            case 'F':
-                if (RExC_parse[0] == 'A' &&
-                    RExC_parse[1] == 'I' &&
-                    RExC_parse[2] == 'L')
-                    RExC_parse+=3;
-                if (*RExC_parse != ')')
-                   vFAIL("Sequence (?FAIL) or (?F) not terminated");
-             do_op_fail:
-               ret = reg_node(pRExC_state, OPFAIL);
-               nextchar(pRExC_state);
-               return ret;
-               break;
             case '$':           /* (?$...) */
             case '@':           /* (?@...) */
                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
@@ -5098,12 +5181,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
           capturing_parens:
             parno = RExC_npar;
             RExC_npar++;
+           
             ret = reganode(pRExC_state, OPEN, parno);
-           if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
-               DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
+           if (!SIZE_ONLY ){
+               if (!RExC_nestroot) 
+                   RExC_nestroot = parno;
+               if (RExC_seen & REG_SEEN_RECURSE) {
+                   DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
                         "Setting open paren #%"IVdf" to %d\n", 
                         (IV)parno, REG_NODE_NUM(ret)));
-               RExC_open_parens[parno-1]= ret;
+                   RExC_open_parens[parno-1]= ret;
+               }
             }
              Set_Node_Length(ret, 1); /* MJD */
              Set_Node_Offset(ret, RExC_parse); /* MJD */
@@ -5175,6 +5263,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         "Setting close paren #%"IVdf" to %d\n", 
                         (IV)parno, REG_NODE_NUM(ender)));
                 RExC_close_parens[parno-1]= ender;
+               if (RExC_nestroot == parno) 
+                   RExC_nestroot = 0;
             }       
              Set_Node_Offset(ender,RExC_parse+1); /* MJD */
              Set_Node_Length(ender,1); /* MJD */
@@ -7505,6 +7595,11 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
         RExC_size += 1;
         return(ret);
      }
+#ifdef DEBUGGING
+    if (OP(RExC_emit) == 255)
+        Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
+            reg_name[op], OP(RExC_emit));
+#endif  
      NODE_ALIGN_FILL(ret);
      ptr = ret;
      FILL_ADVANCE_NODE(ptr, op);
@@ -7521,7 +7616,6 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
      }
  
      RExC_emit = ptr;
-
      return(ret);
  }
  
@@ -7555,7 +7649,10 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
         */
         return(ret);
      }
-
+#ifdef DEBUGGING
+    if (OP(RExC_emit) == 255)
+        Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
+#endif 
      NODE_ALIGN_FILL(ret);
      ptr = ret;
      FILL_ADVANCE_NODE_ARG(ptr, op, arg);
@@ -7573,7 +7670,6 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
      }
              
      RExC_emit = ptr;
-
      return(ret);
  }
  
@@ -8006,11 +8102,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
      }
      else if (k == WHILEM && o->flags)                  /* Ordinal/of */
         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
-    else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP) 
+    else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) 
         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
      else if (k == GOSUB) 
         Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
-    else if (k == LOGICAL)
+    else if (k == VERB) {
+        if (!o->flags) 
+            Perl_sv_catpvf(aTHX_ sv, ":%"SVf, 
+                (SV*)prog->data->data[ ARG( o ) ]);
+    } else if (k == LOGICAL)
         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
      else if (k == ANYOF) {
         int i, rangestart = -1;
@@ -8401,7 +8501,7 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
         for (i = 0; i < count; i++) {
             d->what[i] = r->data->what[i];
             switch (d->what[i]) {
-               /* legal options are one of: sfpont
+               /* legal options are one of: sSfpont
                    see also regcomp.h and pregfree() */
             case 's':
             case 'S':
diff --git a/regcomp.h b/regcomp.h

index 360e2a987a75d14b6bef47dcb88d6de1e594a0c1..2774a27ef9e5aa64f270359d59950b4042ce95f2 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -351,6 +351,7 @@ struct regnode_charclass_class {    /* has [[:blah:]] classes */
  #define REG_SEEN_SANY          REG_SEEN_CANY /* src bckwrd cmpt */
  #define REG_SEEN_RECURSE        0x00000020
  #define REG_TOP_LEVEL_BRANCHES  0x00000040
+#define REG_SEEN_VERBARG        0x00000080
  
  START_EXTERN_C
  
diff --git a/regcomp.sym b/regcomp.sym

index e673313d446d2db7922f96c7113bcfc173229f0e..074af132846ad6d3511a0787e34788d360d50231 100644 (file)
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -169,10 +169,16 @@ INSUBP            INSUBP,    num 1        Whether we are in a specific recurse.
  DEFINEP                DEFINEP,   none 1       Never execute directly.               
  
  #*Bactracking 
-OPFAIL         OPFAIL, none            Same as (?!)
-COMMIT         COMMIT, none            Pattern fails if backtracking through this 
-CUT            COMMIT, none            ... and restarts at the cursor point
-OPERROR                OPERROR,none            Pattern fails outright if backtracking through this
+ENDLIKE                ENDLIKE,   none         Used only for the type field of verbs
+OPFAIL         ENDLIKE,   none         Same as (?!)
+ACCEPT         ENDLIKE,   parno 1      Accepts the current matched string.
+VERB           VERB,      no-sv 1      Used only for the type field of verbs
+NOMATCH                VERB,      no-sv 1      Pattern fails at this startpoint if no-backtracking through this 
+MARKPOINT      VERB,      no-sv 1      Push the current location for rollback by cut.
+CUT            VERB,      no-sv 1      On failure cut the string at the mark.
+COMMIT         VERB,      no-sv 1      Pattern fails outright if backtracking through this
+
+
  
  # NEW STUFF ABOVE THIS LINE -- Please update counts below. 
  
@@ -210,4 +216,5 @@ CURLYM      A,B:FAIL
  IFMATCH        A:FAIL  
  CURLY          B_min_known,B_min,B_max:FAIL    
  COMMIT         next:FAIL
-
+MARKPOINT      next:FAIL
+CUT            next:FAIL
diff --git a/regexec.c b/regexec.c

index f7fd347922a045768ebe9a83c67b3869ef6e01f6..8e0aabdb4e02672876fa3c08be370e62ba382d12 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -2571,11 +2571,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
      int nochange_depth = 0; /* depth of GOSUB recursion with nochange*/
      regmatch_state *yes_state = NULL; /* state to pop to on success of
                                                             subpattern */
+    /* mark_state piggy backs on the yes_state logic so that when we unwind 
+       the stack on success we can update the mark_state as we go */
+    regmatch_state *mark_state = NULL; /* last mark state we have seen */
      regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
      struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
      U32 state_num;
      bool no_final = 0;
-
+    char *startpoint = PL_reginput;
+    SV *popmark = NULL;
+    SV *sv_commit = NULL;
+    int lastopen = 0;
      /* these three flags are set by various ops to signal information to
       * the very next op. They have a useful lifetime of exactly one loop
       * iteration, and are not preserved or restored by state pushes/pops
@@ -3606,6 +3612,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             PL_reg_start_tmp[n] = locinput;
             if (n > PL_regsize)
                 PL_regsize = n;
+            lastopen = n;
             break;
         case CLOSE:
             n = ARG(scan);  /* which paren pair */
@@ -3620,6 +3627,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                 goto fake_end;
             }    
             break;
+        case ACCEPT:
+            if (ARG(scan)){
+                regnode *cursor;
+                for (cursor=scan;
+                     cursor && OP(cursor)!=END; 
+                     cursor=regnext(cursor)) 
+                {
+                    if ( OP(cursor)==CLOSE ){
+                        n = ARG(cursor);
+                        if ( n <= lastopen ) {
+                            PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
+                            PL_regendp[n] = locinput - PL_bostr;
+                            /*if (n > PL_regsize)
+                            PL_regsize = n;*/
+                            if (n > (I32)*PL_reglastparen)
+                                *PL_reglastparen = n;
+                            *PL_reglastcloseparen = n;
+                            if ( n == ARG(scan) || (cur_eval && 
+                                cur_eval->u.eval.close_paren == (U32)n))
+                                break;
+                        }
+                    }
+                }
+            }
+           goto fake_end;
+           /*NOTREACHED*/          
         case GROUPP:
             n = ARG(scan);  /* which paren pair */
             sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
@@ -4302,7 +4335,7 @@ NULL
             PL_reginput = locinput;
             if (minmod) {
                 minmod = 0;
-               if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
+               if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
                     sayNO;
                 ST.count = ST.min;
                 locinput = PL_reginput;
@@ -4335,7 +4368,7 @@ NULL
  
             }
             else {
-               ST.count = regrepeat(rex, ST.A, ST.max);
+               ST.count = regrepeat(rex, ST.A, ST.max, depth);
                 locinput = PL_reginput;
                 if (ST.count < ST.min)
                     sayNO;
@@ -4421,7 +4454,7 @@ NULL
                 /* PL_reginput == oldloc now */
                 if (n) {
                     ST.count += n;
-                   if (regrepeat(rex, ST.A, n) < n)
+                   if (regrepeat(rex, ST.A, n, depth) < n)
                         sayNO;
                 }
                 PL_reginput = locinput;
@@ -4443,7 +4476,7 @@ NULL
             REGCP_UNWIND(ST.cp);
             /* failed -- move forward one */
             PL_reginput = locinput;
-           if (regrepeat(rex, ST.A, 1)) {
+           if (regrepeat(rex, ST.A, 1, depth)) {
                 ST.count++;
                 locinput = PL_reginput;
                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
@@ -4622,17 +4655,13 @@ NULL
             if (next == scan)
                 next = NULL;
             break;
-       case OPERROR:
-           reginfo->cutpoint=PL_regeol;
-           goto do_commit;
-           /* NOTREACHED */
-       case CUT:
-           if ( locinput > reginfo->bol )
-               reginfo->cutpoint = HOPBACKc(locinput, 1);
-           /* FALLTHROUGH */       
         case COMMIT:
-         do_commit:
+           reginfo->cutpoint = PL_regeol;
+           /* FALLTHROUGH */
+       case NOMATCH:
             PL_reginput = locinput;
+           if (!scan->flags)
+               sv_commit = (SV*)rex->data->data[ ARG( scan ) ];
             PUSH_STATE_GOTO(COMMIT_next,next);
             /* NOTREACHED */
         case COMMIT_next_fail:
@@ -4640,6 +4669,71 @@ NULL
             /* FALLTHROUGH */       
         case OPFAIL:
             sayNO;
+           /* NOTREACHED */
+
+#define ST st->u.mark
+        case MARKPOINT:
+            ST.prev_mark = mark_state;
+            ST.mark_name = scan->flags ? &PL_sv_yes : 
+                (SV*)rex->data->data[ ARG( scan ) ];
+            mark_state = st;
+            ST.mark_loc = PL_reginput = locinput;
+            PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
+            /* NOTREACHED */
+        case MARKPOINT_next:
+            mark_state = ST.prev_mark;
+            sayYES;
+            /* NOTREACHED */
+        case MARKPOINT_next_fail:
+            if (popmark && ( popmark == &PL_sv_yes || 
+                 (ST.mark_name != &PL_sv_yes && 
+                  sv_eq(ST.mark_name,popmark)))) 
+            {
+                if (ST.mark_loc > startpoint)
+                   reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
+                popmark = NULL; /* we found our mark */
+                sv_commit = ST.mark_name;
+
+                DEBUG_EXECUTE_r({
+                    if (sv_commit != &PL_sv_yes) 
+                       PerlIO_printf(Perl_debug_log,
+                           "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
+                           REPORT_CODE_OFF+depth*2, "", 
+                           PL_colors[4], sv_commit, PL_colors[5]);
+                    else
+                        PerlIO_printf(Perl_debug_log,
+                           "%*s  %ssetting cutpoint to mark...%s\n",
+                           REPORT_CODE_OFF+depth*2, "", 
+                           PL_colors[4], PL_colors[5]);
+               });
+            }
+            mark_state = ST.prev_mark;
+            sayNO;
+            /* NOTREACHED */
+        case CUT:
+            ST.mark_name = scan->flags ? &PL_sv_yes : 
+                    (SV*)rex->data->data[ ARG( scan ) ];
+            if (mark_state) {
+                ST.mark_loc = NULL;
+            } else {
+                ST.mark_loc = locinput;
+            }    
+            PL_reginput = locinput;
+           PUSH_STATE_GOTO(CUT_next,next);
+           /* NOTREACHED */
+       case CUT_next_fail:
+           if (ST.mark_loc) {
+               if (ST.mark_loc > startpoint)
+                   reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
+               sv_commit = ST.mark_name;
+            } else {
+                popmark = ST.mark_name;           
+            }
+            no_final = 1; 
+            sayNO;
+            /* NOTREACHED */
+#undef ST
+
         default:
             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
                           PTR2UV(scan), OP(scan));
@@ -4716,13 +4810,13 @@ yes:
                 PL_regmatch_slab = PL_regmatch_slab->prev;
                 st = SLAB_LAST(PL_regmatch_slab);
             }
-            DEBUG_STATE_r({
+           DEBUG_STATE_r({
                 if (no_final) {
                     DEBUG_STATE_pp("pop (no final)");        
                 } else {
                     DEBUG_STATE_pp("pop (yes)");
                 }
-               }); 
+           });
             depth--;
         }
  #else
@@ -4789,7 +4883,14 @@ no_silent:
      result = 0;
  
    final_exit:
-
+    if (rex->reganch & ROPT_VERBARG_SEEN) {
+        SV *sv = get_sv("REGERROR", 1);
+        if (result) 
+            sv_commit = &PL_sv_no;
+        else if (!sv_commit) 
+            sv_commit = &PL_sv_yes;
+        sv_setsv(sv, sv_commit);
+    }
      /* restore original high-water mark */
      PL_regmatch_slab  = orig_slab;
      PL_regmatch_state = orig_state;
@@ -4817,7 +4918,7 @@ no_silent:
   * rather than incrementing count on every character.  [Er, except utf8.]]
   */
  STATIC I32
-S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
+S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
  {
      dVAR;
      register char *scan;
@@ -5048,7 +5149,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
             regprop(prog, prop, p);
             PerlIO_printf(Perl_debug_log,
                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
-                       REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
+                       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
         });
      });
  
diff --git a/regexp.h b/regexp.h

index f13a5c5f808108ecf61903c0422cf8b953ac8f33..9b3ce7993a41000015be625cf0a35dcb57788373 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -97,7 +97,6 @@ typedef struct regexp_engine {
  #define ROPT_CANY_SEEN         0x00000800
  #define ROPT_SANY_SEEN         ROPT_CANY_SEEN /* src bckwrd cmpt */
  #define ROPT_GPOS_CHECK         (ROPT_GPOS_SEEN|ROPT_ANCH_GPOS)
-#define ROPT_RECURSE_SEEN       0x00001000
  
  /* 0xf800 of reganch is used by PMf_COMPILETIME */
  
@@ -106,6 +105,8 @@ typedef struct regexp_engine {
  #define ROPT_COPY_DONE         0x00040000      /* subbeg is a copy of the string */
  #define ROPT_TAINTED_SEEN      0x00080000
  #define ROPT_MATCH_UTF8                0x10000000 /* subbeg is utf-8 */
+#define ROPT_RECURSE_SEEN       0x20000000
+#define ROPT_VERBARG_SEEN       0x40000000
  
  #define RE_USE_INTUIT_NOML     0x00100000 /* Best to intuit before matching */
  #define RE_USE_INTUIT_ML       0x00200000
@@ -311,6 +312,14 @@ typedef struct regmatch_state {
             I32 logical;        /* saved copy of 'logical' var */
             regnode  *me; /* the IFMATCH/SUSPEND/UNLESSM node  */
         } ifmatch; /* and SUSPEND/UNLESSM */
+       
+       struct {
+           /* this first element must match u.yes */
+           struct regmatch_state *prev_yes_state;
+           struct regmatch_state *prev_mark;
+           SV* mark_name;
+           char *mark_loc;
+       } mark;
      } u;
  } regmatch_state;
  
diff --git a/regnodes.h b/regnodes.h

index 010b94303cb6d6f9e73d1c40b979d6919a6ba39b..005e409ab56c344651d428d72be935e41c23be30 100644 (file)
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
  
  /* Regops and State definitions */
  
-#define REGNODE_MAX            78
-#define REGMATCH_STATE_MAX     110
+#define REGNODE_MAX            82
+#define REGMATCH_STATE_MAX     118
  
  #define        END                     0       /* 0000 End of program. */
  #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
@@ -82,12 +82,16 @@
  #define        NGROUPP                 70      /* 0x46 Whether the group matched. */
  #define        INSUBP                  71      /* 0x47 Whether we are in a specific recurse. */
  #define        DEFINEP                 72      /* 0x48 Never execute directly. */
-#define        OPFAIL                  73      /* 0x49 Same as (?!) */
-#define        COMMIT                  74      /* 0x4a Pattern fails if backtracking through this */
-#define        CUT                     75      /* 0x4b ... and restarts at the cursor point */
-#define        OPERROR                 76      /* 0x4c Pattern fails outright if backtracking through this */
-#define        OPTIMIZED               77      /* 0x4d Placeholder for dump. */
-#define        PSEUDO                  78      /* 0x4e Pseudo opcode for internal use. */
+#define        ENDLIKE                 73      /* 0x49 Used only for the type field of verbs */
+#define        OPFAIL                  74      /* 0x4a Same as (?!) */
+#define        ACCEPT                  75      /* 0x4b Accepts the current matched string. */
+#define        VERB                    76      /* 0x4c    no-sv 1      Used only for the type field of verbs */
+#define        NOMATCH                 77      /* 0x4d Pattern fails at this startpoint if no-backtracking through this */
+#define        MARKPOINT               78      /* 0x4e Push the current location for rollback by cut. */
+#define        CUT                     79      /* 0x4f On failure cut the string at the mark. */
+#define        COMMIT                  80      /* 0x50 Pattern fails outright if backtracking through this */
+#define        OPTIMIZED               81      /* 0x51 Placeholder for dump. */
+#define        PSEUDO                  82      /* 0x52 Pseudo opcode for internal use. */
         /* ------------ States ------------- */
  #define        TRIE_next               (REGNODE_MAX + 1)       /* state for TRIE */
  #define        TRIE_next_fail          (REGNODE_MAX + 2)       /* state for TRIE */
@@ -121,6 +125,10 @@
  #define        CURLY_B_max_fail        (REGNODE_MAX + 30)      /* state for CURLY */
  #define        COMMIT_next             (REGNODE_MAX + 31)      /* state for COMMIT */
  #define        COMMIT_next_fail        (REGNODE_MAX + 32)      /* state for COMMIT */
+#define        MARKPOINT_next          (REGNODE_MAX + 33)      /* state for MARKPOINT */
+#define        MARKPOINT_next_fail     (REGNODE_MAX + 34)      /* state for MARKPOINT */
+#define        CUT_next                (REGNODE_MAX + 35)      /* state for CUT */
+#define        CUT_next_fail           (REGNODE_MAX + 36)      /* state for CUT */
  
  /* PL_regkind[] What type of regop or state is this. */
  
@@ -128,118 +136,126 @@
  EXTCONST U8 PL_regkind[];
  #else
  EXTCONST U8 PL_regkind[] = {
-       END,            /* END                    */
-       END,            /* SUCCEED                */
-       BOL,            /* BOL                    */
-       BOL,            /* MBOL                   */
-       BOL,            /* SBOL                   */
-       EOL,            /* EOS                    */
-       EOL,            /* EOL                    */
-       EOL,            /* MEOL                   */
-       EOL,            /* SEOL                   */
-       BOUND,          /* BOUND                  */
-       BOUND,          /* BOUNDL                 */
-       NBOUND,         /* NBOUND                 */
-       NBOUND,         /* NBOUNDL                */
-       GPOS,           /* GPOS                   */
-       REG_ANY,        /* REG_ANY                */
-       REG_ANY,        /* SANY                   */
-       REG_ANY,        /* CANY                   */
-       ANYOF,          /* ANYOF                  */
-       ALNUM,          /* ALNUM                  */
-       ALNUM,          /* ALNUML                 */
-       NALNUM,         /* NALNUM                 */
-       NALNUM,         /* NALNUML                */
-       SPACE,          /* SPACE                  */
-       SPACE,          /* SPACEL                 */
-       NSPACE,         /* NSPACE                 */
-       NSPACE,         /* NSPACEL                */
-       DIGIT,          /* DIGIT                  */
-       DIGIT,          /* DIGITL                 */
-       NDIGIT,         /* NDIGIT                 */
-       NDIGIT,         /* NDIGITL                */
-       CLUMP,          /* CLUMP                  */
-       BRANCH,         /* BRANCH                 */
-       BACK,           /* BACK                   */
-       EXACT,          /* EXACT                  */
-       EXACT,          /* EXACTF                 */
-       EXACT,          /* EXACTFL                */
-       NOTHING,        /* NOTHING                */
-       NOTHING,        /* TAIL                   */
-       STAR,           /* STAR                   */
-       PLUS,           /* PLUS                   */
-       CURLY,          /* CURLY                  */
-       CURLY,          /* CURLYN                 */
-       CURLY,          /* CURLYM                 */
-       CURLY,          /* CURLYX                 */
-       WHILEM,         /* WHILEM                 */
-       OPEN,           /* OPEN                   */
-       CLOSE,          /* CLOSE                  */
-       REF,            /* REF                    */
-       REF,            /* REFF                   */
-       REF,            /* REFFL                  */
-       BRANCHJ,        /* IFMATCH                */
-       BRANCHJ,        /* UNLESSM                */
-       BRANCHJ,        /* SUSPEND                */
-       BRANCHJ,        /* IFTHEN                 */
-       GROUPP,         /* GROUPP                 */
-       LONGJMP,        /* LONGJMP                */
-       BRANCHJ,        /* BRANCHJ                */
-       EVAL,           /* EVAL                   */
-       MINMOD,         /* MINMOD                 */
-       LOGICAL,        /* LOGICAL                */
-       BRANCHJ,        /* RENUM                  */
-       TRIE,           /* TRIE                   */
-       TRIE,           /* TRIEC                  */
-       TRIE,           /* AHOCORASICK            */
-       TRIE,           /* AHOCORASICKC           */
-       GOSUB,          /* GOSUB                  */
-       GOSTART,        /* GOSTART                */
-       NREF,           /* NREF                   */
-       NREF,           /* NREFF                  */
-       NREF,           /* NREFFL                 */
-       NGROUPP,        /* NGROUPP                */
-       INSUBP,         /* INSUBP                 */
-       DEFINEP,        /* DEFINEP                */
-       OPFAIL,         /* OPFAIL                 */
-       COMMIT,         /* COMMIT                 */
-       COMMIT,         /* CUT                    */
-       OPERROR,        /* OPERROR                */
-       NOTHING,        /* OPTIMIZED              */
-       PSEUDO,         /* PSEUDO                 */
+       END,            /* END                    */
+       END,            /* SUCCEED                */
+       BOL,            /* BOL                    */
+       BOL,            /* MBOL                   */
+       BOL,            /* SBOL                   */
+       EOL,            /* EOS                    */
+       EOL,            /* EOL                    */
+       EOL,            /* MEOL                   */
+       EOL,            /* SEOL                   */
+       BOUND,          /* BOUND                  */
+       BOUND,          /* BOUNDL                 */
+       NBOUND,         /* NBOUND                 */
+       NBOUND,         /* NBOUNDL                */
+       GPOS,           /* GPOS                   */
+       REG_ANY,        /* REG_ANY                */
+       REG_ANY,        /* SANY                   */
+       REG_ANY,        /* CANY                   */
+       ANYOF,          /* ANYOF                  */
+       ALNUM,          /* ALNUM                  */
+       ALNUM,          /* ALNUML                 */
+       NALNUM,         /* NALNUM                 */
+       NALNUM,         /* NALNUML                */
+       SPACE,          /* SPACE                  */
+       SPACE,          /* SPACEL                 */
+       NSPACE,         /* NSPACE                 */
+       NSPACE,         /* NSPACEL                */
+       DIGIT,          /* DIGIT                  */
+       DIGIT,          /* DIGITL                 */
+       NDIGIT,         /* NDIGIT                 */
+       NDIGIT,         /* NDIGITL                */
+       CLUMP,          /* CLUMP                  */
+       BRANCH,         /* BRANCH                 */
+       BACK,           /* BACK                   */
+       EXACT,          /* EXACT                  */
+       EXACT,          /* EXACTF                 */
+       EXACT,          /* EXACTFL                */
+       NOTHING,        /* NOTHING                */
+       NOTHING,        /* TAIL                   */
+       STAR,           /* STAR                   */
+       PLUS,           /* PLUS                   */
+       CURLY,          /* CURLY                  */
+       CURLY,          /* CURLYN                 */
+       CURLY,          /* CURLYM                 */
+       CURLY,          /* CURLYX                 */
+       WHILEM,         /* WHILEM                 */
+       OPEN,           /* OPEN                   */
+       CLOSE,          /* CLOSE                  */
+       REF,            /* REF                    */
+       REF,            /* REFF                   */
+       REF,            /* REFFL                  */
+       BRANCHJ,        /* IFMATCH                */
+       BRANCHJ,        /* UNLESSM                */
+       BRANCHJ,        /* SUSPEND                */
+       BRANCHJ,        /* IFTHEN                 */
+       GROUPP,         /* GROUPP                 */
+       LONGJMP,        /* LONGJMP                */
+       BRANCHJ,        /* BRANCHJ                */
+       EVAL,           /* EVAL                   */
+       MINMOD,         /* MINMOD                 */
+       LOGICAL,        /* LOGICAL                */
+       BRANCHJ,        /* RENUM                  */
+       TRIE,           /* TRIE                   */
+       TRIE,           /* TRIEC                  */
+       TRIE,           /* AHOCORASICK            */
+       TRIE,           /* AHOCORASICKC           */
+       GOSUB,          /* GOSUB                  */
+       GOSTART,        /* GOSTART                */
+       NREF,           /* NREF                   */
+       NREF,           /* NREFF                  */
+       NREF,           /* NREFFL                 */
+       NGROUPP,        /* NGROUPP                */
+       INSUBP,         /* INSUBP                 */
+       DEFINEP,        /* DEFINEP                */
+       ENDLIKE,        /* ENDLIKE                */
+       ENDLIKE,        /* OPFAIL                 */
+       ENDLIKE,        /* ACCEPT                 */
+       VERB,           /* VERB                   */
+       VERB,           /* NOMATCH                */
+       VERB,           /* MARKPOINT              */
+       VERB,           /* CUT                    */
+       VERB,           /* COMMIT                 */
+       NOTHING,        /* OPTIMIZED              */
+       PSEUDO,         /* PSEUDO                 */
         /* ------------ States ------------- */
-       TRIE,           /* TRIE_next              */
-       TRIE,           /* TRIE_next_fail         */
-       EVAL,           /* EVAL_AB                */
-       EVAL,           /* EVAL_AB_fail           */
-       CURLYX,         /* CURLYX_end             */
-       CURLYX,         /* CURLYX_end_fail        */
-       WHILEM,         /* WHILEM_A_pre           */
-       WHILEM,         /* WHILEM_A_pre_fail      */
-       WHILEM,         /* WHILEM_A_min           */
-       WHILEM,         /* WHILEM_A_min_fail      */
-       WHILEM,         /* WHILEM_A_max           */
-       WHILEM,         /* WHILEM_A_max_fail      */
-       WHILEM,         /* WHILEM_B_min           */
-       WHILEM,         /* WHILEM_B_min_fail      */
-       WHILEM,         /* WHILEM_B_max           */
-       WHILEM,         /* WHILEM_B_max_fail      */
-       BRANCH,         /* BRANCH_next            */
-       BRANCH,         /* BRANCH_next_fail       */
-       CURLYM,         /* CURLYM_A               */
-       CURLYM,         /* CURLYM_A_fail          */
-       CURLYM,         /* CURLYM_B               */
-       CURLYM,         /* CURLYM_B_fail          */
-       IFMATCH,        /* IFMATCH_A              */
-       IFMATCH,        /* IFMATCH_A_fail         */
-       CURLY,          /* CURLY_B_min_known      */
-       CURLY,          /* CURLY_B_min_known_fail */
-       CURLY,          /* CURLY_B_min            */
-       CURLY,          /* CURLY_B_min_fail       */
-       CURLY,          /* CURLY_B_max            */
-       CURLY,          /* CURLY_B_max_fail       */
-       COMMIT,         /* COMMIT_next            */
-       COMMIT,         /* COMMIT_next_fail       */
+       TRIE,           /* TRIE_next              */
+       TRIE,           /* TRIE_next_fail         */
+       EVAL,           /* EVAL_AB                */
+       EVAL,           /* EVAL_AB_fail           */
+       CURLYX,         /* CURLYX_end             */
+       CURLYX,         /* CURLYX_end_fail        */
+       WHILEM,         /* WHILEM_A_pre           */
+       WHILEM,         /* WHILEM_A_pre_fail      */
+       WHILEM,         /* WHILEM_A_min           */
+       WHILEM,         /* WHILEM_A_min_fail      */
+       WHILEM,         /* WHILEM_A_max           */
+       WHILEM,         /* WHILEM_A_max_fail      */
+       WHILEM,         /* WHILEM_B_min           */
+       WHILEM,         /* WHILEM_B_min_fail      */
+       WHILEM,         /* WHILEM_B_max           */
+       WHILEM,         /* WHILEM_B_max_fail      */
+       BRANCH,         /* BRANCH_next            */
+       BRANCH,         /* BRANCH_next_fail       */
+       CURLYM,         /* CURLYM_A               */
+       CURLYM,         /* CURLYM_A_fail          */
+       CURLYM,         /* CURLYM_B               */
+       CURLYM,         /* CURLYM_B_fail          */
+       IFMATCH,        /* IFMATCH_A              */
+       IFMATCH,        /* IFMATCH_A_fail         */
+       CURLY,          /* CURLY_B_min_known      */
+       CURLY,          /* CURLY_B_min_known_fail */
+       CURLY,          /* CURLY_B_min            */
+       CURLY,          /* CURLY_B_min_fail       */
+       CURLY,          /* CURLY_B_max            */
+       CURLY,          /* CURLY_B_max_fail       */
+       COMMIT,         /* COMMIT_next            */
+       COMMIT,         /* COMMIT_next_fail       */
+       MARKPOINT,      /* MARKPOINT_next         */
+       MARKPOINT,      /* MARKPOINT_next_fail    */
+       CUT,            /* CUT_next               */
+       CUT,            /* CUT_next_fail          */
  };
  #endif
  
@@ -320,10 +336,14 @@ static const U8 regarglen[] = {
         EXTRA_SIZE(struct regnode_1),           /* NGROUPP      */
         EXTRA_SIZE(struct regnode_1),           /* INSUBP       */
         EXTRA_SIZE(struct regnode_1),           /* DEFINEP      */
+       0,                                      /* ENDLIKE      */
         0,                                      /* OPFAIL       */
-       0,                                      /* COMMIT       */
-       0,                                      /* CUT          */
-       0,                                      /* OPERROR      */
+       EXTRA_SIZE(struct regnode_1),           /* ACCEPT       */
+       0,                                      /* VERB         */
+       EXTRA_SIZE(struct regnode_1),           /* NOMATCH      */
+       EXTRA_SIZE(struct regnode_1),           /* MARKPOINT    */
+       EXTRA_SIZE(struct regnode_1),           /* CUT          */
+       EXTRA_SIZE(struct regnode_1),           /* COMMIT       */
         0,                                      /* OPTIMIZED    */
         0,                                      /* PSEUDO       */
  };
@@ -404,10 +424,14 @@ static const char reg_off_by_arg[] = {
         0,      /* NGROUPP      */
         0,      /* INSUBP       */
         0,      /* DEFINEP      */
+       0,      /* ENDLIKE      */
         0,      /* OPFAIL       */
-       0,      /* COMMIT       */
+       0,      /* ACCEPT       */
+       0,      /* VERB         */
+       0,      /* NOMATCH      */
+       0,      /* MARKPOINT    */
         0,      /* CUT          */
-       0,      /* OPERROR      */
+       0,      /* COMMIT       */
         0,      /* OPTIMIZED    */
         0,      /* PSEUDO       */
  };
@@ -489,12 +513,16 @@ const char * reg_name[] = {
         "NGROUPP",                      /* 0x46 */
         "INSUBP",                       /* 0x47 */
         "DEFINEP",                      /* 0x48 */
-       "OPFAIL",                       /* 0x49 */
-       "COMMIT",                       /* 0x4a */
-       "CUT",                          /* 0x4b */
-       "OPERROR",                      /* 0x4c */
-       "OPTIMIZED",                    /* 0x4d */
-       "PSEUDO",                       /* 0x4e */
+       "ENDLIKE",                      /* 0x49 */
+       "OPFAIL",                       /* 0x4a */
+       "ACCEPT",                       /* 0x4b */
+       "VERB",                         /* 0x4c */
+       "NOMATCH",                      /* 0x4d */
+       "MARKPOINT",                    /* 0x4e */
+       "CUT",                          /* 0x4f */
+       "COMMIT",                       /* 0x50 */
+       "OPTIMIZED",                    /* 0x51 */
+       "PSEUDO",                       /* 0x52 */
         /* ------------ States ------------- */
         "TRIE_next",                    /* REGNODE_MAX +0x01 */
         "TRIE_next_fail",               /* REGNODE_MAX +0x02 */
@@ -528,6 +556,10 @@ const char * reg_name[] = {
         "CURLY_B_max_fail",             /* REGNODE_MAX +0x1e */
         "COMMIT_next",                  /* REGNODE_MAX +0x1f */
         "COMMIT_next_fail",             /* REGNODE_MAX +0x20 */
+       "MARKPOINT_next",               /* REGNODE_MAX +0x21 */
+       "MARKPOINT_next_fail",          /* REGNODE_MAX +0x22 */
+       "CUT_next",                     /* REGNODE_MAX +0x23 */
+       "CUT_next_fail",                /* REGNODE_MAX +0x24 */
  };
  #endif /* DEBUGGING */
  #else
diff --git a/t/op/pat.t b/t/op/pat.t

index 67be900c3cbf0c1507b88b1804a1057d3105e204..5405cf60994c07238b1b4253719ec15e450d4277 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3851,54 +3851,136 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
      iseq($count,1,"should have matched once only [RT#36046]");
  }
  
-{   # Test the (?COMMIT) pattern
+{   # Test the (*NOMATCH) pattern
      our $count = 0;
-    'aaab'=~/a+b?(?{$count++})(?FAIL)/;
-    iseq($count,9,"expect 9 for no (?COMMIT)");
+    'aaab'=~/a+b?(?{$count++})(*FAIL)/;
+    iseq($count,9,"expect 9 for no (*NOMATCH)");
      $count = 0;
-    'aaab'=~/a+b?(?COMMIT)(?{$count++})(?FAIL)/;
-    iseq($count,3,"expect 3 with (?COMMIT)");
+    'aaab'=~/a+b?(*NOMATCH)(?{$count++})(*FAIL)/;
+    iseq($count,3,"expect 3 with (*NOMATCH)");
      local $_='aaab';
      $count=0;
-    1 while /.(?COMMIT)(?{$count++})(?FAIL)/g;
-    iseq($count,4,"/.(?COMMIT)/");
+    1 while /.(*NOMATCH)(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.(*NOMATCH)/");
      $count = 0;
-    'aaab'=~/a+b?(??{'(?COMMIT)'})(?{$count++})(?FAIL)/;
-    iseq($count,3,"expect 3 with (?COMMIT)");
+    'aaab'=~/a+b?(??{'(*NOMATCH)'})(?{$count++})(*FAIL)/;
+    iseq($count,3,"expect 3 with (*NOMATCH)");
      local $_='aaab';
      $count=0;
-    1 while /.(??{'(?COMMIT)'})(?{$count++})(?FAIL)/g;
-    iseq($count,4,"/.(?COMMIT)/");
+    1 while /.(??{'(*NOMATCH)'})(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.(*NOMATCH)/");
  }
-{   # Test the (?CUT) pattern
+{   # Test the (*CUT) pattern
      our $count = 0;
-    'aaab'=~/a+b?(?CUT)(?{$count++})(?FAIL)/;
-    iseq($count,1,"expect 1 with (?CUT)");
+    'aaab'=~/a+b?(*CUT)(?{$count++})(*FAIL)/;
+    iseq($count,1,"expect 1 with (*CUT)");
      local $_='aaab';
      $count=0;
-    1 while /.(?CUT)(?{$count++})(?FAIL)/g;
-    iseq($count,4,"/.(?CUT)/");
+    1 while /.(*CUT)(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.(*CUT)/");
      $_='aaabaaab';
      $count=0;
      our @res=();
-    1 while /(a+b?)(?CUT)(?{$count++; push @res,$1})(?FAIL)/g;
-    iseq($count,2,"Expect 2 with (?CUT)" );
-    iseq("@res","aaab aaab","adjacent (?CUT) works as expected" );
+    1 while /(a+b?)(*CUT)(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,2,"Expect 2 with (*CUT)" );
+    iseq("@res","aaab aaab","adjacent (*CUT) works as expected" );
  }
-{   # Test the (?ERROR) pattern
+{   # Test the (*CUT) pattern
      our $count = 0;
-    'aaabaaab'=~/a+b?(?ERROR)(?{$count++})(?FAIL)/;
-    iseq($count,1,"expect 1 with (?ERROR)");
+    'aaab'=~/a+b?(*MARK)(*CUT)(?{$count++})(*FAIL)/;
+    iseq($count,1,"expect 1 with (*CUT)");
      local $_='aaab';
      $count=0;
-    1 while /.(?ERROR)(?{$count++})(?FAIL)/g;
-    iseq($count,1,"/.(?ERROR)/");
+    1 while /.(*MARK)(*CUT)(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.(*CUT)/");
      $_='aaabaaab';
      $count=0;
      our @res=();
-    1 while /(a+b?)(?ERROR)(?{$count++; push @res,$1})(?FAIL)/g;
-    iseq($count,1,"Expect 1 with (?ERROR)" );
-    iseq("@res","aaab","adjacent (?ERROR) works as expected" );
+    1 while /(a+b?)(*MARK)(*CUT)(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,2,"Expect 2 with (*CUT)" );
+    iseq("@res","aaab aaab","adjacent (*CUT) works as expected" );
+}
+{   # Test the (*CUT) pattern
+    our $count = 0;
+    'aaab'=~/a*(*MARK:a)b?(*MARK:b)(*CUT:a)(?{$count++})(*FAIL)/;
+    iseq($count,3,"expect 3 with *MARK:a)b?(*MARK:b)(*CUT:a)");
+    local $_='aaabaaab';
+    $count=0;
+    our @res=();
+    1 while /(a*(*MARK:a)b?)(*MARK)(*CUT:a)(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,5,"Expect 5 with (*MARK:a)b?)(*MARK)(*CUT:a)" );
+    iseq("@res","aaab b aaab b ","adjacent (*MARK:a)b?)(*MARK)(*CUT:a) works as expected" );
+}
+{   # Test the (*COMMIT) pattern
+    our $count = 0;
+    'aaabaaab'=~/a+b?(*COMMIT)(?{$count++})(*FAIL)/;
+    iseq($count,1,"expect 1 with (*COMMIT)");
+    local $_='aaab';
+    $count=0;
+    1 while /.(*COMMIT)(?{$count++})(*FAIL)/g;
+    iseq($count,1,"/.(*COMMIT)/");
+    $_='aaabaaab';
+    $count=0;
+    our @res=();
+    1 while /(a+b?)(*COMMIT)(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,1,"Expect 1 with (*COMMIT)" );
+    iseq("@res","aaab","adjacent (*COMMIT) works as expected" );
+}
+{
+    # Test named commits and the $REGERROR var
+    our $REGERROR;
+    for my $name ('',':foo') 
+    {
+        for my $pat ("(*NOMATCH$name)","(*MARK$name)(*CUT)",
+                         "(*CUT$name)","(*COMMIT$name)")
+        {                         
+            for my $suffix ('(*FAIL)','') 
+            {
+                'aaaab'=~/a+b$pat$suffix/;
+                iseq(
+                    $REGERROR,
+                    ($suffix ? ($name ? 'foo' : "1") : ""),
+                    "Test $pat and \$REGERROR $suffix"
+                );
+            }
+        }
+    }      
+}    
+{
+    # Test named commits and the $REGERROR var
+    package Fnorble;
+    our $REGERROR;
+    for my $name ('',':foo') 
+    {
+        for my $pat ("(*NOMATCH$name)","(*MARK$name)(*CUT)",
+                         "(*CUT$name)","(*COMMIT$name)")
+        {                         
+            for my $suffix ('(*FAIL)','') 
+            {
+                'aaaab'=~/a+b$pat$suffix/;
+                ::iseq(
+                    $REGERROR,
+                    ($suffix ? ($name ? 'foo' : "1") : ""),
+                    "Test $pat and \$REGERROR $suffix"
+                );
+            }
+        }
+    }      
+}    
+{
+    # Test named commits and the $REGERROR var
+    our $REGERROR;
+    for $word (qw(bar baz bop)) {
+        $REGERROR="";
+        "aaaaa$word"=~/a+(?:bar(*COMMIT:bar)|baz(*COMMIT:baz)|bop(*COMMIT:bop))(*FAIL)/;
+        iseq($REGERROR,$word);
+    }    
+}
+{   #Regression test for perlbug 40684
+    my $s = "abc\ndef";
+    my $rex = qr'^abc$'m;
+    ok($s =~ m/$rex/);
+    ok($s =~ m/^abc$/m);
  }
  #-------------------------------------------------------------------
  
@@ -3914,5 +3996,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
  # Put new tests above the line, not here.
  
  # Don't forget to update this!
-BEGIN{print "1..1300\n"};
+BEGIN{print "1..1344\n"};
  
diff --git a/t/op/re_tests b/t/op/re_tests

index 9b9e5f80560d43f04021e857b0651a558255387e..99c68243e58ac97a8c52adee8e5dc44ae93e89ab 100644 (file)
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -138,7 +138,8 @@ ab|cd       abcd    y       $&      ab
  ()ef   def     y       $-[1]   1
  ()ef   def     y       $+[1]   1
  *a     -       c       -       Quantifier follows nothing
-(*)b   -       c       -       Quantifier follows nothing
+(|*)b  -       c       -       Quantifier follows nothing
+(*)b   -       c       -       Unknown verb
  $b     b       n       -       -
  a\     -       c       -       Search pattern not terminated
  a\(b   a(b     y       $&-$1   a(b-
@@ -325,7 +326,8 @@ a[-]?c      ac      y       $&      ac
  'ab|cd'i       ABCD    y       $&      AB
  '()ef'i        DEF     y       $&-$1   EF-
  '*a'i  -       c       -       Quantifier follows nothing
-'(*)b'i        -       c       -       Quantifier follows nothing
+'(|*)b'i       -       c       -       Quantifier follows nothing
+'(*)b'i        -       c       -       Unknown verb
  '$b'i  B       n       -       -
  'a\'i  -       c       -       Search pattern not terminated
  'a\(b'i        A(B     y       $&-$1   A(B-
@@ -1178,5 +1180,9 @@ round\(([^()]++)\)        _I(round(xs * sz),1)    y       $1      xs * sz
  
  
  a*(?!) aaaab   n       -       -
-a*(?FAIL)      aaaab   n       -       -
-a*(?F) aaaab   n       -       -
+a*(*FAIL)      aaaab   n       -       -
+a*(*F) aaaab   n       -       -
+
+(A(A|B(*ACCEPT)|C)D)(E)        AB      y       $1      AB
+(A(A|B(*ACCEPT)|C)D)(E)        ACDE    y       $1$2$3  ACDCE
+
author	Yves Orton <demerphq@gmail.com>
	Mon, 6 Nov 2006 13:06:28 +0000 (14:06 +0100)
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
	Tue, 7 Nov 2006 10:21:25 +0000 (10:21 +0000)
embed.fnc		patch \| blob \| history
embed.h		patch \| blob \| history
ext/re/t/regop.t		patch \| blob \| history
pod/perl595delta.pod		patch \| blob \| history
pod/perldiag.pod		patch \| blob \| history
pod/perlre.pod		patch \| blob \| history
proto.h		patch \| blob \| history
regcomp.c		patch \| blob \| history
regcomp.h		patch \| blob \| history
regcomp.sym		patch \| blob \| history
regexec.c		patch \| blob \| history
regexp.h		patch \| blob \| history
regnodes.h		patch \| blob \| history
t/op/pat.t		patch \| blob \| history
t/op/re_tests		patch \| blob \| history