New regex syntax omnibus
authorYves Orton <demerphq@gmail.com>
Mon, 6 Nov 2006 13:06:28 +0000 (14:06 +0100)
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>
Tue, 7 Nov 2006 10:21:25 +0000 (10:21 +0000)
Message-ID: <9b18b3110611060406u2fa1572as57073949a5df9e62@mail.gmail.com>

Plus a portability fix (in string comparison for regex verbs)
and doc tweaks / podchecker fixes

p4raw-id: //depot/perl@29222

15 files changed:
embed.fnc
embed.h
ext/re/t/regop.t
pod/perl595delta.pod
pod/perldiag.pod
pod/perlre.pod
proto.h
regcomp.c
regcomp.h
regcomp.sym
regexec.c
regexp.h
regnodes.h
t/op/pat.t
t/op/re_tests

index 350b433..a3251a0 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1359,7 +1359,7 @@ Es        |U8     |regtail_study  |NN struct RExC_state_t *state|NN regnode *p|NN const regn
 
 #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT)
 ERs    |I32    |regmatch       |NN regmatch_info *reginfo|NN regnode *prog
-ERs    |I32    |regrepeat      |NN const regexp *prog|NN const regnode *p|I32 max
+ERs    |I32    |regrepeat      |NN const regexp *prog|NN const regnode *p|I32 max|int depth
 ERs    |I32    |regtry         |NN regmatch_info *reginfo|NN char **startpos
 ERs    |bool   |reginclass     |NULLOK const regexp *prog|NN const regnode *n|NN const U8 *p|NULLOK STRLEN *lenp\
                                |bool do_utf8sv_is_utf8
diff --git a/embed.h b/embed.h
index 22595d5..fea5b27 100644 (file)
--- a/embed.h
+++ b/embed.h
 #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT)
 #if defined(PERL_CORE) || defined(PERL_EXT)
 #define regmatch(a,b)          S_regmatch(aTHX_ a,b)
-#define regrepeat(a,b,c)       S_regrepeat(aTHX_ a,b,c)
+#define regrepeat(a,b,c,d)     S_regrepeat(aTHX_ a,b,c,d)
 #define regtry(a,b)            S_regtry(aTHX_ a,b)
 #define reginclass(a,b,c,d,e)  S_reginclass(aTHX_ a,b,c,d,e)
 #define regcppush(a)           S_regcppush(aTHX_ a)
index 1ccf8b3..f586d22 100644 (file)
@@ -252,7 +252,7 @@ Matching stclass EXACTF <.> against ".exe"
 #Guessed: match at offset 0
 #%MATCHED%
 #Freeing REx: "[q]"
-Got 100 bytes for offset annotations.
-Offsets: [12]
+Got 108 bytes for offset annotations.
+Offsets: [13]
 1:1[3] 3:4[0]
 %MATCHED%        
index ff8efcd..a7e3b40 100644 (file)
@@ -107,8 +107,8 @@ quantifiers. (Yves Orton)
 =item Backtracking control verbs
 
 The regex engine now supports a number of special purpose backtrack
-control verbs: (?COMMIT), (?CUT), (?ERROR) and (?FAIL). See L<perlre>
-for their descriptions.
+control verbs: (*COMMIT), (*MARK), (*CUT), (*ERROR), (*FAIL) and
+(*ACCEPT). See L<perlre> for their descriptions.
 
 =back
 
index c20b060..e9d2326 100644 (file)
@@ -4291,6 +4291,13 @@ category that is unknown to perl at this point.
 
 Note that if you want to enable a warnings category registered by a module
 (e.g. C<use warnings 'File::Find'>), you must have imported this module
+
+=item Unknown verb pattern '%s' in regex; marked by <-- HERE in m/%s/
+
+(F) You either made a typo or have incorrectly put a C<*> quantifier
+after an open brace in your pattern.  Check the pattern and review
+L<perlre> for details on legal verb patterns.
+
 first.
 
 =item unmatched [ in regex; marked by <-- HERE in m/%s/
@@ -4412,6 +4419,17 @@ character to get your parentheses to balance.  See L<attributes>.
 compressed integer format and could not be converted to an integer.
 See L<perlfunc/pack>.
 
+=item Unterminated verb pattern in regex; marked by <-- HERE in m/%s/
+
+(F) You used a pattern of the form C<(*VERB)> but did not terminate
+the pattern with a C<)>. Fix the pattern and retry.
+
+=item Unterminated verb pattern argument in regex; marked by <-- HERE in m/%s/
+
+(F) You used a pattern of the form C<(*VERB:ARG)> but did not terminate
+the pattern with a C<)>. Fix the pattern and retry.
+
+
 =item Unterminated <> operator
 
 (F) The lexer saw a left angle bracket in a place where it was expecting
@@ -4807,6 +4825,16 @@ anonymous, using the C<sub {}> syntax.  When inner anonymous subs that
 reference variables in outer subroutines are created, they
 are automatically rebound to the current values of such variables.
 
+=item Verb pattern '%s' has a mandatory argument in regex; marked by <-- HERE in m/%s/ 
+
+(F) You used a verb pattern that requires an argument. Supply an argument
+or check that you are using the right verb.
+
+=item Verb pattern '%s' may not have an argument in regex; marked by <-- HERE in m/%s/ 
+
+(F) You used a verb pattern that is not allowed an argument. Remove the 
+argument or check that you are using the right verb.
+
 =item Version number must be a constant number
 
 (P) The attempt to translate a C<use Module n.n LIST> statement into
index bce7291..45e41e5 100644 (file)
@@ -933,14 +933,100 @@ the same name, then it recurses to the leftmost.
 It is an error to refer to a name that is not declared somewhere in the
 pattern.
 
-=item C<(?FAIL)> C<(?F)>
-X<(?FAIL)> X<(?F)>
+=item C<(?(condition)yes-pattern|no-pattern)>
+X<(?()>
 
-This pattern matches nothing and always fails. It can be used to force the
-engine to backtrack. It is equivalent to C<(?!)>, but easier to read. In
-fact, C<(?!)> gets optimised into C<(?FAIL)> internally.
+=item C<(?(condition)yes-pattern)>
 
-It is probably useful only when combined with C<(?{})> or C<(??{})>.
+Conditional expression.  C<(condition)> should be either an integer in
+parentheses (which is valid if the corresponding pair of parentheses
+matched), a look-ahead/look-behind/evaluate zero-width assertion, a
+name in angle brackets or single quotes (which is valid if a buffer
+with the given name matched), or the special symbol (R) (true when
+evaluated inside of recursion or eval). Additionally the R may be
+followed by a number, (which will be true when evaluated when recursing
+inside of the appropriate group), or by C<&NAME>, in which case it will
+be true only when evaluated during recursion in the named group.
+
+Here's a summary of the possible predicates:
+
+=over 4
+
+=item (1) (2) ...
+
+Checks if the numbered capturing buffer has matched something.
+
+=item (<NAME>) ('NAME')
+
+Checks if a buffer with the given name has matched something.
+
+=item (?{ CODE })
+
+Treats the code block as the condition.
+
+=item (R)
+
+Checks if the expression has been evaluated inside of recursion.
+
+=item (R1) (R2) ...
+
+Checks if the expression has been evaluated while executing directly
+inside of the n-th capture group. This check is the regex equivalent of
+
+  if ((caller(0))[3] eq 'subname') { ... }
+
+In other words, it does not check the full recursion stack.
+
+=item (R&NAME)
+
+Similar to C<(R1)>, this predicate checks to see if we're executing
+directly inside of the leftmost group with a given name (this is the same
+logic used by C<(?&NAME)> to disambiguate). It does not check the full
+stack, but only the name of the innermost active recursion.
+
+=item (DEFINE)
+
+In this case, the yes-pattern is never directly executed, and no
+no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient.
+See below for details.
+
+=back
+
+For example:
+
+    m{ ( \( )?
+       [^()]+
+       (?(1) \) )
+     }x
+
+matches a chunk of non-parentheses, possibly included in parentheses
+themselves.
+
+A special form is the C<(DEFINE)> predicate, which never executes directly
+its yes-pattern, and does not allow a no-pattern. This allows to define
+subpatterns which will be executed only by using the recursion mechanism.
+This way, you can define a set of regular expression rules that can be
+bundled into any pattern you choose.
+
+It is recommended that for this usage you put the DEFINE block at the
+end of the pattern, and that you name any subpatterns defined within it.
+
+Also, it's worth noting that patterns defined this way probably will
+not be as efficient, as the optimiser is not very clever about
+handling them.
+
+An example of how this might be used is as follows:
+
+  /(?<NAME>(&NAME_PAT))(?<ADDR>(&ADDRESS_PAT))
+   (?(DEFINE)
+     (<NAME_PAT>....)
+     (<ADRESS_PAT>....)
+   )/x
+
+Note that capture buffers matched inside of recursion are not accessible
+after the recursion returns, so the extra layer of capturing buffers are
+necessary. Thus C<$+{NAME_PAT}> would not be defined even though
+C<$+{NAME}> would be.
 
 =item C<< (?>pattern) >>
 X<backtrack> X<backtracking> X<atomic> X<possessive>
@@ -973,12 +1059,12 @@ in the rest of a regular expression.)
 Consider this pattern:
 
     m{ \(
-         ( 
-           [^()]+              # x+
-          | 
+          (
+            [^()]+             # x+
+          |
             \( [^()]* \)
           )+
-       \) 
+       \)
      }x
 
 That will efficiently match a nonempty group with matching parentheses
@@ -992,13 +1078,13 @@ seconds, but that each extra letter doubles this time.  This
 exponential performance will make it appear that your program has
 hung.  However, a tiny change to this pattern
 
-    m{ \( 
-         ( 
-           (?> [^()]+ )        # change x+ above to (?> x+ )
-          | 
+    m{ \(
+          (
+            (?> [^()]+ )       # change x+ above to (?> x+ )
+          |
             \( [^()]* \)
           )+
-       \) 
+       \)
      }x
 
 which uses C<< (?>...) >> matches exactly when the one above does (verifying
@@ -1046,13 +1132,50 @@ to inside of one of these constructs. The following equivalences apply:
     PAT?+               (?>PAT?)
     PAT{min,max}+       (?>PAT{min,max})
 
-=item C<(?COMMIT)>
-X<(?COMMIT)>
+=back
+
+=head2 Special Backtracking Control Verbs
+
+B<WARNING:> These patterns are experimental and subject to change or
+removal in a future version of perl. Their usage in production code should
+be noted to avoid problems during upgrades.
+
+These special patterns are generally of the form C<(*VERB:ARG)>. Unless
+otherwise stated the ARG argument is optional; in some cases, it is
+forbidden.
+
+Any pattern containing a special backtracking verb that allows an argument
+has the special behaviour that when executed it sets the current packages'
+C<$REGERROR> variable. In this case, the following rules apply:
+
+On failure, this variable will be set to the ARG value of the verb
+pattern, if the verb was involved in the failure of the match. If the ARG
+part of the pattern was omitted, then C<$REGERROR> will be set to TRUE.
+
+On a successful match this variable will be set to FALSE.
+
+B<NOTE:> C<$REGERROR> is not a magic variable in the same sense than
+C<$1> and most other regex related variables. It is not local to a
+scope, nor readonly but instead a volatile package variable similar to
+C<$AUTOLOAD>. Use C<local> to localize changes to it to a specific scope
+if necessary.
+
+If a pattern does not contain a special backtracking verb that allows an
+argument, then C<$REGERROR> is not touched at all.
+
+=over 4
+
+=item Verbs that take an argument
+
+=over 4
+
+=item C<(*NOMATCH)> C<(*NOMATCH:NAME)>
+X<(*NOMATCH)> X<(*NOMATCH:NAME)>
 
 This zero-width pattern commits the match at the current point, preventing
-the engine from back-tracking on failure to the left of the commit point.
-Consider the pattern C<A (?COMMIT) B>, where A and B are complex patterns.
-Until the C<(?COMMIT)> is reached, A may backtrack as necessary to match.
+the engine from backtracking on failure to the left of the this point.
+Consider the pattern C<A (*NOMATCH) B>, where A and B are complex patterns.
+Until the C<(*NOMATCH)> is reached, A may backtrack as necessary to match.
 Once it is reached, matching continues in B, which may also backtrack as
 necessary; however, should B not match, then no further backtracking will
 take place, and the pattern will fail outright at that starting position.
@@ -1060,7 +1183,7 @@ take place, and the pattern will fail outright at that starting position.
 The following example counts all the possible matching strings in a
 pattern (without actually matching any of them).
 
-    'aaab'=~/a+b?(?{print "$&\n"; $count++})(?FAIL)/;
+    'aaab' =~ /a+b?(?{print "$&\n"; $count++})(*FAIL)/;
     print "Count=$count\n";
 
 which produces:
@@ -1076,9 +1199,9 @@ which produces:
     a
     Count=9
 
-If we add a C<(?COMMIT)> before the count like the following
+If we add a C<(*NOMATCH)> before the count like the following
 
-    'aaab'=~/a+b?(?COMMIT)(?{print "$&\n"; $count++})(?FAIL)/;
+    'aaab' =~ /a+b?(*NOMATCH)(?{print "$&\n"; $count++})(*FAIL)/;
     print "Count=$count\n";
 
 we prevent backtracking and find the count of the longest matching
@@ -1089,23 +1212,47 @@ at each matching startpoint like so:
     ab
     Count=3
 
-Any number of C<(?COMMIT)> assertions may be used in a pattern.
+Any number of C<(*NOMATCH)> assertions may be used in a pattern.
 
 See also C<< (?>pattern) >> and possessive quantifiers for other
 ways to control backtracking.
 
-=item C<(?CUT)>
-X<(?CUT)>
-
-This zero-width pattern is similar to C<(?COMMIT)>, except that on
-failure it also signifies that whatever text that was matched leading
-up to the C<(?CUT)> pattern cannot match, I<even from another
-starting point>.
-
-Compare the following to the examples in C<(?COMMIT)>, note the string
+=item C<(*MARK)> C<(*MARK:NAME)>
+X<(*MARK)>
+
+This zero-width pattern can be used to mark the point in a string
+reached when a certain part of the pattern has been successfully
+matched. This mark may be given a name. A later C<(*CUT)> pattern
+will then cut at that point if backtracked into on failure. Any
+number of (*MARK) patterns are allowed, and the NAME portion is
+optional and may be duplicated.
+
+See C<*CUT> for more detail.
+
+=item C<(*CUT)> C<(*CUT:NAME)>
+X<(*CUT)>
+
+This zero-width pattern is similar to C<(*NOMATCH)>, except that on
+failure it also signifies that whatever text that was matched leading up
+to the C<(*CUT)> pattern being executed cannot be part of a match, I<even
+if started from a later point>. This effectively means that the regex
+engine moves forward to this position on failure and tries to match
+again, (assuming that there is sufficient room to match).
+
+The name of the C<(*CUT:NAME)> pattern has special significance. If a
+C<(*MARK:NAME)> was encountered while matching, then it is the position
+where that pattern was executed that is used for the "cut point" in the
+string. If no mark of that name was encountered, then the cut is done at
+the point where the C<(*CUT)> was. Similarly if no NAME is specified in
+the C<(*CUT)>, and if a C<(*MARK)> with any name (or none) is encountered,
+then that C<(*MARK)>'s cursor point will be used. If the C<(*CUT)> is not
+preceded by a C<(*MARK)>, then the cut point is where the string was when
+the C<(*CUT)> was encountered.
+
+Compare the following to the examples in C<(*NOMATCH)>, note the string
 is twice as long:
 
-    'aaabaaab'=~/a+b?(?CUT)(?{print "$&\n"; $count++})(?FAIL)/;
+    'aaabaaab' =~ /a+b?(*CUT)(?{print "$&\n"; $count++})(*FAIL)/;
     print "Count=$count\n";
 
 outputs
@@ -1114,17 +1261,17 @@ outputs
     aaab
     Count=2
 
-Once the 'aaab' at the start of the string has matched and the C<(?CUT)>
-executed the next startpoint will be where the cursor was when the
-C<(?CUT)> was executed.
+Once the 'aaab' at the start of the string has matched, and the C<(*CUT)>
+executed, the next startpoint will be where the cursor was when the
+C<(*CUT)> was executed.
 
-=item C<(?ERROR)>
-X<(?ERROR)>
+=item C<(*COMMIT)>
+X<(*COMMIT)>
 
-This zero-width pattern is similar to C<(?CUT)> except that it causes
+This zero-width pattern is similar to C<(*CUT)> except that it causes
 the match to fail outright. No attempts to match will occur again.
 
-    'aaabaaab'=~/a+b?(?ERROR)(?{print "$&\n"; $count++})(?FAIL)/;
+    'aaabaaab' =~ /a+b?(*COMMIT)(?{print "$&\n"; $count++})(*FAIL)/;
     print "Count=$count\n";
 
 outputs
@@ -1132,105 +1279,49 @@ outputs
     aaab
     Count=1
 
-In other words, once the C<(?ERROR)> has been entered and then pattern
-does not match then the regex engine will not try any further matching at
-all on the rest of the string.
-
-=item C<(?(condition)yes-pattern|no-pattern)>
-X<(?()>
-
-=item C<(?(condition)yes-pattern)>
+In other words, once the C<(*COMMIT)> has been entered, and if the pattern
+does not match, the regex engine will not try any further matching on the
+rest of the string.
 
-Conditional expression.  C<(condition)> should be either an integer in
-parentheses (which is valid if the corresponding pair of parentheses
-matched), a look-ahead/look-behind/evaluate zero-width assertion, a
-name in angle brackets or single quotes (which is valid if a buffer
-with the given name matched), the special symbol (R) (true when
-evaluated inside of recursion or eval). Additionally the R may be
-followed by a number, (which will be true when evaluated when recursing
-inside of the appropriate group), or by C<&NAME> in which case it will
-be true only when evaluated during recursion in the named group.
+=back
 
-Here's a summary of the possible predicates:
+=item Verbs without an argument
 
 =over 4
 
-=item (1) (2) ...
-
-Checks if the numbered capturing buffer has matched something.
-
-=item (<NAME>) ('NAME')
+=item C<(*FAIL)> C<(*F)>
+X<(*FAIL)> X<(*F)>
 
-Checks if a buffer with the given name has matched something.
-
-=item (?{ CODE })
-
-Treats the code block as the condition
-
-=item (R)
-
-Checks if the expression has been evaluated inside of recursion.
-
-=item (R1) (R2) ...
+This pattern matches nothing and always fails. It can be used to force the
+engine to backtrack. It is equivalent to C<(?!)>, but easier to read. In
+fact, C<(?!)> gets optimised into C<(*FAIL)> internally.
 
-Checks if the expression has been evaluated while executing directly
-inside of the n-th capture group. This check is the regex equivalent of
+It is probably useful only when combined with C<(?{})> or C<(??{})>.
 
-  if ((caller(0))[3] eq 'subname') { .. }
+=item C<(*ACCEPT)>
+X<(*ACCEPT)>
 
-In other words, it does not check the full recursion stack.
+B<WARNING:> This feature is highly experimental. It is not recommended
+for production code.
 
-=item (R&NAME)
+This pattern matches nothing and causes the end of successful matching at
+the point at which the C<(*ACCEPT)> pattern was encountered, regardless of
+whether there is actually more to match in the string. When inside of a
+nested pattern, such as recursion or a dynamically generated subbpattern
+via C<(??{})>, only the innermost pattern is ended immediately.
 
-Similar to C<(R1)>, this predicate checks to see if we're executing
-directly inside of the leftmost group with a given name (this is the same
-logic used by C<(?&NAME)> to disambiguate). It does not check the full
-stack, but only the name of the innermost active recursion.
+If the C<(*ACCEPT)> is inside of capturing buffers then the buffers are
+marked as ended at the point at which the C<(*ACCEPT)> was encountered.
+For instance:
 
-=item (DEFINE)
+  'AB' =~ /(A (A|B(*ACCEPT)|C) D)(E)/x;
 
-In this case, the yes-pattern is never directly executed, and no
-no-pattern is allowed. Similar in spirit to C<(?{0})> but more efficient.
-See below for details.
+will match, and C<$1> will be C<AB> and C<$2> will be C<B>, C<$3> will not
+be set. If another branch in the inner parens were matched, such as in the
+string 'ACDE', then the C<D> and C<E> would have to be matched as well.
 
 =back
 
-For example:
-
-    m{ ( \( )?
-       [^()]+
-       (?(1) \) )
-     }x
-
-matches a chunk of non-parentheses, possibly included in parentheses
-themselves.
-
-A special form is the C<(DEFINE)> predicate, which never executes directly
-its yes-pattern, and does not allow a no-pattern. This allows to define
-subpatterns which will be executed only by using the recursion mechanism.
-This way, you can define a set of regular expression rules that can be
-bundled into any pattern you choose.
-
-It is recommended that for this usage you put the DEFINE block at the
-end of the pattern, and that you name any subpatterns defined within it.
-
-Also, it's worth noting that patterns defined this way probably will
-not be as efficient, as the optimiser is not very clever about
-handling them. YMMV.
-
-An example of how this might be used is as follows:
-
-  /(?<NAME>(&NAME_PAT))(?<ADDR>(&ADDRESS_PAT))
-   (?(DEFINE)
-     (<NAME_PAT>....)
-     (<ADRESS_PAT>....)
-   )/x
-
-Note that capture buffers matched inside of recursion are not accessible
-after the recursion returns, so the extra layer of capturing buffers are
-necessary. Thus C<$+{NAME_PAT}> would not be defined even though
-C<$+{NAME}> would be.
-
 =back
 
 =head2 Backtracking
diff --git a/proto.h b/proto.h
index b141466..531d583 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -3697,7 +3697,7 @@ STATIC I32        S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2);
 
-STATIC I32     S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
+STATIC I32     S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1)
                        __attribute__nonnull__(pTHX_2);
index 80d7eec..3ce84c1 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -114,6 +114,7 @@ typedef struct RExC_state_t {
     U32                seen;
     I32                size;                   /* Code size. */
     I32                npar;                   /* () count. */
+    I32                nestroot;               /* root parens we are in - used by accept */
     I32                extralen;
     I32                seen_zerolen;
     I32                seen_evals;
@@ -152,6 +153,7 @@ typedef struct RExC_state_t {
 #define RExC_seen      (pRExC_state->seen)
 #define RExC_size      (pRExC_state->size)
 #define RExC_npar      (pRExC_state->npar)
+#define RExC_nestroot   (pRExC_state->nestroot)
 #define RExC_extralen  (pRExC_state->extralen)
 #define RExC_seen_zerolen      (pRExC_state->seen_zerolen)
 #define RExC_seen_evals        (pRExC_state->seen_evals)
@@ -335,7 +337,7 @@ static const scan_data_t zero_scan_data =
 #define SCF_WHILEM_VISITED_POS 0x2000
 
 #define SCF_TRIE_RESTUDY        0x4000 /* Do restudy? */
-
+#define SCF_SEEN_ACCEPT         0x8000 
 
 #define UTF (RExC_utf8 != 0)
 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
@@ -2311,6 +2313,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
     scan_data_t data_fake;
     SV *re_trie_maxbuff = NULL;
     regnode *first_non_open = scan;
+    I32 stopmin = I32_MAX;
     GET_RE_DEBUG_FLAGS_DECL;
 #ifdef DEBUGGING
     StructCopy(&zero_scan_data, &data_fake, scan_data_t);
@@ -2411,6 +2414,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    scan = next;
                    if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
                        pars++;
+                   if (data_fake.flags & SCF_SEEN_ACCEPT) {
+                       if ( stopmin > minnext) 
+                           stopmin = min + min1;
+                       flags &= ~SCF_DO_SUBSTR;
+                       if (data)
+                           data->flags |= SCF_SEEN_ACCEPT;
+                   }
                    if (data) {
                        if (data_fake.flags & SF_HAS_EVAL)
                            data->flags |= SF_HAS_EVAL;
@@ -3580,11 +3590,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (data)
                    data->flags |= SF_HAS_EVAL;
        }
-       else if ( OP(scan)==OPFAIL ) {
+       else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
            if (flags & SCF_DO_SUBSTR) {
                scan_commit(pRExC_state,data,minlenp);
                flags &= ~SCF_DO_SUBSTR;
            }
+           if (data && OP(scan)==ACCEPT) {
+               data->flags |= SCF_SEEN_ACCEPT;
+               if (stopmin > min)
+                   stopmin = min;
+           }
        }
        else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
        {
@@ -3666,7 +3681,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     
                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
                         pars++;
-                    
+                    if (data_fake.flags & SCF_SEEN_ACCEPT) {
+                        if ( stopmin > min + min1) 
+                           stopmin = min + min1;
+                       flags &= ~SCF_DO_SUBSTR;
+                       if (data)
+                           data->flags |= SCF_SEEN_ACCEPT;
+                   }
                     if (data) {
                         if (data_fake.flags & SF_HAS_EVAL)
                             data->flags |= SF_HAS_EVAL;
@@ -3758,7 +3779,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
     
     DEBUG_STUDYDATA(data,depth);
     
-    return min;
+    return min < stopmin ? min : stopmin;
 }
 
 STATIC I32
@@ -3915,6 +3936,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_end = xend;
     RExC_naughty = 0;
     RExC_npar = 1;
+    RExC_nestroot = 0;
     RExC_size = 0L;
     RExC_emit = &PL_regdummy;
     RExC_whilem_seen = 0;
@@ -3952,6 +3974,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     if (RExC_whilem_seen > 15)
        RExC_whilem_seen = 15;
 
+#ifdef DEBUGGING
+    /* Make room for a sentinel value at the end of the program */
+    RExC_size++;
+#endif
+
     /* Allocate space and zero-initialize. Note, the two step process 
        of zeroing when in debug mode, thus anything assigned has to 
        happen after that */
@@ -4008,6 +4035,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_npar = 1;
     RExC_emit_start = r->program;
     RExC_emit = r->program;
+#ifdef DEBUGGING
+    /* put a sentinal on the end of the program so we can check for
+       overwrites */
+    r->program[RExC_size].type = 255;
+#endif
     /* Store the count of eval-groups for security checks: */
     RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
@@ -4415,6 +4447,8 @@ reStudy:
        r->reganch |= ROPT_EVAL_SEEN;
     if (RExC_seen & REG_SEEN_CANY)
        r->reganch |= ROPT_CANY_SEEN;
+    if (RExC_seen & REG_SEEN_VERBARG)
+       r->reganch |= ROPT_VERBARG_SEEN;
     if (RExC_paren_names)
         r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
     else
@@ -4605,6 +4639,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
 #endif
 
+/* this idea is borrowed from STR_WITH_LEN in handy.h */
+#define CHECK_WORD(s,v,l)  \
+    (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
+
 STATIC regnode *
 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
@@ -4641,6 +4679,98 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 
     /* Make an OPEN node, if parenthesized. */
     if (paren) {
+        if ( *RExC_parse == '*') { /* (*VERB:ARG) */
+           char *start_verb = RExC_parse;
+           STRLEN verb_len = 0;
+           char *start_arg = NULL;
+           unsigned char op = 0;
+           int argok = 1;
+           int internal_argval = 0; /* internal_argval is only useful if !argok */
+           while ( *RExC_parse && *RExC_parse != ')' ) {
+               if ( *RExC_parse == ':' ) {
+                   start_arg = RExC_parse + 1;
+                   break;
+               }
+               RExC_parse++;
+           }
+           ++start_verb;
+           verb_len = RExC_parse - start_verb;
+           if ( start_arg ) {
+               RExC_parse++;
+               while ( *RExC_parse && *RExC_parse != ')' ) 
+                   RExC_parse++;
+               if ( *RExC_parse != ')' ) 
+                   vFAIL("Unterminated verb pattern argument");
+               if ( RExC_parse == start_arg )
+                   start_arg = NULL;
+           } else {
+               if ( *RExC_parse != ')' )
+                   vFAIL("Unterminated verb pattern");
+           }
+           switch ( *start_verb ) {
+            case 'A':  /* (*ACCEPT) */
+                if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
+                   op = ACCEPT;
+                   internal_argval = RExC_nestroot;
+               }
+               break;
+            case 'C':  /* (*COMMIT) */
+                if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
+                    op = COMMIT;
+                else if ( CHECK_WORD("CUT",start_verb,verb_len) )
+                    op = CUT;
+                break;
+            case 'F':  /* (*FAIL) */
+                if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
+                   op = OPFAIL;
+                   argok = 0;
+               }
+               break;
+           case 'M':
+               if ( CHECK_WORD("MARK",start_verb,verb_len) )
+                    op = MARKPOINT;
+                break;
+            case 'N':  /* (*NOMATCH) */
+                if ( CHECK_WORD("NOMATCH",start_verb,verb_len) )
+                    op = NOMATCH;
+                break;
+           }
+           if ( ! op ) {
+               RExC_parse++;
+               vFAIL3("Unknown verb pattern '%.*s'",
+                   verb_len, start_verb);
+           }
+           if ( argok ) {
+                if ( start_arg && internal_argval ) {
+                   vFAIL3("Verb pattern '%.*s' may not have an argument",
+                       verb_len, start_verb); 
+               } else if ( argok < 0 && !start_arg ) {
+                    vFAIL3("Verb pattern '%.*s' has a mandatory argument",
+                       verb_len, start_verb);    
+               } else {
+                   ret = reganode(pRExC_state, op, internal_argval);
+                   if ( ! internal_argval && ! SIZE_ONLY ) {
+                        if (start_arg) {
+                            SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
+                            ARG(ret) = add_data( pRExC_state, 1, "S" );
+                            RExC_rx->data->data[ARG(ret)]=(void*)sv;
+                            ret->flags = 0;
+                        } else {
+                            ret->flags = 1; 
+                        }
+                    }              
+               }
+               if (!internal_argval)
+                   RExC_seen |= REG_SEEN_VERBARG;
+           } else if ( start_arg ) {
+               vFAIL3("Verb pattern '%.*s' may not have an argument",
+                       verb_len, start_verb);    
+           } else {
+               ret = reg_node(pRExC_state, op);
+           }
+           nextchar(pRExC_state);
+           return ret;
+        } else 
        if (*RExC_parse == '?') { /* (?...) */
            U32 posflags = 0, negflags = 0;
            U32 *flagsp = &posflags;
@@ -4711,62 +4841,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                RExC_parse++;
            case '=':           /* (?=...) */
            case '!':           /* (?!...) */
-               if (*RExC_parse == ')')
-                   goto do_op_fail;
                RExC_seen_zerolen++;
+               if (*RExC_parse == ')') {
+                   ret=reg_node(pRExC_state, OPFAIL);
+                   nextchar(pRExC_state);
+                   return ret;
+               }
            case ':':           /* (?:...) */
            case '>':           /* (?>...) */
                break;
-            case 'C':           /* (?CUT) and (?COMMIT) */
-               if (RExC_parse[0] == 'O' &&
-                   RExC_parse[1] == 'M' &&
-                   RExC_parse[2] == 'M' &&
-                   RExC_parse[3] == 'I' &&
-                   RExC_parse[4] == 'T' &&
-                   RExC_parse[5] == ')')
-               {
-                   RExC_parse+=5;
-                   ret = reg_node(pRExC_state, COMMIT);
-                } else if (
-                    RExC_parse[0] == 'U' &&
-                    RExC_parse[1] == 'T' &&
-                    RExC_parse[2] == ')') 
-                {
-                    RExC_parse+=2;
-                    ret = reg_node(pRExC_state, CUT);
-               } else {
-                   vFAIL("Sequence (?C... not terminated");
-               }
-               nextchar(pRExC_state);
-               return ret;
-               break;
-            case 'E':            /* (?ERROR) */
-                if (RExC_parse[0] == 'R' &&
-                    RExC_parse[1] == 'R' &&
-                    RExC_parse[2] == 'O' &&
-                    RExC_parse[3] == 'R' &&
-                    RExC_parse[4] == ')') 
-                {
-                    RExC_parse+=4;
-                    ret = reg_node(pRExC_state, OPERROR);
-                } else {
-                    vFAIL("Sequence (?E... not terminated"); 
-                }
-               nextchar(pRExC_state);
-               return ret;
-                break;                
-            case 'F':
-                if (RExC_parse[0] == 'A' &&
-                    RExC_parse[1] == 'I' &&
-                    RExC_parse[2] == 'L')
-                    RExC_parse+=3;
-                if (*RExC_parse != ')')
-                   vFAIL("Sequence (?FAIL) or (?F) not terminated");
-             do_op_fail:
-               ret = reg_node(pRExC_state, OPFAIL);
-               nextchar(pRExC_state);
-               return ret;
-               break;
            case '$':           /* (?$...) */
            case '@':           /* (?@...) */
                vFAIL2("Sequence (?%c...) not implemented", (int)paren);
@@ -5098,12 +5181,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
          capturing_parens:
            parno = RExC_npar;
            RExC_npar++;
+           
            ret = reganode(pRExC_state, OPEN, parno);
-           if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
-               DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
+           if (!SIZE_ONLY ){
+               if (!RExC_nestroot) 
+                   RExC_nestroot = parno;
+               if (RExC_seen & REG_SEEN_RECURSE) {
+                   DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
                        "Setting open paren #%"IVdf" to %d\n", 
                        (IV)parno, REG_NODE_NUM(ret)));
-               RExC_open_parens[parno-1]= ret;
+                   RExC_open_parens[parno-1]= ret;
+               }
            }
             Set_Node_Length(ret, 1); /* MJD */
             Set_Node_Offset(ret, RExC_parse); /* MJD */
@@ -5175,6 +5263,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        "Setting close paren #%"IVdf" to %d\n", 
                        (IV)parno, REG_NODE_NUM(ender)));
                RExC_close_parens[parno-1]= ender;
+               if (RExC_nestroot == parno) 
+                   RExC_nestroot = 0;
            }       
             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
             Set_Node_Length(ender,1); /* MJD */
@@ -7505,6 +7595,11 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
        RExC_size += 1;
        return(ret);
     }
+#ifdef DEBUGGING
+    if (OP(RExC_emit) == 255)
+        Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
+            reg_name[op], OP(RExC_emit));
+#endif  
     NODE_ALIGN_FILL(ret);
     ptr = ret;
     FILL_ADVANCE_NODE(ptr, op);
@@ -7521,7 +7616,6 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
     }
 
     RExC_emit = ptr;
-
     return(ret);
 }
 
@@ -7555,7 +7649,10 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
        */
        return(ret);
     }
-
+#ifdef DEBUGGING
+    if (OP(RExC_emit) == 255)
+        Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
+#endif 
     NODE_ALIGN_FILL(ret);
     ptr = ret;
     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
@@ -7573,7 +7670,6 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
     }
             
     RExC_emit = ptr;
-
     return(ret);
 }
 
@@ -8006,11 +8102,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     }
     else if (k == WHILEM && o->flags)                  /* Ordinal/of */
        Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
-    else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP) 
+    else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT
        Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
     else if (k == GOSUB) 
        Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
-    else if (k == LOGICAL)
+    else if (k == VERB) {
+        if (!o->flags) 
+            Perl_sv_catpvf(aTHX_ sv, ":%"SVf, 
+                (SV*)prog->data->data[ ARG( o ) ]);
+    } else if (k == LOGICAL)
        Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
     else if (k == ANYOF) {
        int i, rangestart = -1;
@@ -8401,7 +8501,7 @@ Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
        for (i = 0; i < count; i++) {
            d->what[i] = r->data->what[i];
            switch (d->what[i]) {
-               /* legal options are one of: sfpont
+               /* legal options are one of: sSfpont
                   see also regcomp.h and pregfree() */
            case 's':
            case 'S':
index 360e2a9..2774a27 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -351,6 +351,7 @@ struct regnode_charclass_class {    /* has [[:blah:]] classes */
 #define REG_SEEN_SANY          REG_SEEN_CANY /* src bckwrd cmpt */
 #define REG_SEEN_RECURSE        0x00000020
 #define REG_TOP_LEVEL_BRANCHES  0x00000040
+#define REG_SEEN_VERBARG        0x00000080
 
 START_EXTERN_C
 
index e673313..074af13 100644 (file)
@@ -169,10 +169,16 @@ INSUBP            INSUBP,    num 1        Whether we are in a specific recurse.
 DEFINEP                DEFINEP,   none 1       Never execute directly.               
 
 #*Bactracking 
-OPFAIL         OPFAIL, none            Same as (?!)
-COMMIT         COMMIT, none            Pattern fails if backtracking through this 
-CUT            COMMIT, none            ... and restarts at the cursor point
-OPERROR                OPERROR,none            Pattern fails outright if backtracking through this
+ENDLIKE                ENDLIKE,   none         Used only for the type field of verbs
+OPFAIL         ENDLIKE,   none         Same as (?!)
+ACCEPT         ENDLIKE,   parno 1      Accepts the current matched string.
+VERB           VERB,      no-sv 1      Used only for the type field of verbs
+NOMATCH                VERB,      no-sv 1      Pattern fails at this startpoint if no-backtracking through this 
+MARKPOINT      VERB,      no-sv 1      Push the current location for rollback by cut.
+CUT            VERB,      no-sv 1      On failure cut the string at the mark.
+COMMIT         VERB,      no-sv 1      Pattern fails outright if backtracking through this
+
+
 
 # NEW STUFF ABOVE THIS LINE -- Please update counts below. 
 
@@ -210,4 +216,5 @@ CURLYM      A,B:FAIL
 IFMATCH        A:FAIL  
 CURLY          B_min_known,B_min,B_max:FAIL    
 COMMIT         next:FAIL
-
+MARKPOINT      next:FAIL
+CUT            next:FAIL
index f7fd347..8e0aabd 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -2571,11 +2571,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
     int nochange_depth = 0; /* depth of GOSUB recursion with nochange*/
     regmatch_state *yes_state = NULL; /* state to pop to on success of
                                                            subpattern */
+    /* mark_state piggy backs on the yes_state logic so that when we unwind 
+       the stack on success we can update the mark_state as we go */
+    regmatch_state *mark_state = NULL; /* last mark state we have seen */
     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
     U32 state_num;
     bool no_final = 0;
-
+    char *startpoint = PL_reginput;
+    SV *popmark = NULL;
+    SV *sv_commit = NULL;
+    int lastopen = 0;
     /* these three flags are set by various ops to signal information to
      * the very next op. They have a useful lifetime of exactly one loop
      * iteration, and are not preserved or restored by state pushes/pops
@@ -3606,6 +3612,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            PL_reg_start_tmp[n] = locinput;
            if (n > PL_regsize)
                PL_regsize = n;
+            lastopen = n;
            break;
        case CLOSE:
            n = ARG(scan);  /* which paren pair */
@@ -3620,6 +3627,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                goto fake_end;
            }    
            break;
+        case ACCEPT:
+            if (ARG(scan)){
+                regnode *cursor;
+                for (cursor=scan;
+                     cursor && OP(cursor)!=END; 
+                     cursor=regnext(cursor)) 
+                {
+                    if ( OP(cursor)==CLOSE ){
+                        n = ARG(cursor);
+                        if ( n <= lastopen ) {
+                            PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
+                            PL_regendp[n] = locinput - PL_bostr;
+                            /*if (n > PL_regsize)
+                            PL_regsize = n;*/
+                            if (n > (I32)*PL_reglastparen)
+                                *PL_reglastparen = n;
+                            *PL_reglastcloseparen = n;
+                            if ( n == ARG(scan) || (cur_eval && 
+                                cur_eval->u.eval.close_paren == (U32)n))
+                                break;
+                        }
+                    }
+                }
+            }
+           goto fake_end;
+           /*NOTREACHED*/          
        case GROUPP:
            n = ARG(scan);  /* which paren pair */
            sw = (bool)((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
@@ -4302,7 +4335,7 @@ NULL
            PL_reginput = locinput;
            if (minmod) {
                minmod = 0;
-               if (ST.min && regrepeat(rex, ST.A, ST.min) < ST.min)
+               if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
                    sayNO;
                ST.count = ST.min;
                locinput = PL_reginput;
@@ -4335,7 +4368,7 @@ NULL
 
            }
            else {
-               ST.count = regrepeat(rex, ST.A, ST.max);
+               ST.count = regrepeat(rex, ST.A, ST.max, depth);
                locinput = PL_reginput;
                if (ST.count < ST.min)
                    sayNO;
@@ -4421,7 +4454,7 @@ NULL
                /* PL_reginput == oldloc now */
                if (n) {
                    ST.count += n;
-                   if (regrepeat(rex, ST.A, n) < n)
+                   if (regrepeat(rex, ST.A, n, depth) < n)
                        sayNO;
                }
                PL_reginput = locinput;
@@ -4443,7 +4476,7 @@ NULL
            REGCP_UNWIND(ST.cp);
            /* failed -- move forward one */
            PL_reginput = locinput;
-           if (regrepeat(rex, ST.A, 1)) {
+           if (regrepeat(rex, ST.A, 1, depth)) {
                ST.count++;
                locinput = PL_reginput;
                if (ST.count <= ST.max || (ST.max == REG_INFTY &&
@@ -4622,17 +4655,13 @@ NULL
            if (next == scan)
                next = NULL;
            break;
-       case OPERROR:
-           reginfo->cutpoint=PL_regeol;
-           goto do_commit;
-           /* NOTREACHED */
-       case CUT:
-           if ( locinput > reginfo->bol )
-               reginfo->cutpoint = HOPBACKc(locinput, 1);
-           /* FALLTHROUGH */       
        case COMMIT:
-         do_commit:
+           reginfo->cutpoint = PL_regeol;
+           /* FALLTHROUGH */
+       case NOMATCH:
            PL_reginput = locinput;
+           if (!scan->flags)
+               sv_commit = (SV*)rex->data->data[ ARG( scan ) ];
            PUSH_STATE_GOTO(COMMIT_next,next);
            /* NOTREACHED */
        case COMMIT_next_fail:
@@ -4640,6 +4669,71 @@ NULL
            /* FALLTHROUGH */       
        case OPFAIL:
            sayNO;
+           /* NOTREACHED */
+
+#define ST st->u.mark
+        case MARKPOINT:
+            ST.prev_mark = mark_state;
+            ST.mark_name = scan->flags ? &PL_sv_yes : 
+                (SV*)rex->data->data[ ARG( scan ) ];
+            mark_state = st;
+            ST.mark_loc = PL_reginput = locinput;
+            PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
+            /* NOTREACHED */
+        case MARKPOINT_next:
+            mark_state = ST.prev_mark;
+            sayYES;
+            /* NOTREACHED */
+        case MARKPOINT_next_fail:
+            if (popmark && ( popmark == &PL_sv_yes || 
+                 (ST.mark_name != &PL_sv_yes && 
+                  sv_eq(ST.mark_name,popmark)))) 
+            {
+                if (ST.mark_loc > startpoint)
+                   reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
+                popmark = NULL; /* we found our mark */
+                sv_commit = ST.mark_name;
+
+                DEBUG_EXECUTE_r({
+                    if (sv_commit != &PL_sv_yes) 
+                       PerlIO_printf(Perl_debug_log,
+                           "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
+                           REPORT_CODE_OFF+depth*2, "", 
+                           PL_colors[4], sv_commit, PL_colors[5]);
+                    else
+                        PerlIO_printf(Perl_debug_log,
+                           "%*s  %ssetting cutpoint to mark...%s\n",
+                           REPORT_CODE_OFF+depth*2, "", 
+                           PL_colors[4], PL_colors[5]);
+               });
+            }
+            mark_state = ST.prev_mark;
+            sayNO;
+            /* NOTREACHED */
+        case CUT:
+            ST.mark_name = scan->flags ? &PL_sv_yes : 
+                    (SV*)rex->data->data[ ARG( scan ) ];
+            if (mark_state) {
+                ST.mark_loc = NULL;
+            } else {
+                ST.mark_loc = locinput;
+            }    
+            PL_reginput = locinput;
+           PUSH_STATE_GOTO(CUT_next,next);
+           /* NOTREACHED */
+       case CUT_next_fail:
+           if (ST.mark_loc) {
+               if (ST.mark_loc > startpoint)
+                   reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
+               sv_commit = ST.mark_name;
+            } else {
+                popmark = ST.mark_name;           
+            }
+            no_final = 1; 
+            sayNO;
+            /* NOTREACHED */
+#undef ST
+
        default:
            PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
                          PTR2UV(scan), OP(scan));
@@ -4716,13 +4810,13 @@ yes:
                PL_regmatch_slab = PL_regmatch_slab->prev;
                st = SLAB_LAST(PL_regmatch_slab);
            }
-            DEBUG_STATE_r({
+           DEBUG_STATE_r({
                if (no_final) {
                    DEBUG_STATE_pp("pop (no final)");        
                } else {
                    DEBUG_STATE_pp("pop (yes)");
                }
-               }); 
+           });
            depth--;
        }
 #else
@@ -4789,7 +4883,14 @@ no_silent:
     result = 0;
 
   final_exit:
-
+    if (rex->reganch & ROPT_VERBARG_SEEN) {
+        SV *sv = get_sv("REGERROR", 1);
+        if (result) 
+            sv_commit = &PL_sv_no;
+        else if (!sv_commit) 
+            sv_commit = &PL_sv_yes;
+        sv_setsv(sv, sv_commit);
+    }
     /* restore original high-water mark */
     PL_regmatch_slab  = orig_slab;
     PL_regmatch_state = orig_state;
@@ -4817,7 +4918,7 @@ no_silent:
  * rather than incrementing count on every character.  [Er, except utf8.]]
  */
 STATIC I32
-S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
+S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
 {
     dVAR;
     register char *scan;
@@ -5048,7 +5149,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max)
            regprop(prog, prop, p);
            PerlIO_printf(Perl_debug_log,
                        "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
-                       REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
+                       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
        });
     });
 
index f13a5c5..9b3ce79 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -97,7 +97,6 @@ typedef struct regexp_engine {
 #define ROPT_CANY_SEEN         0x00000800
 #define ROPT_SANY_SEEN         ROPT_CANY_SEEN /* src bckwrd cmpt */
 #define ROPT_GPOS_CHECK         (ROPT_GPOS_SEEN|ROPT_ANCH_GPOS)
-#define ROPT_RECURSE_SEEN       0x00001000
 
 /* 0xf800 of reganch is used by PMf_COMPILETIME */
 
@@ -106,6 +105,8 @@ typedef struct regexp_engine {
 #define ROPT_COPY_DONE         0x00040000      /* subbeg is a copy of the string */
 #define ROPT_TAINTED_SEEN      0x00080000
 #define ROPT_MATCH_UTF8                0x10000000 /* subbeg is utf-8 */
+#define ROPT_RECURSE_SEEN       0x20000000
+#define ROPT_VERBARG_SEEN       0x40000000
 
 #define RE_USE_INTUIT_NOML     0x00100000 /* Best to intuit before matching */
 #define RE_USE_INTUIT_ML       0x00200000
@@ -311,6 +312,14 @@ typedef struct regmatch_state {
            I32 logical;        /* saved copy of 'logical' var */
            regnode  *me; /* the IFMATCH/SUSPEND/UNLESSM node  */
        } ifmatch; /* and SUSPEND/UNLESSM */
+       
+       struct {
+           /* this first element must match u.yes */
+           struct regmatch_state *prev_yes_state;
+           struct regmatch_state *prev_mark;
+           SV* mark_name;
+           char *mark_loc;
+       } mark;
     } u;
 } regmatch_state;
 
index 010b943..005e409 100644 (file)
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX            78
-#define REGMATCH_STATE_MAX     110
+#define REGNODE_MAX            82
+#define REGMATCH_STATE_MAX     118
 
 #define        END                     0       /* 0000 End of program. */
 #define        SUCCEED                 1       /* 0x01 Return from a subroutine, basically. */
 #define        NGROUPP                 70      /* 0x46 Whether the group matched. */
 #define        INSUBP                  71      /* 0x47 Whether we are in a specific recurse. */
 #define        DEFINEP                 72      /* 0x48 Never execute directly. */
-#define        OPFAIL                  73      /* 0x49 Same as (?!) */
-#define        COMMIT                  74      /* 0x4a Pattern fails if backtracking through this */
-#define        CUT                     75      /* 0x4b ... and restarts at the cursor point */
-#define        OPERROR                 76      /* 0x4c Pattern fails outright if backtracking through this */
-#define        OPTIMIZED               77      /* 0x4d Placeholder for dump. */
-#define        PSEUDO                  78      /* 0x4e Pseudo opcode for internal use. */
+#define        ENDLIKE                 73      /* 0x49 Used only for the type field of verbs */
+#define        OPFAIL                  74      /* 0x4a Same as (?!) */
+#define        ACCEPT                  75      /* 0x4b Accepts the current matched string. */
+#define        VERB                    76      /* 0x4c    no-sv 1      Used only for the type field of verbs */
+#define        NOMATCH                 77      /* 0x4d Pattern fails at this startpoint if no-backtracking through this */
+#define        MARKPOINT               78      /* 0x4e Push the current location for rollback by cut. */
+#define        CUT                     79      /* 0x4f On failure cut the string at the mark. */
+#define        COMMIT                  80      /* 0x50 Pattern fails outright if backtracking through this */
+#define        OPTIMIZED               81      /* 0x51 Placeholder for dump. */
+#define        PSEUDO                  82      /* 0x52 Pseudo opcode for internal use. */
        /* ------------ States ------------- */
 #define        TRIE_next               (REGNODE_MAX + 1)       /* state for TRIE */
 #define        TRIE_next_fail          (REGNODE_MAX + 2)       /* state for TRIE */
 #define        CURLY_B_max_fail        (REGNODE_MAX + 30)      /* state for CURLY */
 #define        COMMIT_next             (REGNODE_MAX + 31)      /* state for COMMIT */
 #define        COMMIT_next_fail        (REGNODE_MAX + 32)      /* state for COMMIT */
+#define        MARKPOINT_next          (REGNODE_MAX + 33)      /* state for MARKPOINT */
+#define        MARKPOINT_next_fail     (REGNODE_MAX + 34)      /* state for MARKPOINT */
+#define        CUT_next                (REGNODE_MAX + 35)      /* state for CUT */
+#define        CUT_next_fail           (REGNODE_MAX + 36)      /* state for CUT */
 
 /* PL_regkind[] What type of regop or state is this. */
 
 EXTCONST U8 PL_regkind[];
 #else
 EXTCONST U8 PL_regkind[] = {
-       END,            /* END                    */
-       END,            /* SUCCEED                */
-       BOL,            /* BOL                    */
-       BOL,            /* MBOL                   */
-       BOL,            /* SBOL                   */
-       EOL,            /* EOS                    */
-       EOL,            /* EOL                    */
-       EOL,            /* MEOL                   */
-       EOL,            /* SEOL                   */
-       BOUND,          /* BOUND                  */
-       BOUND,          /* BOUNDL                 */
-       NBOUND,         /* NBOUND                 */
-       NBOUND,         /* NBOUNDL                */
-       GPOS,           /* GPOS                   */
-       REG_ANY,        /* REG_ANY                */
-       REG_ANY,        /* SANY                   */
-       REG_ANY,        /* CANY                   */
-       ANYOF,          /* ANYOF                  */
-       ALNUM,          /* ALNUM                  */
-       ALNUM,          /* ALNUML                 */
-       NALNUM,         /* NALNUM                 */
-       NALNUM,         /* NALNUML                */
-       SPACE,          /* SPACE                  */
-       SPACE,          /* SPACEL                 */
-       NSPACE,         /* NSPACE                 */
-       NSPACE,         /* NSPACEL                */
-       DIGIT,          /* DIGIT                  */
-       DIGIT,          /* DIGITL                 */
-       NDIGIT,         /* NDIGIT                 */
-       NDIGIT,         /* NDIGITL                */
-       CLUMP,          /* CLUMP                  */
-       BRANCH,         /* BRANCH                 */
-       BACK,           /* BACK                   */
-       EXACT,          /* EXACT                  */
-       EXACT,          /* EXACTF                 */
-       EXACT,          /* EXACTFL                */
-       NOTHING,        /* NOTHING                */
-       NOTHING,        /* TAIL                   */
-       STAR,           /* STAR                   */
-       PLUS,           /* PLUS                   */
-       CURLY,          /* CURLY                  */
-       CURLY,          /* CURLYN                 */
-       CURLY,          /* CURLYM                 */
-       CURLY,          /* CURLYX                 */
-       WHILEM,         /* WHILEM                 */
-       OPEN,           /* OPEN                   */
-       CLOSE,          /* CLOSE                  */
-       REF,            /* REF                    */
-       REF,            /* REFF                   */
-       REF,            /* REFFL                  */
-       BRANCHJ,        /* IFMATCH                */
-       BRANCHJ,        /* UNLESSM                */
-       BRANCHJ,        /* SUSPEND                */
-       BRANCHJ,        /* IFTHEN                 */
-       GROUPP,         /* GROUPP                 */
-       LONGJMP,        /* LONGJMP                */
-       BRANCHJ,        /* BRANCHJ                */
-       EVAL,           /* EVAL                   */
-       MINMOD,         /* MINMOD                 */
-       LOGICAL,        /* LOGICAL                */
-       BRANCHJ,        /* RENUM                  */
-       TRIE,           /* TRIE                   */
-       TRIE,           /* TRIEC                  */
-       TRIE,           /* AHOCORASICK            */
-       TRIE,           /* AHOCORASICKC           */
-       GOSUB,          /* GOSUB                  */
-       GOSTART,        /* GOSTART                */
-       NREF,           /* NREF                   */
-       NREF,           /* NREFF                  */
-       NREF,           /* NREFFL                 */
-       NGROUPP,        /* NGROUPP                */
-       INSUBP,         /* INSUBP                 */
-       DEFINEP,        /* DEFINEP                */
-       OPFAIL,         /* OPFAIL                 */
-       COMMIT,         /* COMMIT                 */
-       COMMIT,         /* CUT                    */
-       OPERROR,        /* OPERROR                */
-       NOTHING,        /* OPTIMIZED              */
-       PSEUDO,         /* PSEUDO                 */
+       END,            /* END                    */
+       END,            /* SUCCEED                */
+       BOL,            /* BOL                    */
+       BOL,            /* MBOL                   */
+       BOL,            /* SBOL                   */
+       EOL,            /* EOS                    */
+       EOL,            /* EOL                    */
+       EOL,            /* MEOL                   */
+       EOL,            /* SEOL                   */
+       BOUND,          /* BOUND                  */
+       BOUND,          /* BOUNDL                 */
+       NBOUND,         /* NBOUND                 */
+       NBOUND,         /* NBOUNDL                */
+       GPOS,           /* GPOS                   */
+       REG_ANY,        /* REG_ANY                */
+       REG_ANY,        /* SANY                   */
+       REG_ANY,        /* CANY                   */
+       ANYOF,          /* ANYOF                  */
+       ALNUM,          /* ALNUM                  */
+       ALNUM,          /* ALNUML                 */
+       NALNUM,         /* NALNUM                 */
+       NALNUM,         /* NALNUML                */
+       SPACE,          /* SPACE                  */
+       SPACE,          /* SPACEL                 */
+       NSPACE,         /* NSPACE                 */
+       NSPACE,         /* NSPACEL                */
+       DIGIT,          /* DIGIT                  */
+       DIGIT,          /* DIGITL                 */
+       NDIGIT,         /* NDIGIT                 */
+       NDIGIT,         /* NDIGITL                */
+       CLUMP,          /* CLUMP                  */
+       BRANCH,         /* BRANCH                 */
+       BACK,           /* BACK                   */
+       EXACT,          /* EXACT                  */
+       EXACT,          /* EXACTF                 */
+       EXACT,          /* EXACTFL                */
+       NOTHING,        /* NOTHING                */
+       NOTHING,        /* TAIL                   */
+       STAR,           /* STAR                   */
+       PLUS,           /* PLUS                   */
+       CURLY,          /* CURLY                  */
+       CURLY,          /* CURLYN                 */
+       CURLY,          /* CURLYM                 */
+       CURLY,          /* CURLYX                 */
+       WHILEM,         /* WHILEM                 */
+       OPEN,           /* OPEN                   */
+       CLOSE,          /* CLOSE                  */
+       REF,            /* REF                    */
+       REF,            /* REFF                   */
+       REF,            /* REFFL                  */
+       BRANCHJ,        /* IFMATCH                */
+       BRANCHJ,        /* UNLESSM                */
+       BRANCHJ,        /* SUSPEND                */
+       BRANCHJ,        /* IFTHEN                 */
+       GROUPP,         /* GROUPP                 */
+       LONGJMP,        /* LONGJMP                */
+       BRANCHJ,        /* BRANCHJ                */
+       EVAL,           /* EVAL                   */
+       MINMOD,         /* MINMOD                 */
+       LOGICAL,        /* LOGICAL                */
+       BRANCHJ,        /* RENUM                  */
+       TRIE,           /* TRIE                   */
+       TRIE,           /* TRIEC                  */
+       TRIE,           /* AHOCORASICK            */
+       TRIE,           /* AHOCORASICKC           */
+       GOSUB,          /* GOSUB                  */
+       GOSTART,        /* GOSTART                */
+       NREF,           /* NREF                   */
+       NREF,           /* NREFF                  */
+       NREF,           /* NREFFL                 */
+       NGROUPP,        /* NGROUPP                */
+       INSUBP,         /* INSUBP                 */
+       DEFINEP,        /* DEFINEP                */
+       ENDLIKE,        /* ENDLIKE                */
+       ENDLIKE,        /* OPFAIL                 */
+       ENDLIKE,        /* ACCEPT                 */
+       VERB,           /* VERB                   */
+       VERB,           /* NOMATCH                */
+       VERB,           /* MARKPOINT              */
+       VERB,           /* CUT                    */
+       VERB,           /* COMMIT                 */
+       NOTHING,        /* OPTIMIZED              */
+       PSEUDO,         /* PSEUDO                 */
        /* ------------ States ------------- */
-       TRIE,           /* TRIE_next              */
-       TRIE,           /* TRIE_next_fail         */
-       EVAL,           /* EVAL_AB                */
-       EVAL,           /* EVAL_AB_fail           */
-       CURLYX,         /* CURLYX_end             */
-       CURLYX,         /* CURLYX_end_fail        */
-       WHILEM,         /* WHILEM_A_pre           */
-       WHILEM,         /* WHILEM_A_pre_fail      */
-       WHILEM,         /* WHILEM_A_min           */
-       WHILEM,         /* WHILEM_A_min_fail      */
-       WHILEM,         /* WHILEM_A_max           */
-       WHILEM,         /* WHILEM_A_max_fail      */
-       WHILEM,         /* WHILEM_B_min           */
-       WHILEM,         /* WHILEM_B_min_fail      */
-       WHILEM,         /* WHILEM_B_max           */
-       WHILEM,         /* WHILEM_B_max_fail      */
-       BRANCH,         /* BRANCH_next            */
-       BRANCH,         /* BRANCH_next_fail       */
-       CURLYM,         /* CURLYM_A               */
-       CURLYM,         /* CURLYM_A_fail          */
-       CURLYM,         /* CURLYM_B               */
-       CURLYM,         /* CURLYM_B_fail          */
-       IFMATCH,        /* IFMATCH_A              */
-       IFMATCH,        /* IFMATCH_A_fail         */
-       CURLY,          /* CURLY_B_min_known      */
-       CURLY,          /* CURLY_B_min_known_fail */
-       CURLY,          /* CURLY_B_min            */
-       CURLY,          /* CURLY_B_min_fail       */
-       CURLY,          /* CURLY_B_max            */
-       CURLY,          /* CURLY_B_max_fail       */
-       COMMIT,         /* COMMIT_next            */
-       COMMIT,         /* COMMIT_next_fail       */
+       TRIE,           /* TRIE_next              */
+       TRIE,           /* TRIE_next_fail         */
+       EVAL,           /* EVAL_AB                */
+       EVAL,           /* EVAL_AB_fail           */
+       CURLYX,         /* CURLYX_end             */
+       CURLYX,         /* CURLYX_end_fail        */
+       WHILEM,         /* WHILEM_A_pre           */
+       WHILEM,         /* WHILEM_A_pre_fail      */
+       WHILEM,         /* WHILEM_A_min           */
+       WHILEM,         /* WHILEM_A_min_fail      */
+       WHILEM,         /* WHILEM_A_max           */
+       WHILEM,         /* WHILEM_A_max_fail      */
+       WHILEM,         /* WHILEM_B_min           */
+       WHILEM,         /* WHILEM_B_min_fail      */
+       WHILEM,         /* WHILEM_B_max           */
+       WHILEM,         /* WHILEM_B_max_fail      */
+       BRANCH,         /* BRANCH_next            */
+       BRANCH,         /* BRANCH_next_fail       */
+       CURLYM,         /* CURLYM_A               */
+       CURLYM,         /* CURLYM_A_fail          */
+       CURLYM,         /* CURLYM_B               */
+       CURLYM,         /* CURLYM_B_fail          */
+       IFMATCH,        /* IFMATCH_A              */
+       IFMATCH,        /* IFMATCH_A_fail         */
+       CURLY,          /* CURLY_B_min_known      */
+       CURLY,          /* CURLY_B_min_known_fail */
+       CURLY,          /* CURLY_B_min            */
+       CURLY,          /* CURLY_B_min_fail       */
+       CURLY,          /* CURLY_B_max            */
+       CURLY,          /* CURLY_B_max_fail       */
+       COMMIT,         /* COMMIT_next            */
+       COMMIT,         /* COMMIT_next_fail       */
+       MARKPOINT,      /* MARKPOINT_next         */
+       MARKPOINT,      /* MARKPOINT_next_fail    */
+       CUT,            /* CUT_next               */
+       CUT,            /* CUT_next_fail          */
 };
 #endif
 
@@ -320,10 +336,14 @@ static const U8 regarglen[] = {
        EXTRA_SIZE(struct regnode_1),           /* NGROUPP      */
        EXTRA_SIZE(struct regnode_1),           /* INSUBP       */
        EXTRA_SIZE(struct regnode_1),           /* DEFINEP      */
+       0,                                      /* ENDLIKE      */
        0,                                      /* OPFAIL       */
-       0,                                      /* COMMIT       */
-       0,                                      /* CUT          */
-       0,                                      /* OPERROR      */
+       EXTRA_SIZE(struct regnode_1),           /* ACCEPT       */
+       0,                                      /* VERB         */
+       EXTRA_SIZE(struct regnode_1),           /* NOMATCH      */
+       EXTRA_SIZE(struct regnode_1),           /* MARKPOINT    */
+       EXTRA_SIZE(struct regnode_1),           /* CUT          */
+       EXTRA_SIZE(struct regnode_1),           /* COMMIT       */
        0,                                      /* OPTIMIZED    */
        0,                                      /* PSEUDO       */
 };
@@ -404,10 +424,14 @@ static const char reg_off_by_arg[] = {
        0,      /* NGROUPP      */
        0,      /* INSUBP       */
        0,      /* DEFINEP      */
+       0,      /* ENDLIKE      */
        0,      /* OPFAIL       */
-       0,      /* COMMIT       */
+       0,      /* ACCEPT       */
+       0,      /* VERB         */
+       0,      /* NOMATCH      */
+       0,      /* MARKPOINT    */
        0,      /* CUT          */
-       0,      /* OPERROR      */
+       0,      /* COMMIT       */
        0,      /* OPTIMIZED    */
        0,      /* PSEUDO       */
 };
@@ -489,12 +513,16 @@ const char * reg_name[] = {
        "NGROUPP",                      /* 0x46 */
        "INSUBP",                       /* 0x47 */
        "DEFINEP",                      /* 0x48 */
-       "OPFAIL",                       /* 0x49 */
-       "COMMIT",                       /* 0x4a */
-       "CUT",                          /* 0x4b */
-       "OPERROR",                      /* 0x4c */
-       "OPTIMIZED",                    /* 0x4d */
-       "PSEUDO",                       /* 0x4e */
+       "ENDLIKE",                      /* 0x49 */
+       "OPFAIL",                       /* 0x4a */
+       "ACCEPT",                       /* 0x4b */
+       "VERB",                         /* 0x4c */
+       "NOMATCH",                      /* 0x4d */
+       "MARKPOINT",                    /* 0x4e */
+       "CUT",                          /* 0x4f */
+       "COMMIT",                       /* 0x50 */
+       "OPTIMIZED",                    /* 0x51 */
+       "PSEUDO",                       /* 0x52 */
        /* ------------ States ------------- */
        "TRIE_next",                    /* REGNODE_MAX +0x01 */
        "TRIE_next_fail",               /* REGNODE_MAX +0x02 */
@@ -528,6 +556,10 @@ const char * reg_name[] = {
        "CURLY_B_max_fail",             /* REGNODE_MAX +0x1e */
        "COMMIT_next",                  /* REGNODE_MAX +0x1f */
        "COMMIT_next_fail",             /* REGNODE_MAX +0x20 */
+       "MARKPOINT_next",               /* REGNODE_MAX +0x21 */
+       "MARKPOINT_next_fail",          /* REGNODE_MAX +0x22 */
+       "CUT_next",                     /* REGNODE_MAX +0x23 */
+       "CUT_next_fail",                /* REGNODE_MAX +0x24 */
 };
 #endif /* DEBUGGING */
 #else
index 67be900..5405cf6 100755 (executable)
@@ -3851,54 +3851,136 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
     iseq($count,1,"should have matched once only [RT#36046]");
 }
 
-{   # Test the (?COMMIT) pattern
+{   # Test the (*NOMATCH) pattern
     our $count = 0;
-    'aaab'=~/a+b?(?{$count++})(?FAIL)/;
-    iseq($count,9,"expect 9 for no (?COMMIT)");
+    'aaab'=~/a+b?(?{$count++})(*FAIL)/;
+    iseq($count,9,"expect 9 for no (*NOMATCH)");
     $count = 0;
-    'aaab'=~/a+b?(?COMMIT)(?{$count++})(?FAIL)/;
-    iseq($count,3,"expect 3 with (?COMMIT)");
+    'aaab'=~/a+b?(*NOMATCH)(?{$count++})(*FAIL)/;
+    iseq($count,3,"expect 3 with (*NOMATCH)");
     local $_='aaab';
     $count=0;
-    1 while /.(?COMMIT)(?{$count++})(?FAIL)/g;
-    iseq($count,4,"/.(?COMMIT)/");
+    1 while /.(*NOMATCH)(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.(*NOMATCH)/");
     $count = 0;
-    'aaab'=~/a+b?(??{'(?COMMIT)'})(?{$count++})(?FAIL)/;
-    iseq($count,3,"expect 3 with (?COMMIT)");
+    'aaab'=~/a+b?(??{'(*NOMATCH)'})(?{$count++})(*FAIL)/;
+    iseq($count,3,"expect 3 with (*NOMATCH)");
     local $_='aaab';
     $count=0;
-    1 while /.(??{'(?COMMIT)'})(?{$count++})(?FAIL)/g;
-    iseq($count,4,"/.(?COMMIT)/");
+    1 while /.(??{'(*NOMATCH)'})(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.(*NOMATCH)/");
 }
-{   # Test the (?CUT) pattern
+{   # Test the (*CUT) pattern
     our $count = 0;
-    'aaab'=~/a+b?(?CUT)(?{$count++})(?FAIL)/;
-    iseq($count,1,"expect 1 with (?CUT)");
+    'aaab'=~/a+b?(*CUT)(?{$count++})(*FAIL)/;
+    iseq($count,1,"expect 1 with (*CUT)");
     local $_='aaab';
     $count=0;
-    1 while /.(?CUT)(?{$count++})(?FAIL)/g;
-    iseq($count,4,"/.(?CUT)/");
+    1 while /.(*CUT)(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.(*CUT)/");
     $_='aaabaaab';
     $count=0;
     our @res=();
-    1 while /(a+b?)(?CUT)(?{$count++; push @res,$1})(?FAIL)/g;
-    iseq($count,2,"Expect 2 with (?CUT)" );
-    iseq("@res","aaab aaab","adjacent (?CUT) works as expected" );
+    1 while /(a+b?)(*CUT)(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,2,"Expect 2 with (*CUT)" );
+    iseq("@res","aaab aaab","adjacent (*CUT) works as expected" );
 }
-{   # Test the (?ERROR) pattern
+{   # Test the (*CUT) pattern
     our $count = 0;
-    'aaabaaab'=~/a+b?(?ERROR)(?{$count++})(?FAIL)/;
-    iseq($count,1,"expect 1 with (?ERROR)");
+    'aaab'=~/a+b?(*MARK)(*CUT)(?{$count++})(*FAIL)/;
+    iseq($count,1,"expect 1 with (*CUT)");
     local $_='aaab';
     $count=0;
-    1 while /.(?ERROR)(?{$count++})(?FAIL)/g;
-    iseq($count,1,"/.(?ERROR)/");
+    1 while /.(*MARK)(*CUT)(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.(*CUT)/");
     $_='aaabaaab';
     $count=0;
     our @res=();
-    1 while /(a+b?)(?ERROR)(?{$count++; push @res,$1})(?FAIL)/g;
-    iseq($count,1,"Expect 1 with (?ERROR)" );
-    iseq("@res","aaab","adjacent (?ERROR) works as expected" );
+    1 while /(a+b?)(*MARK)(*CUT)(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,2,"Expect 2 with (*CUT)" );
+    iseq("@res","aaab aaab","adjacent (*CUT) works as expected" );
+}
+{   # Test the (*CUT) pattern
+    our $count = 0;
+    'aaab'=~/a*(*MARK:a)b?(*MARK:b)(*CUT:a)(?{$count++})(*FAIL)/;
+    iseq($count,3,"expect 3 with *MARK:a)b?(*MARK:b)(*CUT:a)");
+    local $_='aaabaaab';
+    $count=0;
+    our @res=();
+    1 while /(a*(*MARK:a)b?)(*MARK)(*CUT:a)(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,5,"Expect 5 with (*MARK:a)b?)(*MARK)(*CUT:a)" );
+    iseq("@res","aaab b aaab b ","adjacent (*MARK:a)b?)(*MARK)(*CUT:a) works as expected" );
+}
+{   # Test the (*COMMIT) pattern
+    our $count = 0;
+    'aaabaaab'=~/a+b?(*COMMIT)(?{$count++})(*FAIL)/;
+    iseq($count,1,"expect 1 with (*COMMIT)");
+    local $_='aaab';
+    $count=0;
+    1 while /.(*COMMIT)(?{$count++})(*FAIL)/g;
+    iseq($count,1,"/.(*COMMIT)/");
+    $_='aaabaaab';
+    $count=0;
+    our @res=();
+    1 while /(a+b?)(*COMMIT)(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,1,"Expect 1 with (*COMMIT)" );
+    iseq("@res","aaab","adjacent (*COMMIT) works as expected" );
+}
+{
+    # Test named commits and the $REGERROR var
+    our $REGERROR;
+    for my $name ('',':foo') 
+    {
+        for my $pat ("(*NOMATCH$name)","(*MARK$name)(*CUT)",
+                         "(*CUT$name)","(*COMMIT$name)")
+        {                         
+            for my $suffix ('(*FAIL)','') 
+            {
+                'aaaab'=~/a+b$pat$suffix/;
+                iseq(
+                    $REGERROR,
+                    ($suffix ? ($name ? 'foo' : "1") : ""),
+                    "Test $pat and \$REGERROR $suffix"
+                );
+            }
+        }
+    }      
+}    
+{
+    # Test named commits and the $REGERROR var
+    package Fnorble;
+    our $REGERROR;
+    for my $name ('',':foo') 
+    {
+        for my $pat ("(*NOMATCH$name)","(*MARK$name)(*CUT)",
+                         "(*CUT$name)","(*COMMIT$name)")
+        {                         
+            for my $suffix ('(*FAIL)','') 
+            {
+                'aaaab'=~/a+b$pat$suffix/;
+                ::iseq(
+                    $REGERROR,
+                    ($suffix ? ($name ? 'foo' : "1") : ""),
+                    "Test $pat and \$REGERROR $suffix"
+                );
+            }
+        }
+    }      
+}    
+{
+    # Test named commits and the $REGERROR var
+    our $REGERROR;
+    for $word (qw(bar baz bop)) {
+        $REGERROR="";
+        "aaaaa$word"=~/a+(?:bar(*COMMIT:bar)|baz(*COMMIT:baz)|bop(*COMMIT:bop))(*FAIL)/;
+        iseq($REGERROR,$word);
+    }    
+}
+{   #Regression test for perlbug 40684
+    my $s = "abc\ndef";
+    my $rex = qr'^abc$'m;
+    ok($s =~ m/$rex/);
+    ok($s =~ m/^abc$/m);
 }
 #-------------------------------------------------------------------
 
@@ -3914,5 +3996,5 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/,
 # Put new tests above the line, not here.
 
 # Don't forget to update this!
-BEGIN{print "1..1300\n"};
+BEGIN{print "1..1344\n"};
 
index 9b9e5f8..99c6824 100644 (file)
@@ -138,7 +138,8 @@ ab|cd       abcd    y       $&      ab
 ()ef   def     y       $-[1]   1
 ()ef   def     y       $+[1]   1
 *a     -       c       -       Quantifier follows nothing
-(*)b   -       c       -       Quantifier follows nothing
+(|*)b  -       c       -       Quantifier follows nothing
+(*)b   -       c       -       Unknown verb
 $b     b       n       -       -
 a\     -       c       -       Search pattern not terminated
 a\(b   a(b     y       $&-$1   a(b-
@@ -325,7 +326,8 @@ a[-]?c      ac      y       $&      ac
 'ab|cd'i       ABCD    y       $&      AB
 '()ef'i        DEF     y       $&-$1   EF-
 '*a'i  -       c       -       Quantifier follows nothing
-'(*)b'i        -       c       -       Quantifier follows nothing
+'(|*)b'i       -       c       -       Quantifier follows nothing
+'(*)b'i        -       c       -       Unknown verb
 '$b'i  B       n       -       -
 'a\'i  -       c       -       Search pattern not terminated
 'a\(b'i        A(B     y       $&-$1   A(B-
@@ -1178,5 +1180,9 @@ round\(([^()]++)\)        _I(round(xs * sz),1)    y       $1      xs * sz
 
 
 a*(?!) aaaab   n       -       -
-a*(?FAIL)      aaaab   n       -       -
-a*(?F) aaaab   n       -       -
+a*(*FAIL)      aaaab   n       -       -
+a*(*F) aaaab   n       -       -
+
+(A(A|B(*ACCEPT)|C)D)(E)        AB      y       $1      AB
+(A(A|B(*ACCEPT)|C)D)(E)        ACDE    y       $1$2$3  ACDCE
+