Add possessive quantifiers to regex engine.

author Yves Orton <demerphq@gmail.com>

Thu, 12 Oct 2006 21:23:15 +0000 (23:23 +0200)

committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>

Fri, 13 Oct 2006 09:02:05 +0000 (09:02 +0000)
author Yves Orton <demerphq@gmail.com>
Thu, 12 Oct 2006 21:23:15 +0000 (23:23 +0200)
committer Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
Fri, 13 Oct 2006 09:02:05 +0000 (09:02 +0000)
diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod

index e3c24d4453fa6cf96adbae91299ad10e86782a13..e0c60796d649cfd48fd11a94a2f2d236b7774a20 100644 (file)
--- a/pod/perl595delta.pod
+++ b/pod/perl595delta.pod
@@ -78,6 +78,15 @@ $1 will be 'A', $2 will be 'B', $3 will be 'C' and $4 will be 'D' and not
  $1 is 'A', $2 is 'C' and $3 is 'B' and $4 is 'D' that a .NET programmer
  would expect. This is considered a feature. :-)
  
+=item Possessive Quantifiers
+
+Perl now supports the "possessive quantifier" syntax of the "atomic match" 
+pattern. Basically a possessive quantifier matches as much as it can and never
+gives any back. Thus it can be used to control backtracking. The syntax is 
+similar to non-greedy matching, except instead of using a '?' as the modifier
+the '+' is used. Thus C<?+>, C<*+>, C<++>, C<{min,max}+> are now legal
+quantifiers.
+
  =back
  
  =head1 Modules and Pragmas
diff --git a/pod/perlre.pod b/pod/perlre.pod

index c2da3bdf9127a9cfed314bc2a74916b1e0bfecd3..c89d29f71409cd3f49e5bc86ee87c8861bfe150d 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -154,6 +154,37 @@ X<?> X<*?> X<+?> X<??> X<{n}?> X<{n,}?> X<{n,m}?>
      {n,}?  Match at least n times
      {n,m}? Match at least n but not more than m times
  
+By default, when a quantified subpattern does not allow the rest of the
+overall pattern to match, Perl will backtrack. However, this behaviour is
+sometimes undesirable. Thus Perl provides the "possesive" quantifier form
+as well.
+
+    *+    Match 0 or more times and give nothing back
+    +?    Match 1 or more times and give nothing back
+    ?+    Match 0 or 1 time and give nothing back
+    {n}+   Match exactly n times and give nothing back (redundant)
+    {n,}?  Match at least n times and give nothing back
+    {n,m}? Match at least n but not more than m times and give nothing back
+
+For instance,
+
+   'aaaa' =~ /a++a/
+
+will never match, as the C<a++> will gobble up all the C<a>'s in the
+string and won't leave any for the remaining part of the pattern. This
+feature can be extremely useful to give perl hints about where it
+shouldn't backtrack. For instance, the typical "match a double-quoted
+string" problem can be most efficiently performed when written as:
+
+   /"(?:[^"\\]++|\\.)*+"/
+
+as we know that if the final quote does not match, bactracking will not
+help. See the independent subexpression C<< (?>...) >> for more details;
+possessive quantifiers are just syntactic sugar for that construct. For
+instance the above example could also be written as follows:
+
+   /"(?>(?:(?>[^"\\]+)|\\.)*)"/
+
  Because patterns are processed as double quoted strings, the following
  also work:
  X<\t> X<\n> X<\r> X<\f> X<\a> X<\l> X<\u> X<\L> X<\U> X<\E> X<\Q>
@@ -690,7 +721,9 @@ Both forms are equivalent.
  X<(?{})> X<regex, code in> X<regexp, code in> X<regular expression, code in>
  
  B<WARNING>: This extended regular expression feature is considered
-highly experimental, and may be changed or deleted without notice.
+experimental, and may be changed without notice. Code executed that
+has side effects may not perform identically from version to version
+due to the effect of future optimisations in the regex engine.
  
  This zero-width assertion evaluates any embedded Perl code.  It
  always succeeds, and its C<code> is not interpolated.  Currently,
@@ -777,9 +810,9 @@ X<(??{})>
  X<regex, postponed> X<regexp, postponed> X<regular expression, postponed>
  
  B<WARNING>: This extended regular expression feature is considered
-highly experimental, and may be changed or deleted without notice.
-A simplified version of the syntax may be introduced for commonly
-used idioms.
+experimental, and may be changed without notice. Code executed that
+has side effects may not perform identically from version to version
+due to the effect of future optimisations in the regex engine.
  
  This is a "postponed" regular subexpression.  The C<code> is evaluated
  at run time, at the moment this subexpression may match.  The result
@@ -824,9 +857,6 @@ changing it requires a custom build.
  X<(?PARNO)> X<(?1)> X<(?R)> X<(?0)>
  X<regex, recursive> X<regexp, recursive> X<regular expression, recursive>
  
-B<WARNING>:  This extended regular expression feature is considered
-highly experimental, and may be changed or deleted without notice.
-
  Similar to C<(??{ code })> except it does not involve compiling any code,
  instead it treats the contents of a capture buffer as an independent
  pattern that must match at the current position.  Capture buffers
@@ -894,9 +924,6 @@ pattern.
  =item C<< (?>pattern) >>
  X<backtrack> X<backtracking> X<atomic> X<possessive>
  
-B<WARNING>: This extended regular expression feature is considered
-highly experimental, and may be changed or deleted without notice.
-
  An "independent" subexpression, one which matches the substring
  that a I<standalone> C<pattern> would match if anchored at the given
  position, and it matches I<nothing other than this substring>.  This
@@ -988,14 +1015,21 @@ the above specification of comments.
  In some literature this construct is called "atomic matching" or
  "possessive matching".
  
+Possessive quantifiers are equivalent to putting the item they are applied
+to inside of one of these constructs. The following equivalences apply:
+
+    Quantifier Form     Bracketing Form
+    ---------------     ---------------
+    PAT*+               (?>PAT*)
+    PAT++               (?>PAT+)
+    PAT?+               (?>PAT?)
+    PAT{min,max}+       (?>PAT{min,max})
+
  =item C<(?(condition)yes-pattern|no-pattern)>
  X<(?()>
  
  =item C<(?(condition)yes-pattern)>
  
-B<WARNING>: This extended regular expression feature is considered
-highly experimental, and may be changed or deleted without notice.
-
  Conditional expression.  C<(condition)> should be either an integer in
  parentheses (which is valid if the corresponding pair of parentheses
  matched), a look-ahead/look-behind/evaluate zero-width assertion, a
diff --git a/pod/perltodo.pod b/pod/perltodo.pod

index 4a54bcddb6fc00a6edca06715da90a54b4ebea7c..b65009e8012025f4467ac4bb7374edf45b2d7dc5 100644 (file)
--- a/pod/perltodo.pod
+++ b/pod/perltodo.pod
@@ -629,29 +629,6 @@ Fix (or rewrite) the implementation of the C</(?{...})/> closures.
  This will allow the use of a regex from inside (?{ }), (??{ }) and
  (?(?{ })|) constructs.
  
-=head2 Add possessive quantifiers to regexp engine
-
-Possessive quantifiers are a syntactic sugar that affords a more
-elegant way to express (?>A+). They are also provided by many other 
-regex engines. Most importantly they allow various patterns to be 
-optimised more efficiently than (?>...) allows, and allow various data 
-driven optimisations to be implemented (such as auto-possesification of 
-quantifiers followed by contrary suffixes). Common syntax for them is 
-  
-  ++        possessive 1 or more
-  *+        possessive 0 or more
-  {n,m}+    possessive n..m
-  
-A possessive quantifier basically absorbs as much as it can and doesn't 
-give any back. 
-
-Jeffrey Friedl documents possessive quantifiers in Mastering Regular 
-Expressions 2nd edition and explicitly pleads for them to be added to 
-perl. We should oblige him, lest he leaves us out of a future edition. 
-;-)
-
-demerphq has this on his todo list
-
  =head2 Add (?YES) (?NO) to regexp enigne
  
  YES/NO would allow a subpattern to be passed/failed but allow backtracking.
diff --git a/regcomp.c b/regcomp.c

index 3b694cbe2880c09b5741376ea16113d162c2f69e..89ce4200ab27c31e44daa0eb5252a9b52736dd26 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3699,14 +3699,17 @@ Perl_reginitcolors(pTHX)
   * Beware that the optimization-preparation code in here knows about some
   * of the structure of the compiled regexp.  [I'll say.]
   */
+
+
+
  #ifndef PERL_IN_XSUB_RE
-#define CORE_ONLY_BLOCK(c) {c}{
  #define RE_ENGINE_PTR &PL_core_reg_engine
  #else
-#define CORE_ONLY_BLOCK(c) {
  extern const struct regexp_engine my_reg_engine;
  #define RE_ENGINE_PTR &my_reg_engine
  #endif
+/* these make a few things look better, to avoid indentation */
+#define BEGIN_BLOCK {
  #define END_BLOCK }
   
  regexp *
@@ -3715,7 +3718,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
      dVAR;
      GET_RE_DEBUG_FLAGS_DECL;
      DEBUG_r(if (!PL_colorset) reginitcolors());
-    CORE_ONLY_BLOCK(
+#ifndef PERL_IN_XSUB_RE
+    BEGIN_BLOCK
      /* Dispatch a request to compile a regexp to correct 
         regexp engine. */
      HV * const table = GvHV(PL_hintgv);
@@ -3729,7 +3733,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
              });            
              return CALLREGCOMP_ENG(eng, exp, xend, pm);
          } 
-    })
+    }
+    END_BLOCK
+#endif
+    BEGIN_BLOCK    
      register regexp *r;
      regnode *scan;
      regnode *first;
@@ -5206,10 +5213,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
         *flagp = flags;
         return(ret);
      }
-    /* else if (OP(ret)==RECURSE) {
-        RExC_parse++;
-        vFAIL("Illegal quantifier on recursion group");
-    } */
  
  #if 0                          /* Now runtime fix should be reliable. */
  
@@ -5262,12 +5265,27 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                origparse);
      }
  
-    if (*RExC_parse == '?') {
+    if (RExC_parse < RExC_end && *RExC_parse == '?') {
         nextchar(pRExC_state);
         reginsert(pRExC_state, MINMOD, ret, depth+1);
          REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
      }
-    if (ISMULT2(RExC_parse)) {
+#ifndef REG_ALLOW_MINMOD_SUSPEND
+    else
+#endif
+    if (RExC_parse < RExC_end && *RExC_parse == '+') {
+        regnode *ender;
+        nextchar(pRExC_state);
+        ender = reg_node(pRExC_state, SUCCEED);
+        REGTAIL(pRExC_state, ret, ender);
+        reginsert(pRExC_state, SUSPEND, ret, depth+1);
+        ret->flags = 0;
+        ender = reg_node(pRExC_state, TAIL);
+        REGTAIL(pRExC_state, ret, ender);
+        /*ret= ender;*/
+    }
+
+    if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
         RExC_parse++;
         vFAIL("Nested quantifiers");
      }
diff --git a/regcomp.h b/regcomp.h

index e7b5a2c291bc030538b4fed1a7cc9519df7e6602..5fb6b14ae6df12a821c133af009c3529bd8164e7 100644 (file)
--- a/regcomp.h
+++ b/regcomp.h
@@ -15,6 +15,12 @@ typedef OP OP_4tree;                 /* Will be redefined later. */
  #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1
  #define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 1
  #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0
+/* Unless the next line is uncommented it is illegal to combine lazy 
+   matching with possessive matching. Frankly it doesn't make much sense 
+   to allow it as X*?+ matches nothing, X+?+ matches a single char only, 
+   and X{min,max}?+ matches min times only.
+ */
+/* #define REG_ALLOW_MINMOD_SUSPEND */
  
  /*
   * The "internal use only" fields in regexp.h are present to pass info from
diff --git a/t/op/re_tests b/t/op/re_tests

index 9f0e06b4d2185f8f5c5ad5d80bf2ae90f2546315..dbbe99307320f01e320833081d7de788dadd5948 100644 (file)
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -1062,3 +1062,88 @@ X(?<=foo.)[YZ]   ..XfooXY..      y       pos     8
  /(ab)+((?1))(fox)/     ababfox y       $1-$2-$3        ab-ab-fox
  /(ab){1,100}((?1))(fox)/       ababfox y       $1-$2-$3        ab-ab-fox
  /(ab){0,100}((?1))(fox)/       ababfox y       $1-$2-$3        ab-ab-fox
+# possessive captures
+a++a   aaaaa   n       -       -
+a*+a   aaaaa   n       -       -
+a{1,5}+a       aaaaa   n       -       -
+a?+a   ab      n       -       -
+a++b   aaaaab  y       $&      aaaaab
+a*+b   aaaaab  y       $&      aaaaab
+a{1,5}+b       aaaaab  y       $&      aaaaab
+a?+b   ab      y       $&      ab
+fooa++a        fooaaaaa        n       -       -
+fooa*+a        fooaaaaa        n       -       -
+fooa{1,5}+a    fooaaaaa        n       -       -
+fooa?+a        fooab   n       -       -
+fooa++b        fooaaaaab       y       $&      fooaaaaab
+fooa*+b        fooaaaaab       y       $&      fooaaaaab
+fooa{1,5}+b    fooaaaaab       y       $&      fooaaaaab
+fooa?+b        fooab   y       $&      fooab
+(?:aA)++(?:aA) aAaAaAaAaA      n       -       aAaAaAaAaA
+(aA)++(aA)     aAaAaAaAaA      n       -       aAaAaAaAaA
+(aA|bB)++(aA|bB)       aAaAbBaAbB      n       -       aAaAbBaAbB
+(?:aA|bB)++(?:aA|bB)   aAbBbBbBaA      n       -       aAbBbBbBaA
+(?:aA)*+(?:aA) aAaAaAaAaA      n       -       aAaAaAaAaA
+(aA)*+(aA)     aAaAaAaAaA      n       -       aAaAaAaAaA
+(aA|bB)*+(aA|bB)       aAaAbBaAaA      n       -       aAaAbBaAaA
+(?:aA|bB)*+(?:aA|bB)   aAaAaAbBaA      n       -       aAaAaAbBaA
+(?:aA){1,5}+(?:aA)     aAaAaAaAaA      n       -       aAaAaAaAaA
+(aA){1,5}+(aA) aAaAaAaAaA      n       -       aAaAaAaAaA
+(aA|bB){1,5}+(aA|bB)   aAaAbBaAaA      n       -       aAaAbBaAaA
+(?:aA|bB){1,5}+(?:aA|bB)       bBbBbBbBbB      n       -       bBbBbBbBbB
+(?:aA)?+(?:aA) aAb     n       -       aAb
+(aA)?+(aA)     aAb     n       -       aAb
+(aA|bB)?+(aA|bB)       bBb     n       -       bBb
+(?:aA|bB)?+(?:aA|bB)   aAb     n       -       aAb
+(?:aA)++b      aAaAaAaAaAb     y       $&      aAaAaAaAaAb
+(aA)++b        aAaAaAaAaAb     y       $&      aAaAaAaAaAb
+(aA|bB)++b     aAbBaAaAbBb     y       $&      aAbBaAaAbBb
+(?:aA|bB)++b   aAbBbBaAaAb     y       $&      aAbBbBaAaAb
+(?:aA)*+b      aAaAaAaAaAb     y       $&      aAaAaAaAaAb
+(aA)*+b        aAaAaAaAaAb     y       $&      aAaAaAaAaAb
+(aA|bB)*+b     bBbBbBbBbBb     y       $&      bBbBbBbBbBb
+(?:aA|bB)*+b   bBaAbBbBaAb     y       $&      bBaAbBbBaAb
+(?:aA){1,5}+b  aAaAaAaAaAb     y       $&      aAaAaAaAaAb
+(aA){1,5}+b    aAaAaAaAaAb     y       $&      aAaAaAaAaAb
+(aA|bB){1,5}+b bBaAbBaAbBb     y       $&      bBaAbBaAbBb
+(?:aA|bB){1,5}+b       aAbBaAbBbBb     y       $&      aAbBaAbBbBb
+(?:aA)?+b      aAb     y       $&      aAb
+(aA)?+b        aAb     y       $&      aAb
+(aA|bB)?+b     bBb     y       $&      bBb
+(?:aA|bB)?+b   bBb     y       $&      bBb
+foo(?:aA)++(?:aA)      fooaAaAaAaAaA   n       -       fooaAaAaAaAaA
+foo(aA)++(aA)  fooaAaAaAaAaA   n       -       fooaAaAaAaAaA
+foo(aA|bB)++(aA|bB)    foobBbBbBaAaA   n       -       foobBbBbBaAaA
+foo(?:aA|bB)++(?:aA|bB)        fooaAaAaAaAaA   n       -       fooaAaAaAaAaA
+foo(?:aA)*+(?:aA)      fooaAaAaAaAaA   n       -       fooaAaAaAaAaA
+foo(aA)*+(aA)  fooaAaAaAaAaA   n       -       fooaAaAaAaAaA
+foo(aA|bB)*+(aA|bB)    foobBaAbBaAaA   n       -       foobBaAbBaAaA
+foo(?:aA|bB)*+(?:aA|bB)        fooaAaAbBbBaA   n       -       fooaAaAbBbBaA
+foo(?:aA){1,5}+(?:aA)  fooaAaAaAaAaA   n       -       fooaAaAaAaAaA
+foo(aA){1,5}+(aA)      fooaAaAaAaAaA   n       -       fooaAaAaAaAaA
+foo(aA|bB){1,5}+(aA|bB)        fooaAbBbBaAaA   n       -       fooaAbBbBaAaA
+foo(?:aA|bB){1,5}+(?:aA|bB)    fooaAbBbBaAbB   n       -       fooaAbBbBaAbB
+foo(?:aA)?+(?:aA)      fooaAb  n       -       fooaAb
+foo(aA)?+(aA)  fooaAb  n       -       fooaAb
+foo(aA|bB)?+(aA|bB)    foobBb  n       -       foobBb
+foo(?:aA|bB)?+(?:aA|bB)        fooaAb  n       -       fooaAb
+foo(?:aA)++b   fooaAaAaAaAaAb  y       $&      fooaAaAaAaAaAb
+foo(aA)++b     fooaAaAaAaAaAb  y       $&      fooaAaAaAaAaAb
+foo(aA|bB)++b  foobBaAbBaAbBb  y       $&      foobBaAbBaAbBb
+foo(?:aA|bB)++b        fooaAaAbBaAaAb  y       $&      fooaAaAbBaAaAb
+foo(?:aA)*+b   fooaAaAaAaAaAb  y       $&      fooaAaAaAaAaAb
+foo(aA)*+b     fooaAaAaAaAaAb  y       $&      fooaAaAaAaAaAb
+foo(aA|bB)*+b  foobBbBaAaAaAb  y       $&      foobBbBaAaAaAb
+foo(?:aA|bB)*+b        foobBaAaAbBaAb  y       $&      foobBaAaAbBaAb
+foo(?:aA){1,5}+b       fooaAaAaAaAaAb  y       $&      fooaAaAaAaAaAb
+foo(aA){1,5}+b fooaAaAaAaAaAb  y       $&      fooaAaAaAaAaAb
+foo(aA|bB){1,5}+b      foobBaAaAaAaAb  y       $&      foobBaAaAaAaAb
+foo(?:aA|bB){1,5}+b    fooaAbBaAbBbBb  y       $&      fooaAbBaAbBbBb
+foo(?:aA)?+b   fooaAb  y       $&      fooaAb
+foo(aA)?+b     fooaAb  y       $&      fooaAb
+foo(aA|bB)?+b  foobBb  y       $&      foobBb
+foo(?:aA|bB)?+b        foobBb  y       $&      foobBb
+
+([^()]++|\([^()]*\))+  ((abc(ade)ufh()()x      y       $&      abc(ade)ufh()()x
+round\(([^()]++)\)     _I(round(xs * sz),1)    y       $1      xs * sz
+
diff --git a/t/op/regexp.t b/t/op/regexp.t

index 6a469b7699faafec08a0732127f925988ff6e43f..cce19fc03da463a2ee6d8c34af0dae0ad578fc63 100755 (executable)
--- a/t/op/regexp.t
+++ b/t/op/regexp.t
@@ -28,6 +28,9 @@
  #
  # \n in the tests are interpolated, as are variables of the form ${\w+}.
  #
+# Blanks lines are treated as PASSING tests to keep the line numbers
+# linked to the test number.
+#
  # If you want to add a regular expression test that can't be expressed
  # in this format, don't add it here: put it in op/pat.t instead.
  #
@@ -58,6 +61,11 @@ $| = 1;
  print "1..$numtests\n# $iters iterations\n";
  TEST:
  while (<TESTS>) {
+    if (!/\S/ || /^\s*#/) {
+        print "ok $. # (Blank line or comment)\n";
+        if (/\S/) { print $_ };
+        next;
+    }
      chomp;
      s/\\n/\n/g;
      ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6);
author	Yves Orton <demerphq@gmail.com>
	Thu, 12 Oct 2006 21:23:15 +0000 (23:23 +0200)
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>
	Fri, 13 Oct 2006 09:02:05 +0000 (09:02 +0000)
pod/perl595delta.pod		patch \| blob \| history
pod/perlre.pod		patch \| blob \| history
pod/perltodo.pod		patch \| blob \| history
regcomp.c		patch \| blob \| history
regcomp.h		patch \| blob \| history
t/op/re_tests		patch \| blob \| history
t/op/regexp.t		patch \| blob \| history