Add Notoneloopgreedy to go with One/Setloopgreedy
authorStephen Toub <stoub@microsoft.com>
Fri, 3 Jan 2020 17:13:59 +0000 (12:13 -0500)
committerStephen Toub <stoub@microsoft.com>
Thu, 9 Jan 2020 03:50:09 +0000 (22:50 -0500)
I initially mistakenly thought this wouldn't be useful, but it is.  For example, the expression ".*\n" can be made non-backtracing.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFCD.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs

index 0651f42..bc4e4cb 100644 (file)
@@ -85,7 +85,8 @@ namespace System.Text.RegularExpressions
         // These exist to reduce backtracking (both actually performing it and spitting code for it).
 
         public const int Oneloopgreedy = 43;      // lef,back char,min,max    (?> a {,n} )
-        public const int Setloopgreedy = 44;      // lef,back set,min,max     (?> [\d]{,n} )
+        public const int Notoneloopgreedy = 44;   // lef,back set,min,max     (?> . {,n} )
+        public const int Setloopgreedy = 45;      // lef,back set,min,max     (?> [\d]{,n} )
 
         // Modifiers for alternate modes
         public const int Mask = 63;   // Mask to get unmodified ordinary operator
@@ -207,6 +208,7 @@ namespace System.Text.RegularExpressions
                 case Oneloop:
                 case Oneloopgreedy:
                 case Notoneloop:
+                case Notoneloopgreedy:
                 case Onelazy:
                 case Notonelazy:
                 case Setlazy:
@@ -236,7 +238,7 @@ namespace System.Text.RegularExpressions
             "Setjump", "Backjump", "Forejump", "Testref", "Goto",
             "Prune", "Stop",
             "ECMABoundary", "NonECMABoundary",
-            "Oneloopgreedy", "Setloopgreedy"
+            "Oneloopgreedy", "Notoneloopgreedy", "Setloopgreedy"
         };
 
         private static string OperatorDescription(int Opcode)
@@ -275,6 +277,7 @@ namespace System.Text.RegularExpressions
                 case Oneloop:
                 case Oneloopgreedy:
                 case Notoneloop:
+                case Notoneloopgreedy:
                 case Onelazy:
                 case Notonelazy:
                     sb.Append("Ch = ");
@@ -335,6 +338,7 @@ namespace System.Text.RegularExpressions
                 case Oneloop:
                 case Oneloopgreedy:
                 case Notoneloop:
+                case Notoneloopgreedy:
                 case Onelazy:
                 case Notonelazy:
                 case Setrep:
index 4afd909..b791d4c 100644 (file)
@@ -1572,8 +1572,9 @@ namespace System.Text.RegularExpressions
                         // {Set/One}loopgreedy are optimized nodes that represent non-backtracking variable-length loops.
                         // These consume their {Set/One} inputs as long as they match, and don't give up anything they
                         // matched, which means we can support them without backtracking.
-                        case RegexNode.Setloopgreedy:
                         case RegexNode.Oneloopgreedy:
+                        case RegexNode.Notoneloopgreedy:
+                        case RegexNode.Setloopgreedy:
                             // TODO: Add support for greedy {Lazy}Loop around supported elements, namely Concatenate.
                             //       Nested loops will require multiple iteration variables to be defined.
                             supported = true;
@@ -1714,6 +1715,7 @@ namespace System.Text.RegularExpressions
                         break;
 
                     case RegexNode.Oneloopgreedy:
+                    case RegexNode.Notoneloopgreedy:
                     case RegexNode.Setloopgreedy:
                         EmitGreedyLoop(node);
                         break;
@@ -1782,7 +1784,7 @@ namespace System.Text.RegularExpressions
                         break;
 
                     default:
-                        Debug.Assert(node.Type == RegexNode.Notone || node.Type == RegexNode.Notonelazy || node.Type == RegexNode.Notoneloop);
+                        Debug.Assert(node.Type == RegexNode.Notone || node.Type == RegexNode.Notonelazy || node.Type == RegexNode.Notoneloop || node.Type == RegexNode.Notoneloopgreedy);
                         if (IsCaseInsensitive(node)) CallToLower();
                         Ldc(node.Ch);
                         BeqFar(doneLabel);
@@ -1896,7 +1898,7 @@ namespace System.Text.RegularExpressions
             // Emits the code to handle a non-backtracking, variable-length loop (Oneloopgreedy or Setloopgreedy).
             void EmitGreedyLoop(RegexNode node)
             {
-                Debug.Assert(node.Type == RegexNode.Oneloopgreedy || node.Type == RegexNode.Setloopgreedy);
+                Debug.Assert(node.Type == RegexNode.Oneloopgreedy || node.Type == RegexNode.Notoneloopgreedy || node.Type == RegexNode.Setloopgreedy);
                 Debug.Assert(node.M < int.MaxValue);
 
                 // First generate the code to handle the required number of iterations.
@@ -1934,16 +1936,22 @@ namespace System.Text.RegularExpressions
                     Add();
                     Call(s_spanGetItemMethod);
                     LdindU2();
-                    if (node.Type == RegexNode.Oneloopgreedy)
-                    {
-                        if (IsCaseInsensitive(node)) CallToLower();
-                        Ldc(node.Ch);
-                        BneFar(doneLabel);
-                    }
-                    else // Setloopgreedy
+                    switch (node.Type)
                     {
-                        EmitCallCharInClass(node.Str!, IsCaseInsensitive(node), setScratchLocal);
-                        BrfalseFar(doneLabel);
+                        case RegexNode.Oneloopgreedy:
+                            if (IsCaseInsensitive(node)) CallToLower();
+                            Ldc(node.Ch);
+                            BneFar(doneLabel);
+                            break;
+                        case RegexNode.Notoneloopgreedy:
+                            if (IsCaseInsensitive(node)) CallToLower();
+                            Ldc(node.Ch);
+                            BeqFar(doneLabel);
+                            break;
+                        case RegexNode.Setloopgreedy:
+                            EmitCallCharInClass(node.Str!, IsCaseInsensitive(node), setScratchLocal);
+                            BrfalseFar(doneLabel);
+                            break;
                     }
 
                     // i++;
@@ -3281,12 +3289,16 @@ namespace System.Text.RegularExpressions
                 case RegexCode.Notoneloop | RegexCode.Ci | RegexCode.Rtl:
                 case RegexCode.Setloop | RegexCode.Ci | RegexCode.Rtl:
                 case RegexCode.Oneloopgreedy:
+                case RegexCode.Notoneloopgreedy:
                 case RegexCode.Setloopgreedy:
                 case RegexCode.Oneloopgreedy | RegexCode.Rtl:
+                case RegexCode.Notoneloopgreedy | RegexCode.Rtl:
                 case RegexCode.Setloopgreedy | RegexCode.Rtl:
                 case RegexCode.Oneloopgreedy | RegexCode.Ci:
+                case RegexCode.Notoneloopgreedy | RegexCode.Ci:
                 case RegexCode.Setloopgreedy | RegexCode.Ci:
                 case RegexCode.Oneloopgreedy | RegexCode.Ci | RegexCode.Rtl:
+                case RegexCode.Notoneloopgreedy | RegexCode.Ci | RegexCode.Rtl:
                 case RegexCode.Setloopgreedy | RegexCode.Ci | RegexCode.Rtl:
                     //: int c = Operand(1);
                     //: if (c > Rightchars())
@@ -3388,6 +3400,7 @@ namespace System.Text.RegularExpressions
                             }
                             else
                             {
+                                Debug.Assert(Code() == RegexCode.Notoneloop || Code() == RegexCode.Notoneloopgreedy);
                                 Bne(l1);
                             }
                         }
@@ -3399,7 +3412,7 @@ namespace System.Text.RegularExpressions
 
                         MarkLabel(l2);
 
-                        if (Code() != RegexCode.Oneloopgreedy && Code() != RegexCode.Setloopgreedy)
+                        if (Code() != RegexCode.Oneloopgreedy && Code() != RegexCode.Notoneloopgreedy && Code() != RegexCode.Setloopgreedy)
                         {
                             Ldloc(lenLocal);
                             Ldloc(cLocal);
index 5fbe927..3f595b1 100644 (file)
@@ -446,6 +446,7 @@ namespace System.Text.RegularExpressions
                     break;
 
                 case RegexNode.Notoneloop:
+                case RegexNode.Notoneloopgreedy:
                 case RegexNode.Notonelazy:
                     PushFC(new RegexFC(node.Ch, true, node.M == 0, ci));
                     break;
index 45b39cf..895a0a1 100644 (file)
@@ -1163,6 +1163,7 @@ namespace System.Text.RegularExpressions
                         }
 
                     case RegexCode.Notoneloop:
+                    case RegexCode.Notoneloopgreedy:
                         {
                             int c = Operand(1);
 
@@ -1182,8 +1183,10 @@ namespace System.Text.RegularExpressions
                                 }
                             }
 
-                            if (c > i)
+                            if (c > i && Operator() == RegexCode.Notoneloop)
+                            {
                                 TrackPush(c - i - 1, Textpos() - Bump());
+                            }
 
                             advance = 2;
                             continue;
index 39e307d..e0c4332 100644 (file)
@@ -78,8 +78,9 @@ namespace System.Text.RegularExpressions
         public const int EndZ = RegexCode.EndZ;                       //          \Z
         public const int End = RegexCode.End;                         //          \z
 
-        public const int Oneloopgreedy = RegexCode.Oneloopgreedy;     // c,n      (?> a*)
-        public const int Setloopgreedy = RegexCode.Setloopgreedy;     // set,n    (?> \d*)
+        public const int Oneloopgreedy = RegexCode.Oneloopgreedy;        // c,n      (?> a*)
+        public const int Notoneloopgreedy = RegexCode.Notoneloopgreedy;  // c,n      (?> .*)
+        public const int Setloopgreedy = RegexCode.Setloopgreedy;        // set,n    (?> \d*)
 
         // Interior nodes do not correspond to primitive operations, but
         // control structures compositing other operations
@@ -187,14 +188,18 @@ namespace System.Text.RegularExpressions
                 {
                     switch (node.Type)
                     {
-                        case Setloop:
-                            node.Type = Setloopgreedy;
-                            break;
-
                         case Oneloop:
                             node.Type = Oneloopgreedy;
                             break;
 
+                        case Notoneloop:
+                            node.Type = Notoneloopgreedy;
+                            break;
+
+                        case Setloop:
+                            node.Type = Setloopgreedy;
+                            break;
+
                         case Capture:
                         case Greedy:
                             Debug.Assert(node.ChildCount() == 1);
@@ -304,11 +309,16 @@ namespace System.Text.RegularExpressions
                     child.Type = Oneloopgreedy;
                     return child;
 
+                case Notoneloop:
+                    child.Type = Notoneloopgreedy;
+                    return child;
+
                 case Setloop:
                     child.Type = Setloopgreedy;
                     return child;
 
                 case Oneloopgreedy:
+                case Notoneloopgreedy:
                 case Setloopgreedy:
                     return child;
             }
@@ -346,6 +356,7 @@ namespace System.Text.RegularExpressions
                             case Oneloop:
                             case Oneloopgreedy:
                             case Notoneloop:
+                            case Notoneloopgreedy:
                             case Setloop:
                             case Setloopgreedy:
                                 valid = true;
@@ -739,8 +750,8 @@ namespace System.Text.RegularExpressions
                     continue;
                 }
 
-                // If this node is a oneloop or a setloop, see if it overlaps with its successor in the concatenation.
-                // If it doesn't, then we can upgrade it to being a oneloopgreedy or setloopgreedy, respectively.
+                // If this node is a one/notone/setloop, see if it overlaps with its successor in the concatenation.
+                // If it doesn't, then we can upgrade it to being a one/notone/setloopgreedy.
                 // Doing so avoids unnecessary backtracking.
                 switch (node.Type)
                 {
@@ -754,6 +765,7 @@ namespace System.Text.RegularExpressions
                             case Notone when node.Ch == subsequent.Ch:
                             case Notonelazy when subsequent.M > 0 && node.Ch == subsequent.Ch:
                             case Notoneloop when subsequent.M > 0 && node.Ch == subsequent.Ch:
+                            case Notoneloopgreedy when subsequent.M > 0 && node.Ch == subsequent.Ch:
                             case Multi when node.Ch != subsequent.Str![0]:
                             case Set when !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
                             case Setlazy when subsequent.M > 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
@@ -769,6 +781,20 @@ namespace System.Text.RegularExpressions
                         }
                         break;
 
+                    case Notoneloop:
+                        switch (subsequent.Type)
+                        {
+                            case One when node.Ch == subsequent.Ch:
+                            case Onelazy when subsequent.M > 0 && node.Ch == subsequent.Ch:
+                            case Oneloop when subsequent.M > 0 && node.Ch == subsequent.Ch:
+                            case Oneloopgreedy when subsequent.M > 0 && node.Ch == subsequent.Ch:
+                            case Multi when node.Ch == subsequent.Str![0]:
+                            case End:
+                                node.Type = Notoneloopgreedy;
+                                break;
+                        }
+                        break;
+
                     case Setloop:
                         switch (subsequent.Type)
                         {
@@ -779,7 +805,8 @@ namespace System.Text.RegularExpressions
                             case Notone when RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
                             case Notonelazy when subsequent.M > 0 && RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
                             case Notoneloop when subsequent.M > 0 && RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
-                            case Multi when !string.IsNullOrEmpty(subsequent.Str) && !RegexCharClass.CharInClass(subsequent.Str[0], node.Str!):
+                            case Notoneloopgreedy when subsequent.M > 0 && RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
+                            case Multi when !RegexCharClass.CharInClass(subsequent.Str![0], node.Str!):
                             case Set when !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
                             case Setlazy when subsequent.M > 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
                             case Setloop when subsequent.M > 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
@@ -881,7 +908,7 @@ namespace System.Text.RegularExpressions
             "Testref", "Testgroup",
             "", "", "", "", "", "",
             "ECMABoundary", "NonECMABoundary",
-            "Oneloopgreedy", "Setloopgreedy",
+            "Oneloopgreedy", "Notoneloopgreedy", "Setloopgreedy",
         };
 
         public string Description()
@@ -910,6 +937,7 @@ namespace System.Text.RegularExpressions
                 case Oneloop:
                 case Oneloopgreedy:
                 case Notoneloop:
+                case Notoneloopgreedy:
                 case Onelazy:
                 case Notonelazy:
                 case One:
@@ -939,6 +967,7 @@ namespace System.Text.RegularExpressions
                 case Oneloop:
                 case Oneloopgreedy:
                 case Notoneloop:
+                case Notoneloopgreedy:
                 case Onelazy:
                 case Notonelazy:
                 case Setloop:
index 3f048fb..b8f30b5 100644 (file)
@@ -442,6 +442,7 @@ namespace System.Text.RegularExpressions
                     break;
 
                 case RegexNode.Notoneloop:
+                case RegexNode.Notoneloopgreedy:
                 case RegexNode.Notonelazy:
                 case RegexNode.Oneloop:
                 case RegexNode.Oneloopgreedy: