Reduce unnecessary Regex match attempts for expressions beginning with atomic loops...
authorStephen Toub <stoub@microsoft.com>
Tue, 5 May 2020 18:22:13 +0000 (14:22 -0400)
committerGitHub <noreply@github.com>
Tue, 5 May 2020 18:22:13 +0000 (11:22 -0700)
* Reduce unnecessary Regex match attempts for expressions beginning with atomic loops

* Fix corner-case overflows in ComputeMinLength

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs
src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs
src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs
src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs

index 8df7e06..f327e8c 100644 (file)
@@ -85,6 +85,7 @@ namespace System.Text.RegularExpressions
         public const int Oneloopatomic = 43;      // lef,back char,min,max    (?> a {,n} )
         public const int Notoneloopatomic = 44;   // lef,back set,min,max     (?> . {,n} )
         public const int Setloopatomic = 45;      // lef,back set,min,max     (?> [\d]{,n} )
+        public const int UpdateBumpalong = 46;    // updates the bumpalong position to the current position
 
         // Modifiers for alternate modes
         public const int Mask = 63;   // Mask to get unmodified ordinary operator
@@ -184,6 +185,7 @@ namespace System.Text.RegularExpressions
                 case Backjump:
                 case Forejump:
                 case Stop:
+                case UpdateBumpalong:
                     return 1;
 
                 case One:
@@ -273,6 +275,7 @@ namespace System.Text.RegularExpressions
                 Oneloopatomic => nameof(Oneloopatomic),
                 Notoneloopatomic => nameof(Notoneloopatomic),
                 Setloopatomic => nameof(Setloopatomic),
+                UpdateBumpalong => nameof(UpdateBumpalong),
                 _ => "(unknown)"
             };
 
index ecd8c0b..e3d87bf 100644 (file)
@@ -300,6 +300,9 @@ namespace System.Text.RegularExpressions
         /// <summary>A macro for _ilg.Emit(OpCodes.Newobj, constructor).</summary>
         protected void Newobj(ConstructorInfo constructor) => _ilg!.Emit(OpCodes.Newobj, constructor);
 
+        /// <summary>A macro for _ilg.Emit(OpCodes.Dup).</summary>
+        protected void Dup() => _ilg!.Emit(OpCodes.Dup);
+
         /// <summary>A macro for _ilg.Emit(OpCodes.Rem_Un).</summary>
         private void RemUn() => _ilg!.Emit(OpCodes.Rem_Un);
 
@@ -1924,8 +1927,10 @@ namespace System.Text.RegularExpressions
                         case RegexNode.Setloopatomic:
                         // "Empty" is easy: nothing is emitted for it.
                         // "Nothing" is also easy: it doesn't match anything.
+                        // "UpdateBumpalong" doesn't match anything, it's just an optional directive to the engine.
                         case RegexNode.Empty:
                         case RegexNode.Nothing:
+                        case RegexNode.UpdateBumpalong:
                             supported = true;
                             break;
 
@@ -2441,12 +2446,28 @@ namespace System.Text.RegularExpressions
                         // Emit nothing.
                         break;
 
+                    case RegexNode.UpdateBumpalong:
+                        EmitUpdateBumpalong();
+                        break;
+
                     default:
                         Debug.Fail($"Unexpected node type: {node.Type}");
                         break;
                 }
             }
 
+            // Emits the code to handle updating base.runtextpos to runtextpos in response to
+            // an UpdateBumpalong node.  This is used when we want to inform the scan loop that
+            // it should bump from this location rather than from the original location.
+            void EmitUpdateBumpalong()
+            {
+                // base.runtextpos = runtextpos;
+                TransferTextSpanPosToRunTextPos();
+                Ldthis();
+                Ldloc(runtextposLocal);
+                Stfld(s_runtextposField);
+            }
+
             // Emits the code to handle a single-character match.
             void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null)
             {
@@ -3381,6 +3402,20 @@ namespace System.Text.RegularExpressions
                     Back();
                     break;
 
+                case RegexCode.UpdateBumpalong:
+                    // UpdateBumpalong should only exist in the code stream at such a point where the root
+                    // of the backtracking stack contains the runtextpos from the start of this Go call. Replace
+                    // that tracking value with the current runtextpos value.
+                    //: base.runtrack[base.runtrack.Length - 1] = runtextpos;
+                    Ldloc(_runtrackLocal!);
+                    Dup();
+                    Ldlen();
+                    Ldc(1);
+                    Sub();
+                    Ldloc(_runtextposLocal!);
+                    StelemI4();
+                    break;
+
                 case RegexCode.Goto:
                     //: Goto(Operand(0));
                     Goto(Operand(0));
index 2de366b..e479b37 100644 (file)
@@ -1345,6 +1345,14 @@ namespace System.Text.RegularExpressions
                         advance = 2;
                         continue;
 
+                    case RegexCode.UpdateBumpalong:
+                        // UpdateBumpalong should only exist in the code stream at such a point where the root
+                        // of the backtracking stack contains the runtextpos from the start of this Go call. Replace
+                        // that tracking value with the current runtextpos value.
+                        runtrack![runtrack.Length - 1] = runtextpos;
+                        advance = 0;
+                        continue;
+
                     default:
                         Debug.Fail($"Unimplemented state: {_operator:X8}");
                         break;
index 64d7052..1479c2e 100644 (file)
@@ -80,6 +80,7 @@ namespace System.Text.RegularExpressions
         public const int Oneloopatomic = RegexCode.Oneloopatomic;        // c,n      (?> a*)
         public const int Notoneloopatomic = RegexCode.Notoneloopatomic;  // c,n      (?> .*)
         public const int Setloopatomic = RegexCode.Setloopatomic;        // set,n    (?> \d*)
+        public const int UpdateBumpalong = RegexCode.UpdateBumpalong;
 
         // Interior nodes do not correspond to primitive operations, but
         // control structures compositing other operations
@@ -235,6 +236,7 @@ namespace System.Text.RegularExpressions
                     case Setloop:
                     case Setloopatomic:
                     case Start:
+                    case UpdateBumpalong:
                         Debug.Assert(childCount == 0, $"Expected zero children for {node.TypeName}, got {childCount}.");
                         break;
 
@@ -308,6 +310,44 @@ namespace System.Text.RegularExpressions
                 // to implementations that don't support backtracking.
                 EliminateEndingBacktracking(rootNode.Child(0), DefaultMaxRecursionDepth);
 
+                // Optimization: unnecessary re-processing of atomic starting groups.
+                // If an expression is guaranteed to begin with a single-character infinite atomic group that isn't part of an alternation (in which case it
+                // wouldn't be guaranteed to be at the beginning) or a capture (in which case a back reference could be influenced by its length), then we
+                // can update the tree with a temporary node to indicate that the implementation should use that node's ending position in the input text
+                // as the next starting position at which to start the next match. This avoids redoing matches we've already performed, e.g. matching
+                // "\w+@dot.net" against "is this a valid address@dot.net", the \w+ will initially match the "is" and then will fail to match the "@".
+                // Rather than bumping the scan loop by 1 and trying again to match at the "s", we can instead start at the " ".  We limit ourselves to
+                // one/set atomic loops with a min iteration count of 1 so that we know we'll get something in exchange for the extra overhead of storing
+                // the updated position. For functional correctness we can only consider infinite atomic loops, as to be able to start at the end of the
+                // loop we need the loop to have consumed all possible matches; otherwise, you could end up with a pattern like "a{1,3}b" matching
+                // against "aaaabc", which should match, but if we pre-emptively stop consuming after the first three a's and re-start from that position,
+                // we'll end up failing the match even though it should have succeeded.
+                {
+                    RegexNode node = rootNode.Child(0); // skip implicit root capture node
+                    while (true)
+                    {
+                        switch (node.Type)
+                        {
+                            case Atomic:
+                            case Concatenate:
+                                node = node.Child(0);
+                                continue;
+
+                            case Oneloopatomic when node.M > 0 && node.N == int.MaxValue:
+                            case Notoneloopatomic when node.M > 0 && node.N == int.MaxValue:
+                            case Setloopatomic when node.M > 0 && node.N == int.MaxValue:
+                                RegexNode? parent = node.Next;
+                                if (parent != null && parent.Type == Concatenate)
+                                {
+                                    parent.InsertChild(1, new RegexNode(UpdateBumpalong, node.Options));
+                                }
+                                break;
+                        }
+
+                        break;
+                    }
+                }
+
                 // Optimization: implicit anchoring.
                 // If the expression begins with a .* loop, add an anchor to the beginning:
                 // - If Singleline is set such that '.' eats anything, the .* will zip to the end of the string and then backtrack through
@@ -1603,7 +1643,7 @@ namespace System.Text.RegularExpressions
                     case Lazyloop:
                     case Loop:
                         // A node graph repeated at least M times.
-                        return node.M * ComputeMinLength(node.Child(0), maxDepth - 1);
+                        return (int)Math.Min(int.MaxValue, (long)node.M * ComputeMinLength(node.Child(0), maxDepth - 1));
 
                     case Alternate:
                         // The minimum required length for any of the alternation's branches.
@@ -1621,13 +1661,13 @@ namespace System.Text.RegularExpressions
                     case Concatenate:
                         // The sum of all of the concatenation's children.
                         {
-                            int sum = 0;
+                            long sum = 0;
                             int childCount = node.ChildCount();
                             for (int i = 0; i < childCount; i++)
                             {
                                 sum += ComputeMinLength(node.Child(i), maxDepth - 1);
                             }
-                            return sum;
+                            return (int)Math.Min(int.MaxValue, sum);
                         }
 
                     case Atomic:
@@ -1639,8 +1679,9 @@ namespace System.Text.RegularExpressions
 
                     case Empty:
                     case Nothing:
+                    case UpdateBumpalong:
                     // Nothing to match. In the future, we could potentially use Nothing to say that the min length
-                    // is infinite, but that would require a different structure, as that would only applies if the
+                    // is infinite, but that would require a different structure, as that would only apply if the
                     // Nothing match is required in all cases (rather than, say, as one branch of an alternation).
                     case Beginning:
                     case Bol:
@@ -1668,7 +1709,7 @@ namespace System.Text.RegularExpressions
 #if DEBUG
                         Debug.Fail($"Unknown node: {node.TypeName}");
 #endif
-                        return 0;
+                        goto case Empty;
                 }
             }
         }
@@ -1716,6 +1757,17 @@ namespace System.Text.RegularExpressions
             }
         }
 
+        public void InsertChild(int index, RegexNode newChild)
+        {
+            Debug.Assert(Children is List<RegexNode>);
+
+            newChild.Next = this; // so that the child can see its parent while being reduced
+            newChild = newChild.Reduce();
+            newChild.Next = this; // in case Reduce returns a different node that needs to be reparented
+
+            ((List<RegexNode>)Children).Insert(index, newChild);
+        }
+
         public void ReplaceChild(int index, RegexNode newChild)
         {
             Debug.Assert(Children != null);
@@ -1799,6 +1851,7 @@ namespace System.Text.RegularExpressions
                 Atomic => nameof(Atomic),
                 Testref => nameof(Testref),
                 Testgroup => nameof(Testgroup),
+                UpdateBumpalong => nameof(UpdateBumpalong),
                 _ => $"(unknown {Type})"
             };
 
index 627a485..21e3856 100644 (file)
@@ -608,6 +608,7 @@ namespace System.Text.RegularExpressions
                 case RegexNode.Start:
                 case RegexNode.EndZ:
                 case RegexNode.End:
+                case RegexNode.UpdateBumpalong:
                     PushFC(new RegexFC(true));
                     break;
 
index 33fe44f..f09b383 100644 (file)
@@ -518,6 +518,7 @@ namespace System.Text.RegularExpressions
                 case RegexNode.Start:
                 case RegexNode.EndZ:
                 case RegexNode.End:
+                case RegexNode.UpdateBumpalong:
                     Emit(node.Type);
                     break;
 
index 60537db..827f664 100644 (file)
@@ -391,7 +391,6 @@ namespace System.Text.RegularExpressions.Tests
             if (!PlatformDetection.IsNetFramework) // missing fix for https://github.com/dotnet/runtime/issues/24759
             {
                 yield return new object[] { null, @"(cat)(\c[*)(dog)", "asdlkcat\u001bdogiwod", RegexOptions.None, new string[] { "cat\u001bdog", "cat", "\u001b", "dog" } };
-                yield return new object[] { null, @"(cat)(\c[*)(dog)", "asdlkcat\u001Bdogiwod", RegexOptions.None, new string[] { "cat\u001Bdog", "cat", "\u001B", "dog" } };
             }
 
             // Atomic Zero-Width Assertions \A \G ^ \Z \z \b \B
index 1f02723..cf25c63 100644 (file)
@@ -96,6 +96,35 @@ namespace System.Text.RegularExpressions.Tests
                 yield return new object[] { Case(@"a[^wyz]*w"), "abczw", RegexOptions.IgnoreCase, 0, 0, false, string.Empty };
             }
 
+            // Loops at beginning of expressions
+            yield return new object[] { @"a+", "aaa", RegexOptions.None, 0, 3, true, "aaa" };
+            yield return new object[] { @"a+\d+", "a1", RegexOptions.None, 0, 2, true, "a1" };
+            yield return new object[] { @".+\d+", "a1", RegexOptions.None, 0, 2, true, "a1" };
+            yield return new object[] { ".+\nabc", "a\nabc", RegexOptions.None, 0, 5, true, "a\nabc" };
+            yield return new object[] { @"\d+", "abcd123efg", RegexOptions.None, 0, 10, true, "123" };
+            yield return new object[] { @"\d+\d+", "abcd123efg", RegexOptions.None, 0, 10, true, "123" };
+            yield return new object[] { @"\w+123\w+", "abcd123efg", RegexOptions.None, 0, 10, true, "abcd123efg" };
+            yield return new object[] { @"\d+\w+", "abcd123efg", RegexOptions.None, 0, 10, true, "123efg" };
+            yield return new object[] { @"\w+@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"\w{3,}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"\w{4,}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, false, string.Empty };
+            yield return new object[] { @"\w{2,5}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"\w{3}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"\w{0,3}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"\w{0,2}c@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"\w*@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"(\w+)@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"((\w+))@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" };
+            yield return new object[] { @"(\w+)c@\w+.com", "abc@def.comabcdef", RegexOptions.None, 0, 17, true, "abc@def.com" };
+            yield return new object[] { @"(\w+)c@\w+.com\1", "abc@def.comabcdef", RegexOptions.None, 0, 17, true, "abc@def.comab" };
+            yield return new object[] { @"(\w+)@def.com\1", "abc@def.comab", RegexOptions.None, 0, 13, false, string.Empty };
+            yield return new object[] { @"(\w+)@def.com\1", "abc@def.combc", RegexOptions.None, 0, 13, true, "bc@def.combc" };
+            yield return new object[] { @"(\w*)@def.com\1", "abc@def.com", RegexOptions.None, 0, 11, true, "@def.com" };
+            yield return new object[] { @"\w+(?<!a)", "a", RegexOptions.None, 0, 1, false, string.Empty };
+            yield return new object[] { @"\w+(?<!a)", "aa", RegexOptions.None, 0, 2, false, string.Empty };
+            yield return new object[] { @"(?>\w+)(?<!a)", "a", RegexOptions.None, 0, 1, false, string.Empty };
+            yield return new object[] { @"(?>\w+)(?<!a)", "aa", RegexOptions.None, 0, 2, false, string.Empty };
+
             // Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z"
             yield return new object[] { @"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzz", RegexOptions.IgnoreCase, 0, 17, true, "aaaasdfajsdlfjzzz" };
             yield return new object[] { @"\Aaaaaa\w+zzz\Z", "aaaa", RegexOptions.IgnoreCase, 0, 4, false, string.Empty };
index a4fd802..1c416b8 100644 (file)
@@ -495,6 +495,8 @@ namespace System.Text.RegularExpressions.Tests
         [InlineData(@"abcd(?<=cd)efgh", 8)]
         [InlineData(@"abcd(?!ab)efgh", 8)]
         [InlineData(@"abcd(?<!ef)efgh", 8)]
+        [InlineData(@"(a{1073741824}){2}", 2147483647)]
+        [InlineData(@"a{1073741824}b{1073741824}", 2147483647)]
         // we stop computing after a certain depth; if that logic changes in the future, these tests can be updated
         [InlineData(@"((((((((((((((((((((((((((((((ab|cd+)|ef+)|gh+)|ij+)|kl+)|mn+)|op+)|qr+)|st+)|uv+)|wx+)|yz+)|01+)|23+)|45+)|67+)|89+)|AB+)|CD+)|EF+)|GH+)|IJ+)|KL+)|MN+)|OP+)|QR+)|ST+)|UV+)|WX+)|YZ)", 0)]
         [InlineData(@"(YZ+|(WX+|(UV+|(ST+|(QR+|(OP+|(MN+|(KL+|(IJ+|(GH+|(EF+|(CD+|(AB+|(89+|(67+|(45+|(23+|(01+|(yz+|(wx+|(uv+|(st+|(qr+|(op+|(mn+|(kl+|(ij+|(gh+|(ef+|(de+|(a|bc+)))))))))))))))))))))))))))))))", 0)]