From 1657df9afb04aae206176f0ca465837fd56cdaa8 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 5 May 2020 14:22:13 -0400 Subject: [PATCH] Reduce unnecessary Regex match attempts for expressions beginning with atomic loops (#35824) * Reduce unnecessary Regex match attempts for expressions beginning with atomic loops * Fix corner-case overflows in ComputeMinLength --- .../System/Text/RegularExpressions/RegexCode.cs | 3 ++ .../Text/RegularExpressions/RegexCompiler.cs | 35 ++++++++++++ .../Text/RegularExpressions/RegexInterpreter.cs | 8 +++ .../System/Text/RegularExpressions/RegexNode.cs | 63 ++++++++++++++++++++-- .../Text/RegularExpressions/RegexPrefixAnalyzer.cs | 1 + .../System/Text/RegularExpressions/RegexWriter.cs | 1 + .../tests/Regex.Groups.Tests.cs | 1 - .../tests/Regex.Match.Tests.cs | 29 ++++++++++ .../tests/RegexReductionTests.cs | 2 + 9 files changed, 137 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs index 8df7e06..f327e8c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCode.cs @@ -85,6 +85,7 @@ namespace System.Text.RegularExpressions public const int Oneloopatomic = 43; // lef,back char,min,max (?> a {,n} ) public const int Notoneloopatomic = 44; // lef,back set,min,max (?> . {,n} ) public const int Setloopatomic = 45; // lef,back set,min,max (?> [\d]{,n} ) + public const int UpdateBumpalong = 46; // updates the bumpalong position to the current position // Modifiers for alternate modes public const int Mask = 63; // Mask to get unmodified ordinary operator @@ -184,6 +185,7 @@ namespace System.Text.RegularExpressions case Backjump: case Forejump: case Stop: + case UpdateBumpalong: return 1; case One: @@ -273,6 +275,7 @@ namespace System.Text.RegularExpressions Oneloopatomic => nameof(Oneloopatomic), Notoneloopatomic => nameof(Notoneloopatomic), Setloopatomic => nameof(Setloopatomic), + UpdateBumpalong => nameof(UpdateBumpalong), _ => "(unknown)" }; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index ecd8c0b..e3d87bf 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -300,6 +300,9 @@ namespace System.Text.RegularExpressions /// A macro for _ilg.Emit(OpCodes.Newobj, constructor). protected void Newobj(ConstructorInfo constructor) => _ilg!.Emit(OpCodes.Newobj, constructor); + /// A macro for _ilg.Emit(OpCodes.Dup). + protected void Dup() => _ilg!.Emit(OpCodes.Dup); + /// A macro for _ilg.Emit(OpCodes.Rem_Un). private void RemUn() => _ilg!.Emit(OpCodes.Rem_Un); @@ -1924,8 +1927,10 @@ namespace System.Text.RegularExpressions case RegexNode.Setloopatomic: // "Empty" is easy: nothing is emitted for it. // "Nothing" is also easy: it doesn't match anything. + // "UpdateBumpalong" doesn't match anything, it's just an optional directive to the engine. case RegexNode.Empty: case RegexNode.Nothing: + case RegexNode.UpdateBumpalong: supported = true; break; @@ -2441,12 +2446,28 @@ namespace System.Text.RegularExpressions // Emit nothing. break; + case RegexNode.UpdateBumpalong: + EmitUpdateBumpalong(); + break; + default: Debug.Fail($"Unexpected node type: {node.Type}"); break; } } + // Emits the code to handle updating base.runtextpos to runtextpos in response to + // an UpdateBumpalong node. This is used when we want to inform the scan loop that + // it should bump from this location rather than from the original location. + void EmitUpdateBumpalong() + { + // base.runtextpos = runtextpos; + TransferTextSpanPosToRunTextPos(); + Ldthis(); + Ldloc(runtextposLocal); + Stfld(s_runtextposField); + } + // Emits the code to handle a single-character match. void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null) { @@ -3381,6 +3402,20 @@ namespace System.Text.RegularExpressions Back(); break; + case RegexCode.UpdateBumpalong: + // UpdateBumpalong should only exist in the code stream at such a point where the root + // of the backtracking stack contains the runtextpos from the start of this Go call. Replace + // that tracking value with the current runtextpos value. + //: base.runtrack[base.runtrack.Length - 1] = runtextpos; + Ldloc(_runtrackLocal!); + Dup(); + Ldlen(); + Ldc(1); + Sub(); + Ldloc(_runtextposLocal!); + StelemI4(); + break; + case RegexCode.Goto: //: Goto(Operand(0)); Goto(Operand(0)); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 2de366b..e479b37 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -1345,6 +1345,14 @@ namespace System.Text.RegularExpressions advance = 2; continue; + case RegexCode.UpdateBumpalong: + // UpdateBumpalong should only exist in the code stream at such a point where the root + // of the backtracking stack contains the runtextpos from the start of this Go call. Replace + // that tracking value with the current runtextpos value. + runtrack![runtrack.Length - 1] = runtextpos; + advance = 0; + continue; + default: Debug.Fail($"Unimplemented state: {_operator:X8}"); break; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 64d7052..1479c2e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -80,6 +80,7 @@ namespace System.Text.RegularExpressions public const int Oneloopatomic = RegexCode.Oneloopatomic; // c,n (?> a*) public const int Notoneloopatomic = RegexCode.Notoneloopatomic; // c,n (?> .*) public const int Setloopatomic = RegexCode.Setloopatomic; // set,n (?> \d*) + public const int UpdateBumpalong = RegexCode.UpdateBumpalong; // Interior nodes do not correspond to primitive operations, but // control structures compositing other operations @@ -235,6 +236,7 @@ namespace System.Text.RegularExpressions case Setloop: case Setloopatomic: case Start: + case UpdateBumpalong: Debug.Assert(childCount == 0, $"Expected zero children for {node.TypeName}, got {childCount}."); break; @@ -308,6 +310,44 @@ namespace System.Text.RegularExpressions // to implementations that don't support backtracking. EliminateEndingBacktracking(rootNode.Child(0), DefaultMaxRecursionDepth); + // Optimization: unnecessary re-processing of atomic starting groups. + // If an expression is guaranteed to begin with a single-character infinite atomic group that isn't part of an alternation (in which case it + // wouldn't be guaranteed to be at the beginning) or a capture (in which case a back reference could be influenced by its length), then we + // can update the tree with a temporary node to indicate that the implementation should use that node's ending position in the input text + // as the next starting position at which to start the next match. This avoids redoing matches we've already performed, e.g. matching + // "\w+@dot.net" against "is this a valid address@dot.net", the \w+ will initially match the "is" and then will fail to match the "@". + // Rather than bumping the scan loop by 1 and trying again to match at the "s", we can instead start at the " ". We limit ourselves to + // one/set atomic loops with a min iteration count of 1 so that we know we'll get something in exchange for the extra overhead of storing + // the updated position. For functional correctness we can only consider infinite atomic loops, as to be able to start at the end of the + // loop we need the loop to have consumed all possible matches; otherwise, you could end up with a pattern like "a{1,3}b" matching + // against "aaaabc", which should match, but if we pre-emptively stop consuming after the first three a's and re-start from that position, + // we'll end up failing the match even though it should have succeeded. + { + RegexNode node = rootNode.Child(0); // skip implicit root capture node + while (true) + { + switch (node.Type) + { + case Atomic: + case Concatenate: + node = node.Child(0); + continue; + + case Oneloopatomic when node.M > 0 && node.N == int.MaxValue: + case Notoneloopatomic when node.M > 0 && node.N == int.MaxValue: + case Setloopatomic when node.M > 0 && node.N == int.MaxValue: + RegexNode? parent = node.Next; + if (parent != null && parent.Type == Concatenate) + { + parent.InsertChild(1, new RegexNode(UpdateBumpalong, node.Options)); + } + break; + } + + break; + } + } + // Optimization: implicit anchoring. // If the expression begins with a .* loop, add an anchor to the beginning: // - If Singleline is set such that '.' eats anything, the .* will zip to the end of the string and then backtrack through @@ -1603,7 +1643,7 @@ namespace System.Text.RegularExpressions case Lazyloop: case Loop: // A node graph repeated at least M times. - return node.M * ComputeMinLength(node.Child(0), maxDepth - 1); + return (int)Math.Min(int.MaxValue, (long)node.M * ComputeMinLength(node.Child(0), maxDepth - 1)); case Alternate: // The minimum required length for any of the alternation's branches. @@ -1621,13 +1661,13 @@ namespace System.Text.RegularExpressions case Concatenate: // The sum of all of the concatenation's children. { - int sum = 0; + long sum = 0; int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { sum += ComputeMinLength(node.Child(i), maxDepth - 1); } - return sum; + return (int)Math.Min(int.MaxValue, sum); } case Atomic: @@ -1639,8 +1679,9 @@ namespace System.Text.RegularExpressions case Empty: case Nothing: + case UpdateBumpalong: // Nothing to match. In the future, we could potentially use Nothing to say that the min length - // is infinite, but that would require a different structure, as that would only applies if the + // is infinite, but that would require a different structure, as that would only apply if the // Nothing match is required in all cases (rather than, say, as one branch of an alternation). case Beginning: case Bol: @@ -1668,7 +1709,7 @@ namespace System.Text.RegularExpressions #if DEBUG Debug.Fail($"Unknown node: {node.TypeName}"); #endif - return 0; + goto case Empty; } } } @@ -1716,6 +1757,17 @@ namespace System.Text.RegularExpressions } } + public void InsertChild(int index, RegexNode newChild) + { + Debug.Assert(Children is List); + + newChild.Next = this; // so that the child can see its parent while being reduced + newChild = newChild.Reduce(); + newChild.Next = this; // in case Reduce returns a different node that needs to be reparented + + ((List)Children).Insert(index, newChild); + } + public void ReplaceChild(int index, RegexNode newChild) { Debug.Assert(Children != null); @@ -1799,6 +1851,7 @@ namespace System.Text.RegularExpressions Atomic => nameof(Atomic), Testref => nameof(Testref), Testgroup => nameof(Testgroup), + UpdateBumpalong => nameof(UpdateBumpalong), _ => $"(unknown {Type})" }; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 627a485..21e3856 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -608,6 +608,7 @@ namespace System.Text.RegularExpressions case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: + case RegexNode.UpdateBumpalong: PushFC(new RegexFC(true)); break; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index 33fe44f..f09b383 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -518,6 +518,7 @@ namespace System.Text.RegularExpressions case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: + case RegexNode.UpdateBumpalong: Emit(node.Type); break; diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs index 60537db..827f664 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs @@ -391,7 +391,6 @@ namespace System.Text.RegularExpressions.Tests if (!PlatformDetection.IsNetFramework) // missing fix for https://github.com/dotnet/runtime/issues/24759 { yield return new object[] { null, @"(cat)(\c[*)(dog)", "asdlkcat\u001bdogiwod", RegexOptions.None, new string[] { "cat\u001bdog", "cat", "\u001b", "dog" } }; - yield return new object[] { null, @"(cat)(\c[*)(dog)", "asdlkcat\u001Bdogiwod", RegexOptions.None, new string[] { "cat\u001Bdog", "cat", "\u001B", "dog" } }; } // Atomic Zero-Width Assertions \A \G ^ \Z \z \b \B diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 1f02723..cf25c63 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -96,6 +96,35 @@ namespace System.Text.RegularExpressions.Tests yield return new object[] { Case(@"a[^wyz]*w"), "abczw", RegexOptions.IgnoreCase, 0, 0, false, string.Empty }; } + // Loops at beginning of expressions + yield return new object[] { @"a+", "aaa", RegexOptions.None, 0, 3, true, "aaa" }; + yield return new object[] { @"a+\d+", "a1", RegexOptions.None, 0, 2, true, "a1" }; + yield return new object[] { @".+\d+", "a1", RegexOptions.None, 0, 2, true, "a1" }; + yield return new object[] { ".+\nabc", "a\nabc", RegexOptions.None, 0, 5, true, "a\nabc" }; + yield return new object[] { @"\d+", "abcd123efg", RegexOptions.None, 0, 10, true, "123" }; + yield return new object[] { @"\d+\d+", "abcd123efg", RegexOptions.None, 0, 10, true, "123" }; + yield return new object[] { @"\w+123\w+", "abcd123efg", RegexOptions.None, 0, 10, true, "abcd123efg" }; + yield return new object[] { @"\d+\w+", "abcd123efg", RegexOptions.None, 0, 10, true, "123efg" }; + yield return new object[] { @"\w+@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"\w{3,}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"\w{4,}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, false, string.Empty }; + yield return new object[] { @"\w{2,5}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"\w{3}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"\w{0,3}@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"\w{0,2}c@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"\w*@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"(\w+)@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"((\w+))@\w+.com", "abc@def.com", RegexOptions.None, 0, 11, true, "abc@def.com" }; + yield return new object[] { @"(\w+)c@\w+.com", "abc@def.comabcdef", RegexOptions.None, 0, 17, true, "abc@def.com" }; + yield return new object[] { @"(\w+)c@\w+.com\1", "abc@def.comabcdef", RegexOptions.None, 0, 17, true, "abc@def.comab" }; + yield return new object[] { @"(\w+)@def.com\1", "abc@def.comab", RegexOptions.None, 0, 13, false, string.Empty }; + yield return new object[] { @"(\w+)@def.com\1", "abc@def.combc", RegexOptions.None, 0, 13, true, "bc@def.combc" }; + yield return new object[] { @"(\w*)@def.com\1", "abc@def.com", RegexOptions.None, 0, 11, true, "@def.com" }; + yield return new object[] { @"\w+(?\w+)(?\w+)(?