Fix boundary handling in Regex auto-atomicity optimization (#79088)
authorStephen Toub <stoub@microsoft.com>
Thu, 1 Dec 2022 19:29:59 +0000 (14:29 -0500)
committerGitHub <noreply@github.com>
Thu, 1 Dec 2022 19:29:59 +0000 (14:29 -0500)
This fixes a regression that occurred between .NET Core 3.1 and .NET 5, when we added the auto-atomicity optimization.  This optimization is based on the premise that if a loop is followed by something that can't possibly match anything the loop could give up when backtracking, then backtracking for that loop is wasted effort, and thus the loop can be made atomic.  For example, given the expression `a*b`, there's nothing the `a*` loop could give up when backtracking that would also match `b`, thus this can be transformed into `(?>a*)b`.  As part of this, we also factor in word boundaries, but we're too aggressive in our handling of them.  If you have `a+\b`, there's nothing that `a+` could give up that would enable the `\b` to match.  However, if you have `a*\b`, since `\b` is a zero-width assertion, it is actually possible for the `a*` to give up something (the empty match) that could match `\b`, yet we're erroneously still converting that to be `(?>a*)\b`.  The fix is to constrain that part of the optimization to require a non-zero minimum bound on the loop in order to make it atomic.

While it's simple to repro incorrect handling here, it's also rare to find in real-world use, as word boundary anchors are used to demarcate words, this issue really only affects cases of empty words, and it's unusual that someone would write an expression to try to identify empty words.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs
src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

index 80ec75b..a94be74 100644 (file)
@@ -2117,10 +2117,10 @@ namespace System.Text.RegularExpressions
                             case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch:
                             case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch:
                             case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
-                            case RegexNodeKind.Boundary when RegexCharClass.IsBoundaryWordChar(node.Ch):
-                            case RegexNodeKind.NonBoundary when !RegexCharClass.IsBoundaryWordChar(node.Ch):
-                            case RegexNodeKind.ECMABoundary when RegexCharClass.IsECMAWordChar(node.Ch):
-                            case RegexNodeKind.NonECMABoundary when !RegexCharClass.IsECMAWordChar(node.Ch):
+                            case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsBoundaryWordChar(node.Ch):
+                            case RegexNodeKind.NonBoundary when node.M > 0 && !RegexCharClass.IsBoundaryWordChar(node.Ch):
+                            case RegexNodeKind.ECMABoundary when node.M > 0 && RegexCharClass.IsECMAWordChar(node.Ch):
+                            case RegexNodeKind.NonECMABoundary when node.M > 0 && !RegexCharClass.IsECMAWordChar(node.Ch):
                                 // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
                                 break;
 
@@ -2163,10 +2163,10 @@ namespace System.Text.RegularExpressions
 
                             case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
                             case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
-                            case RegexNodeKind.Boundary when node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
-                            case RegexNodeKind.NonBoundary when node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
-                            case RegexNodeKind.ECMABoundary when node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
-                            case RegexNodeKind.NonECMABoundary when node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
+                            case RegexNodeKind.Boundary when node.M > 0 && node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
+                            case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
+                            case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
+                            case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
                                 // The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
                                 break;
 
index bd0b1ea..990d489 100644 (file)
@@ -292,6 +292,25 @@ namespace System.Text.RegularExpressions.Tests
                     }
                 };
 
+                foreach (RegexOptions options in new[] { RegexOptions.None, RegexOptions.ECMAScript })
+                {
+                    if (RegexHelpers.IsNonBacktracking(engine))
+                    {
+                        continue;
+                    }
+
+                    yield return new object[]
+                    {
+                        engine,
+                        @"a?\b", "ac", options,
+                        new[]
+                        {
+                            new CaptureData("", 0, 0),
+                            new CaptureData("", 2, 0),
+                        }
+                    };
+                }
+
                 if (!PlatformDetection.IsNetFramework)
                 {
                     // .NET Framework missing fix in https://github.com/dotnet/runtime/pull/1075
index dbb22cf..c7c8cc5 100644 (file)
@@ -334,8 +334,10 @@ namespace System.Text.RegularExpressions.Tests
         [InlineData("[^\n]*\n+", "(?>[^\n]*)(?>\n+)")]
         [InlineData("(a+)b", "((?>a+))b")]
         [InlineData("a*(?:bcd|efg)", "(?>a*)(?:bcd|efg)")]
-        [InlineData("\\w*\\b", "(?>\\w*)\\b")]
-        [InlineData("\\d*\\b", "(?>\\d*)\\b")]
+        [InlineData("\\w+\\b", "(?>\\w+)\\b")]
+        [InlineData("\\d+\\b", "(?>\\d+)\\b")]
+        [InlineData("\\W+\\B", "(?>\\W+)\\B")]
+        [InlineData("\\D+\\B", "(?>\\D+)\\B")]
         [InlineData("(?:abc*|def*)g", "(?:ab(?>c*)|de(?>f*))g")]
         [InlineData("(?:a[ce]*|b*)g", "(?:a(?>[ce]*)|(?>b*))g")]
         [InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")]
@@ -476,6 +478,11 @@ namespace System.Text.RegularExpressions.Tests
         [InlineData(@"\w*\b\w+", @"(?>\w*)\b\w+")]
         [InlineData(@"\W+\B\W+", @"(?>\W+)\B\W")]
         [InlineData(@"\W*\B\W+", @"(?>\W*)\B\W")]
+        [InlineData(@"a?\b", @"(?>a?)\b")]
+        [InlineData(@"\w*\b", @"(?>\w*)\b")]
+        [InlineData(@"\d*\b", @"(?>\d*)\b")]
+        [InlineData(@"\W*\B", @"(?>\W*)\B")]
+        [InlineData(@"\D*\B", @"(?>\D*)\B")]
         // Loops inside alternation constructs
         [InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
         [InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]