Fix beginning fixups of captures after Regex span support (#66713)
authorStephen Toub <stoub@microsoft.com>
Wed, 16 Mar 2022 21:18:38 +0000 (17:18 -0400)
committerGitHub <noreply@github.com>
Wed, 16 Mar 2022 21:18:38 +0000 (17:18 -0400)
The Regex span support changed the scanning infrastructure to always be based on spans.  That means that when a string is passed in by the caller, internally we still operate on it as a span.  That also means we can take advantage of slicing, and if the caller has specified via a beginning/length set of arguments that we should only process a substring, we can just slice to that substring.  That, however, then means that all offsets computed by the scanning implementation are 0-based rather than beginning-based.  The span change included a fixup for the overall match position, but not for the position of each individual capture, and that then meant that captures were providing the wrong values.  We unfortunately didn't have any tests for validating groups that also involved non-0 beginnings with string inputs.

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Capture.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Match.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexRunner.cs
src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Groups.Tests.cs

index 2eee81f..fba98ab 100644 (file)
@@ -19,17 +19,6 @@ namespace System.Text.RegularExpressions
         /// <summary>Returns the position in the original string where the first character of captured substring was found.</summary>
         public int Index { get; private protected set; }
 
-        /// <summary>
-        /// This method should only be called when the text for matching was sliced with a different beginning, so the resulting index of
-        /// the match is not from the start of the text, but instead the start of the slice. This method will add back that extra indices
-        /// to account for the original text beginning.
-        /// </summary>
-        /// <param name="beginning">The original text's beginning offset.</param>
-        internal void AddBeginningToIndex(int beginning)
-        {
-            Index += beginning;
-        }
-
         /// <summary>Returns the length of the captured substring.</summary>
         public int Length { get; private protected set; }
 
index adf727a..882ff11 100644 (file)
@@ -45,8 +45,8 @@ namespace System.Text.RegularExpressions
         internal int _textstart;
 
         // output from the match
-        internal int[][] _matches;
-        internal int[] _matchcount;
+        internal readonly int[][] _matches;
+        internal readonly int[] _matchcount;
         internal bool _balancing;        // whether we've done any balancing with this match.  If we
                                          // have done balancing, we'll need to do extra work in Tidy().
 
@@ -89,7 +89,7 @@ namespace System.Text.RegularExpressions
         /// </summary>
         /// <remarks>
         /// The main difference between the public <see cref="Group.Success"/> property and this one, is that <see cref="Group.Success"/> requires
-        /// for a <see cref="Match"/> to call <see cref="Match.Tidy(int)"/> first, in order to report the correct value, while this API will return
+        /// for a <see cref="Match"/> to call <see cref="Tidy"/> first, in order to report the correct value, while this API will return
         /// the correct value right after a Match gets calculated, meaning that it will return <see langword="true"/> right after <see cref="RegexRunner.Capture(int, int, int)"/>
         /// </remarks>
         internal bool FoundMatch => _matchcount[0] > 0;
@@ -272,17 +272,43 @@ namespace System.Text.RegularExpressions
         }
 
         /// <summary>Tidy the match so that it can be used as an immutable result</summary>
-        internal void Tidy(int textpos)
+        internal void Tidy(int textpos, int beginningOfSpanSlice)
         {
+            int[] matchcount = _matchcount;
+            int[][] matches = _matches;
+
             _textpos = textpos;
-            _capcount = _matchcount[0];
-            int[] interval = _matches[0];
+            _capcount = matchcount[0];
+            int[] interval = matches[0];
             Index = interval[0];
             Length = interval[1];
             if (_balancing)
             {
                 TidyBalancing();
             }
+
+            // At this point the Match is consistent for handing back to a caller, with regards to the span that was processed.
+            // However, the caller may have actually provided a string, and may have done so with a non-zero beginning.
+            // In such a case, all offsets need to be shifted by beginning, e.g. if beginning is 5 and a capture occurred at
+            // offset 17, that 17 offset needs to be increased to 22 to account for the fact that it was actually 17 from the
+            // beginning, which the implementation saw as 0 but which from the caller's perspective was 5.
+            Debug.Assert(!_balancing);
+            if (beginningOfSpanSlice != 0)
+            {
+                Index += beginningOfSpanSlice;
+                for (int groupNumber = 0; groupNumber < matches.Length; groupNumber++)
+                {
+                    int[] captures = matches[groupNumber];
+                    if (captures is not null)
+                    {
+                        int capturesLength = matchcount[groupNumber] * 2; // each capture has an offset and a length
+                        for (int c = 0; c < capturesLength; c += 2)
+                        {
+                            captures[c] += beginningOfSpanSlice;
+                        }
+                    }
+                }
+            }
         }
 
         private void TidyBalancing()
index c2cd8e8..07b435a 100644 (file)
@@ -570,13 +570,7 @@ namespace System.Text.RegularExpressions
                     return null;
                 }
 
-                match.Tidy(runner.runtextpos);
-
-                // If the passed in beginning was not 0 then we need to adjust the offsets on the match object.
-                if (beginning != 0)
-                {
-                    match.AddBeginningToIndex(beginning);
-                }
+                match.Tidy(runner.runtextpos, beginning);
 
                 return match;
             }
index 13a20bb..bd25add 100644 (file)
@@ -184,7 +184,7 @@ namespace System.Text.RegularExpressions
                 }
 
                 runmatch = null;
-                match.Tidy(runtextpos);
+                match.Tidy(runtextpos, 0);
             }
             else
             {
index 58e78b2..3d297ad 100644 (file)
@@ -961,22 +961,26 @@ namespace System.Text.RegularExpressions.Tests
                 return;
             }
 
-            Match match = regex.Match(input);
-
-            Assert.True(match.Success);
-            Assert.Equal(expectedGroups[0], match.Value);
+            foreach (string prefix in new[] { "", "IGNORETHIS" })
+            {
+                Match match = prefix.Length == 0 ?
+                    regex.Match(input) : // validate the original input
+                    regex.Match(prefix + input, prefix.Length, input.Length); // validate we handle groups and beginning/length correctly
 
-            Assert.Equal(expectedGroups.Length, match.Groups.Count);
+                Assert.True(match.Success);
+                Assert.Equal(expectedGroups[0], match.Value);
+                Assert.Equal(expectedGroups.Length, match.Groups.Count);
 
-            int[] groupNumbers = regex.GetGroupNumbers();
-            string[] groupNames = regex.GetGroupNames();
-            for (int i = 0; i < expectedGroups.Length; i++)
-            {
-                Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value);
-                Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]);
+                int[] groupNumbers = regex.GetGroupNumbers();
+                string[] groupNames = regex.GetGroupNames();
+                for (int i = 0; i < expectedGroups.Length; i++)
+                {
+                    Assert.Equal(expectedGroups[i], match.Groups[groupNumbers[i]].Value);
+                    Assert.Equal(match.Groups[groupNumbers[i]], match.Groups[groupNames[i]]);
 
-                Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i]));
-                Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i]));
+                    Assert.Equal(groupNumbers[i], regex.GroupNumberFromName(groupNames[i]));
+                    Assert.Equal(groupNames[i], regex.GroupNameFromNumber(groupNumbers[i]));
+                }
             }
         }