Broaden use of SearchValues in TryFindNextPossibleStartingPosition in Regex (#89205)
authorStephen Toub <stoub@microsoft.com>
Thu, 20 Jul 2023 13:23:23 +0000 (09:23 -0400)
committerGitHub <noreply@github.com>
Thu, 20 Jul 2023 13:23:23 +0000 (09:23 -0400)
SearchValues has been updated to have an ASCII fast-path for inputs that are not only ASCII.  This means we can simplify TryFindNextPossibleStartingPosition in Regex to not track AsciiSet specially and instead just increase the number of characters we query the set for (from 5 to 128).  That way, we'll use SearchValues rather than emitting our own helper up until a (semi-arbitrary) point where we deem it impossible or infeasible to enumerate all the chars that make up the set.

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs

index e54ffd3..1948870 100644 (file)
@@ -399,54 +399,65 @@ namespace System.Text.RegularExpressions.Generator
         }
 
         /// <summary>Adds a SearchValues instance declaration to the required helpers collection.</summary>
-        private static string EmitSearchValues(char[] asciiChars, Dictionary<string, string[]> requiredHelpers)
+        private static string EmitSearchValues(char[] chars, Dictionary<string, string[]> requiredHelpers)
         {
-            Debug.Assert(RegexCharClass.IsAscii(asciiChars));
+            Array.Sort(chars);
 
-            // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
-            byte[] bitmap = new byte[16];
-            foreach (char c in asciiChars)
+            string fieldName;
+            if (RegexCharClass.IsAscii(chars))
             {
-                bitmap[c >> 3] |= (byte)(1 << (c & 7));
+                // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
+                var bitmap = new byte[16];
+                foreach (char c in chars)
+                {
+                    bitmap[c >> 3] |= (byte)(1 << (c & 7));
+                }
+
+                string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
+
+                fieldName = hexBitmap switch
+                {
+                    "FFFFFFFF000000000000000000000080" => "s_asciiControl",
+                    "000000000000FF030000000000000000" => "s_asciiDigits",
+                    "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
+                    "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
+                    "000000000000FF037E0000007E000000" => "s_asciiHexDigits",
+                    "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
+                    "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
+                    "00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
+                    "00000000010000000000000000000000" => "s_asciiSeparators",
+                    "00000000100800700000004001000050" => "s_asciiSymbols",
+                    "003E0000010000000000000000000000" => "s_asciiWhiteSpace",
+                    "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",
+
+                    "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
+                    "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
+                    "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
+                    "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
+                    "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
+                    "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
+                    "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
+                    "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
+                    "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
+                    "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
+                    "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",
+
+                    _ => $"s_ascii_{hexBitmap.TrimStart('0')}"
+                };
             }
-
-            string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
-
-            string fieldName = hexBitmap switch
+            else
             {
-                "FFFFFFFF000000000000000000000080" => "s_asciiControl",
-                "000000000000FF030000000000000000" => "s_asciiDigits",
-                "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
-                "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
-                "000000000000FF037E0000007E000000" => "s_asciiHexDigits",
-                "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
-                "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
-                "00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
-                "00000000010000000000000000000000" => "s_asciiSeparators",
-                "00000000100800700000004001000050" => "s_asciiSymbols",
-                "003E0000010000000000000000000000" => "s_asciiWhiteSpace",
-                "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",
-
-                "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
-                "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
-                "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
-                "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
-                "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
-                "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
-                "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
-                "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
-                "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
-                "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
-                "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",
-
-                _ => $"s_ascii_{hexBitmap.TrimStart('0')}"
-            };
+                using (SHA256 sha = SHA256.Create())
+                {
+#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
+                    fieldName = $"s_nonAscii_{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(chars))).Replace("-", "")}";
+#pragma warning restore CA1850
+                }
+            }
 
             if (!requiredHelpers.ContainsKey(fieldName))
             {
-                Array.Sort(asciiChars);
-
-                string setLiteral = Literal(new string(asciiChars));
+                string setLiteral = Literal(new string(chars));
 
                 requiredHelpers.Add(fieldName, new string[]
                 {
@@ -465,12 +476,12 @@ namespace System.Text.RegularExpressions.Generator
             // a sequential walk).  In order to do that search, we actually build up a set for all of the ASCII
             // characters _not_ contained in the set, and then do a search for the inverse of that, which will be
             // all of the target ASCII characters and all of non-ASCII.
-            var asciiChars = new List<char>();
+            var excludedAsciiChars = new List<char>();
             for (int i = 0; i < 128; i++)
             {
                 if (!RegexCharClass.CharInClass((char)i, set))
                 {
-                    asciiChars.Add((char)i);
+                    excludedAsciiChars.Add((char)i);
                 }
             }
 
@@ -538,9 +549,9 @@ namespace System.Text.RegularExpressions.Generator
                 lines.Add($"internal static int {helperName}(this ReadOnlySpan<char> span)");
                 lines.Add($"{{");
                 int uncheckedStart = lines.Count;
-                lines.Add(asciiChars.Count == 128 ?
+                lines.Add(excludedAsciiChars.Count == 128 ?
                           $"    int i = span.IndexOfAnyExceptInRange('\0', '\u007f');" :
-                          $"    int i = span.IndexOfAnyExcept({EmitSearchValues(asciiChars.ToArray(), requiredHelpers)});");
+                          $"    int i = span.IndexOfAnyExcept({EmitSearchValues(excludedAsciiChars.ToArray(), requiredHelpers)});");
                 lines.Add($"    if ((uint)i < (uint)span.Length)");
                 lines.Add($"    {{");
                 lines.Add($"        if (char.IsAscii(span[i]))");
@@ -1067,6 +1078,8 @@ namespace System.Text.RegularExpressions.Generator
                     string indexOf;
                     if (primarySet.Chars is not null)
                     {
+                        Debug.Assert(primarySet.Chars.Length > 0);
+
                         // We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
                         string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
                         if (primarySet.Negated)
@@ -1076,18 +1089,19 @@ namespace System.Text.RegularExpressions.Generator
 
                         indexOf = primarySet.Chars.Length switch
                         {
+                            // 1, 2, 3 have dedicated optimized IndexOfAny overloads
                             1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
                             2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
                             3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
-                            _ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
+
+                            // 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan<char> overload,
+                            // but can also be handled via SearchValues
+                            4 or 5 => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
+
+                            // > 5 can only be handled efficiently via SearchValues
+                            _ => $"{span}.{indexOfAnyName}({EmitSearchValues(primarySet.Chars, requiredHelpers)})",
                         };
                     }
-                    else if (primarySet.AsciiSet is not null)
-                    {
-                        // We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
-                        Debug.Assert(!primarySet.Negated);
-                        indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
-                    }
                     else if (primarySet.Range is not null)
                     {
                         // We have a range, so we can use IndexOfAny{Except}InRange to search for it.  In the corner case,
@@ -1102,8 +1116,8 @@ namespace System.Text.RegularExpressions.Generator
                     }
                     else
                     {
-                        // We have an arbitrary set of characters that includes at least one non-ASCII char.  We use a custom IndexOfAny helper that
-                        // will perform the search as efficiently as possible.
+                        // We have an arbitrary set of characters that's really large or otherwise not enumerable.
+                        // We use a custom IndexOfAny helper that will perform the search as efficiently as possible.
                         indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
                     }
 
index 97e728a..20ccc3a 100644 (file)
@@ -903,6 +903,7 @@ namespace System.Text.RegularExpressions
 
                     if (primarySet.Chars is not null)
                     {
+                        Debug.Assert(primarySet.Chars.Length > 0);
                         switch (primarySet.Chars.Length)
                         {
                             case 1:
@@ -926,19 +927,23 @@ namespace System.Text.RegularExpressions
                                 Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
                                 break;
 
-                            default:
+                            case 4 or 5:
+                                // tmp = ...IndexOfAny("abcd");
+                                // Note that this case differs slightly from the source generator, where it might choose to use
+                                // SearchValues instead of a literal, but there's extra cost to doing so for RegexCompiler so
+                                // it just always uses IndexOfAny(span).
                                 Ldstr(new string(primarySet.Chars));
                                 Call(s_stringAsSpanMethod);
                                 Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
                                 break;
+
+                            default:
+                                // tmp = ...IndexOfAny(s_searchValues);
+                                LoadSearchValues(primarySet.Chars);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptSearchValues : s_spanIndexOfAnySearchValues);
+                                break;
                         }
                     }
-                    else if (primarySet.AsciiSet is not null)
-                    {
-                        Debug.Assert(!primarySet.Negated);
-                        LoadSearchValues(primarySet.AsciiSet);
-                        Call(s_spanIndexOfAnySearchValues);
-                    }
                     else if (primarySet.Range is not null)
                     {
                         if (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive)
index 7597b37..517c9da 100644 (file)
@@ -271,8 +271,6 @@ namespace System.Text.RegularExpressions
             public int Distance;
             /// <summary>As an alternative to <see cref="Chars"/>, a description of the single range the set represents, if it does.</summary>
             public (char LowInclusive, char HighInclusive)? Range;
-            /// <summary>As an alternative to <see cref="Chars"/>, a description of the set of ASCII characters it represents, if it does.</summary>
-            public char[]? AsciiSet;
         }
 
         /// <summary>When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop.</summary>
@@ -593,7 +591,7 @@ namespace System.Text.RegularExpressions
                         char[]? chars = primarySet.Chars;
 
                         ReadOnlySpan<char> span = textSpan.Slice(pos);
-                        if (chars is not null)
+                        if (chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} without SearchValues
                         {
                             int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
                             if (i >= 0)
@@ -660,7 +658,7 @@ namespace System.Text.RegularExpressions
 
                         int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength);
 
-                        if (primarySet.Chars is not null)
+                        if (primarySet.Chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except}
                         {
                             for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
                             {
index 56ff64c..96c5021 100644 (file)
@@ -194,7 +194,7 @@ namespace System.Text.RegularExpressions
             TryFindRawFixedSets(root, results, ref distance, thorough);
 #if DEBUG
             results.ForEach(r => Debug.Assert(
-                !r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null,
+                !r.Negated && r.Chars is null && r.Range is null,
                 $"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}"));
 #endif
 
@@ -225,31 +225,25 @@ namespace System.Text.RegularExpressions
 
             // For every entry, try to get the chars that make up the set, if there are few enough.
             // For any for which we couldn't get the small chars list, see if we can get other useful info.
-            Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
+            Span<char> scratch = stackalloc char[128]; // limit based on what's currently efficiently handled by SearchValues
             for (int i = 0; i < results.Count; i++)
             {
                 RegexFindOptimizations.FixedDistanceSet result = results[i];
                 result.Negated = RegexCharClass.IsNegated(result.Set);
 
                 int count = RegexCharClass.GetSetChars(result.Set, scratch);
-
                 if (count > 0)
                 {
                     result.Chars = scratch.Slice(0, count).ToArray();
                 }
 
-                if (thorough)
+                // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
+                if (thorough &&
+                    (result.Chars is null || result.Chars.Length > 2) &&
+                    RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
                 {
-                    // Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
-                    if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
-                    {
-                        result.Chars = null;
-                        result.Range = (lowInclusive, highInclusive);
-                    }
-                    else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars))
-                    {
-                        result.AsciiSet = asciiChars;
-                    }
+                    result.Chars = null;
+                    result.Range = (lowInclusive, highInclusive);
                 }
 
                 results[i] = result;
@@ -472,8 +466,8 @@ namespace System.Text.RegularExpressions
             // for the fastest and that have the best chance of matching as few false positives as possible.
             results.Sort(static (s1, s2) =>
             {
-                char[]? s1Chars = s1.Chars ?? s1.AsciiSet;
-                char[]? s2Chars = s2.Chars ?? s2.AsciiSet;
+                char[]? s1Chars = s1.Chars;
+                char[]? s2Chars = s2.Chars;
                 int s1CharsLength = s1Chars?.Length ?? 0;
                 int s2CharsLength = s2Chars?.Length ?? 0;
                 bool s1Negated = s1.Negated;