Improve choice of IndexOfXx routine for some TryFindNextStartingPosition implementati...
authorStephen Toub <stoub@microsoft.com>
Tue, 18 Jul 2023 21:38:16 +0000 (17:38 -0400)
committerGitHub <noreply@github.com>
Tue, 18 Jul 2023 21:38:16 +0000 (17:38 -0400)
Earlier in .NET 8, we updated the Regex compiler and source generator to be able to vectorize a search for any set, not just simple ones.  When one of the main routines couldn't be used, we emit a specialized IndexOfAny helper that uses SearchValues to search for any matching ASCII character or a Unicode character, and if it encounters a Unicode character, it falls back to a linear scan.  This meant that a bunch of sets that wouldn't previously have taken these paths now do, but some of those sets have more efficient means of searching; for example, for the set `[^aA]` that searches case-insensitive for anything other than an 'A', with these scheme we'll emit a whole routine that uses SearchValues with a fallback, but we could just use IndexOfAnyExcept('A', 'a').  This fixes the compiler / source generator to prefer such helpers instead when available.

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs

index 021453e..e54ffd3 100644 (file)
@@ -1063,25 +1063,49 @@ namespace System.Text.RegularExpressions.Generator
                         (true, _) => $"{span}.Slice(i + {primarySet.Distance})",
                     };
 
-                    Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
+                    // Get the IndexOf* expression to use to perform the search.
+                    string indexOf;
+                    if (primarySet.Chars is not null)
+                    {
+                        // We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
+                        string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
+                        if (primarySet.Negated)
+                        {
+                            indexOfName = indexOfAnyName = "IndexOfAnyExcept";
+                        }
 
-                    string indexOf =
-                        primarySet.Chars is not null ? primarySet.Chars.Length switch
+                        indexOf = primarySet.Chars.Length switch
                         {
-                            1 => $"{span}.IndexOf({Literal(primarySet.Chars[0])})",
-                            2 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
-                            3 => $"{span}.IndexOfAny({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
-                            _ => $"{span}.IndexOfAny({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
-                        } :
-                        primarySet.AsciiSet is not null ? $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})" :
-                        primarySet.Range is not null ? (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
+                            1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
+                            2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
+                            3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
+                            _ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
+                        };
+                    }
+                    else if (primarySet.AsciiSet is not null)
+                    {
+                        // We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
+                        Debug.Assert(!primarySet.Negated);
+                        indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
+                    }
+                    else if (primarySet.Range is not null)
+                    {
+                        // We have a range, so we can use IndexOfAny{Except}InRange to search for it.  In the corner case,
+                        // where we end up with a set of a single char, we can use IndexOf instead.
+                        indexOf = (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive, primarySet.Negated) switch
                         {
                             (false, false) => $"{span}.IndexOfAnyInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
                             (true, false) => $"{span}.IndexOf({Literal(primarySet.Range.Value.LowInclusive)})",
                             (false, true) => $"{span}.IndexOfAnyExceptInRange({Literal(primarySet.Range.Value.LowInclusive)}, {Literal(primarySet.Range.Value.HighInclusive)})",
                             (true, true) => $"{span}.IndexOfAnyExcept({Literal(primarySet.Range.Value.LowInclusive)})",
-                        } :
-                        $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
+                        };
+                    }
+                    else
+                    {
+                        // We have an arbitrary set of characters that includes at least one non-ASCII char.  We use a custom IndexOfAny helper that
+                        // will perform the search as efficiently as possible.
+                        indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
+                    }
 
                     if (needLoop)
                     {
@@ -1184,6 +1208,7 @@ namespace System.Text.RegularExpressions.Generator
 
                 if (set.Chars is { Length: 1 })
                 {
+                    Debug.Assert(!set.Negated);
                     writer.WriteLine($"pos = inputSpan.Slice(0, pos).LastIndexOf({Literal(set.Chars[0])});");
                     using (EmitBlock(writer, "if (pos >= 0)"))
                     {
@@ -3307,7 +3332,7 @@ namespace System.Text.RegularExpressions.Generator
                 {
                     if (iterationCount is null &&
                         node.Kind is RegexNodeKind.Notonelazy &&
-                        subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch
+                        subsequent?.FindStartingLiteral(4) is RegexNode.StartingLiteralData literal && // 5 == max efficiently optimized by IndexOfAny, and we need to reserve 1 for node.Ch
                         !literal.Negated && // not negated; can't search for both the node.Ch and a negated subsequent char with an IndexOf* method
                         (literal.String is not null ||
                          literal.SetChars is not null ||
index afbad70..97e728a 100644 (file)
@@ -901,8 +901,6 @@ namespace System.Text.RegularExpressions
                         Ldloc(textSpanLocal);
                     }
 
-                    Debug.Assert(!primarySet.Negated || (primarySet.Chars is null && primarySet.AsciiSet is null));
-
                     if (primarySet.Chars is not null)
                     {
                         switch (primarySet.Chars.Length)
@@ -910,14 +908,14 @@ namespace System.Text.RegularExpressions
                             case 1:
                                 // tmp = ...IndexOf(setChars[0]);
                                 Ldc(primarySet.Chars[0]);
-                                Call(s_spanIndexOfChar);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar);
                                 break;
 
                             case 2:
                                 // tmp = ...IndexOfAny(setChars[0], setChars[1]);
                                 Ldc(primarySet.Chars[0]);
                                 Ldc(primarySet.Chars[1]);
-                                Call(s_spanIndexOfAnyCharChar);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar);
                                 break;
 
                             case 3:
@@ -925,13 +923,13 @@ namespace System.Text.RegularExpressions
                                 Ldc(primarySet.Chars[0]);
                                 Ldc(primarySet.Chars[1]);
                                 Ldc(primarySet.Chars[2]);
-                                Call(s_spanIndexOfAnyCharCharChar);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
                                 break;
 
                             default:
                                 Ldstr(new string(primarySet.Chars));
                                 Call(s_stringAsSpanMethod);
-                                Call(s_spanIndexOfAnySpan);
+                                Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
                                 break;
                         }
                     }
@@ -1166,6 +1164,8 @@ namespace System.Text.RegularExpressions
 
                 if (set.Chars is { Length: 1 })
                 {
+                    Debug.Assert(!set.Negated);
+
                     // pos = inputSpan.Slice(0, pos).LastIndexOf(set.Chars[0]);
                     Ldloca(inputSpan);
                     Ldc(0);
index 113c074..7597b37 100644 (file)
@@ -3,7 +3,6 @@
 
 using System.Collections.Generic;
 using System.Diagnostics;
-using System.Globalization;
 
 namespace System.Text.RegularExpressions
 {
@@ -95,7 +94,7 @@ namespace System.Text.RegularExpressions
                 if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
                 {
                     // See if the set is limited to holding only a few characters.
-                    Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
+                    Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
                     int scratchCount;
                     char[]? chars = null;
                     if (!RegexCharClass.IsNegated(charClass) &&
@@ -109,12 +108,14 @@ namespace System.Text.RegularExpressions
                     {
                         // The set contains one and only one character, meaning every match starts
                         // with the same literal value (potentially case-insensitive). Search for that.
+                        Debug.Assert(!RegexCharClass.IsNegated(charClass));
                         FixedDistanceLiteral = (chars[0], null, 0);
                         FindMode = FindNextStartingPositionMode.LeadingChar_RightToLeft;
                     }
                     else
                     {
                         // The set may match multiple characters.  Search for that.
+                        Debug.Assert(!RegexCharClass.IsNegated(charClass) || chars is null);
                         FixedDistanceSets = new List<FixedDistanceSet>()
                         {
                             new FixedDistanceSet(chars, charClass, 0)
@@ -154,22 +155,32 @@ namespace System.Text.RegularExpressions
 
             // As a backup, see if we can find a literal after a leading atomic loop.  That might be better than whatever sets we find, so
             // we want to know whether we have one in our pocket before deciding whether to use a leading set (we'll prefer a leading
-            // set if it's something for which we can vectorize a search).
+            // set if it's something for which we can search efficiently).
             (RegexNode LoopNode, (char Char, string? String, char[]? Chars) Literal)? literalAfterLoop = RegexPrefixAnalyzer.FindLiteralFollowingLeadingLoop(root);
 
-            // If we got such sets, we'll likely use them.  However, if the best of them is something that doesn't support a vectorized
-            // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the vectorizable search.
+            // If we got such sets, we'll likely use them.  However, if the best of them is something that doesn't support an efficient
+            // search and we did successfully find a literal after an atomic loop we could search instead, we prefer the efficient search.
+            // For example, if we have a negated set, we will still prefer the literal-after-an-atomic-loop because negated sets typically
+            // contain _many_ characters (e.g. [^a] is everything but 'a') and are thus more likely to very quickly match, which means any
+            // vectorization employed is less likely to kick in and be worth the startup overhead.
             if (fixedDistanceSets is not null)
             {
+                // Sort the sets by "quality", such that whatever set is first is the one deemed most efficient to use.
+                // In some searches, we may use multiple sets, so we want the subsequent ones to also be the efficiency runners-up.
                 RegexPrefixAnalyzer.SortFixedDistanceSetsByQuality(fixedDistanceSets);
-                if (fixedDistanceSets[0].Chars is not null || literalAfterLoop is null)
+
+                // If there is no literal after the loop, use whatever set we got.
+                // If there is a literal after the loop, consider it to be better than a negated set and better than a set with many characters.
+                if (literalAfterLoop is null ||
+                    (fixedDistanceSets[0].Chars is not null && !fixedDistanceSets[0].Negated))
                 {
                     // Determine whether to do searching based on one or more sets or on a single literal. Compiled engines
                     // don't need to special-case literals as they already do codegen to create the optimal lookup based on
                     // the set's characteristics.
                     if (!compiled &&
                         fixedDistanceSets.Count == 1 &&
-                        fixedDistanceSets[0].Chars is { Length: 1 })
+                        fixedDistanceSets[0].Chars is { Length: 1 } &&
+                        !fixedDistanceSets[0].Negated)
                     {
                         FixedDistanceLiteral = (fixedDistanceSets[0].Chars![0], null, fixedDistanceSets[0].Distance);
                         FindMode = FindNextStartingPositionMode.FixedDistanceChar_LeftToRight;
@@ -186,8 +197,9 @@ namespace System.Text.RegularExpressions
 
                         // Store the sets, and compute which mode to use.
                         FixedDistanceSets = fixedDistanceSets;
-                        FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ? FindNextStartingPositionMode.LeadingSet_LeftToRight
-                            : FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
+                        FindMode = (fixedDistanceSets.Count == 1 && fixedDistanceSets[0].Distance == 0) ?
+                            FindNextStartingPositionMode.LeadingSet_LeftToRight :
+                            FindNextStartingPositionMode.FixedDistanceSets_LeftToRight;
                         _asciiLookups = new uint[fixedDistanceSets.Count][];
                     }
                     return;
@@ -322,6 +334,7 @@ namespace System.Text.RegularExpressions
             return best;
         }
 
+#if SYSTEM_TEXT_REGULAREXPRESSIONS
         /// <summary>Try to advance to the next starting position that might be a location for a match.</summary>
         /// <param name="textSpan">The text to search.</param>
         /// <param name="pos">The position in <paramref name="textSpan"/>.  This is updated with the found position.</param>
@@ -578,12 +591,11 @@ namespace System.Text.RegularExpressions
                     {
                         FixedDistanceSet primarySet = FixedDistanceSets![0];
                         char[]? chars = primarySet.Chars;
-                        string set = primarySet.Set;
 
                         ReadOnlySpan<char> span = textSpan.Slice(pos);
                         if (chars is not null)
                         {
-                            int i = span.IndexOfAny(chars);
+                            int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
                             if (i >= 0)
                             {
                                 pos += i;
@@ -595,7 +607,7 @@ namespace System.Text.RegularExpressions
                             ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
                             for (int i = 0; i < span.Length; i++)
                             {
-                                if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
+                                if (RegexCharClass.CharInClass(span[i], primarySet.Set, ref startingAsciiLookup))
                                 {
                                     pos += i;
                                     return true;
@@ -653,7 +665,8 @@ namespace System.Text.RegularExpressions
                             for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
                             {
                                 int offset = inputPosition + primarySet.Distance;
-                                int index = textSpan.Slice(offset).IndexOfAny(primarySet.Chars);
+                                ReadOnlySpan<char> textSpanAtOffset = textSpan.Slice(offset);
+                                int index = primarySet.Negated ? textSpanAtOffset.IndexOfAnyExcept(primarySet.Chars) : textSpanAtOffset.IndexOfAny(primarySet.Chars);
                                 if (index < 0)
                                 {
                                     break;
@@ -769,6 +782,7 @@ namespace System.Text.RegularExpressions
                     return true;
             }
         }
+#endif
     }
 
     /// <summary>Mode to use for searching for the next location of a possible match.</summary>
index 7cf3b9f..e579ad1 100644 (file)
@@ -1419,7 +1419,7 @@ namespace System.Text.RegularExpressions
         /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
         /// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
         /// </returns>
-        public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today
+        public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max efficiently optimized by IndexOfAny today
         {
             Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated.");
 
index 88553b3..56ff64c 100644 (file)
@@ -3,7 +3,6 @@
 
 using System.Collections.Generic;
 using System.Diagnostics;
-using System.Globalization;
 using System.Runtime.CompilerServices;
 using System.Threading;
 
@@ -192,7 +191,12 @@ namespace System.Text.RegularExpressions
             // Find all fixed-distance sets.
             var results = new List<RegexFindOptimizations.FixedDistanceSet>();
             int distance = 0;
-            TryFindFixedSets(root, results, ref distance, thorough);
+            TryFindRawFixedSets(root, results, ref distance, thorough);
+#if DEBUG
+            results.ForEach(r => Debug.Assert(
+                !r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null,
+                $"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}"));
+#endif
 
             // Remove any sets that match everything; they're not helpful.  (This check exists primarily to weed
             // out use of . in Singleline mode, but also filters out explicit sets like [\s\S].)
@@ -221,7 +225,7 @@ namespace System.Text.RegularExpressions
 
             // For every entry, try to get the chars that make up the set, if there are few enough.
             // For any for which we couldn't get the small chars list, see if we can get other useful info.
-            Span<char> scratch = stackalloc char[5]; // max optimized by IndexOfAny today
+            Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
             for (int i = 0; i < results.Count; i++)
             {
                 RegexFindOptimizations.FixedDistanceSet result = results[i];
@@ -229,7 +233,7 @@ namespace System.Text.RegularExpressions
 
                 int count = RegexCharClass.GetSetChars(result.Set, scratch);
 
-                if (!result.Negated && count > 0)
+                if (count > 0)
                 {
                     result.Chars = scratch.Slice(0, count).ToArray();
                 }
@@ -258,8 +262,10 @@ namespace System.Text.RegularExpressions
             // is at a fixed distance, in which case distance will have been updated to include the full length
             // of the node.  If it returns false, the node isn't entirely fixed, in which case subsequent nodes
             // shouldn't be examined and distance should no longer be trusted.  However, regardless of whether it
-            // returns true or false, it may have populated results, and all populated results are valid.
-            static bool TryFindFixedSets(RegexNode node, List<RegexFindOptimizations.FixedDistanceSet> results, ref int distance, bool thorough)
+            // returns true or false, it may have populated results, and all populated results are valid. All
+            // FixedDistanceSet result will only have its Set string and Distance populated; the rest is left
+            // to be populated by FindFixedDistanceSets after this returns.
+            static bool TryFindRawFixedSets(RegexNode node, List<RegexFindOptimizations.FixedDistanceSet> results, ref int distance, bool thorough)
             {
                 if (!StackHelper.TryEnsureSufficientExecutionStack())
                 {
@@ -357,7 +363,7 @@ namespace System.Text.RegularExpressions
                     case RegexNodeKind.Atomic:
                     case RegexNodeKind.Group:
                     case RegexNodeKind.Capture:
-                        return TryFindFixedSets(node.Child(0), results, ref distance, thorough);
+                        return TryFindRawFixedSets(node.Child(0), results, ref distance, thorough);
 
                     case RegexNodeKind.Lazyloop or RegexNodeKind.Loop when node.M > 0:
                         // This effectively only iterates the loop once.  If deemed valuable,
@@ -366,7 +372,7 @@ namespace System.Text.RegularExpressions
                         // summed distance for all node.M iterations.  If node.M == node.N,
                         // this would then also allow continued evaluation of the rest of the
                         // expression after the loop.
-                        TryFindFixedSets(node.Child(0), results, ref distance, thorough);
+                        TryFindRawFixedSets(node.Child(0), results, ref distance, thorough);
                         return false;
 
                     case RegexNodeKind.Concatenate:
@@ -374,7 +380,7 @@ namespace System.Text.RegularExpressions
                             int childCount = node.ChildCount();
                             for (int i = 0; i < childCount; i++)
                             {
-                                if (!TryFindFixedSets(node.Child(i), results, ref distance, thorough))
+                                if (!TryFindRawFixedSets(node.Child(i), results, ref distance, thorough))
                                 {
                                     return false;
                                 }
@@ -394,7 +400,7 @@ namespace System.Text.RegularExpressions
                             {
                                 localResults.Clear();
                                 int localDistance = 0;
-                                allSameSize &= TryFindFixedSets(node.Child(i), localResults, ref localDistance, thorough);
+                                allSameSize &= TryFindRawFixedSets(node.Child(i), localResults, ref localDistance, thorough);
 
                                 if (localResults.Count == 0)
                                 {
@@ -475,64 +481,75 @@ namespace System.Text.RegularExpressions
                 int s1RangeLength = s1.Range is not null ? GetRangeLength(s1.Range.Value, s1Negated) : 0;
                 int s2RangeLength = s2.Range is not null ? GetRangeLength(s2.Range.Value, s2Negated) : 0;
 
-                Debug.Assert(!s1Negated || s1Chars is null);
-                Debug.Assert(!s2Negated || s2Chars is null);
+                // If one set is negated and the other isn't, prefer the non-negated set. In general, negated
+                // sets are large and thus likely to match more frequently, making them slower to search for.
+                if (s1Negated != s2Negated)
+                {
+                    return s2Negated ? -1 : 1;
+                }
 
-                // If both have chars, prioritize the one with the smaller frequency for those chars.
-                if (s1Chars is not null && s2Chars is not null)
+                // If we extracted only a few chars and the sets are negated, they both represent very large
+                // sets that are difficult to compare for quality.
+                if (!s1Negated)
                 {
-                    // Prefer sets with less frequent values.  The frequency is only an approximation,
-                    // used as a tie-breaker when we'd otherwise effectively be picking randomly.
-                    // True frequencies will vary widely based on the actual data being searched, the language of the data, etc.
-                    float s1Frequency = SumFrequencies(s1Chars);
-                    float s2Frequency = SumFrequencies(s2Chars);
+                    Debug.Assert(!s2Negated);
 
-                    if (s1Frequency != s2Frequency)
+                    // If both have chars, prioritize the one with the smaller frequency for those chars.
+                    if (s1Chars is not null && s2Chars is not null)
                     {
-                        return s1Frequency.CompareTo(s2Frequency);
-                    }
+                        // Prefer sets with less frequent values.  The frequency is only an approximation,
+                        // used as a tie-breaker when we'd otherwise effectively be picking randomly.
+                        // True frequencies will vary widely based on the actual data being searched, the language of the data, etc.
+                        float s1Frequency = SumFrequencies(s1Chars);
+                        float s2Frequency = SumFrequencies(s2Chars);
 
-                    if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars))
-                    {
-                        // Prefer the set with fewer values.
-                        return s1CharsLength.CompareTo(s2CharsLength);
-                    }
+                        if (s1Frequency != s2Frequency)
+                        {
+                            return s1Frequency.CompareTo(s2Frequency);
+                        }
 
-                    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                    static float SumFrequencies(char[] chars)
-                    {
-                        float sum = 0;
-                        foreach (char c in chars)
+                        if (!RegexCharClass.IsAscii(s1Chars) && !RegexCharClass.IsAscii(s2Chars))
+                        {
+                            // Prefer the set with fewer values.
+                            return s1CharsLength.CompareTo(s2CharsLength);
+                        }
+
+                        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                        static float SumFrequencies(char[] chars)
                         {
-                            // Lookup each character in the table.  Values >= 128 are ignored
-                            // and thus we'll get skew in the data.  It's already a gross approximation, though,
-                            // and it is primarily meant for disambiguation of ASCII letters.
-                            if (c < 128)
+                            float sum = 0;
+                            foreach (char c in chars)
                             {
-                                sum += Frequency[c];
+                                // Lookup each character in the table.  Values >= 128 are ignored
+                                // and thus we'll get skew in the data.  It's already a gross approximation, though,
+                                // and it is primarily meant for disambiguation of ASCII letters.
+                                if (c < 128)
+                                {
+                                    sum += Frequency[c];
+                                }
                             }
+                            return sum;
                         }
-                        return sum;
                     }
-                }
 
-                // If one has chars and the other has a range, prefer the shorter set.
-                if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0))
-                {
-                    int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength));
-                    if (c != 0)
+                    // If one has chars and the other has a range, prefer the shorter set.
+                    if ((s1CharsLength > 0 && s2RangeLength > 0) || (s1RangeLength > 0 && s2CharsLength > 0))
                     {
-                        return c;
-                    }
+                        int c = Math.Max(s1CharsLength, s1RangeLength).CompareTo(Math.Max(s2CharsLength, s2RangeLength));
+                        if (c != 0)
+                        {
+                            return c;
+                        }
 
-                    // If lengths are the same, prefer the chars.
-                    return s1CharsLength > 0 ? -1 : 1;
-                }
+                        // If lengths are the same, prefer the chars.
+                        return s1CharsLength > 0 ? -1 : 1;
+                    }
 
-                // If one has chars and the other doesn't, prioritize the one with chars.
-                if ((s1CharsLength > 0) != (s2CharsLength > 0))
-                {
-                    return s1CharsLength > 0 ? -1 : 1;
+                    // If one has chars and the other doesn't, prioritize the one with chars.
+                    if ((s1CharsLength > 0) != (s2CharsLength > 0))
+                    {
+                        return s1CharsLength > 0 ? -1 : 1;
+                    }
                 }
 
                 // If one has a range and the other doesn't, prioritize the one with a range.