Lots of tweaks to FrozenDictionary/Set analysis and hashing (#86293)
authorStephen Toub <stoub@microsoft.com>
Mon, 22 May 2023 14:34:54 +0000 (10:34 -0400)
committerGitHub <noreply@github.com>
Mon, 22 May 2023 14:34:54 +0000 (10:34 -0400)
- Some minor internal refactorings, e.g. changed the internal KeyAnalyzer.Analyze from being void returning with an out results to instead just return those results, and did some internal renaming.
- We were previously computing the min/max string length in KeyAnalyzer.Analyze twice, once in UseSubstring and then again after that in Analyze.  We were also computing it in the length buckets analyzer.  Changed the code to do the checks just once at the beginning and then pass that information around to everywhere that needs it.
- Once that information is passed in, AnalysisResults can then be made readonly, rather than being mutated to store the min/max after it was constructed.
- If the min length is 0, there's no point calling TryUseSubstring as it'll never find one, so skip the call to it entirely if there's an empty string in the mix.
- In TryUseSubstring, for a given substring length first check all the left justifications and then check all the right justifications, rather than intermingling them.  Left justifications are a bit cheaper to look up, plus we can avoid creating any objects related to checking right justification if we end up finding a left justification for a given substring length first.
- We can also avoid doing any right justification checks if all of the input strings are of the same length, as at that point there's nothing right justification could find that left justification couldn't.
- When constructing the HashSets for evaluating uniqueness, when targeting more recent .NET versions we can presize the HashSets to avoid the expansion as we add all the items.
- Importantly, set a limit on the maximum length of substring we'll consider.  This significantly curtails the worst-case analysis performance for very large inputs that don't yield any unique substrings.  While it's certainly possible to construct cases where this will then fail to find a substring when it otherwise could, it's much more rare, and the longer we're dealing with the less we're saving on the hashing costs, which is the only thing this is avoiding.
- For hashing, create dedicated branchless hash checks for each of lengths 0 to 4.
- When we're doing the ASCII check to see if we can use an ASCII-optimized comparer, if OrdinalIgnoreCase was used, we can also check to see if the substring contains any ASCII letters; if the only the thing the substrings contain are ASCII non-letters, then we can switch to being case-sensitive, as no casing will impact the comparisons.
- The ASCII check downlevel was erroneously including 0x7f as being non-ASCII. Fixed it to be 0x80 instead of 0x7f.
- Changed GetHashCodeOrdinalIgnoreCase to delegate to GetHashCodeOrdinal after doing its non-ASCII casing work.
- On .NET 6+, we can use GetValueRefOrAddDefault to avoid some dictionary lookups.
- For the length-bucketing implementation, we can do a quick up-front check to rule out applicability of many inputs where we know just based on the number of input strings and the min/max lengths whether some bucket will be forced to be too big.

src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenSet.cs
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenSet.cs
src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs

index 607284e..5369038 100644 (file)
@@ -193,109 +193,115 @@ namespace System.Collections.Frozen
         {
             IEqualityComparer<TKey> comparer = source.Comparer;
 
-            if (typeof(TKey).IsValueType)
+            // Optimize for value types when the default comparer is being used. In such a case, the implementation
+            // may use {Equality}Comparer<TKey>.Default.Compare/Equals/GetHashCode directly, with generic specialization enabling
+            // the Equals/GetHashCode methods to be devirtualized and possibly inlined.
+            if (typeof(TKey).IsValueType && ReferenceEquals(comparer, EqualityComparer<TKey>.Default))
             {
-                // Optimize for value types when the default comparer is being used. In such a case, the implementation
-                // may use {Equality}Comparer<TKey>.Default.Compare/Equals/GetHashCode directly, with generic specialization enabling
-                // the Equals/GetHashCode methods to be devirtualized and possibly inlined.
-                if (ReferenceEquals(comparer, EqualityComparer<TKey>.Default))
+                if (source.Count <= Constants.MaxItemsInSmallValueTypeFrozenCollection)
                 {
-                    if (source.Count <= Constants.MaxItemsInSmallValueTypeFrozenCollection)
+                    // If the key is a something we know we can efficiently compare, use a specialized implementation
+                    // that will enable quickly ruling out values outside of the range of keys stored.
+                    if (Constants.IsKnownComparable<TKey>())
                     {
-                        // If the key is a something we know we can efficiently compare, use a specialized implementation
-                        // that will enable quickly ruling out values outside of the range of keys stored.
-                        if (Constants.IsKnownComparable<TKey>())
-                        {
-                            return (FrozenDictionary<TKey, TValue>)(object)new SmallValueTypeComparableFrozenDictionary<TKey, TValue>(source);
-                        }
-
-                        // Otherwise, use an implementation optimized for a small number of value types using the default comparer.
-                        return (FrozenDictionary<TKey, TValue>)(object)new SmallValueTypeDefaultComparerFrozenDictionary<TKey, TValue>(source);
+                        return (FrozenDictionary<TKey, TValue>)(object)new SmallValueTypeComparableFrozenDictionary<TKey, TValue>(source);
                     }
 
-                    // Use a hash-based implementation.
+                    // Otherwise, use an implementation optimized for a small number of value types using the default comparer.
+                    return (FrozenDictionary<TKey, TValue>)(object)new SmallValueTypeDefaultComparerFrozenDictionary<TKey, TValue>(source);
+                }
 
-                    // For Int32 keys, we can reuse the key storage as the hash storage, saving on space and extra indirection.
-                    if (typeof(TKey) == typeof(int))
-                    {
-                        return (FrozenDictionary<TKey, TValue>)(object)new Int32FrozenDictionary<TValue>((Dictionary<int, TValue>)(object)source);
-                    }
+                // Use a hash-based implementation.
 
-                    // Fallback to an implementation usable with any value type and the default comparer.
-                    return new ValueTypeDefaultComparerFrozenDictionary<TKey, TValue>(source);
+                // For Int32 keys, we can reuse the key storage as the hash storage, saving on space and extra indirection.
+                if (typeof(TKey) == typeof(int))
+                {
+                    return (FrozenDictionary<TKey, TValue>)(object)new Int32FrozenDictionary<TValue>((Dictionary<int, TValue>)(object)source);
                 }
+
+                // Fallback to an implementation usable with any value type and the default comparer.
+                return new ValueTypeDefaultComparerFrozenDictionary<TKey, TValue>(source);
             }
-            else if (typeof(TKey) == typeof(string))
+
+            // Optimize for string keys with the default, Ordinal, or OrdinalIgnoreCase comparers.
+            // If the key is a string and the comparer is known to provide ordinal (case-sensitive or case-insensitive) semantics,
+            // we can use an implementation that's able to examine and optimize based on lengths and/or subsequences within those strings.
+            if (typeof(TKey) == typeof(string) &&
+                (ReferenceEquals(comparer, EqualityComparer<TKey>.Default) || ReferenceEquals(comparer, StringComparer.Ordinal) || ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase)))
             {
-                // If the key is a string and the comparer is known to provide ordinal (case-sensitive or case-insensitive) semantics,
-                // we can use an implementation that's able to examine and optimize based on lengths and/or subsequences within those strings.
-                if (ReferenceEquals(comparer, EqualityComparer<TKey>.Default) ||
-                    ReferenceEquals(comparer, StringComparer.Ordinal) ||
-                    ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase))
-                {
-                    Dictionary<string, TValue> stringEntries = (Dictionary<string, TValue>)(object)source;
-                    IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
+                Dictionary<string, TValue> stringEntries = (Dictionary<string, TValue>)(object)source;
+                IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
 
-                    FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer);
-                    if (frozenDictionary is not null)
-                    {
-                        return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
-                    }
+                // Calculate the minimum and maximum lengths of the strings in the dictionary. Several of the analyses need this.
+                int minLength = int.MaxValue, maxLength = 0;
+                foreach (KeyValuePair<string, TValue> kvp in stringEntries)
+                {
+                    if (kvp.Key.Length < minLength) minLength = kvp.Key.Length;
+                    if (kvp.Key.Length > maxLength) maxLength = kvp.Key.Length;
+                }
+                Debug.Assert(minLength >= 0 && maxLength >= minLength);
 
-                    string[] entries = (string[])(object)source.Keys.ToArray();
+                // Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length.
+                FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength);
+                if (frozenDictionary is not null)
+                {
+                    return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
+                }
 
-                    KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), out KeyAnalyzer.AnalysisResults results);
-                    if (results.SubstringHashing)
+                // Analyze the keys for unique substrings and create an implementation that minimizes the cost of hashing keys.
+                string[] entries = (string[])(object)source.Keys.ToArray();
+                KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength);
+                if (analysis.SubstringHashing)
+                {
+                    if (analysis.RightJustifiedSubstring)
                     {
-                        if (results.RightJustifiedSubstring)
+                        if (analysis.IgnoreCase)
                         {
-                            if (results.IgnoreCase)
-                            {
-                                frozenDictionary = results.AllAscii
-                                    ? new OrdinalStringFrozenDictionary_RightJustifiedCaseInsensitiveAsciiSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount)
-                                    : new OrdinalStringFrozenDictionary_RightJustifiedCaseInsensitiveSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                            }
-                            else
-                            {
-                                frozenDictionary = results.HashCount == 1
-                                    ? new OrdinalStringFrozenDictionary_RightJustifiedSingleChar<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex)
-                                    : new OrdinalStringFrozenDictionary_RightJustifiedSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                            }
+                            frozenDictionary = analysis.AllAsciiIfIgnoreCase
+                                ? new OrdinalStringFrozenDictionary_RightJustifiedCaseInsensitiveAsciiSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount)
+                                : new OrdinalStringFrozenDictionary_RightJustifiedCaseInsensitiveSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                         }
                         else
                         {
-                            if (results.IgnoreCase)
-                            {
-                                frozenDictionary = results.AllAscii
-                                    ? new OrdinalStringFrozenDictionary_LeftJustifiedCaseInsensitiveAsciiSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount)
-                                    : new OrdinalStringFrozenDictionary_LeftJustifiedCaseInsensitiveSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                            }
-                            else
-                            {
-                                frozenDictionary = results.HashCount == 1
-                                    ? new OrdinalStringFrozenDictionary_LeftJustifiedSingleChar<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex)
-                                    : new OrdinalStringFrozenDictionary_LeftJustifiedSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                            }
+                            frozenDictionary = analysis.HashCount == 1
+                                ? new OrdinalStringFrozenDictionary_RightJustifiedSingleChar<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex)
+                                : new OrdinalStringFrozenDictionary_RightJustifiedSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                         }
                     }
                     else
                     {
-                        if (results.IgnoreCase)
+                        if (analysis.IgnoreCase)
                         {
-                            frozenDictionary = results.AllAscii
-                                ? new OrdinalStringFrozenDictionary_FullCaseInsensitiveAscii<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff)
-                                : new OrdinalStringFrozenDictionary_FullCaseInsensitive<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff);
+                            frozenDictionary = analysis.AllAsciiIfIgnoreCase
+                                ? new OrdinalStringFrozenDictionary_LeftJustifiedCaseInsensitiveAsciiSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount)
+                                : new OrdinalStringFrozenDictionary_LeftJustifiedCaseInsensitiveSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                         }
                         else
                         {
-                            frozenDictionary = new OrdinalStringFrozenDictionary_Full<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff);
+                            frozenDictionary = analysis.HashCount == 1
+                                ? new OrdinalStringFrozenDictionary_LeftJustifiedSingleChar<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex)
+                                : new OrdinalStringFrozenDictionary_LeftJustifiedSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                         }
                     }
-
-                    return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
                 }
+                else
+                {
+                    if (analysis.IgnoreCase)
+                    {
+                        frozenDictionary = analysis.AllAsciiIfIgnoreCase
+                            ? new OrdinalStringFrozenDictionary_FullCaseInsensitiveAscii<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff)
+                            : new OrdinalStringFrozenDictionary_FullCaseInsensitive<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff);
+                    }
+                    else
+                    {
+                        frozenDictionary = new OrdinalStringFrozenDictionary_Full<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff);
+                    }
+                }
+
+                return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
             }
 
+            // Optimize for very small numbers of items by using a specialized implementation that just does a linear search.
             if (source.Count <= Constants.MaxItemsInSmallFrozenCollection)
             {
                 // Use the specialized dictionary for low item counts.
index 7237e79..31214ff 100644 (file)
@@ -5,8 +5,6 @@ using System.Collections.Generic;
 using System.Collections.Immutable;
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
-using System.Linq;
-using System.Numerics;
 using System.Runtime.InteropServices;
 
 namespace System.Collections.Frozen
@@ -125,115 +123,117 @@ namespace System.Collections.Frozen
         {
             IEqualityComparer<T> comparer = source.Comparer;
 
-            if (typeof(T).IsValueType)
+            // Optimize for value types when the default comparer is being used. In such a case, the implementation
+            // may use {Equality}Comparer<T>.Default.Compare/Equals/GetHashCode directly, with generic specialization enabling
+            // the Equals/GetHashCode methods to be devirtualized and possibly inlined.
+            if (typeof(T).IsValueType && ReferenceEquals(comparer, EqualityComparer<T>.Default))
             {
-                // Optimize for value types when the default comparer is being used. In such a case, the implementation
-                // may use {Equality}Comparer<T>.Default.Compare/Equals/GetHashCode directly, with generic specialization enabling
-                // the Equals/GetHashCode methods to be devirtualized and possibly inlined.
-                if (ReferenceEquals(comparer, EqualityComparer<T>.Default))
+                if (source.Count <= Constants.MaxItemsInSmallValueTypeFrozenCollection)
                 {
-                    if (source.Count <= Constants.MaxItemsInSmallValueTypeFrozenCollection)
+                    // If the type is a something we know we can efficiently compare, use a specialized implementation
+                    // that will enable quickly ruling out values outside of the range of keys stored.
+                    if (Constants.IsKnownComparable<T>())
                     {
-                        // If the type is a something we know we can efficiently compare, use a specialized implementation
-                        // that will enable quickly ruling out values outside of the range of keys stored.
-                        if (Constants.IsKnownComparable<T>())
-                        {
-                            return (FrozenSet<T>)(object)new SmallValueTypeComparableFrozenSet<T>(source);
-                        }
-
-                        // Otherwise, use an implementation optimized for a small number of value types using the default comparer.
-                        return (FrozenSet<T>)(object)new SmallValueTypeDefaultComparerFrozenSet<T>(source);
+                        return (FrozenSet<T>)(object)new SmallValueTypeComparableFrozenSet<T>(source);
                     }
 
-                    // Use a hash-based implementation.
+                    // Otherwise, use an implementation optimized for a small number of value types using the default comparer.
+                    return (FrozenSet<T>)(object)new SmallValueTypeDefaultComparerFrozenSet<T>(source);
+                }
 
-                    // For Int32 values, we can reuse the item storage as the hash storage, saving on space and extra indirection.
-                    if (typeof(T) == typeof(int))
-                    {
-                        return (FrozenSet<T>)(object)new Int32FrozenSet((HashSet<int>)(object)source);
-                    }
+                // Use a hash-based implementation.
 
-                    // Fallback to an implementation usable with any value type and the default comparer.
-                    return new ValueTypeDefaultComparerFrozenSet<T>(source);
+                // For Int32 values, we can reuse the item storage as the hash storage, saving on space and extra indirection.
+                if (typeof(T) == typeof(int))
+                {
+                    return (FrozenSet<T>)(object)new Int32FrozenSet((HashSet<int>)(object)source);
                 }
+
+                // Fallback to an implementation usable with any value type and the default comparer.
+                return new ValueTypeDefaultComparerFrozenSet<T>(source);
             }
-            else if (typeof(T) == typeof(string))
+
+            // Optimize for strings when the default, Ordinal, or OrdinalIgnoreCase comparer is being used.
+            // Null is rare as a value in the set and we don't optimize for it.  This enables the ordinal string
+            // implementation to fast-path out on null inputs rather than having to accommodate null inputs.
+            if (typeof(T) == typeof(string) &&
+                !source.Contains(default!) &&
+                (ReferenceEquals(comparer, EqualityComparer<T>.Default) || ReferenceEquals(comparer, StringComparer.Ordinal) || ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase)))
             {
-                // Null is rare as a value in the set and we don't optimize for it.  This enables the ordinal string
-                // implementation to fast-path out on null inputs rather than having to accommodate null inputs.
-                if (!source.Contains(default!))
+                HashSet<string> stringValues = (HashSet<string>)(object)source;
+                var entries = new string[stringValues.Count];
+                stringValues.CopyTo(entries);
+                IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
+
+                // Calculate the minimum and maximum lengths of the strings in the set. Several of the analyses need this.
+                int minLength = int.MaxValue, maxLength = 0;
+                foreach (string s in entries)
                 {
-                    // If the value is a string and the comparer is known to provide ordinal (case-sensitive or case-insensitive) semantics,
-                    // we can use an implementation that's able to examine and optimize based on lengths and/or subsequences within those strings.
-                    if (ReferenceEquals(comparer, EqualityComparer<T>.Default) ||
-                        ReferenceEquals(comparer, StringComparer.Ordinal) ||
-                        ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase))
-                    {
-                        HashSet<string> stringValues = (HashSet<string>)(object)source;
-                        var entries = new string[stringValues.Count];
-                        stringValues.CopyTo(entries);
+                    if (s.Length < minLength) minLength = s.Length;
+                    if (s.Length > maxLength) maxLength = s.Length;
+                }
+                Debug.Assert(minLength >= 0 && maxLength >= minLength);
 
-                        IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
+                // Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length.
+                FrozenSet<string>? frozenSet = LengthBucketsFrozenSet.CreateLengthBucketsFrozenSetIfAppropriate(entries, stringComparer, minLength, maxLength);
+                if (frozenSet is not null)
+                {
+                    return (FrozenSet<T>)(object)frozenSet;
+                }
 
-                        FrozenSet<string>? frozenSet = LengthBucketsFrozenSet.CreateLengthBucketsFrozenSetIfAppropriate(entries, stringComparer);
-                        if (frozenSet is not null)
+                // Analyze the values for unique substrings and create an implementation that minimizes the cost of hashing keys.
+                KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength);
+                if (analysis.SubstringHashing)
+                {
+                    if (analysis.RightJustifiedSubstring)
+                    {
+                        if (analysis.IgnoreCase)
                         {
-                            return (FrozenSet<T>)(object)frozenSet;
+                            frozenSet = analysis.AllAsciiIfIgnoreCase
+                                ? new OrdinalStringFrozenSet_RightJustifiedCaseInsensitiveAsciiSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount)
+                                : new OrdinalStringFrozenSet_RightJustifiedCaseInsensitiveSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                         }
-
-                        KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), out KeyAnalyzer.AnalysisResults results);
-                        if (results.SubstringHashing)
+                        else
                         {
-                            if (results.RightJustifiedSubstring)
-                            {
-                                if (results.IgnoreCase)
-                                {
-                                    frozenSet = results.AllAscii
-                                        ? new OrdinalStringFrozenSet_RightJustifiedCaseInsensitiveAsciiSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount)
-                                        : new OrdinalStringFrozenSet_RightJustifiedCaseInsensitiveSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                                }
-                                else
-                                {
-                                    frozenSet = results.HashCount == 1
-                                        ? new OrdinalStringFrozenSet_RightJustifiedSingleChar(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex)
-                                        : new OrdinalStringFrozenSet_RightJustifiedSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                                }
-                            }
-                            else
-                            {
-                                if (results.IgnoreCase)
-                                {
-                                    frozenSet = results.AllAscii
-                                        ? new OrdinalStringFrozenSet_LeftJustifiedCaseInsensitiveAsciiSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount)
-                                        : new OrdinalStringFrozenSet_LeftJustifiedCaseInsensitiveSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                                }
-                                else
-                                {
-                                    frozenSet = results.HashCount == 1
-                                        ? new OrdinalStringFrozenSet_LeftJustifiedSingleChar(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex)
-                                        : new OrdinalStringFrozenSet_LeftJustifiedSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                                }
-                            }
+                            frozenSet = analysis.HashCount == 1
+                                ? new OrdinalStringFrozenSet_RightJustifiedSingleChar(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex)
+                                : new OrdinalStringFrozenSet_RightJustifiedSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
+                        }
+                    }
+                    else
+                    {
+                        if (analysis.IgnoreCase)
+                        {
+                            frozenSet = analysis.AllAsciiIfIgnoreCase
+                                ? new OrdinalStringFrozenSet_LeftJustifiedCaseInsensitiveAsciiSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount)
+                                : new OrdinalStringFrozenSet_LeftJustifiedCaseInsensitiveSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                         }
                         else
                         {
-                            if (results.IgnoreCase)
-                            {
-                                frozenSet = results.AllAscii
-                                    ? new OrdinalStringFrozenSet_FullCaseInsensitiveAscii(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff)
-                                    : new OrdinalStringFrozenSet_FullCaseInsensitive(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff);
-                            }
-                            else
-                            {
-                                frozenSet = new OrdinalStringFrozenSet_Full(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff);
-                            }
+                            frozenSet = analysis.HashCount == 1
+                                ? new OrdinalStringFrozenSet_LeftJustifiedSingleChar(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex)
+                                : new OrdinalStringFrozenSet_LeftJustifiedSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                         }
-
-                        return (FrozenSet<T>)(object)frozenSet;
                     }
                 }
+                else
+                {
+                    if (analysis.IgnoreCase)
+                    {
+                        frozenSet = analysis.AllAsciiIfIgnoreCase
+                            ? new OrdinalStringFrozenSet_FullCaseInsensitiveAscii(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff)
+                            : new OrdinalStringFrozenSet_FullCaseInsensitive(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff);
+                    }
+                    else
+                    {
+                        frozenSet = new OrdinalStringFrozenSet_Full(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff);
+                    }
+                }
+
+                return (FrozenSet<T>)(object)frozenSet;
             }
 
+            // Optimize for very small numbers of items by using a specialized implementation that just does a linear search.
             if (source.Count <= Constants.MaxItemsInSmallFrozenCollection)
             {
                 // use the specialized set for low item counts
index 03fc4a9..4db4d2f 100644 (file)
@@ -12,64 +12,127 @@ namespace System.Collections.Frozen
         // TODO https://github.com/dotnet/runtime/issues/77679:
         // Replace these once non-randomized implementations are available.
 
+        // Lengths 0 to 4 are unrolled manually due to their commonality, especially
+        // with the substring-based dictionary/sets that use substrings with length <= 4.
+
+        private const uint Hash1Start = (5381 << 16) + 5381;
+        private const uint Factor = 1_566_083_941;
+
         public static unsafe int GetHashCodeOrdinal(ReadOnlySpan<char> s)
         {
             int length = s.Length;
             fixed (char* src = &MemoryMarshal.GetReference(s))
             {
-                uint hash1 = (5381 << 16) + 5381;
-                uint hash2 = hash1;
-
-                uint* ptrUInt32 = (uint*)src;
-                while (length > 3)
-                {
-                    hash1 = BitOperations.RotateLeft(hash1, 5) + hash1 ^ ptrUInt32[0];
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ ptrUInt32[1];
-                    ptrUInt32 += 2;
-                    length -= 4;
-                }
-
-                char* ptrChar = (char*)ptrUInt32;
-                while (length-- > 0)
+                uint hash1, hash2;
+                switch (length)
                 {
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ *ptrChar++;
+                    case 0:
+                        return (int)(Hash1Start + unchecked(Hash1Start * Factor));
+
+                    case 1:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ src[0];
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 2:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ src[0];
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ src[1];
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 3:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ src[0];
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ src[1];
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ src[2];
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 4:
+                        hash1 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ ((uint*)src)[0];
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ ((uint*)src)[1];
+                        return (int)(hash1 + (hash2 * Factor));
+
+                    default:
+                        hash1 = Hash1Start;
+                        hash2 = hash1;
+
+                        uint* ptrUInt32 = (uint*)src;
+                        while (length >= 4)
+                        {
+                            hash1 = (BitOperations.RotateLeft(hash1, 5) + hash1) ^ ptrUInt32[0];
+                            hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ ptrUInt32[1];
+                            ptrUInt32 += 2;
+                            length -= 4;
+                        }
+
+                        char* ptrChar = (char*)ptrUInt32;
+                        while (length-- > 0)
+                        {
+                            hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ *ptrChar++;
+                        }
+
+                        return (int)(hash1 + (hash2 * Factor));
                 }
-
-                return (int)(hash1 + (hash2 * 1_566_083_941));
             }
         }
 
         // useful if the string only contains ASCII characters
         public static unsafe int GetHashCodeOrdinalIgnoreCaseAscii(ReadOnlySpan<char> s)
         {
+            // We "normalize to lowercase" every char by ORing with 0x20. This casts
+            // a very wide net because it will change, e.g., '^' to '~'. But that should
+            // be ok because we expect this to be very rare in practice.
+            const uint LowercaseChar = 0x20u;
+            const uint LowercaseUInt32 = 0x0020_0020u;
+
             int length = s.Length;
             fixed (char* src = &MemoryMarshal.GetReference(s))
             {
-                uint hash1 = (5381 << 16) + 5381;
-                uint hash2 = hash1;
-
-                // We "normalize to lowercase" every char by ORing with 0x0020. This casts
-                // a very wide net because it will change, e.g., '^' to '~'. But that should
-                // be ok because we expect this to be very rare in practice.
-                const uint NormalizeToLowercase = 0x0020_0020u; // valid both for big-endian and for little-endian
-
-                uint* ptrUInt32 = (uint*)src;
-                while (length > 3)
+                uint hash1, hash2;
+                switch (length)
                 {
-                    hash1 = BitOperations.RotateLeft(hash1, 5) + hash1 ^ (ptrUInt32[0] | NormalizeToLowercase);
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ (ptrUInt32[1] | NormalizeToLowercase);
-                    ptrUInt32 += 2;
-                    length -= 4;
+                    case 0:
+                        return (int)(Hash1Start + unchecked(Hash1Start * Factor));
+
+                    case 1:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (src[0] | LowercaseChar);
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 2:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (src[0] | LowercaseChar);
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (src[1] | LowercaseChar);
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 3:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (src[0] | LowercaseChar);
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (src[1] | LowercaseChar);
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (src[2] | LowercaseChar);
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 4:
+                        hash1 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (((uint*)src)[0] | LowercaseUInt32);
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (((uint*)src)[1] | LowercaseUInt32);
+                        return (int)(hash1 + (hash2 * Factor));
+
+                    default:
+                        hash1 = Hash1Start;
+                        hash2 = hash1;
+
+                        uint* ptrUInt32 = (uint*)src;
+                        while (length >= 4)
+                        {
+                            hash1 = (BitOperations.RotateLeft(hash1, 5) + hash1) ^ (ptrUInt32[0] | LowercaseUInt32);
+                            hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (ptrUInt32[1] | LowercaseUInt32);
+                            ptrUInt32 += 2;
+                            length -= 4;
+                        }
+
+                        char* ptrChar = (char*)ptrUInt32;
+                        while (length-- > 0)
+                        {
+                            hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (*ptrChar | LowercaseUInt32);
+                            ptrChar++;
+                        }
+
+                        return (int)(hash1 + (hash2 * Factor));
                 }
-
-                char* ptrChar = (char*)ptrUInt32;
-                while (length-- > 0)
-                {
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ (*ptrChar | NormalizeToLowercase);
-                    ptrChar++;
-                }
-
-                return (int)(hash1 + (hash2 * 1_566_083_941));
             }
         }
 
@@ -83,34 +146,14 @@ namespace System.Collections.Frozen
                 (rentedArray = ArrayPool<char>.Shared.Rent(length));
 
             length = s.ToUpperInvariant(scratch); // NOTE: this really should be the (non-existent) ToUpperOrdinal
-
-            uint hash1 = (5381 << 16) + 5381;
-            uint hash2 = hash1;
-
-            fixed (char* src = &MemoryMarshal.GetReference(scratch))
-            {
-                uint* ptrUInt32 = (uint*)src;
-                while (length > 3)
-                {
-                    hash1 = (BitOperations.RotateLeft(hash1, 5) + hash1) ^ ptrUInt32[0];
-                    hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ ptrUInt32[1];
-                    ptrUInt32 += 2;
-                    length -= 4;
-                }
-
-                char* ptrChar = (char*)ptrUInt32;
-                while (length-- > 0)
-                {
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ *ptrChar++;
-                }
-            }
+            int hash = GetHashCodeOrdinal(scratch.Slice(0, length));
 
             if (rentedArray is not null)
             {
                 ArrayPool<char>.Shared.Return(rentedArray);
             }
 
-            return (int)(hash1 + (hash2 * 1_566_083_941));
+            return hash;
         }
     }
 }
index 1060df2..ee80e6a 100644 (file)
@@ -1,8 +1,12 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Buffers;
 using System.Collections.Generic;
+using System.Diagnostics;
+#if !NET8_0_OR_GREATER
 using System.Runtime.CompilerServices;
+#endif
 
 namespace System.Collections.Frozen
 {
@@ -24,119 +28,152 @@ namespace System.Collections.Frozen
         /// In whatever slice we end up with, if all the characters involved in the slice are ASCII and we're doing case-insensitive
         /// operations, then we can select an ASCII-specific case-insensitive comparer which yields faster overall performance.
         /// </remarks>
-        public static void Analyze(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, out AnalysisResults results)
+        public static AnalysisResults Analyze(
+            ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength)
         {
-            // First, try to pick a substring comparer.
-            // if we can't find a good substring comparer, fallback to a full string comparer.
-            if (!UseSubstring(uniqueStrings, ignoreCase, out results))
-            {
-                UseFullString(uniqueStrings, ignoreCase, out results);
-            }
+            Debug.Assert(!uniqueStrings.IsEmpty);
 
-            // Calculate the trivial rejection boundaries.
-            int min = int.MaxValue, max = 0;
-            foreach (string s in uniqueStrings)
+            // Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer.
+            AnalysisResults results;
+            if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results))
             {
-                if (s.Length < min)
-                {
-                    min = s.Length;
-                }
-
-                if (s.Length > max)
-                {
-                    max = s.Length;
-                }
+                results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
             }
 
-            results.MinimumLength = min;
-            results.MaximumLengthDiff = max - min;
+            return results;
         }
 
-        private static bool UseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, out AnalysisResults results)
+        /// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
+        private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
         {
             const double SufficientUniquenessFactor = 0.95; // 95% is good enough
-
-            // What is the shortest string? This represents the maximum substring length we consider
-            int maxSubstringLength = int.MaxValue;
-            foreach (string s in uniqueStrings)
-            {
-                if (s.Length < maxSubstringLength)
-                {
-                    maxSubstringLength = s.Length;
-                }
-            }
+            const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
 
             SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer();
-            SubstringComparer rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer();
+            HashSet<string> leftSet = new HashSet<string>(
+#if NET6_0_OR_GREATER
+                uniqueStrings.Length,
+#endif
+                leftComparer);
+
+            HashSet<string>? rightSet = null;
+            SubstringComparer? rightComparer = null;
 
-            // try to find the minimal unique substring to use for comparisons
-            var leftSet = new HashSet<string>(leftComparer);
-            var rightSet = new HashSet<string>(rightComparer);
+            // For each substring length...
+            int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
             for (int count = 1; count <= maxSubstringLength; count++)
             {
-                for (int index = 0; index <= maxSubstringLength - count; index++)
+                leftComparer.Count = count;
+
+                // For each index, get a uniqueness factor for the left-justified substrings.
+                // If any is above our threshold, we're done.
+                for (int index = 0; index <= minLength - count; index++)
                 {
                     leftComparer.Index = index;
-                    leftComparer.Count = count;
-
                     double factor = GetUniquenessFactor(leftSet, uniqueStrings);
                     if (factor >= SufficientUniquenessFactor)
                     {
-                        bool allAscii = true;
-                        foreach (string s in uniqueStrings)
-                        {
-                            if (!IsAllAscii(s.AsSpan(leftComparer.Index, leftComparer.Count)))
-                            {
-                                allAscii = false;
-                                break;
-                            }
-                        }
-
-                        results = new(allAscii, ignoreCase, 0, 0, leftComparer.Index, leftComparer.Count);
+                        results = CreateAnalysisResults(
+                            uniqueStrings, ignoreCase, minLength, maxLength, index, count,
+                            static (string s, int index, int count) => s.AsSpan(index, count));
                         return true;
                     }
+                }
 
-                    rightComparer.Index = -index - count;
+                // There were no left-justified substrings of this length available.
+                // If all of the strings are of the same length, then just checking left-justification is sufficient.
+                // But if any strings are of different lengths, then we'll get different alignments for left- vs
+                // right-justified substrings, and so we also check right-justification.
+                if (minLength != maxLength)
+                {
+                    // Lazily-initialize the right-comparer/set state, as it's often not needed.
+                    if (rightComparer is null)
+                    {
+                        rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer();
+                        rightSet = new HashSet<string>(
+#if NET6_0_OR_GREATER
+                            uniqueStrings.Length,
+#endif
+                            rightComparer);
+                    }
                     rightComparer.Count = count;
+                    Debug.Assert(rightSet is not null);
 
-                    factor = GetUniquenessFactor(rightSet, uniqueStrings);
-                    if (factor >= SufficientUniquenessFactor)
+                    // For each index, get a uniqueness factor for the right-justified substrings.
+                    // If any is above our threshold, we're done.
+                    for (int index = 0; index <= minLength - count; index++)
                     {
-                        bool allAscii = true;
-                        foreach (string s in uniqueStrings)
+                        // Get a uniqueness factor for the right-justified substrings.
+                        // If it's above our threshold, we're done.
+                        rightComparer.Index = -index - count;
+                        double factor = GetUniquenessFactor(rightSet, uniqueStrings);
+                        if (factor >= SufficientUniquenessFactor)
                         {
-                            if (!IsAllAscii(s.AsSpan(s.Length + rightComparer.Index, rightComparer.Count)))
-                            {
-                                allAscii = false;
-                                break;
-                            }
+                            results = CreateAnalysisResults(
+                                uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count,
+                                static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
+                            return true;
                         }
-
-                        results = new(allAscii, ignoreCase, 0, 0, rightComparer.Index, rightComparer.Count);
-                        return true;
                     }
                 }
             }
 
+            // Could not find a substring index/length that was good enough.
             results = default;
             return false;
         }
 
-        private static void UseFullString(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, out AnalysisResults results)
+        private static AnalysisResults CreateAnalysisResults(
+            ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getSubstringSpan)
         {
-            bool allAscii = true;
-            foreach (string s in uniqueStrings)
+            // Start off by assuming all strings are ASCII
+            bool allAsciiIfIgnoreCase = true;
+
+            // If we're case-sensitive, it doesn't matter if the strings are ASCII or not.
+            // But if we're case-insensitive, we can switch to a faster comparer if all the
+            // substrings are ASCII, so we check each.
+            if (ignoreCase)
             {
-                if (!IsAllAscii(s.AsSpan()))
+                // Further, if the ASCII substrings don't contain any letters, then we can
+                // actually perform the comparison as case-sensitive even if case-insensitive
+                // was requested, as there's nothing that would compare equally to the substring
+                // other than the substring itself.
+                bool canSwitchIgnoreCaseToCaseSensitive = ignoreCase;
+
+                foreach (string s in uniqueStrings)
+                {
+                    // Get the span for the substring.
+                    ReadOnlySpan<char> substring = getSubstringSpan(s, index, count);
+
+                    // If the substring isn't ASCII, bail out to return the results.
+                    if (!IsAllAscii(substring))
+                    {
+                        allAsciiIfIgnoreCase = false;
+                        canSwitchIgnoreCaseToCaseSensitive = false;
+                        break;
+                    }
+
+                    // All substrings so far are still ASCII only.  If this one contains any ASCII
+                    // letters, mark that we can't switch to case-sensitive.
+                    if (canSwitchIgnoreCaseToCaseSensitive && ContainsAnyLetters(substring))
+                    {
+                        canSwitchIgnoreCaseToCaseSensitive = false;
+                    }
+                }
+
+                // If we can switch to case-sensitive, do so.
+                if (canSwitchIgnoreCaseToCaseSensitive)
                 {
-                    allAscii = false;
-                    break;
+                    ignoreCase = false;
                 }
             }
 
-            results = new(allAscii, ignoreCase, 0, 0, 0, 0);
+            // Return the analysis results.
+            return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength);
         }
 
+        private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count);
+
         internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
         {
 #if NET8_0_OR_GREATER
@@ -147,7 +184,7 @@ namespace System.Collections.Frozen
                 uint* ptrUInt32 = (uint*)src;
                 int length = s.Length;
 
-                while (length > 3)
+                while (length >= 4)
                 {
                     if (!AllCharsInUInt32AreAscii(ptrUInt32[0] | ptrUInt32[1]))
                     {
@@ -162,7 +199,7 @@ namespace System.Collections.Frozen
                 while (length-- > 0)
                 {
                     char ch = *ptrChar++;
-                    if (ch >= 0x7f)
+                    if (ch >= 0x80)
                     {
                         return false;
                     }
@@ -176,6 +213,28 @@ namespace System.Collections.Frozen
 #endif
         }
 
+#if NET8_0_OR_GREATER
+        private static readonly SearchValues<char> s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+#endif
+        private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
+        {
+            Debug.Assert(IsAllAscii(s));
+
+#if NET8_0_OR_GREATER
+            return s.IndexOfAny(s_asciiLetters) >= 0;
+#else
+            foreach (char c in s)
+            {
+                Debug.Assert(c <= 0x7f);
+                if ((uint)((c | 0x20) - 'a') <= (uint)('z' - 'a'))
+                {
+                    return true;
+                }
+            }
+            return false;
+#endif
+        }
+
         private static double GetUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
         {
             set.Clear();
@@ -187,30 +246,24 @@ namespace System.Collections.Frozen
             return set.Count / (double)uniqueStrings.Length;
         }
 
-        internal struct AnalysisResults
+        internal readonly struct AnalysisResults
         {
-            public AnalysisResults(
-                bool allAscii,
-                bool ignoreCase,
-                int minimumLength,
-                int maximumLengthDiff,
-                int hashIndex,
-                int hashCount)
+            public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex, int hashCount, int minLength, int maxLength)
             {
-                AllAscii = allAscii;
                 IgnoreCase = ignoreCase;
-                MinimumLength = minimumLength;
-                MaximumLengthDiff = maximumLengthDiff;
+                AllAsciiIfIgnoreCase = allAsciiIfIgnoreCase;
                 HashIndex = hashIndex;
                 HashCount = hashCount;
+                MinimumLength = minLength;
+                MaximumLengthDiff = maxLength - minLength;
             }
 
-            public bool AllAscii { get; }
             public bool IgnoreCase { get; }
-            public int MinimumLength { get; set; }
-            public int MaximumLengthDiff { get; set; }
+            public bool AllAsciiIfIgnoreCase { get; }
             public int HashIndex { get; }
             public int HashCount { get; }
+            public int MinimumLength { get; }
+            public int MaximumLengthDiff { get; }
 
             public bool SubstringHashing => HashCount != 0;
             public bool RightJustifiedSubstring => HashIndex < 0;
index d8c3e2f..c48b7ae 100644 (file)
@@ -2,9 +2,9 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Collections.Generic;
-using System.Collections.Immutable;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 
 namespace System.Collections.Frozen
 {
@@ -35,25 +35,36 @@ namespace System.Collections.Frozen
             _ignoreCase = ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase);
         }
 
-        internal static LengthBucketsFrozenDictionary<TValue>? CreateLengthBucketsFrozenDictionaryIfAppropriate(Dictionary<string, TValue> source, IEqualityComparer<string> comparer)
+        internal static LengthBucketsFrozenDictionary<TValue>? CreateLengthBucketsFrozenDictionaryIfAppropriate(
+            Dictionary<string, TValue> source, IEqualityComparer<string> comparer, int minLength, int maxLength)
         {
             Debug.Assert(source.Count != 0);
             Debug.Assert(comparer == EqualityComparer<string>.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase);
+            Debug.Assert(minLength >= 0 && maxLength >= minLength);
+
+            // If without even looking at the keys we know that some bucket will exceed the max per-bucket
+            // limit (pigeon hole principle), we can early-exit out without doing any further work.
+            int spread = maxLength - minLength + 1;
+            if (source.Count / spread > MaxPerLength)
+            {
+                return null;
+            }
 
             // Iterate through all of the inputs, bucketing them based on the length of the string.
             var groupedByLength = new Dictionary<int, List<KeyValuePair<string, TValue>>>();
-            int minLength = int.MaxValue, maxLength = int.MinValue;
             foreach (KeyValuePair<string, TValue> pair in source)
             {
                 string s = pair.Key;
+                Debug.Assert(s.Length >= minLength && s.Length <= maxLength);
 
-                if (s.Length < minLength) minLength = s.Length;
-                if (s.Length > maxLength) maxLength = s.Length;
-
+#if NET6_0_OR_GREATER
+                List<KeyValuePair<string, TValue>> list = CollectionsMarshal.GetValueRefOrAddDefault(groupedByLength, s.Length, out _) ??= new List<KeyValuePair<string, TValue>>(MaxPerLength);
+#else
                 if (!groupedByLength.TryGetValue(s.Length, out List<KeyValuePair<string, TValue>>? list))
                 {
                     groupedByLength[s.Length] = list = new List<KeyValuePair<string, TValue>>(MaxPerLength);
                 }
+#endif
 
                 // If we've already hit the max per-bucket limit, bail.
                 if (list.Count == MaxPerLength)
@@ -65,15 +76,14 @@ namespace System.Collections.Frozen
             }
 
             // If there would be too much empty space in the lookup array, bail.
-            int spread = maxLength - minLength + 1;
             if (groupedByLength.Count / (double)spread < EmptyLengthsRatio)
             {
                 return null;
             }
 
-            string[] keys = new string[source.Count];
-            TValue[] values = new TValue[keys.Length];
-            var lengthBuckets = new KeyValuePair<string, int>[maxLength - minLength + 1][];
+            var keys = new string[source.Count];
+            var values = new TValue[keys.Length];
+            var lengthBuckets = new KeyValuePair<string, int>[spread][];
 
             // Iterate through each bucket, filling the keys/values arrays, and creating a lookup array such that
             // given a string length we can index into that array to find the array of string,int pairs: the string
index 123df43..880ef00 100644 (file)
@@ -3,6 +3,7 @@
 
 using System.Collections.Generic;
 using System.Diagnostics;
+using System.Runtime.InteropServices;
 
 namespace System.Collections.Frozen
 {
@@ -31,25 +32,35 @@ namespace System.Collections.Frozen
             _ignoreCase = ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase);
         }
 
-        internal static LengthBucketsFrozenSet? CreateLengthBucketsFrozenSetIfAppropriate(string[] entries, IEqualityComparer<string> comparer)
+        internal static LengthBucketsFrozenSet? CreateLengthBucketsFrozenSetIfAppropriate(string[] entries, IEqualityComparer<string> comparer, int minLength, int maxLength)
         {
             Debug.Assert(entries.Length != 0);
             Debug.Assert(comparer == EqualityComparer<string>.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase);
+            Debug.Assert(minLength >= 0 && maxLength >= minLength);
+
+            // If without even looking at the keys we know that some bucket will exceed the max per-bucket
+            // limit (pigeon hole principle), we can early-exit out without doing any further work.
+            int spread = maxLength - minLength + 1;
+            if (entries.Length / spread > MaxPerLength)
+            {
+                return null;
+            }
 
             // Iterate through all of the inputs, bucketing them based on the length of the string.
             var groupedByLength = new Dictionary<int, List<string>>();
-            int minLength = int.MaxValue, maxLength = int.MinValue;
             foreach (string s in entries)
             {
                 Debug.Assert(s is not null, "This implementation should not be used with null source values.");
+                Debug.Assert(s.Length >= minLength && s.Length <= maxLength);
 
-                if (s.Length < minLength) minLength = s.Length;
-                if (s.Length > maxLength) maxLength = s.Length;
-
+#if NET6_0_OR_GREATER
+                List<string> list = CollectionsMarshal.GetValueRefOrAddDefault(groupedByLength, s.Length, out _) ??= new List<string>(MaxPerLength);
+#else
                 if (!groupedByLength.TryGetValue(s.Length, out List<string>? list))
                 {
                     groupedByLength[s.Length] = list = new List<string>(MaxPerLength);
                 }
+#endif
 
                 // If we've already hit the max per-bucket limit, bail.
                 if (list.Count == MaxPerLength)
@@ -61,13 +72,12 @@ namespace System.Collections.Frozen
             }
 
             // If there would be too much empty space in the lookup array, bail.
-            int spread = maxLength - minLength + 1;
             if (groupedByLength.Count / (double)spread < EmptyLengthsRatio)
             {
                 return null;
             }
 
-            var lengthBuckets = new KeyValuePair<string, int>[maxLength - minLength + 1][];
+            var lengthBuckets = new KeyValuePair<string, int>[spread][];
 
             // Iterate through each bucket, filling the items array, and creating a lookup array such that
             // given a string length we can index into that array to find the array of string,int pairs: the string
index c329ae4..6ddec56 100644 (file)
@@ -10,14 +10,18 @@ namespace System.Collections.Frozen.Tests
     {
         private static KeyAnalyzer.AnalysisResults RunAnalysis(string[] values, bool ignoreCase)
         {
-            KeyAnalyzer.Analyze(values, ignoreCase, out KeyAnalyzer.AnalysisResults r);
-
+            int minLength = int.MaxValue, maxLength = 0;
             foreach (string s in values)
             {
-                Assert.True(s.Length >= r.MinimumLength);
-                Assert.True(s.Length <= r.MinimumLength + r.MaximumLengthDiff);
+                if (s.Length < minLength) minLength = s.Length;
+                if (s.Length > maxLength) maxLength = s.Length;
             }
 
+            KeyAnalyzer.AnalysisResults r = KeyAnalyzer.Analyze(values, ignoreCase, minLength, maxLength);
+
+            Assert.All(values, s => Assert.InRange(s.Length, r.MinimumLength, int.MaxValue));
+            Assert.All(values, s => Assert.InRange(s.Length, 0, r.MinimumLength + r.MaximumLengthDiff));
+
             return r;
         }
 
@@ -55,42 +59,42 @@ namespace System.Collections.Frozen.Tests
             KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "É1" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
             Assert.Equal(0, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
             r = RunAnalysis(new[] { "É1", "T1" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
             Assert.Equal(0, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
             r = RunAnalysis(new[] { "ÉA1", "TA1", "ÉB1" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
             Assert.Equal(0, r.HashIndex);
             Assert.Equal(2, r.HashCount);
 
             r = RunAnalysis(new[] { "ABCDEÉ1ABCDEF", "ABCDETA1ABCDEF", "ABCDESB1ABCDEF" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
             Assert.Equal(5, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
             r = RunAnalysis(new[] { "ABCDEFÉ1ABCDEF", "ABCDEFTA1ABCDEF", "ABCDEFSB1ABCDEF" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
             Assert.Equal(6, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
             r = RunAnalysis(new[] { "ABCÉDEFÉ1ABCDEF", "ABCÉDEFTA1ABCDEF", "ABCÉDEFSB1ABCDEF" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
             Assert.Equal(7, r.HashIndex);
             Assert.Equal(1, r.HashCount);
         }
@@ -101,21 +105,21 @@ namespace System.Collections.Frozen.Tests
             KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "S1" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
             Assert.Equal(0, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
             r = RunAnalysis(new[] { "S1", "T1" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
             Assert.Equal(0, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
             r = RunAnalysis(new[] { "SA1", "TA1", "SB1" }, true);
             Assert.False(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
             Assert.Equal(0, r.HashIndex);
             Assert.Equal(2, r.HashCount);
         }
@@ -123,35 +127,35 @@ namespace System.Collections.Frozen.Tests
         [Fact]
         public static void RightHand()
         {
-            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1S", "1T" }, false);
+            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1T1", "1T" }, false);
             Assert.True(r.RightJustifiedSubstring);
             Assert.False(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
             Assert.Equal(-1, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
-            r = RunAnalysis(new[] { "1AS", "1AT", "1BS" }, false);
+            r = RunAnalysis(new[] { "1ATA", "1ATB", "1BS" }, false);
             Assert.True(r.RightJustifiedSubstring);
             Assert.False(r.IgnoreCase);
-            Assert.True(r.AllAscii);
-            Assert.Equal(-2, r.HashIndex);
-            Assert.Equal(2, r.HashCount);
+            Assert.True(r.AllAsciiIfIgnoreCase);
+            Assert.Equal(-1, r.HashIndex);
+            Assert.Equal(1, r.HashCount);
         }
 
         [Fact]
         public static void RightHandCaseInsensitive()
         {
-            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1É", "1T" }, true);
+            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1ÉÉ", "1É" }, true);
             Assert.True(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
-            Assert.Equal(-1, r.HashIndex);
+            Assert.False(r.AllAsciiIfIgnoreCase);
+            Assert.Equal(-2, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
-            r = RunAnalysis(new[] { "1AÉ", "1AT", "1BÉ" }, true);
+            r = RunAnalysis(new[] { "ÉA", "1AT", "1AÉT" }, true);
             Assert.True(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
             Assert.Equal(-2, r.HashIndex);
             Assert.Equal(2, r.HashCount);
         }
@@ -159,19 +163,19 @@ namespace System.Collections.Frozen.Tests
         [Fact]
         public static void RightHandCaseInsensitiveAscii()
         {
-            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1S", "1T" }, true);
+            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "a1", "A1T" }, true);
             Assert.True(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
             Assert.Equal(-1, r.HashIndex);
             Assert.Equal(1, r.HashCount);
 
-            r = RunAnalysis(new[] { "1AS", "1AT", "1BS" }, true);
+            r = RunAnalysis(new[] { "bÉÉ", "caT", "cAÉT" }, true);
             Assert.True(r.RightJustifiedSubstring);
             Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
-            Assert.Equal(-2, r.HashIndex);
-            Assert.Equal(2, r.HashCount);
+            Assert.True(r.AllAsciiIfIgnoreCase);
+            Assert.Equal(-3, r.HashIndex);
+            Assert.Equal(1, r.HashCount);
         }
 
         [Fact]
@@ -180,7 +184,7 @@ namespace System.Collections.Frozen.Tests
             KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "ABC", "DBC", "ADC", "ABD", "ABDABD" }, false);
             Assert.False(r.SubstringHashing);
             Assert.False(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
         }
 
         [Fact]
@@ -189,7 +193,7 @@ namespace System.Collections.Frozen.Tests
             KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "æbc", "DBC", "æDC", "æbd", "æbdæbd" }, true);
             Assert.False(r.SubstringHashing);
             Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
         }
 
         [Fact]
@@ -198,7 +202,7 @@ namespace System.Collections.Frozen.Tests
             KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "abc", "DBC", "aDC", "abd", "abdabd" }, true);
             Assert.False(r.SubstringHashing);
             Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
         }
 
         [Fact]