Lots of tweaks to FrozenDictionary/Set analysis and hashing (#86293)

author Stephen Toub <stoub@microsoft.com>

Mon, 22 May 2023 14:34:54 +0000 (10:34 -0400)

committer GitHub <noreply@github.com>

Mon, 22 May 2023 14:34:54 +0000 (10:34 -0400)
author Stephen Toub <stoub@microsoft.com>
Mon, 22 May 2023 14:34:54 +0000 (10:34 -0400)
committer GitHub <noreply@github.com>
Mon, 22 May 2023 14:34:54 +0000 (10:34 -0400)
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs

index 607284e..5369038 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs
@@ -193,109 +193,115 @@ namespace System.Collections.Frozen
          {
              IEqualityComparer<TKey> comparer = source.Comparer;
  
-            if (typeof(TKey).IsValueType)
+            // Optimize for value types when the default comparer is being used. In such a case, the implementation
+            // may use {Equality}Comparer<TKey>.Default.Compare/Equals/GetHashCode directly, with generic specialization enabling
+            // the Equals/GetHashCode methods to be devirtualized and possibly inlined.
+            if (typeof(TKey).IsValueType && ReferenceEquals(comparer, EqualityComparer<TKey>.Default))
              {
-                // Optimize for value types when the default comparer is being used. In such a case, the implementation
-                // may use {Equality}Comparer<TKey>.Default.Compare/Equals/GetHashCode directly, with generic specialization enabling
-                // the Equals/GetHashCode methods to be devirtualized and possibly inlined.
-                if (ReferenceEquals(comparer, EqualityComparer<TKey>.Default))
+                if (source.Count <= Constants.MaxItemsInSmallValueTypeFrozenCollection)
                  {
-                    if (source.Count <= Constants.MaxItemsInSmallValueTypeFrozenCollection)
+                    // If the key is a something we know we can efficiently compare, use a specialized implementation
+                    // that will enable quickly ruling out values outside of the range of keys stored.
+                    if (Constants.IsKnownComparable<TKey>())
                      {
-                        // If the key is a something we know we can efficiently compare, use a specialized implementation
-                        // that will enable quickly ruling out values outside of the range of keys stored.
-                        if (Constants.IsKnownComparable<TKey>())
-                        {
-                            return (FrozenDictionary<TKey, TValue>)(object)new SmallValueTypeComparableFrozenDictionary<TKey, TValue>(source);
-                        }
-
-                        // Otherwise, use an implementation optimized for a small number of value types using the default comparer.
-                        return (FrozenDictionary<TKey, TValue>)(object)new SmallValueTypeDefaultComparerFrozenDictionary<TKey, TValue>(source);
+                        return (FrozenDictionary<TKey, TValue>)(object)new SmallValueTypeComparableFrozenDictionary<TKey, TValue>(source);
                      }
  
-                    // Use a hash-based implementation.
+                    // Otherwise, use an implementation optimized for a small number of value types using the default comparer.
+                    return (FrozenDictionary<TKey, TValue>)(object)new SmallValueTypeDefaultComparerFrozenDictionary<TKey, TValue>(source);
+                }
  
-                    // For Int32 keys, we can reuse the key storage as the hash storage, saving on space and extra indirection.
-                    if (typeof(TKey) == typeof(int))
-                    {
-                        return (FrozenDictionary<TKey, TValue>)(object)new Int32FrozenDictionary<TValue>((Dictionary<int, TValue>)(object)source);
-                    }
+                // Use a hash-based implementation.
  
-                    // Fallback to an implementation usable with any value type and the default comparer.
-                    return new ValueTypeDefaultComparerFrozenDictionary<TKey, TValue>(source);
+                // For Int32 keys, we can reuse the key storage as the hash storage, saving on space and extra indirection.
+                if (typeof(TKey) == typeof(int))
+                {
+                    return (FrozenDictionary<TKey, TValue>)(object)new Int32FrozenDictionary<TValue>((Dictionary<int, TValue>)(object)source);
                  }
+
+                // Fallback to an implementation usable with any value type and the default comparer.
+                return new ValueTypeDefaultComparerFrozenDictionary<TKey, TValue>(source);
              }
-            else if (typeof(TKey) == typeof(string))
+
+            // Optimize for string keys with the default, Ordinal, or OrdinalIgnoreCase comparers.
+            // If the key is a string and the comparer is known to provide ordinal (case-sensitive or case-insensitive) semantics,
+            // we can use an implementation that's able to examine and optimize based on lengths and/or subsequences within those strings.
+            if (typeof(TKey) == typeof(string) &&
+                (ReferenceEquals(comparer, EqualityComparer<TKey>.Default) || ReferenceEquals(comparer, StringComparer.Ordinal) || ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase)))
              {
-                // If the key is a string and the comparer is known to provide ordinal (case-sensitive or case-insensitive) semantics,
-                // we can use an implementation that's able to examine and optimize based on lengths and/or subsequences within those strings.
-                if (ReferenceEquals(comparer, EqualityComparer<TKey>.Default) ||
-                    ReferenceEquals(comparer, StringComparer.Ordinal) ||
-                    ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase))
-                {
-                    Dictionary<string, TValue> stringEntries = (Dictionary<string, TValue>)(object)source;
-                    IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
+                Dictionary<string, TValue> stringEntries = (Dictionary<string, TValue>)(object)source;
+                IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
  
-                    FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer);
-                    if (frozenDictionary is not null)
-                    {
-                        return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
-                    }
+                // Calculate the minimum and maximum lengths of the strings in the dictionary. Several of the analyses need this.
+                int minLength = int.MaxValue, maxLength = 0;
+                foreach (KeyValuePair<string, TValue> kvp in stringEntries)
+                {
+                    if (kvp.Key.Length < minLength) minLength = kvp.Key.Length;
+                    if (kvp.Key.Length > maxLength) maxLength = kvp.Key.Length;
+                }
+                Debug.Assert(minLength >= 0 && maxLength >= minLength);
  
-                    string[] entries = (string[])(object)source.Keys.ToArray();
+                // Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length.
+                FrozenDictionary<string, TValue>? frozenDictionary = LengthBucketsFrozenDictionary<TValue>.CreateLengthBucketsFrozenDictionaryIfAppropriate(stringEntries, stringComparer, minLength, maxLength);
+                if (frozenDictionary is not null)
+                {
+                    return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
+                }
  
-                    KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), out KeyAnalyzer.AnalysisResults results);
-                    if (results.SubstringHashing)
+                // Analyze the keys for unique substrings and create an implementation that minimizes the cost of hashing keys.
+                string[] entries = (string[])(object)source.Keys.ToArray();
+                KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength);
+                if (analysis.SubstringHashing)
+                {
+                    if (analysis.RightJustifiedSubstring)
                      {
-                        if (results.RightJustifiedSubstring)
+                        if (analysis.IgnoreCase)
                          {
-                            if (results.IgnoreCase)
-                            {
-                                frozenDictionary = results.AllAscii
-                                    ? new OrdinalStringFrozenDictionary_RightJustifiedCaseInsensitiveAsciiSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount)
-                                    : new OrdinalStringFrozenDictionary_RightJustifiedCaseInsensitiveSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                            }
-                            else
-                            {
-                                frozenDictionary = results.HashCount == 1
-                                    ? new OrdinalStringFrozenDictionary_RightJustifiedSingleChar<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex)
-                                    : new OrdinalStringFrozenDictionary_RightJustifiedSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                            }
+                            frozenDictionary = analysis.AllAsciiIfIgnoreCase
+                                ? new OrdinalStringFrozenDictionary_RightJustifiedCaseInsensitiveAsciiSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount)
+                                : new OrdinalStringFrozenDictionary_RightJustifiedCaseInsensitiveSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                          }
                          else
                          {
-                            if (results.IgnoreCase)
-                            {
-                                frozenDictionary = results.AllAscii
-                                    ? new OrdinalStringFrozenDictionary_LeftJustifiedCaseInsensitiveAsciiSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount)
-                                    : new OrdinalStringFrozenDictionary_LeftJustifiedCaseInsensitiveSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                            }
-                            else
-                            {
-                                frozenDictionary = results.HashCount == 1
-                                    ? new OrdinalStringFrozenDictionary_LeftJustifiedSingleChar<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex)
-                                    : new OrdinalStringFrozenDictionary_LeftJustifiedSubstring<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                            }
+                            frozenDictionary = analysis.HashCount == 1
+                                ? new OrdinalStringFrozenDictionary_RightJustifiedSingleChar<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex)
+                                : new OrdinalStringFrozenDictionary_RightJustifiedSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                          }
                      }
                      else
                      {
-                        if (results.IgnoreCase)
+                        if (analysis.IgnoreCase)
                          {
-                            frozenDictionary = results.AllAscii
-                                ? new OrdinalStringFrozenDictionary_FullCaseInsensitiveAscii<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff)
-                                : new OrdinalStringFrozenDictionary_FullCaseInsensitive<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff);
+                            frozenDictionary = analysis.AllAsciiIfIgnoreCase
+                                ? new OrdinalStringFrozenDictionary_LeftJustifiedCaseInsensitiveAsciiSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount)
+                                : new OrdinalStringFrozenDictionary_LeftJustifiedCaseInsensitiveSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                          }
                          else
                          {
-                            frozenDictionary = new OrdinalStringFrozenDictionary_Full<TValue>(stringEntries, entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff);
+                            frozenDictionary = analysis.HashCount == 1
+                                ? new OrdinalStringFrozenDictionary_LeftJustifiedSingleChar<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex)
+                                : new OrdinalStringFrozenDictionary_LeftJustifiedSubstring<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                          }
                      }
-
-                    return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
                  }
+                else
+                {
+                    if (analysis.IgnoreCase)
+                    {
+                        frozenDictionary = analysis.AllAsciiIfIgnoreCase
+                            ? new OrdinalStringFrozenDictionary_FullCaseInsensitiveAscii<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff)
+                            : new OrdinalStringFrozenDictionary_FullCaseInsensitive<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff);
+                    }
+                    else
+                    {
+                        frozenDictionary = new OrdinalStringFrozenDictionary_Full<TValue>(stringEntries, entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff);
+                    }
+                }
+
+                return (FrozenDictionary<TKey, TValue>)(object)frozenDictionary;
              }
  
+            // Optimize for very small numbers of items by using a specialized implementation that just does a linear search.
              if (source.Count <= Constants.MaxItemsInSmallFrozenCollection)
              {
                  // Use the specialized dictionary for low item counts.
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenSet.cs

index 7237e79..31214ff 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenSet.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenSet.cs
@@ -5,8 +5,6 @@ using System.Collections.Generic;
  using System.Collections.Immutable;
  using System.Diagnostics;
  using System.Diagnostics.CodeAnalysis;
-using System.Linq;
-using System.Numerics;
  using System.Runtime.InteropServices;
  
  namespace System.Collections.Frozen
@@ -125,115 +123,117 @@ namespace System.Collections.Frozen
          {
              IEqualityComparer<T> comparer = source.Comparer;
  
-            if (typeof(T).IsValueType)
+            // Optimize for value types when the default comparer is being used. In such a case, the implementation
+            // may use {Equality}Comparer<T>.Default.Compare/Equals/GetHashCode directly, with generic specialization enabling
+            // the Equals/GetHashCode methods to be devirtualized and possibly inlined.
+            if (typeof(T).IsValueType && ReferenceEquals(comparer, EqualityComparer<T>.Default))
              {
-                // Optimize for value types when the default comparer is being used. In such a case, the implementation
-                // may use {Equality}Comparer<T>.Default.Compare/Equals/GetHashCode directly, with generic specialization enabling
-                // the Equals/GetHashCode methods to be devirtualized and possibly inlined.
-                if (ReferenceEquals(comparer, EqualityComparer<T>.Default))
+                if (source.Count <= Constants.MaxItemsInSmallValueTypeFrozenCollection)
                  {
-                    if (source.Count <= Constants.MaxItemsInSmallValueTypeFrozenCollection)
+                    // If the type is a something we know we can efficiently compare, use a specialized implementation
+                    // that will enable quickly ruling out values outside of the range of keys stored.
+                    if (Constants.IsKnownComparable<T>())
                      {
-                        // If the type is a something we know we can efficiently compare, use a specialized implementation
-                        // that will enable quickly ruling out values outside of the range of keys stored.
-                        if (Constants.IsKnownComparable<T>())
-                        {
-                            return (FrozenSet<T>)(object)new SmallValueTypeComparableFrozenSet<T>(source);
-                        }
-
-                        // Otherwise, use an implementation optimized for a small number of value types using the default comparer.
-                        return (FrozenSet<T>)(object)new SmallValueTypeDefaultComparerFrozenSet<T>(source);
+                        return (FrozenSet<T>)(object)new SmallValueTypeComparableFrozenSet<T>(source);
                      }
  
-                    // Use a hash-based implementation.
+                    // Otherwise, use an implementation optimized for a small number of value types using the default comparer.
+                    return (FrozenSet<T>)(object)new SmallValueTypeDefaultComparerFrozenSet<T>(source);
+                }
  
-                    // For Int32 values, we can reuse the item storage as the hash storage, saving on space and extra indirection.
-                    if (typeof(T) == typeof(int))
-                    {
-                        return (FrozenSet<T>)(object)new Int32FrozenSet((HashSet<int>)(object)source);
-                    }
+                // Use a hash-based implementation.
  
-                    // Fallback to an implementation usable with any value type and the default comparer.
-                    return new ValueTypeDefaultComparerFrozenSet<T>(source);
+                // For Int32 values, we can reuse the item storage as the hash storage, saving on space and extra indirection.
+                if (typeof(T) == typeof(int))
+                {
+                    return (FrozenSet<T>)(object)new Int32FrozenSet((HashSet<int>)(object)source);
                  }
+
+                // Fallback to an implementation usable with any value type and the default comparer.
+                return new ValueTypeDefaultComparerFrozenSet<T>(source);
              }
-            else if (typeof(T) == typeof(string))
+
+            // Optimize for strings when the default, Ordinal, or OrdinalIgnoreCase comparer is being used.
+            // Null is rare as a value in the set and we don't optimize for it.  This enables the ordinal string
+            // implementation to fast-path out on null inputs rather than having to accommodate null inputs.
+            if (typeof(T) == typeof(string) &&
+                !source.Contains(default!) &&
+                (ReferenceEquals(comparer, EqualityComparer<T>.Default) || ReferenceEquals(comparer, StringComparer.Ordinal) || ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase)))
              {
-                // Null is rare as a value in the set and we don't optimize for it.  This enables the ordinal string
-                // implementation to fast-path out on null inputs rather than having to accommodate null inputs.
-                if (!source.Contains(default!))
+                HashSet<string> stringValues = (HashSet<string>)(object)source;
+                var entries = new string[stringValues.Count];
+                stringValues.CopyTo(entries);
+                IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
+
+                // Calculate the minimum and maximum lengths of the strings in the set. Several of the analyses need this.
+                int minLength = int.MaxValue, maxLength = 0;
+                foreach (string s in entries)
                  {
-                    // If the value is a string and the comparer is known to provide ordinal (case-sensitive or case-insensitive) semantics,
-                    // we can use an implementation that's able to examine and optimize based on lengths and/or subsequences within those strings.
-                    if (ReferenceEquals(comparer, EqualityComparer<T>.Default) ||
-                        ReferenceEquals(comparer, StringComparer.Ordinal) ||
-                        ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase))
-                    {
-                        HashSet<string> stringValues = (HashSet<string>)(object)source;
-                        var entries = new string[stringValues.Count];
-                        stringValues.CopyTo(entries);
+                    if (s.Length < minLength) minLength = s.Length;
+                    if (s.Length > maxLength) maxLength = s.Length;
+                }
+                Debug.Assert(minLength >= 0 && maxLength >= minLength);
  
-                        IEqualityComparer<string> stringComparer = (IEqualityComparer<string>)(object)comparer;
+                // Try to create an implementation that uses length buckets, where each bucket contains up to only a few strings of the same length.
+                FrozenSet<string>? frozenSet = LengthBucketsFrozenSet.CreateLengthBucketsFrozenSetIfAppropriate(entries, stringComparer, minLength, maxLength);
+                if (frozenSet is not null)
+                {
+                    return (FrozenSet<T>)(object)frozenSet;
+                }
  
-                        FrozenSet<string>? frozenSet = LengthBucketsFrozenSet.CreateLengthBucketsFrozenSetIfAppropriate(entries, stringComparer);
-                        if (frozenSet is not null)
+                // Analyze the values for unique substrings and create an implementation that minimizes the cost of hashing keys.
+                KeyAnalyzer.AnalysisResults analysis = KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), minLength, maxLength);
+                if (analysis.SubstringHashing)
+                {
+                    if (analysis.RightJustifiedSubstring)
+                    {
+                        if (analysis.IgnoreCase)
                          {
-                            return (FrozenSet<T>)(object)frozenSet;
+                            frozenSet = analysis.AllAsciiIfIgnoreCase
+                                ? new OrdinalStringFrozenSet_RightJustifiedCaseInsensitiveAsciiSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount)
+                                : new OrdinalStringFrozenSet_RightJustifiedCaseInsensitiveSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                          }
-
-                        KeyAnalyzer.Analyze(entries, ReferenceEquals(stringComparer, StringComparer.OrdinalIgnoreCase), out KeyAnalyzer.AnalysisResults results);
-                        if (results.SubstringHashing)
+                        else
                          {
-                            if (results.RightJustifiedSubstring)
-                            {
-                                if (results.IgnoreCase)
-                                {
-                                    frozenSet = results.AllAscii
-                                        ? new OrdinalStringFrozenSet_RightJustifiedCaseInsensitiveAsciiSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount)
-                                        : new OrdinalStringFrozenSet_RightJustifiedCaseInsensitiveSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                                }
-                                else
-                                {
-                                    frozenSet = results.HashCount == 1
-                                        ? new OrdinalStringFrozenSet_RightJustifiedSingleChar(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex)
-                                        : new OrdinalStringFrozenSet_RightJustifiedSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                                }
-                            }
-                            else
-                            {
-                                if (results.IgnoreCase)
-                                {
-                                    frozenSet = results.AllAscii
-                                        ? new OrdinalStringFrozenSet_LeftJustifiedCaseInsensitiveAsciiSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount)
-                                        : new OrdinalStringFrozenSet_LeftJustifiedCaseInsensitiveSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                                }
-                                else
-                                {
-                                    frozenSet = results.HashCount == 1
-                                        ? new OrdinalStringFrozenSet_LeftJustifiedSingleChar(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex)
-                                        : new OrdinalStringFrozenSet_LeftJustifiedSubstring(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff, results.HashIndex, results.HashCount);
-                                }
-                            }
+                            frozenSet = analysis.HashCount == 1
+                                ? new OrdinalStringFrozenSet_RightJustifiedSingleChar(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex)
+                                : new OrdinalStringFrozenSet_RightJustifiedSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
+                        }
+                    }
+                    else
+                    {
+                        if (analysis.IgnoreCase)
+                        {
+                            frozenSet = analysis.AllAsciiIfIgnoreCase
+                                ? new OrdinalStringFrozenSet_LeftJustifiedCaseInsensitiveAsciiSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount)
+                                : new OrdinalStringFrozenSet_LeftJustifiedCaseInsensitiveSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                          }
                          else
                          {
-                            if (results.IgnoreCase)
-                            {
-                                frozenSet = results.AllAscii
-                                    ? new OrdinalStringFrozenSet_FullCaseInsensitiveAscii(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff)
-                                    : new OrdinalStringFrozenSet_FullCaseInsensitive(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff);
-                            }
-                            else
-                            {
-                                frozenSet = new OrdinalStringFrozenSet_Full(entries, stringComparer, results.MinimumLength, results.MaximumLengthDiff);
-                            }
+                            frozenSet = analysis.HashCount == 1
+                                ? new OrdinalStringFrozenSet_LeftJustifiedSingleChar(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex)
+                                : new OrdinalStringFrozenSet_LeftJustifiedSubstring(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff, analysis.HashIndex, analysis.HashCount);
                          }
-
-                        return (FrozenSet<T>)(object)frozenSet;
                      }
                  }
+                else
+                {
+                    if (analysis.IgnoreCase)
+                    {
+                        frozenSet = analysis.AllAsciiIfIgnoreCase
+                            ? new OrdinalStringFrozenSet_FullCaseInsensitiveAscii(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff)
+                            : new OrdinalStringFrozenSet_FullCaseInsensitive(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff);
+                    }
+                    else
+                    {
+                        frozenSet = new OrdinalStringFrozenSet_Full(entries, stringComparer, analysis.MinimumLength, analysis.MaximumLengthDiff);
+                    }
+                }
+
+                return (FrozenSet<T>)(object)frozenSet;
              }
  
+            // Optimize for very small numbers of items by using a specialized implementation that just does a linear search.
              if (source.Count <= Constants.MaxItemsInSmallFrozenCollection)
              {
                  // use the specialized set for low item counts
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs

index 03fc4a9..4db4d2f 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs
@@ -12,64 +12,127 @@ namespace System.Collections.Frozen
          // TODO https://github.com/dotnet/runtime/issues/77679:
          // Replace these once non-randomized implementations are available.
  
+        // Lengths 0 to 4 are unrolled manually due to their commonality, especially
+        // with the substring-based dictionary/sets that use substrings with length <= 4.
+
+        private const uint Hash1Start = (5381 << 16) + 5381;
+        private const uint Factor = 1_566_083_941;
+
          public static unsafe int GetHashCodeOrdinal(ReadOnlySpan<char> s)
          {
              int length = s.Length;
              fixed (char* src = &MemoryMarshal.GetReference(s))
              {
-                uint hash1 = (5381 << 16) + 5381;
-                uint hash2 = hash1;
-
-                uint* ptrUInt32 = (uint*)src;
-                while (length > 3)
-                {
-                    hash1 = BitOperations.RotateLeft(hash1, 5) + hash1 ^ ptrUInt32[0];
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ ptrUInt32[1];
-                    ptrUInt32 += 2;
-                    length -= 4;
-                }
-
-                char* ptrChar = (char*)ptrUInt32;
-                while (length-- > 0)
+                uint hash1, hash2;
+                switch (length)
                  {
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ *ptrChar++;
+                    case 0:
+                        return (int)(Hash1Start + unchecked(Hash1Start * Factor));
+
+                    case 1:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ src[0];
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 2:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ src[0];
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ src[1];
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 3:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ src[0];
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ src[1];
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ src[2];
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 4:
+                        hash1 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ ((uint*)src)[0];
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ ((uint*)src)[1];
+                        return (int)(hash1 + (hash2 * Factor));
+
+                    default:
+                        hash1 = Hash1Start;
+                        hash2 = hash1;
+
+                        uint* ptrUInt32 = (uint*)src;
+                        while (length >= 4)
+                        {
+                            hash1 = (BitOperations.RotateLeft(hash1, 5) + hash1) ^ ptrUInt32[0];
+                            hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ ptrUInt32[1];
+                            ptrUInt32 += 2;
+                            length -= 4;
+                        }
+
+                        char* ptrChar = (char*)ptrUInt32;
+                        while (length-- > 0)
+                        {
+                            hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ *ptrChar++;
+                        }
+
+                        return (int)(hash1 + (hash2 * Factor));
                  }
-
-                return (int)(hash1 + (hash2 * 1_566_083_941));
              }
          }
  
          // useful if the string only contains ASCII characters
          public static unsafe int GetHashCodeOrdinalIgnoreCaseAscii(ReadOnlySpan<char> s)
          {
+            // We "normalize to lowercase" every char by ORing with 0x20. This casts
+            // a very wide net because it will change, e.g., '^' to '~'. But that should
+            // be ok because we expect this to be very rare in practice.
+            const uint LowercaseChar = 0x20u;
+            const uint LowercaseUInt32 = 0x0020_0020u;
+
              int length = s.Length;
              fixed (char* src = &MemoryMarshal.GetReference(s))
              {
-                uint hash1 = (5381 << 16) + 5381;
-                uint hash2 = hash1;
-
-                // We "normalize to lowercase" every char by ORing with 0x0020. This casts
-                // a very wide net because it will change, e.g., '^' to '~'. But that should
-                // be ok because we expect this to be very rare in practice.
-                const uint NormalizeToLowercase = 0x0020_0020u; // valid both for big-endian and for little-endian
-
-                uint* ptrUInt32 = (uint*)src;
-                while (length > 3)
+                uint hash1, hash2;
+                switch (length)
                  {
-                    hash1 = BitOperations.RotateLeft(hash1, 5) + hash1 ^ (ptrUInt32[0] | NormalizeToLowercase);
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ (ptrUInt32[1] | NormalizeToLowercase);
-                    ptrUInt32 += 2;
-                    length -= 4;
+                    case 0:
+                        return (int)(Hash1Start + unchecked(Hash1Start * Factor));
+
+                    case 1:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (src[0] | LowercaseChar);
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 2:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (src[0] | LowercaseChar);
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (src[1] | LowercaseChar);
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 3:
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (src[0] | LowercaseChar);
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (src[1] | LowercaseChar);
+                        hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (src[2] | LowercaseChar);
+                        return (int)(Hash1Start + (hash2 * Factor));
+
+                    case 4:
+                        hash1 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (((uint*)src)[0] | LowercaseUInt32);
+                        hash2 = (BitOperations.RotateLeft(Hash1Start, 5) + Hash1Start) ^ (((uint*)src)[1] | LowercaseUInt32);
+                        return (int)(hash1 + (hash2 * Factor));
+
+                    default:
+                        hash1 = Hash1Start;
+                        hash2 = hash1;
+
+                        uint* ptrUInt32 = (uint*)src;
+                        while (length >= 4)
+                        {
+                            hash1 = (BitOperations.RotateLeft(hash1, 5) + hash1) ^ (ptrUInt32[0] | LowercaseUInt32);
+                            hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (ptrUInt32[1] | LowercaseUInt32);
+                            ptrUInt32 += 2;
+                            length -= 4;
+                        }
+
+                        char* ptrChar = (char*)ptrUInt32;
+                        while (length-- > 0)
+                        {
+                            hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (*ptrChar | LowercaseUInt32);
+                            ptrChar++;
+                        }
+
+                        return (int)(hash1 + (hash2 * Factor));
                  }
-
-                char* ptrChar = (char*)ptrUInt32;
-                while (length-- > 0)
-                {
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ (*ptrChar | NormalizeToLowercase);
-                    ptrChar++;
-                }
-
-                return (int)(hash1 + (hash2 * 1_566_083_941));
              }
          }
  
@@ -83,34 +146,14 @@ namespace System.Collections.Frozen
                  (rentedArray = ArrayPool<char>.Shared.Rent(length));
  
              length = s.ToUpperInvariant(scratch); // NOTE: this really should be the (non-existent) ToUpperOrdinal
-
-            uint hash1 = (5381 << 16) + 5381;
-            uint hash2 = hash1;
-
-            fixed (char* src = &MemoryMarshal.GetReference(scratch))
-            {
-                uint* ptrUInt32 = (uint*)src;
-                while (length > 3)
-                {
-                    hash1 = (BitOperations.RotateLeft(hash1, 5) + hash1) ^ ptrUInt32[0];
-                    hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ ptrUInt32[1];
-                    ptrUInt32 += 2;
-                    length -= 4;
-                }
-
-                char* ptrChar = (char*)ptrUInt32;
-                while (length-- > 0)
-                {
-                    hash2 = BitOperations.RotateLeft(hash2, 5) + hash2 ^ *ptrChar++;
-                }
-            }
+            int hash = GetHashCodeOrdinal(scratch.Slice(0, length));
  
              if (rentedArray is not null)
              {
                  ArrayPool<char>.Shared.Return(rentedArray);
              }
  
-            return (int)(hash1 + (hash2 * 1_566_083_941));
+            return hash;
          }
      }
  }
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs

index 1060df2..ee80e6a 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs
@@ -1,8 +1,12 @@
  // Licensed to the .NET Foundation under one or more agreements.
  // The .NET Foundation licenses this file to you under the MIT license.
  
+using System.Buffers;
  using System.Collections.Generic;
+using System.Diagnostics;
+#if !NET8_0_OR_GREATER
  using System.Runtime.CompilerServices;
+#endif
  
  namespace System.Collections.Frozen
  {
@@ -24,119 +28,152 @@ namespace System.Collections.Frozen
          /// In whatever slice we end up with, if all the characters involved in the slice are ASCII and we're doing case-insensitive
          /// operations, then we can select an ASCII-specific case-insensitive comparer which yields faster overall performance.
          /// </remarks>
-        public static void Analyze(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, out AnalysisResults results)
+        public static AnalysisResults Analyze(
+            ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength)
          {
-            // First, try to pick a substring comparer.
-            // if we can't find a good substring comparer, fallback to a full string comparer.
-            if (!UseSubstring(uniqueStrings, ignoreCase, out results))
-            {
-                UseFullString(uniqueStrings, ignoreCase, out results);
-            }
+            Debug.Assert(!uniqueStrings.IsEmpty);
  
-            // Calculate the trivial rejection boundaries.
-            int min = int.MaxValue, max = 0;
-            foreach (string s in uniqueStrings)
+            // Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer.
+            AnalysisResults results;
+            if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results))
              {
-                if (s.Length < min)
-                {
-                    min = s.Length;
-                }
-
-                if (s.Length > max)
-                {
-                    max = s.Length;
-                }
+                results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
              }
  
-            results.MinimumLength = min;
-            results.MaximumLengthDiff = max - min;
+            return results;
          }
  
-        private static bool UseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, out AnalysisResults results)
+        /// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
+        private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
          {
              const double SufficientUniquenessFactor = 0.95; // 95% is good enough
-
-            // What is the shortest string? This represents the maximum substring length we consider
-            int maxSubstringLength = int.MaxValue;
-            foreach (string s in uniqueStrings)
-            {
-                if (s.Length < maxSubstringLength)
-                {
-                    maxSubstringLength = s.Length;
-                }
-            }
+            const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
  
              SubstringComparer leftComparer = ignoreCase ? new LeftJustifiedCaseInsensitiveSubstringComparer() : new LeftJustifiedSubstringComparer();
-            SubstringComparer rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer();
+            HashSet<string> leftSet = new HashSet<string>(
+#if NET6_0_OR_GREATER
+                uniqueStrings.Length,
+#endif
+                leftComparer);
+
+            HashSet<string>? rightSet = null;
+            SubstringComparer? rightComparer = null;
  
-            // try to find the minimal unique substring to use for comparisons
-            var leftSet = new HashSet<string>(leftComparer);
-            var rightSet = new HashSet<string>(rightComparer);
+            // For each substring length...
+            int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
              for (int count = 1; count <= maxSubstringLength; count++)
              {
-                for (int index = 0; index <= maxSubstringLength - count; index++)
+                leftComparer.Count = count;
+
+                // For each index, get a uniqueness factor for the left-justified substrings.
+                // If any is above our threshold, we're done.
+                for (int index = 0; index <= minLength - count; index++)
                  {
                      leftComparer.Index = index;
-                    leftComparer.Count = count;
-
                      double factor = GetUniquenessFactor(leftSet, uniqueStrings);
                      if (factor >= SufficientUniquenessFactor)
                      {
-                        bool allAscii = true;
-                        foreach (string s in uniqueStrings)
-                        {
-                            if (!IsAllAscii(s.AsSpan(leftComparer.Index, leftComparer.Count)))
-                            {
-                                allAscii = false;
-                                break;
-                            }
-                        }
-
-                        results = new(allAscii, ignoreCase, 0, 0, leftComparer.Index, leftComparer.Count);
+                        results = CreateAnalysisResults(
+                            uniqueStrings, ignoreCase, minLength, maxLength, index, count,
+                            static (string s, int index, int count) => s.AsSpan(index, count));
                          return true;
                      }
+                }
  
-                    rightComparer.Index = -index - count;
+                // There were no left-justified substrings of this length available.
+                // If all of the strings are of the same length, then just checking left-justification is sufficient.
+                // But if any strings are of different lengths, then we'll get different alignments for left- vs
+                // right-justified substrings, and so we also check right-justification.
+                if (minLength != maxLength)
+                {
+                    // Lazily-initialize the right-comparer/set state, as it's often not needed.
+                    if (rightComparer is null)
+                    {
+                        rightComparer = ignoreCase ? new RightJustifiedCaseInsensitiveSubstringComparer() : new RightJustifiedSubstringComparer();
+                        rightSet = new HashSet<string>(
+#if NET6_0_OR_GREATER
+                            uniqueStrings.Length,
+#endif
+                            rightComparer);
+                    }
                      rightComparer.Count = count;
+                    Debug.Assert(rightSet is not null);
  
-                    factor = GetUniquenessFactor(rightSet, uniqueStrings);
-                    if (factor >= SufficientUniquenessFactor)
+                    // For each index, get a uniqueness factor for the right-justified substrings.
+                    // If any is above our threshold, we're done.
+                    for (int index = 0; index <= minLength - count; index++)
                      {
-                        bool allAscii = true;
-                        foreach (string s in uniqueStrings)
+                        // Get a uniqueness factor for the right-justified substrings.
+                        // If it's above our threshold, we're done.
+                        rightComparer.Index = -index - count;
+                        double factor = GetUniquenessFactor(rightSet, uniqueStrings);
+                        if (factor >= SufficientUniquenessFactor)
                          {
-                            if (!IsAllAscii(s.AsSpan(s.Length + rightComparer.Index, rightComparer.Count)))
-                            {
-                                allAscii = false;
-                                break;
-                            }
+                            results = CreateAnalysisResults(
+                                uniqueStrings, ignoreCase, minLength, maxLength, rightComparer.Index, count,
+                                static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
+                            return true;
                          }
-
-                        results = new(allAscii, ignoreCase, 0, 0, rightComparer.Index, rightComparer.Count);
-                        return true;
                      }
                  }
              }
  
+            // Could not find a substring index/length that was good enough.
              results = default;
              return false;
          }
  
-        private static void UseFullString(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, out AnalysisResults results)
+        private static AnalysisResults CreateAnalysisResults(
+            ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getSubstringSpan)
          {
-            bool allAscii = true;
-            foreach (string s in uniqueStrings)
+            // Start off by assuming all strings are ASCII
+            bool allAsciiIfIgnoreCase = true;
+
+            // If we're case-sensitive, it doesn't matter if the strings are ASCII or not.
+            // But if we're case-insensitive, we can switch to a faster comparer if all the
+            // substrings are ASCII, so we check each.
+            if (ignoreCase)
              {
-                if (!IsAllAscii(s.AsSpan()))
+                // Further, if the ASCII substrings don't contain any letters, then we can
+                // actually perform the comparison as case-sensitive even if case-insensitive
+                // was requested, as there's nothing that would compare equally to the substring
+                // other than the substring itself.
+                bool canSwitchIgnoreCaseToCaseSensitive = ignoreCase;
+
+                foreach (string s in uniqueStrings)
+                {
+                    // Get the span for the substring.
+                    ReadOnlySpan<char> substring = getSubstringSpan(s, index, count);
+
+                    // If the substring isn't ASCII, bail out to return the results.
+                    if (!IsAllAscii(substring))
+                    {
+                        allAsciiIfIgnoreCase = false;
+                        canSwitchIgnoreCaseToCaseSensitive = false;
+                        break;
+                    }
+
+                    // All substrings so far are still ASCII only.  If this one contains any ASCII
+                    // letters, mark that we can't switch to case-sensitive.
+                    if (canSwitchIgnoreCaseToCaseSensitive && ContainsAnyLetters(substring))
+                    {
+                        canSwitchIgnoreCaseToCaseSensitive = false;
+                    }
+                }
+
+                // If we can switch to case-sensitive, do so.
+                if (canSwitchIgnoreCaseToCaseSensitive)
                  {
-                    allAscii = false;
-                    break;
+                    ignoreCase = false;
                  }
              }
  
-            results = new(allAscii, ignoreCase, 0, 0, 0, 0);
+            // Return the analysis results.
+            return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength);
          }
  
+        private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count);
+
          internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
          {
  #if NET8_0_OR_GREATER
@@ -147,7 +184,7 @@ namespace System.Collections.Frozen
                  uint* ptrUInt32 = (uint*)src;
                  int length = s.Length;
  
-                while (length > 3)
+                while (length >= 4)
                  {
                      if (!AllCharsInUInt32AreAscii(ptrUInt32[0] | ptrUInt32[1]))
                      {
@@ -162,7 +199,7 @@ namespace System.Collections.Frozen
                  while (length-- > 0)
                  {
                      char ch = *ptrChar++;
-                    if (ch >= 0x7f)
+                    if (ch >= 0x80)
                      {
                          return false;
                      }
@@ -176,6 +213,28 @@ namespace System.Collections.Frozen
  #endif
          }
  
+#if NET8_0_OR_GREATER
+        private static readonly SearchValues<char> s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+#endif
+        private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
+        {
+            Debug.Assert(IsAllAscii(s));
+
+#if NET8_0_OR_GREATER
+            return s.IndexOfAny(s_asciiLetters) >= 0;
+#else
+            foreach (char c in s)
+            {
+                Debug.Assert(c <= 0x7f);
+                if ((uint)((c | 0x20) - 'a') <= (uint)('z' - 'a'))
+                {
+                    return true;
+                }
+            }
+            return false;
+#endif
+        }
+
          private static double GetUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
          {
              set.Clear();
@@ -187,30 +246,24 @@ namespace System.Collections.Frozen
              return set.Count / (double)uniqueStrings.Length;
          }
  
-        internal struct AnalysisResults
+        internal readonly struct AnalysisResults
          {
-            public AnalysisResults(
-                bool allAscii,
-                bool ignoreCase,
-                int minimumLength,
-                int maximumLengthDiff,
-                int hashIndex,
-                int hashCount)
+            public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex, int hashCount, int minLength, int maxLength)
              {
-                AllAscii = allAscii;
                  IgnoreCase = ignoreCase;
-                MinimumLength = minimumLength;
-                MaximumLengthDiff = maximumLengthDiff;
+                AllAsciiIfIgnoreCase = allAsciiIfIgnoreCase;
                  HashIndex = hashIndex;
                  HashCount = hashCount;
+                MinimumLength = minLength;
+                MaximumLengthDiff = maxLength - minLength;
              }
  
-            public bool AllAscii { get; }
              public bool IgnoreCase { get; }
-            public int MinimumLength { get; set; }
-            public int MaximumLengthDiff { get; set; }
+            public bool AllAsciiIfIgnoreCase { get; }
              public int HashIndex { get; }
              public int HashCount { get; }
+            public int MinimumLength { get; }
+            public int MaximumLengthDiff { get; }
  
              public bool SubstringHashing => HashCount != 0;
              public bool RightJustifiedSubstring => HashIndex < 0;
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs

index d8c3e2f..c48b7ae 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs
@@ -2,9 +2,9 @@
  // The .NET Foundation licenses this file to you under the MIT license.
  
  using System.Collections.Generic;
-using System.Collections.Immutable;
  using System.Diagnostics;
  using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
  
  namespace System.Collections.Frozen
  {
@@ -35,25 +35,36 @@ namespace System.Collections.Frozen
              _ignoreCase = ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase);
          }
  
-        internal static LengthBucketsFrozenDictionary<TValue>? CreateLengthBucketsFrozenDictionaryIfAppropriate(Dictionary<string, TValue> source, IEqualityComparer<string> comparer)
+        internal static LengthBucketsFrozenDictionary<TValue>? CreateLengthBucketsFrozenDictionaryIfAppropriate(
+            Dictionary<string, TValue> source, IEqualityComparer<string> comparer, int minLength, int maxLength)
          {
              Debug.Assert(source.Count != 0);
              Debug.Assert(comparer == EqualityComparer<string>.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase);
+            Debug.Assert(minLength >= 0 && maxLength >= minLength);
+
+            // If without even looking at the keys we know that some bucket will exceed the max per-bucket
+            // limit (pigeon hole principle), we can early-exit out without doing any further work.
+            int spread = maxLength - minLength + 1;
+            if (source.Count / spread > MaxPerLength)
+            {
+                return null;
+            }
  
              // Iterate through all of the inputs, bucketing them based on the length of the string.
              var groupedByLength = new Dictionary<int, List<KeyValuePair<string, TValue>>>();
-            int minLength = int.MaxValue, maxLength = int.MinValue;
              foreach (KeyValuePair<string, TValue> pair in source)
              {
                  string s = pair.Key;
+                Debug.Assert(s.Length >= minLength && s.Length <= maxLength);
  
-                if (s.Length < minLength) minLength = s.Length;
-                if (s.Length > maxLength) maxLength = s.Length;
-
+#if NET6_0_OR_GREATER
+                List<KeyValuePair<string, TValue>> list = CollectionsMarshal.GetValueRefOrAddDefault(groupedByLength, s.Length, out _) ??= new List<KeyValuePair<string, TValue>>(MaxPerLength);
+#else
                  if (!groupedByLength.TryGetValue(s.Length, out List<KeyValuePair<string, TValue>>? list))
                  {
                      groupedByLength[s.Length] = list = new List<KeyValuePair<string, TValue>>(MaxPerLength);
                  }
+#endif
  
                  // If we've already hit the max per-bucket limit, bail.
                  if (list.Count == MaxPerLength)
@@ -65,15 +76,14 @@ namespace System.Collections.Frozen
              }
  
              // If there would be too much empty space in the lookup array, bail.
-            int spread = maxLength - minLength + 1;
              if (groupedByLength.Count / (double)spread < EmptyLengthsRatio)
              {
                  return null;
              }
  
-            string[] keys = new string[source.Count];
-            TValue[] values = new TValue[keys.Length];
-            var lengthBuckets = new KeyValuePair<string, int>[maxLength - minLength + 1][];
+            var keys = new string[source.Count];
+            var values = new TValue[keys.Length];
+            var lengthBuckets = new KeyValuePair<string, int>[spread][];
  
              // Iterate through each bucket, filling the keys/values arrays, and creating a lookup array such that
              // given a string length we can index into that array to find the array of string,int pairs: the string
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenSet.cs

index 123df43..880ef00 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenSet.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenSet.cs
@@ -3,6 +3,7 @@
  
  using System.Collections.Generic;
  using System.Diagnostics;
+using System.Runtime.InteropServices;
  
  namespace System.Collections.Frozen
  {
@@ -31,25 +32,35 @@ namespace System.Collections.Frozen
              _ignoreCase = ReferenceEquals(comparer, StringComparer.OrdinalIgnoreCase);
          }
  
-        internal static LengthBucketsFrozenSet? CreateLengthBucketsFrozenSetIfAppropriate(string[] entries, IEqualityComparer<string> comparer)
+        internal static LengthBucketsFrozenSet? CreateLengthBucketsFrozenSetIfAppropriate(string[] entries, IEqualityComparer<string> comparer, int minLength, int maxLength)
          {
              Debug.Assert(entries.Length != 0);
              Debug.Assert(comparer == EqualityComparer<string>.Default || comparer == StringComparer.Ordinal || comparer == StringComparer.OrdinalIgnoreCase);
+            Debug.Assert(minLength >= 0 && maxLength >= minLength);
+
+            // If without even looking at the keys we know that some bucket will exceed the max per-bucket
+            // limit (pigeon hole principle), we can early-exit out without doing any further work.
+            int spread = maxLength - minLength + 1;
+            if (entries.Length / spread > MaxPerLength)
+            {
+                return null;
+            }
  
              // Iterate through all of the inputs, bucketing them based on the length of the string.
              var groupedByLength = new Dictionary<int, List<string>>();
-            int minLength = int.MaxValue, maxLength = int.MinValue;
              foreach (string s in entries)
              {
                  Debug.Assert(s is not null, "This implementation should not be used with null source values.");
+                Debug.Assert(s.Length >= minLength && s.Length <= maxLength);
  
-                if (s.Length < minLength) minLength = s.Length;
-                if (s.Length > maxLength) maxLength = s.Length;
-
+#if NET6_0_OR_GREATER
+                List<string> list = CollectionsMarshal.GetValueRefOrAddDefault(groupedByLength, s.Length, out _) ??= new List<string>(MaxPerLength);
+#else
                  if (!groupedByLength.TryGetValue(s.Length, out List<string>? list))
                  {
                      groupedByLength[s.Length] = list = new List<string>(MaxPerLength);
                  }
+#endif
  
                  // If we've already hit the max per-bucket limit, bail.
                  if (list.Count == MaxPerLength)
@@ -61,13 +72,12 @@ namespace System.Collections.Frozen
              }
  
              // If there would be too much empty space in the lookup array, bail.
-            int spread = maxLength - minLength + 1;
              if (groupedByLength.Count / (double)spread < EmptyLengthsRatio)
              {
                  return null;
              }
  
-            var lengthBuckets = new KeyValuePair<string, int>[maxLength - minLength + 1][];
+            var lengthBuckets = new KeyValuePair<string, int>[spread][];
  
              // Iterate through each bucket, filling the items array, and creating a lookup array such that
              // given a string length we can index into that array to find the array of string,int pairs: the string
diff --git a/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs b/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs

index c329ae4..6ddec56 100644 (file)
--- a/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs
+++ b/src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs
@@ -10,14 +10,18 @@ namespace System.Collections.Frozen.Tests
      {
          private static KeyAnalyzer.AnalysisResults RunAnalysis(string[] values, bool ignoreCase)
          {
-            KeyAnalyzer.Analyze(values, ignoreCase, out KeyAnalyzer.AnalysisResults r);
-
+            int minLength = int.MaxValue, maxLength = 0;
              foreach (string s in values)
              {
-                Assert.True(s.Length >= r.MinimumLength);
-                Assert.True(s.Length <= r.MinimumLength + r.MaximumLengthDiff);
+                if (s.Length < minLength) minLength = s.Length;
+                if (s.Length > maxLength) maxLength = s.Length;
              }
  
+            KeyAnalyzer.AnalysisResults r = KeyAnalyzer.Analyze(values, ignoreCase, minLength, maxLength);
+
+            Assert.All(values, s => Assert.InRange(s.Length, r.MinimumLength, int.MaxValue));
+            Assert.All(values, s => Assert.InRange(s.Length, 0, r.MinimumLength + r.MaximumLengthDiff));
+
              return r;
          }
  
@@ -55,42 +59,42 @@ namespace System.Collections.Frozen.Tests
              KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "É1" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
              Assert.Equal(0, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
              r = RunAnalysis(new[] { "É1", "T1" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
              Assert.Equal(0, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
              r = RunAnalysis(new[] { "ÉA1", "TA1", "ÉB1" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
              Assert.Equal(0, r.HashIndex);
              Assert.Equal(2, r.HashCount);
  
              r = RunAnalysis(new[] { "ABCDEÉ1ABCDEF", "ABCDETA1ABCDEF", "ABCDESB1ABCDEF" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
              Assert.Equal(5, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
              r = RunAnalysis(new[] { "ABCDEFÉ1ABCDEF", "ABCDEFTA1ABCDEF", "ABCDEFSB1ABCDEF" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
              Assert.Equal(6, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
              r = RunAnalysis(new[] { "ABCÉDEFÉ1ABCDEF", "ABCÉDEFTA1ABCDEF", "ABCÉDEFSB1ABCDEF" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
              Assert.Equal(7, r.HashIndex);
              Assert.Equal(1, r.HashCount);
          }
@@ -101,21 +105,21 @@ namespace System.Collections.Frozen.Tests
              KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "S1" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
              Assert.Equal(0, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
              r = RunAnalysis(new[] { "S1", "T1" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
              Assert.Equal(0, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
              r = RunAnalysis(new[] { "SA1", "TA1", "SB1" }, true);
              Assert.False(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
              Assert.Equal(0, r.HashIndex);
              Assert.Equal(2, r.HashCount);
          }
@@ -123,35 +127,35 @@ namespace System.Collections.Frozen.Tests
          [Fact]
          public static void RightHand()
          {
-            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1S", "1T" }, false);
+            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1T1", "1T" }, false);
              Assert.True(r.RightJustifiedSubstring);
              Assert.False(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
              Assert.Equal(-1, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
-            r = RunAnalysis(new[] { "1AS", "1AT", "1BS" }, false);
+            r = RunAnalysis(new[] { "1ATA", "1ATB", "1BS" }, false);
              Assert.True(r.RightJustifiedSubstring);
              Assert.False(r.IgnoreCase);
-            Assert.True(r.AllAscii);
-            Assert.Equal(-2, r.HashIndex);
-            Assert.Equal(2, r.HashCount);
+            Assert.True(r.AllAsciiIfIgnoreCase);
+            Assert.Equal(-1, r.HashIndex);
+            Assert.Equal(1, r.HashCount);
          }
  
          [Fact]
          public static void RightHandCaseInsensitive()
          {
-            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1É", "1T" }, true);
+            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1ÉÉ", "1É" }, true);
              Assert.True(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
-            Assert.Equal(-1, r.HashIndex);
+            Assert.False(r.AllAsciiIfIgnoreCase);
+            Assert.Equal(-2, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
-            r = RunAnalysis(new[] { "1AÉ", "1AT", "1BÉ" }, true);
+            r = RunAnalysis(new[] { "ÉA", "1AT", "1AÉT" }, true);
              Assert.True(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
              Assert.Equal(-2, r.HashIndex);
              Assert.Equal(2, r.HashCount);
          }
@@ -159,19 +163,19 @@ namespace System.Collections.Frozen.Tests
          [Fact]
          public static void RightHandCaseInsensitiveAscii()
          {
-            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "1S", "1T" }, true);
+            KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "a1", "A1T" }, true);
              Assert.True(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
              Assert.Equal(-1, r.HashIndex);
              Assert.Equal(1, r.HashCount);
  
-            r = RunAnalysis(new[] { "1AS", "1AT", "1BS" }, true);
+            r = RunAnalysis(new[] { "bÉÉ", "caT", "cAÉT" }, true);
              Assert.True(r.RightJustifiedSubstring);
              Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
-            Assert.Equal(-2, r.HashIndex);
-            Assert.Equal(2, r.HashCount);
+            Assert.True(r.AllAsciiIfIgnoreCase);
+            Assert.Equal(-3, r.HashIndex);
+            Assert.Equal(1, r.HashCount);
          }
  
          [Fact]
@@ -180,7 +184,7 @@ namespace System.Collections.Frozen.Tests
              KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "ABC", "DBC", "ADC", "ABD", "ABDABD" }, false);
              Assert.False(r.SubstringHashing);
              Assert.False(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
          }
  
          [Fact]
@@ -189,7 +193,7 @@ namespace System.Collections.Frozen.Tests
              KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "æbc", "DBC", "æDC", "æbd", "æbdæbd" }, true);
              Assert.False(r.SubstringHashing);
              Assert.True(r.IgnoreCase);
-            Assert.False(r.AllAscii);
+            Assert.False(r.AllAsciiIfIgnoreCase);
          }
  
          [Fact]
@@ -198,7 +202,7 @@ namespace System.Collections.Frozen.Tests
              KeyAnalyzer.AnalysisResults r = RunAnalysis(new[] { "abc", "DBC", "aDC", "abd", "abdabd" }, true);
              Assert.False(r.SubstringHashing);
              Assert.True(r.IgnoreCase);
-            Assert.True(r.AllAscii);
+            Assert.True(r.AllAsciiIfIgnoreCase);
          }
  
          [Fact]
author	Stephen Toub <stoub@microsoft.com>
	Mon, 22 May 2023 14:34:54 +0000 (10:34 -0400)
committer	GitHub <noreply@github.com>
	Mon, 22 May 2023 14:34:54 +0000 (10:34 -0400)
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenDictionary.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenSet.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/Hashing.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/KeyAnalyzer.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenDictionary.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/LengthBucketsFrozenSet.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/tests/Frozen/KeyAnalyzerTests.cs		patch \| blob \| history