Faster optimized frozen dictionary creation (3/n) (#87688)

author Adam Sitnik <adam.sitnik@gmail.com>

Thu, 22 Jun 2023 14:52:50 +0000 (16:52 +0200)

committer GitHub <noreply@github.com>

Thu, 22 Jun 2023 14:52:50 +0000 (16:52 +0200)
author Adam Sitnik <adam.sitnik@gmail.com>
Thu, 22 Jun 2023 14:52:50 +0000 (16:52 +0200)
committer GitHub <noreply@github.com>
Thu, 22 Jun 2023 14:52:50 +0000 (16:52 +0200)
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs

index 15ba747..363ab51 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs
@@ -36,23 +36,21 @@ namespace System.Collections.Frozen
          }
  
          /// <summary>Initializes a frozen hash table.</summary>
-        /// <param name="hashCodes">Pre-calculated hash codes.</param>
-        /// <param name="storeDestIndexFromSrcIndex">A delegate that assigns the index to a specific entry. It's passed the destination and source indices.</param>
+        /// <param name="hashCodes">Pre-calculated hash codes. When the method finishes, it assigns each value to destination index.</param>
          /// <param name="optimizeForReading">true to spend additional effort tuning for subsequent read speed on the table; false to prioritize construction time.</param>
+        /// <param name="hashCodesAreUnique">True when the input hash codes are already unique (provided from a dictionary of integers for example).</param>
          /// <remarks>
-        /// This method will iterate through the incoming entries and will invoke the hasher on each once.
          /// It will then determine the optimal number of hash buckets to allocate and will populate the
-        /// bucket table. In the process of doing so, it calls out to the <paramref name="storeDestIndexFromSrcIndex"/> to indicate
-        /// the resulting index for that entry. <see cref="FindMatchingEntries(int, out int, out int)"/>
-        /// then uses this index to reference individual entries by indexing into <see cref="HashCodes"/>.
+        /// bucket table. The caller is responsible to consume the values written to <paramref name="hashCodes"/> and update the destination (if desired).
+        /// <see cref="FindMatchingEntries(int, out int, out int)"/> then uses this index to reference individual entries by indexing into <see cref="HashCodes"/>.
          /// </remarks>
          /// <returns>A frozen hash table.</returns>
-        public static FrozenHashTable Create(ReadOnlySpan<int> hashCodes, Action<int, int> storeDestIndexFromSrcIndex, bool optimizeForReading = true)
+        public static FrozenHashTable Create(Span<int> hashCodes, bool optimizeForReading = true, bool hashCodesAreUnique = false)
          {
              // Determine how many buckets to use.  This might be fewer than the number of entries
              // if any entries have identical hashcodes (not just different hashcodes that might
              // map to the same bucket).
-            int numBuckets = CalcNumBuckets(hashCodes, optimizeForReading);
+            int numBuckets = CalcNumBuckets(hashCodes, optimizeForReading, hashCodesAreUnique);
              ulong fastModMultiplier = HashHelpers.GetFastModMultiplier((uint)numBuckets);
  
              // Create two spans:
@@ -100,8 +98,10 @@ namespace System.Collections.Frozen
                  bucketStart = count;
                  while (index >= 0)
                  {
-                    hashtableHashcodes[count] = hashCodes[index];
-                    storeDestIndexFromSrcIndex(count, index);
+                    ref int hashCode = ref hashCodes[index];
+                    hashtableHashcodes[count] = hashCode;
+                    // we have used the hash code for the last time, now we re-use the buffer to store destination index
+                    hashCode = count;
                      count++;
                      bucketCount++;
  
@@ -144,9 +144,10 @@ namespace System.Collections.Frozen
          /// sizes, starting at the exact number of hash codes and incrementing the bucket count by 1 per trial,
          /// this is a trade-off between speed of determining a good number of buckets and maximal density.
          /// </remarks>
-        private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForReading)
+        private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForReading, bool hashCodesAreUnique)
          {
              Debug.Assert(hashCodes.Length != 0);
+            Debug.Assert(!hashCodesAreUnique || new HashSet<int>(hashCodes.ToArray()).Count == hashCodes.Length);
  
              const double AcceptableCollisionRate = 0.05;  // What is a satisfactory rate of hash collisions?
              const int LargeInputSizeThreshold = 1000;     // What is the limit for an input to be considered "small"?
@@ -159,38 +160,44 @@ namespace System.Collections.Frozen
              }
  
              // Filter out duplicate codes, since no increase in buckets will avoid collisions from duplicate input hash codes.
-            var codes =
+            HashSet<int>? codes = null;
+            int uniqueCodesCount = hashCodes.Length;
+            if (!hashCodesAreUnique)
+            {
+                codes =
  #if NETCOREAPP2_0_OR_GREATER
-                new HashSet<int>(hashCodes.Length);
+                    new HashSet<int>(hashCodes.Length);
  #else
-                new HashSet<int>();
+                    new HashSet<int>();
  #endif
-            foreach (int hashCode in hashCodes)
-            {
-                codes.Add(hashCode);
+                foreach (int hashCode in hashCodes)
+                {
+                    codes.Add(hashCode);
+                }
+                uniqueCodesCount = codes.Count;
              }
-            Debug.Assert(codes.Count != 0);
+            Debug.Assert(uniqueCodesCount != 0);
  
              // In our precomputed primes table, find the index of the smallest prime that's at least as large as our number of
              // hash codes. If there are more codes than in our precomputed primes table, which accommodates millions of values,
              // give up and just use the next prime.
              ReadOnlySpan<int> primes = HashHelpers.Primes;
              int minPrimeIndexInclusive = 0;
-            while ((uint)minPrimeIndexInclusive < (uint)primes.Length && codes.Count > primes[minPrimeIndexInclusive])
+            while ((uint)minPrimeIndexInclusive < (uint)primes.Length && uniqueCodesCount > primes[minPrimeIndexInclusive])
              {
                  minPrimeIndexInclusive++;
              }
  
              if (minPrimeIndexInclusive >= primes.Length)
              {
-                return HashHelpers.GetPrime(codes.Count);
+                return HashHelpers.GetPrime(uniqueCodesCount);
              }
  
              // Determine the largest number of buckets we're willing to use, based on a multiple of the number of inputs.
              // For smaller inputs, we allow for a larger multiplier.
              int maxNumBuckets =
-                codes.Count *
-                (codes.Count >= LargeInputSizeThreshold ? MaxLargeBucketTableMultiplier : MaxSmallBucketTableMultiplier);
+                uniqueCodesCount *
+                (uniqueCodesCount >= LargeInputSizeThreshold ? MaxLargeBucketTableMultiplier : MaxSmallBucketTableMultiplier);
  
              // Find the index of the smallest prime that accommodates our max buckets.
              int maxPrimeIndexExclusive = minPrimeIndexInclusive;
@@ -209,7 +216,7 @@ namespace System.Collections.Frozen
              int[] seenBuckets = ArrayPool<int>.Shared.Rent((maxNumBuckets / BitsPerInt32) + 1);
  
              int bestNumBuckets = maxNumBuckets;
-            int bestNumCollisions = codes.Count;
+            int bestNumCollisions = uniqueCodesCount;
  
              // Iterate through each available prime between the min and max discovered. For each, compute
              // the collision ratio.
@@ -222,22 +229,48 @@ namespace System.Collections.Frozen
                  // Determine the bucket for each hash code and mark it as seen. If it was already seen,
                  // track it as a collision.
                  int numCollisions = 0;
-                foreach (int code in codes)
+
+                if (codes is not null && uniqueCodesCount != hashCodes.Length)
                  {
-                    uint bucketNum = (uint)code % (uint)numBuckets;
-                    if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0)
+                    foreach (int code in codes)
                      {
-                        numCollisions++;
-                        if (numCollisions >= bestNumCollisions)
+                        uint bucketNum = (uint)code % (uint)numBuckets;
+                        if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0)
+                        {
+                            numCollisions++;
+                            if (numCollisions >= bestNumCollisions)
+                            {
+                                // If we've already hit the previously known best number of collisions,
+                                // there's no point in continuing as worst case we'd just use that.
+                                break;
+                            }
+                        }
+                        else
                          {
-                            // If we've already hit the previously known best number of collisions,
-                            // there's no point in continuing as worst case we'd just use that.
-                            break;
+                            seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum;
                          }
                      }
-                    else
+                }
+                else
+                {
+                    // All of the hash codes in hashCodes are unique. In such scenario, it's faster to iterate over a span.
+                    foreach (int code in hashCodes)
                      {
-                        seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum;
+                        uint bucketNum = (uint)code % (uint)numBuckets;
+                        if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0)
+                        {
+                            numCollisions++;
+                            if (numCollisions >= bestNumCollisions)
+                            {
+                                // If we've already hit the previously known best number of collisions,
+                                // there's no point in continuing as worst case we'd just use that.
+                                break;
+                            }
+                        }
+                        else
+                        {
+                            seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum;
+                        }
                      }
                  }
  
@@ -247,7 +280,7 @@ namespace System.Collections.Frozen
                  {
                      bestNumBuckets = numBuckets;
  
-                    if (numCollisions / (double)codes.Count <= AcceptableCollisionRate)
+                    if (numCollisions / (double)uniqueCodesCount <= AcceptableCollisionRate)
                      {
                          break;
                      }
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs

index e987158..f371992 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs
@@ -35,9 +35,14 @@ namespace System.Collections.Frozen
                  hashCodes[i] = entries[i].Key;
              }
  
-            _hashTable = FrozenHashTable.Create(
-                hashCodes,
-                (destIndex, srcIndex) => _values[destIndex] = entries[srcIndex].Value);
+            _hashTable = FrozenHashTable.Create(hashCodes, hashCodesAreUnique: true);
+
+            for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
+            {
+                int destIndex = hashCodes[srcIndex];
+
+                _values[destIndex] = entries[srcIndex].Value;
+            }
  
              ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
          }
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs

index 4317b31..22e7f09 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs
@@ -25,9 +25,7 @@ namespace System.Collections.Frozen
              int[] entries = ArrayPool<int>.Shared.Rent(count);
              source.CopyTo(entries);
  
-            _hashTable = FrozenHashTable.Create(
-                new ReadOnlySpan<int>(entries, 0, count),
-                static delegate { });
+            _hashTable = FrozenHashTable.Create(new Span<int>(entries, 0, count), hashCodesAreUnique: true);
  
              ArrayPool<int>.Shared.Return(entries);
          }
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs

index db94a1a..6037a13 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs
@@ -30,10 +30,14 @@ namespace System.Collections.Frozen
                  hashCodes[i] = entries[i] is T t ? Comparer.GetHashCode(t) : 0;
              }
  
-            _hashTable = FrozenHashTable.Create(
-                hashCodes,
-                (destIndex, srcIndex) => _items[destIndex] = entries[srcIndex],
-                optimizeForReading);
+            _hashTable = FrozenHashTable.Create(hashCodes, optimizeForReading);
+
+            for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
+            {
+                int destIndex = hashCodes[srcIndex];
+
+                _items[destIndex] = entries[srcIndex];
+            }
  
              ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
          }
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs

index ab0cba7..ce2f3ec 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs
@@ -32,14 +32,15 @@ namespace System.Collections.Frozen
                  hashCodes[i] = Comparer.GetHashCode(entries[i].Key);
              }
  
-            _hashTable = FrozenHashTable.Create(
-                hashCodes,
-                (destIndex, srcIndex) =>
-                {
-                    _keys[destIndex] = entries[srcIndex].Key;
-                    _values[destIndex] = entries[srcIndex].Value;
-                },
-                optimizeForReading);
+            _hashTable = FrozenHashTable.Create(hashCodes, optimizeForReading);
+
+            for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
+            {
+                int destIndex = hashCodes[srcIndex];
+
+                _keys[destIndex] = entries[srcIndex].Key;
+                _values[destIndex] = entries[srcIndex].Value;
+            }
  
              ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
          }
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs

index 2cb519e..e510954 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs
@@ -47,13 +47,15 @@ namespace System.Collections.Frozen
                  hashCodes[i] = GetHashCode(keys[i]);
              }
  
-            _hashTable = FrozenHashTable.Create(
-                hashCodes,
-                (destIndex, srcIndex) =>
-                {
-                    _keys[destIndex] = keys[srcIndex];
-                    _values[destIndex] = values[srcIndex];
-                });
+            _hashTable = FrozenHashTable.Create(hashCodes);
+
+            for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
+            {
+                int destIndex = hashCodes[srcIndex];
+
+                _keys[destIndex] = keys[srcIndex];
+                _values[destIndex] = values[srcIndex];
+            }
  
              ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
          }
diff --git a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs

index 1ee4b27..62ce56e 100644 (file)
--- a/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs
+++ b/src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs
@@ -38,9 +38,14 @@ namespace System.Collections.Frozen
                  hashCodes[i] = GetHashCode(entries[i]);
              }
  
-            _hashTable = FrozenHashTable.Create(
-                hashCodes,
-                (destIndex, srcIndex) => _items[destIndex] = entries[srcIndex]);
+            _hashTable = FrozenHashTable.Create(hashCodes);
+
+            for (int srcIndex = 0; srcIndex < hashCodes.Length; srcIndex++)
+            {
+                int destIndex = hashCodes[srcIndex];
+
+                _items[destIndex] = entries[srcIndex];
+            }
  
              ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
          }
author	Adam Sitnik <adam.sitnik@gmail.com>
	Thu, 22 Jun 2023 14:52:50 +0000 (16:52 +0200)
committer	GitHub <noreply@github.com>
	Thu, 22 Jun 2023 14:52:50 +0000 (16:52 +0200)
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/FrozenHashTable.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenDictionary.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/Int32/Int32FrozenSet.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/ItemsFrozenSet.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/KeysAndValuesFrozenDictionary.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenDictionary.cs		patch \| blob \| history
src/libraries/System.Collections.Immutable/src/System/Collections/Frozen/String/OrdinalStringFrozenSet.cs		patch \| blob \| history