src/jit/smallhash.h

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 #ifndef _SMALLHASHTABLE_H_
   6 #define _SMALLHASHTABLE_H_
   7
   8 // Since compiler depends on valuenum which depends on smallhash, forward declare
   9 // a wrapper for comp->compGetMem here (implemented in compiler.hpp) that can be used below.
  10 class Compiler;
  11 void* compGetMem(Compiler* comp, size_t sz);
  12
  13 // genLog2 is defined in compiler.hpp
  14 unsigned genLog2(unsigned value);
  15
  16 //------------------------------------------------------------------------
  17 // HashTableInfo: a concept that provides equality and hashing methods for
  18 //                a particular key type. Used by HashTableBase and its
  19 //                subclasses.
  20 template <typename TKey>
  21 struct HashTableInfo
  22 {
  23     // static bool Equals(const TKey& x, const TKey& y);
  24     // static unsigned GetHashCode(const TKey& key);
  25 };
  26
  27 //------------------------------------------------------------------------
  28 // HashTableInfo<TKey*>: specialized version of HashTableInfo for pointer-
  29 //                       typed keys.
  30 template <typename TKey>
  31 struct HashTableInfo<TKey*>
  32 {
  33     static bool Equals(const TKey* x, const TKey* y)
  34     {
  35         return x == y;
  36     }
  37
  38     static unsigned GetHashCode(const TKey* key)
  39     {
  40         // Shift off bits that are not likely to be significant
  41         size_t keyval = reinterpret_cast<size_t>(key) >> ConstLog2<__alignof(TKey)>::value;
  42
  43         // Truncate and return the result
  44         return static_cast<unsigned>(keyval);
  45     }
  46 };
  47
  48 //------------------------------------------------------------------------
  49 // HashTableInfo<int>: specialized version of HashTableInfo for int-
  50 //                     typed keys.
  51 template <>
  52 struct HashTableInfo<int>
  53 {
  54     static bool Equals(int x, int y)
  55     {
  56         return x == y;
  57     }
  58
  59     static unsigned GetHashCode(int key)
  60     {
  61         // Cast and return the key
  62         return static_cast<unsigned>(key);
  63     }
  64 };
  65
  66 //------------------------------------------------------------------------
  67 // HashTableInfo<unsigned>: specialized version of HashTableInfo for unsigned-
  68 //                          typed keys.
  69 template <>
  70 struct HashTableInfo<unsigned>
  71 {
  72     static bool Equals(unsigned x, unsigned y)
  73     {
  74         return x == y;
  75     }
  76
  77     static unsigned GetHashCode(unsigned key)
  78     {
  79         // Return the key itself
  80         return key;
  81     }
  82 };
  83
  84 //------------------------------------------------------------------------
  85 // HashTableBase: base type for HashTable and SmallHashTable. This class
  86 //                provides the vast majority of the implementation. The
  87 //                subclasses differ in the storage they use at the time of
  88 //                construction: HashTable allocates the initial bucket
  89 //                array on the heap; SmallHashTable contains a small inline
  90 //                array.
  91 //
  92 // This implementation is based on the ideas presented in Herlihy, Shavit,
  93 // and Tzafrir '08 (http://mcg.cs.tau.ac.il/papers/disc2008-hopscotch.pdf),
  94 // though it does not currently implement the hopscotch algorithm.
  95 //
  96 // The approach taken is intended to perform well in both space and speed.
  97 // This approach is a hybrid of separate chaining and open addressing with
  98 // linear probing: collisions are resolved using a bucket chain, but that
  99 // chain is stored in the bucket array itself.
 100 //
 101 // Resolving collisions using a bucket chain avoids the primary clustering
 102 // issue common in linearly-probed open addressed hash tables, while using
 103 // buckets as chain nodes avoids the allocaiton traffic typical of chained
 104 // tables. Applying the hopscotch algorithm in the aforementioned paper
 105 // could further improve performance by optimizing access patterns for
 106 // better cache usage.
 107 //
 108 // Template parameters:
 109 //    TKey     - The type of the table's keys.
 110 //    TValue   - The type of the table's values.
 111 //    TKeyInfo - A type that conforms to the HashTableInfo<TKey> concept.
 112 template <typename TKey, typename TValue, typename TKeyInfo = HashTableInfo<TKey>>
 113 class HashTableBase
 114 {
 115     friend class KeyValuePair;
 116     friend class Iterator;
 117
 118     enum : unsigned
 119     {
 120         InitialNumBuckets = 8
 121     };
 122
 123 protected:
 124     //------------------------------------------------------------------------
 125     // HashTableBase::Bucket: provides storage for the key-value pairs that
 126     //                        make up the contents of the table.
 127     //
 128     // The "home" bucket for a particular key is the bucket indexed by the
 129     // key's hash code modulo the size of the bucket array (the "home index").
 130     //
 131     // The home bucket is always considered to be part of the chain that it
 132     // roots, even if it is also part of the chain rooted at a different
 133     // bucket. `m_firstOffset` indicates the offset of the first non-home
 134     // bucket in the home bucket's chain. If the `m_firstOffset` of a bucket
 135     // is 0, the chain rooted at that bucket is empty.
 136     //
 137     // The index of the next bucket in a chain is calculated by adding the
 138     // value in `m_nextOffset` to the index of the current bucket. If
 139     // `m_nextOffset` is 0, the current bucket is the end of its chain. Each
 140     // bucket in a chain must be occupied (i.e. `m_isFull` will be true).
 141     struct Bucket
 142     {
 143         bool m_isFull; // True if the bucket is occupied; false otherwise.
 144
 145         unsigned m_firstOffset; // The offset to the first node in the chain for this bucket index.
 146         unsigned m_nextOffset;  // The offset to the next node in the chain for this bucket index.
 147
 148         unsigned m_hash;  // The hash code for the element stored in this bucket.
 149         TKey     m_key;   // The key for the element stored in this bucket.
 150         TValue   m_value; // The value for the element stored in this bucket.
 151     };
 152
 153 private:
 154     Compiler* m_compiler;       // The compiler context to use for allocations.
 155     Bucket*   m_buckets;        // The bucket array.
 156     unsigned  m_numBuckets;     // The number of buckets in the bucket array.
 157     unsigned  m_numFullBuckets; // The number of occupied buckets.
 158
 159     //------------------------------------------------------------------------
 160     // HashTableBase::Insert: inserts a key-value pair into a bucket array.
 161     //
 162     // Arguments:
 163     //    buckets    - The bucket array in which to insert the key-value pair.
 164     //    numBuckets - The number of buckets in the bucket array.
 165     //    hash       - The hash code of the key to insert.
 166     //    key        - The key to insert.
 167     //    value      - The value to insert.
 168     //
 169     // Returns:
 170     //    True if the key-value pair was successfully inserted; false
 171     //    otherwise.
 172     static bool Insert(Bucket* buckets, unsigned numBuckets, unsigned hash, const TKey& key, const TValue& value)
 173     {
 174         const unsigned mask      = numBuckets - 1;
 175         unsigned       homeIndex = hash & mask;
 176
 177         Bucket* home = &buckets[homeIndex];
 178         if (!home->m_isFull)
 179         {
 180             // The home bucket is empty; use it.
 181             //
 182             // Note that `m_firstOffset` does not need to be updated: whether or not it is non-zero,
 183             // it is already correct, since we're inserting at the head of the list. `m_nextOffset`
 184             // must be 0, however, since this node should not be part of a list.
 185             assert(home->m_nextOffset == 0);
 186
 187             home->m_isFull = true;
 188             home->m_hash   = hash;
 189             home->m_key    = key;
 190             home->m_value  = value;
 191             return true;
 192         }
 193
 194         // If the home bucket is full, probe to find the next empty bucket.
 195         unsigned precedingIndexInChain = homeIndex;
 196         unsigned nextIndexInChain      = (homeIndex + home->m_firstOffset) & mask;
 197         for (unsigned j = 1; j < numBuckets; j++)
 198         {
 199             unsigned bucketIndex = (homeIndex + j) & mask;
 200             Bucket*  bucket      = &buckets[bucketIndex];
 201             if (bucketIndex == nextIndexInChain)
 202             {
 203                 assert(bucket->m_isFull);
 204                 precedingIndexInChain = bucketIndex;
 205                 nextIndexInChain      = (bucketIndex + bucket->m_nextOffset) & mask;
 206             }
 207             else if (!bucket->m_isFull)
 208             {
 209                 bucket->m_isFull = true;
 210                 if (precedingIndexInChain == nextIndexInChain)
 211                 {
 212                     bucket->m_nextOffset = 0;
 213                 }
 214                 else
 215                 {
 216                     assert(((nextIndexInChain - bucketIndex) & mask) > 0);
 217                     bucket->m_nextOffset = (nextIndexInChain - bucketIndex) & mask;
 218                 }
 219
 220                 unsigned offset = (bucketIndex - precedingIndexInChain) & mask;
 221                 assert(offset != 0);
 222
 223                 if (precedingIndexInChain == homeIndex)
 224                 {
 225                     buckets[precedingIndexInChain].m_firstOffset = offset;
 226                 }
 227                 else
 228                 {
 229                     buckets[precedingIndexInChain].m_nextOffset = offset;
 230                 }
 231
 232                 bucket->m_hash  = hash;
 233                 bucket->m_key   = key;
 234                 bucket->m_value = value;
 235                 return true;
 236             }
 237         }
 238
 239         // No more free buckets.
 240         return false;
 241     }
 242
 243     //------------------------------------------------------------------------
 244     // HashTableBase::TryGetBucket: attempts to get the bucket that holds a
 245     //                              particular key.
 246     //
 247     // Arguments:
 248     //    hash           - The hash code of the key to find.
 249     //    key            - The key to find.
 250     //    precedingIndex - An output parameter that will hold the index of the
 251     //                     preceding bucket in the chain for the key. May be
 252     //                     equal to `bucketIndex` if the key is stored in its
 253     //                     home bucket.
 254     //    bucketIndex    - An output parameter that will hold the index of the
 255     //                     bucket that stores the key.
 256     //
 257     // Returns:
 258     //    True if the key was successfully found; false otherwise.
 259     bool TryGetBucket(unsigned hash, const TKey& key, unsigned* precedingIndex, unsigned* bucketIndex) const
 260     {
 261         if (m_numBuckets == 0)
 262         {
 263             return false;
 264         }
 265
 266         const unsigned mask  = m_numBuckets - 1;
 267         unsigned       index = hash & mask;
 268
 269         Bucket* bucket = &m_buckets[index];
 270         if (bucket->m_isFull && bucket->m_hash == hash && TKeyInfo::Equals(bucket->m_key, key))
 271         {
 272             *precedingIndex = index;
 273             *bucketIndex    = index;
 274             return true;
 275         }
 276
 277         for (unsigned offset = bucket->m_firstOffset; offset != 0; offset = bucket->m_nextOffset)
 278         {
 279             unsigned precedingIndexInChain = index;
 280
 281             index  = (index + offset) & mask;
 282             bucket = &m_buckets[index];
 283
 284             assert(bucket->m_isFull);
 285             if (bucket->m_hash == hash && TKeyInfo::Equals(bucket->m_key, key))
 286             {
 287                 *precedingIndex = precedingIndexInChain;
 288                 *bucketIndex    = index;
 289                 return true;
 290             }
 291         }
 292
 293         return false;
 294     }
 295
 296     //------------------------------------------------------------------------
 297     // HashTableBase::Resize: allocates a new bucket array twice the size of
 298     //                        the current array and copies the key-value pairs
 299     //                        from the current bucket array into the new array.
 300     void Resize()
 301     {
 302         Bucket* currentBuckets = m_buckets;
 303
 304         unsigned newNumBuckets = m_numBuckets == 0 ? InitialNumBuckets : m_numBuckets * 2;
 305         size_t   allocSize     = sizeof(Bucket) * newNumBuckets;
 306         assert((sizeof(Bucket) * m_numBuckets) < allocSize);
 307
 308         auto* newBuckets = reinterpret_cast<Bucket*>(compGetMem(m_compiler, allocSize));
 309         memset(newBuckets, 0, allocSize);
 310
 311         for (unsigned currentIndex = 0; currentIndex < m_numBuckets; currentIndex++)
 312         {
 313             Bucket* currentBucket = &currentBuckets[currentIndex];
 314             if (!currentBucket->m_isFull)
 315             {
 316                 continue;
 317             }
 318
 319             bool inserted =
 320                 Insert(newBuckets, newNumBuckets, currentBucket->m_hash, currentBucket->m_key, currentBucket->m_value);
 321             (assert(inserted), (void)inserted);
 322         }
 323
 324         m_numBuckets = newNumBuckets;
 325         m_buckets    = newBuckets;
 326     }
 327
 328 protected:
 329     HashTableBase(Compiler* compiler, Bucket* buckets, unsigned numBuckets)
 330         : m_compiler(compiler), m_buckets(buckets), m_numBuckets(numBuckets), m_numFullBuckets(0)
 331     {
 332         assert(compiler != nullptr);
 333
 334         if (numBuckets > 0)
 335         {
 336             assert((numBuckets & (numBuckets - 1)) == 0); // Size must be a power of 2
 337             assert(m_buckets != nullptr);
 338
 339             memset(m_buckets, 0, sizeof(Bucket) * numBuckets);
 340         }
 341     }
 342
 343 public:
 344 #ifdef DEBUG
 345     class Iterator;
 346
 347     class KeyValuePair final
 348     {
 349         friend class HashTableBase<TKey, TValue, TKeyInfo>::Iterator;
 350
 351         Bucket* m_bucket;
 352
 353         KeyValuePair(Bucket* bucket) : m_bucket(bucket)
 354         {
 355             assert(m_bucket != nullptr);
 356         }
 357
 358     public:
 359         KeyValuePair() : m_bucket(nullptr)
 360         {
 361         }
 362
 363         inline TKey& Key()
 364         {
 365             return m_bucket->m_key;
 366         }
 367
 368         inline TValue& Value()
 369         {
 370             return m_bucket->m_value;
 371         }
 372     };
 373
 374     // NOTE: HashTableBase only provides iterators in debug builds because the order in which
 375     // the iterator type produces values is undefined (e.g. it is not related to the order in
 376     // which key-value pairs were inserted).
 377     class Iterator final
 378     {
 379         friend class HashTableBase<TKey, TValue, TKeyInfo>;
 380
 381         Bucket*  m_buckets;
 382         unsigned m_numBuckets;
 383         unsigned m_index;
 384
 385         Iterator(Bucket* buckets, unsigned numBuckets, unsigned index)
 386             : m_buckets(buckets), m_numBuckets(numBuckets), m_index(index)
 387         {
 388             assert((buckets != nullptr) || (numBuckets == 0));
 389             assert(index <= numBuckets);
 390
 391             // Advance to the first occupied bucket
 392             while (m_index != m_numBuckets && !m_buckets[m_index].m_isFull)
 393             {
 394                 m_index++;
 395             }
 396         }
 397
 398     public:
 399         Iterator() : m_buckets(nullptr), m_numBuckets(0), m_index(0)
 400         {
 401         }
 402
 403         KeyValuePair operator*() const
 404         {
 405             if (m_index >= m_numBuckets)
 406             {
 407                 return KeyValuePair();
 408             }
 409
 410             Bucket* bucket = &m_buckets[m_index];
 411             assert(bucket->m_isFull);
 412             return KeyValuePair(bucket);
 413         }
 414
 415         KeyValuePair operator->() const
 416         {
 417             return this->operator*();
 418         }
 419
 420         bool operator==(const Iterator& other) const
 421         {
 422             return (m_buckets == other.m_buckets) && (m_index == other.m_index);
 423         }
 424
 425         bool operator!=(const Iterator& other) const
 426         {
 427             return (m_buckets != other.m_buckets) || (m_index != other.m_index);
 428         }
 429
 430         Iterator& operator++()
 431         {
 432             do
 433             {
 434                 m_index++;
 435             } while (m_index != m_numBuckets && !m_buckets[m_index].m_isFull);
 436
 437             return *this;
 438         }
 439     };
 440
 441     Iterator begin() const
 442     {
 443         return Iterator(m_buckets, m_numBuckets, 0);
 444     }
 445
 446     Iterator end() const
 447     {
 448         return Iterator(m_buckets, m_numBuckets, m_numBuckets);
 449     }
 450 #endif // DEBUG
 451
 452     unsigned Count() const
 453     {
 454         return m_numFullBuckets;
 455     }
 456
 457     void Clear()
 458     {
 459         if (m_numBuckets > 0)
 460         {
 461             memset(m_buckets, 0, sizeof(Bucket) * m_numBuckets);
 462             m_numFullBuckets = 0;
 463         }
 464     }
 465
 466     //------------------------------------------------------------------------
 467     // HashTableBase::AddOrUpdate: adds a key-value pair to the hash table if
 468     //                             the key does not already exist in the
 469     //                             table, or updates the value if the key
 470     //                             already exists.
 471     //
 472     // Arguments:
 473     //    key   - The key for which to add or update a value.
 474     //    value - The value.
 475     //
 476     // Returns:
 477     //    True if the value was added; false if it was updated.
 478     bool AddOrUpdate(const TKey& key, const TValue& value)
 479     {
 480         unsigned hash = TKeyInfo::GetHashCode(key);
 481
 482         unsigned unused, index;
 483         if (TryGetBucket(hash, key, &unused, &index))
 484         {
 485             m_buckets[index].m_value = value;
 486             return false;
 487         }
 488
 489         // If the load is greater than 0.8, resize the table before inserting.
 490         if ((m_numFullBuckets * 5) >= (m_numBuckets * 4))
 491         {
 492             Resize();
 493         }
 494
 495         bool inserted = Insert(m_buckets, m_numBuckets, hash, key, value);
 496         (assert(inserted), (void)inserted);
 497
 498         m_numFullBuckets++;
 499
 500         return true;
 501     }
 502
 503     //------------------------------------------------------------------------
 504     // HashTableBase::TryRemove: removes a key from the hash table and returns
 505     //                           its value if the key exists in the table.
 506     //
 507     // Arguments:
 508     //    key   - The key to remove from the table.
 509     //    value - An output parameter that will hold the value for the removed
 510     //            key.
 511     //
 512     // Returns:
 513     //    True if the key was removed from the table; false otherwise.
 514     bool TryRemove(const TKey& key, TValue* value)
 515     {
 516         unsigned hash = TKeyInfo::GetHashCode(key);
 517
 518         unsigned precedingIndexInChain, bucketIndex;
 519         if (!TryGetBucket(hash, key, &precedingIndexInChain, &bucketIndex))
 520         {
 521             return false;
 522         }
 523
 524         Bucket* bucket = &m_buckets[bucketIndex];
 525
 526         if (precedingIndexInChain != bucketIndex)
 527         {
 528             const unsigned mask      = m_numBuckets - 1;
 529             unsigned       homeIndex = hash & mask;
 530
 531             unsigned nextOffset;
 532             if (bucket->m_nextOffset == 0)
 533             {
 534                 nextOffset = 0;
 535             }
 536             else
 537             {
 538                 unsigned nextIndexInChain = (bucketIndex + bucket->m_nextOffset) & mask;
 539                 nextOffset                = (nextIndexInChain - precedingIndexInChain) & mask;
 540             }
 541
 542             if (precedingIndexInChain == homeIndex)
 543             {
 544                 m_buckets[precedingIndexInChain].m_firstOffset = nextOffset;
 545             }
 546             else
 547             {
 548                 m_buckets[precedingIndexInChain].m_nextOffset = nextOffset;
 549             }
 550         }
 551
 552         bucket->m_isFull     = false;
 553         bucket->m_nextOffset = 0;
 554
 555         m_numFullBuckets--;
 556
 557         *value = bucket->m_value;
 558         return true;
 559     }
 560
 561     //------------------------------------------------------------------------
 562     // HashTableBase::TryGetValue: retrieves the value for a key if the key
 563     //                             exists in the table.
 564     //
 565     // Arguments:
 566     //    key   - The key to find from the table.
 567     //    value - An output parameter that will hold the value for the key.
 568     //
 569     // Returns:
 570     //    True if the key was found in the table; false otherwise.
 571     bool TryGetValue(const TKey& key, TValue* value) const
 572     {
 573         unsigned unused, index;
 574         if (!TryGetBucket(TKeyInfo::GetHashCode(key), key, &unused, &index))
 575         {
 576             return false;
 577         }
 578
 579         *value = m_buckets[index].m_value;
 580         return true;
 581     }
 582
 583     //------------------------------------------------------------------------
 584     // HashTableBase::Contains: returns true if a key exists in the table and
 585     //                          false otherwise.
 586     //
 587     // Arguments:
 588     //    key   - The key to find from the table.
 589     //
 590     // Returns:
 591     //    True if the key was found in the table; false otherwise.
 592     bool Contains(const TKey& key) const
 593     {
 594         unsigned unused, index;
 595         return TryGetBucket(TKeyInfo::GetHashCode(key), key, &unused, &index);
 596     }
 597 };
 598
 599 //------------------------------------------------------------------------
 600 // HashTable: a simple subclass of `HashTableBase` that always uses heap
 601 //            storage for its bucket array.
 602 template <typename TKey, typename TValue, typename TKeyInfo = HashTableInfo<TKey>>
 603 class HashTable final : public HashTableBase<TKey, TValue, TKeyInfo>
 604 {
 605     typedef HashTableBase<TKey, TValue, TKeyInfo> TBase;
 606
 607     static unsigned RoundUp(unsigned initialSize)
 608     {
 609         return 1 << genLog2(initialSize);
 610     }
 611
 612 public:
 613     HashTable(Compiler* compiler) : TBase(compiler, nullptr, 0)
 614     {
 615     }
 616
 617     HashTable(Compiler* compiler, unsigned initialSize)
 618         : TBase(compiler,
 619                 reinterpret_cast<typename TBase::Bucket*>(
 620                     compGetMem(compiler, RoundUp(initialSize) * sizeof(typename TBase::Bucket))),
 621                 RoundUp(initialSize))
 622     {
 623     }
 624 };
 625
 626 //------------------------------------------------------------------------
 627 // SmallHashTable: an alternative to `HashTable` that stores the initial
 628 //                 bucket array inline. Most useful for situations where
 629 //                 the number of key-value pairs that will be stored in
 630 //                 the map at any given time falls below a certain
 631 //                 threshold. Switches to heap storage once the initial
 632 //                 inline storage is exhausted.
 633 template <typename TKey, typename TValue, unsigned NumInlineBuckets = 8, typename TKeyInfo = HashTableInfo<TKey>>
 634 class SmallHashTable final : public HashTableBase<TKey, TValue, TKeyInfo>
 635 {
 636     typedef HashTableBase<TKey, TValue, TKeyInfo> TBase;
 637
 638     enum : unsigned
 639     {
 640         RoundedNumInlineBuckets = 1 << ConstLog2<NumInlineBuckets>::value
 641     };
 642
 643     typename TBase::Bucket m_inlineBuckets[RoundedNumInlineBuckets];
 644
 645 public:
 646     SmallHashTable(Compiler* compiler) : TBase(compiler, m_inlineBuckets, RoundedNumInlineBuckets)
 647     {
 648     }
 649 };
 650
 651 #endif // _SMALLHASHTABLE_H_