src/third_party/tcmalloc/vendor/src/thread_cache.h

   1 // Copyright (c) 2008, Google Inc.
   2 // All rights reserved.
   3 //
   4 // Redistribution and use in source and binary forms, with or without
   5 // modification, are permitted provided that the following conditions are
   6 // met:
   7 //
   8 //     * Redistributions of source code must retain the above copyright
   9 // notice, this list of conditions and the following disclaimer.
  10 //     * Redistributions in binary form must reproduce the above
  11 // copyright notice, this list of conditions and the following disclaimer
  12 // in the documentation and/or other materials provided with the
  13 // distribution.
  14 //     * Neither the name of Google Inc. nor the names of its
  15 // contributors may be used to endorse or promote products derived from
  16 // this software without specific prior written permission.
  17 //
  18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 // ---
  31 // Author: Sanjay Ghemawat <opensource@google.com>
  32
  33 #ifndef TCMALLOC_THREAD_CACHE_H_
  34 #define TCMALLOC_THREAD_CACHE_H_
  35
  36 #include <config.h>
  37 #ifdef HAVE_PTHREAD
  38 #include <pthread.h>                    // for pthread_t, pthread_key_t
  39 #endif
  40 #include <stddef.h>                     // for size_t, NULL
  41 #ifdef HAVE_STDINT_H
  42 #include <stdint.h>                     // for uint32_t, uint64_t
  43 #endif
  44 #include <sys/types.h>                  // for ssize_t
  45 #include "common.h"
  46 #include "linked_list.h"
  47 #include "maybe_threads.h"
  48 #include "page_heap_allocator.h"
  49 #include "sampler.h"
  50 #include "static_vars.h"
  51
  52 #include "common.h"            // for SizeMap, kMaxSize, etc
  53 #include "internal_logging.h"  // for ASSERT, etc
  54 #include "linked_list.h"       // for SLL_Pop, SLL_PopRange, etc
  55 #include "page_heap_allocator.h"  // for PageHeapAllocator
  56 #include "sampler.h"           // for Sampler
  57 #include "static_vars.h"       // for Static
  58
  59 namespace tcmalloc {
  60
  61 // Even if we have support for thread-local storage in the compiler
  62 // and linker, the OS may not support it.  We need to check that at
  63 // runtime.  Right now, we have to keep a manual set of "bad" OSes.
  64 #if defined(HAVE_TLS)
  65 extern bool kernel_supports_tls;   // defined in thread_cache.cc
  66 void CheckIfKernelSupportsTLS();
  67 inline bool KernelSupportsTLS() {
  68   return kernel_supports_tls;
  69 }
  70 #endif    // HAVE_TLS
  71
  72 //-------------------------------------------------------------------
  73 // Data kept per thread
  74 //-------------------------------------------------------------------
  75
  76 class ThreadCache {
  77  public:
  78   // All ThreadCache objects are kept in a linked list (for stats collection)
  79   ThreadCache* next_;
  80   ThreadCache* prev_;
  81
  82   void Init(pthread_t tid);
  83   void Cleanup();
  84
  85   // Accessors (mostly just for printing stats)
  86   int freelist_length(size_t cl) const { return list_[cl].length(); }
  87
  88   // Total byte size in cache
  89   size_t Size() const { return size_; }
  90
  91   // Allocate an object of the given size and class. The size given
  92   // must be the same as the size of the class in the size map.
  93   void* Allocate(size_t size, size_t cl);
  94   void Deallocate(void* ptr, size_t size_class);
  95
  96   void Scavenge();
  97
  98   int GetSamplePeriod();
  99
 100   // Record allocation of "k" bytes.  Return true iff allocation
 101   // should be sampled
 102   bool SampleAllocation(size_t k);
 103
 104   static void         InitModule();
 105   static void         InitTSD();
 106   static ThreadCache* GetThreadHeap();
 107   static ThreadCache* GetCache();
 108   static ThreadCache* GetCacheIfPresent();
 109   static ThreadCache* CreateCacheIfNecessary();
 110   static void         BecomeIdle();
 111
 112   // Return the number of thread heaps in use.
 113   static inline int HeapsInUse();
 114
 115   // Writes to total_bytes the total number of bytes used by all thread heaps.
 116   // class_count must be an array of size kNumClasses.  Writes the number of
 117   // items on the corresponding freelist.  class_count may be NULL.
 118   // The storage of both parameters must be zero intialized.
 119   // REQUIRES: Static::pageheap_lock is held.
 120   static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);
 121
 122   // Sets the total thread cache size to new_size, recomputing the
 123   // individual thread cache sizes as necessary.
 124   // REQUIRES: Static::pageheap lock is held.
 125   static void set_overall_thread_cache_size(size_t new_size);
 126   static size_t overall_thread_cache_size() {
 127     return overall_thread_cache_size_;
 128   }
 129
 130  private:
 131   class FreeList {
 132    private:
 133     void*    list_;       // Linked list of nodes
 134
 135 #ifdef _LP64
 136     // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
 137     uint32_t length_;      // Current length.
 138     uint32_t lowater_;     // Low water mark for list length.
 139     uint32_t max_length_;  // Dynamic max list length based on usage.
 140     // Tracks the number of times a deallocation has caused
 141     // length_ > max_length_.  After the kMaxOverages'th time, max_length_
 142     // shrinks and length_overages_ is reset to zero.
 143     uint32_t length_overages_;
 144 #else
 145     // If we aren't using 64-bit pointers then pack these into less space.
 146     uint16_t length_;
 147     uint16_t lowater_;
 148     uint16_t max_length_;
 149     uint16_t length_overages_;
 150 #endif
 151
 152    public:
 153     void Init() {
 154       list_ = NULL;
 155       length_ = 0;
 156       lowater_ = 0;
 157       max_length_ = 1;
 158       length_overages_ = 0;
 159     }
 160
 161     // Return current length of list
 162     size_t length() const {
 163       return length_;
 164     }
 165
 166     // Return the maximum length of the list.
 167     size_t max_length() const {
 168       return max_length_;
 169     }
 170
 171     // Set the maximum length of the list.  If 'new_max' > length(), the
 172     // client is responsible for removing objects from the list.
 173     void set_max_length(size_t new_max) {
 174       max_length_ = new_max;
 175     }
 176
 177     // Return the number of times that length() has gone over max_length().
 178     size_t length_overages() const {
 179       return length_overages_;
 180     }
 181
 182     void set_length_overages(size_t new_count) {
 183       length_overages_ = new_count;
 184     }
 185
 186     // Is list empty?
 187     bool empty() const {
 188       return list_ == NULL;
 189     }
 190
 191     // Low-water mark management
 192     int lowwatermark() const { return lowater_; }
 193     void clear_lowwatermark() { lowater_ = length_; }
 194
 195     void Push(void* ptr) {
 196       SLL_Push(&list_, ptr);
 197       length_++;
 198     }
 199
 200     void* Pop() {
 201       ASSERT(list_ != NULL);
 202       length_--;
 203       if (length_ < lowater_) lowater_ = length_;
 204       return SLL_Pop(&list_);
 205     }
 206
 207     void* Next() {
 208       return SLL_Next(&list_);
 209     }
 210
 211     void PushRange(int N, void *start, void *end) {
 212       SLL_PushRange(&list_, start, end);
 213       length_ += N;
 214     }
 215
 216     void PopRange(int N, void **start, void **end) {
 217       SLL_PopRange(&list_, N, start, end);
 218       ASSERT(length_ >= N);
 219       length_ -= N;
 220       if (length_ < lowater_) lowater_ = length_;
 221     }
 222   };
 223
 224   // Gets and returns an object from the central cache, and, if possible,
 225   // also adds some objects of that size class to this thread cache.
 226   void* FetchFromCentralCache(size_t cl, size_t byte_size);
 227
 228   // Releases some number of items from src.  Adjusts the list's max_length
 229   // to eventually converge on num_objects_to_move(cl).
 230   void ListTooLong(FreeList* src, size_t cl);
 231
 232   // Releases N items from this thread cache.
 233   void ReleaseToCentralCache(FreeList* src, size_t cl, int N);
 234
 235   // Increase max_size_ by reducing unclaimed_cache_space_ or by
 236   // reducing the max_size_ of some other thread.  In both cases,
 237   // the delta is kStealAmount.
 238   void IncreaseCacheLimit();
 239   // Same as above but requires Static::pageheap_lock() is held.
 240   void IncreaseCacheLimitLocked();
 241
 242   // If TLS is available, we also store a copy of the per-thread object
 243   // in a __thread variable since __thread variables are faster to read
 244   // than pthread_getspecific().  We still need pthread_setspecific()
 245   // because __thread variables provide no way to run cleanup code when
 246   // a thread is destroyed.
 247   // We also give a hint to the compiler to use the "initial exec" TLS
 248   // model.  This is faster than the default TLS model, at the cost that
 249   // you cannot dlopen this library.  (To see the difference, look at
 250   // the CPU use of __tls_get_addr with and without this attribute.)
 251   // Since we don't really use dlopen in google code -- and using dlopen
 252   // on a malloc replacement is asking for trouble in any case -- that's
 253   // a good tradeoff for us.
 254 #ifdef HAVE_TLS
 255   static __thread ThreadCache* threadlocal_heap_
 256 # ifdef HAVE___ATTRIBUTE__
 257    __attribute__ ((tls_model ("initial-exec")))
 258 # endif
 259    ;
 260 #endif
 261
 262   // Thread-specific key.  Initialization here is somewhat tricky
 263   // because some Linux startup code invokes malloc() before it
 264   // is in a good enough state to handle pthread_keycreate().
 265   // Therefore, we use TSD keys only after tsd_inited is set to true.
 266   // Until then, we use a slow path to get the heap object.
 267   static bool tsd_inited_;
 268   static pthread_key_t heap_key_;
 269
 270   // Linked list of heap objects.  Protected by Static::pageheap_lock.
 271   static ThreadCache* thread_heaps_;
 272   static int thread_heap_count_;
 273
 274   // A pointer to one of the objects in thread_heaps_.  Represents
 275   // the next ThreadCache from which a thread over its max_size_ should
 276   // steal memory limit.  Round-robin through all of the objects in
 277   // thread_heaps_.  Protected by Static::pageheap_lock.
 278   static ThreadCache* next_memory_steal_;
 279
 280   // Overall thread cache size.  Protected by Static::pageheap_lock.
 281   static size_t overall_thread_cache_size_;
 282
 283   // Global per-thread cache size.  Writes are protected by
 284   // Static::pageheap_lock.  Reads are done without any locking, which should be
 285   // fine as long as size_t can be written atomically and we don't place
 286   // invariants between this variable and other pieces of state.
 287   static volatile size_t per_thread_cache_size_;
 288
 289   // Represents overall_thread_cache_size_ minus the sum of max_size_
 290   // across all ThreadCaches.  Protected by Static::pageheap_lock.
 291   static ssize_t unclaimed_cache_space_;
 292
 293   // This class is laid out with the most frequently used fields
 294   // first so that hot elements are placed on the same cache line.
 295
 296   size_t        size_;                  // Combined size of data
 297   size_t        max_size_;              // size_ > max_size_ --> Scavenge()
 298
 299   // We sample allocations, biased by the size of the allocation
 300   Sampler       sampler_;               // A sampler
 301
 302   FreeList      list_[kNumClasses];     // Array indexed by size-class
 303
 304   pthread_t     tid_;                   // Which thread owns it
 305   bool          in_setspecific_;        // In call to pthread_setspecific?
 306
 307   // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
 308   static ThreadCache* NewHeap(pthread_t tid);
 309
 310   // Use only as pthread thread-specific destructor function.
 311   static void DestroyThreadCache(void* ptr);
 312
 313   static void DeleteCache(ThreadCache* heap);
 314   static void RecomputePerThreadCacheSize();
 315
 316   // Ensure that this class is cacheline-aligned. This is critical for
 317   // performance, as false sharing would negate many of the benefits
 318   // of a per-thread cache.
 319 } CACHELINE_ALIGNED;
 320
 321 // Allocator for thread heaps
 322 // This is logically part of the ThreadCache class, but MSVC, at
 323 // least, does not like using ThreadCache as a template argument
 324 // before the class is fully defined.  So we put it outside the class.
 325 extern PageHeapAllocator<ThreadCache> threadcache_allocator;
 326
 327 inline int ThreadCache::HeapsInUse() {
 328   return threadcache_allocator.inuse();
 329 }
 330
 331 inline bool ThreadCache::SampleAllocation(size_t k) {
 332   return sampler_.SampleAllocation(k);
 333 }
 334
 335 inline void* ThreadCache::Allocate(size_t size, size_t cl) {
 336   ASSERT(size <= kMaxSize);
 337   ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
 338
 339   FreeList* list = &list_[cl];
 340   if (list->empty()) {
 341     return FetchFromCentralCache(cl, size);
 342   }
 343   size_ -= size;
 344   return list->Pop();
 345 }
 346
 347 inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
 348   FreeList* list = &list_[cl];
 349   size_ += Static::sizemap()->ByteSizeForClass(cl);
 350   ssize_t size_headroom = max_size_ - size_ - 1;
 351
 352   // This catches back-to-back frees of allocs in the same size
 353   // class. A more comprehensive (and expensive) test would be to walk
 354   // the entire freelist. But this might be enough to find some bugs.
 355   ASSERT(ptr != list->Next());
 356
 357   list->Push(ptr);
 358   ssize_t list_headroom =
 359       static_cast<ssize_t>(list->max_length()) - list->length();
 360
 361   // There are two relatively uncommon things that require further work.
 362   // In the common case we're done, and in that case we need a single branch
 363   // because of the bitwise-or trick that follows.
 364   if ((list_headroom | size_headroom) < 0) {
 365     if (list_headroom < 0) {
 366       ListTooLong(list, cl);
 367     }
 368     if (size_ >= max_size_) Scavenge();
 369   }
 370 }
 371
 372 inline ThreadCache* ThreadCache::GetThreadHeap() {
 373 #ifdef HAVE_TLS
 374   // __thread is faster, but only when the kernel supports it
 375   if (KernelSupportsTLS())
 376     return threadlocal_heap_;
 377 #endif
 378   return reinterpret_cast<ThreadCache *>(
 379       perftools_pthread_getspecific(heap_key_));
 380 }
 381
 382 inline ThreadCache* ThreadCache::GetCache() {
 383   ThreadCache* ptr = NULL;
 384   if (!tsd_inited_) {
 385     InitModule();
 386   } else {
 387     ptr = GetThreadHeap();
 388   }
 389   if (ptr == NULL) ptr = CreateCacheIfNecessary();
 390   return ptr;
 391 }
 392
 393 // In deletion paths, we do not try to create a thread-cache.  This is
 394 // because we may be in the thread destruction code and may have
 395 // already cleaned up the cache for this thread.
 396 inline ThreadCache* ThreadCache::GetCacheIfPresent() {
 397   if (!tsd_inited_) return NULL;
 398   return GetThreadHeap();
 399 }
 400
 401 }  // namespace tcmalloc
 402
 403 #endif  // TCMALLOC_THREAD_CACHE_H_