src/cpp/thread_manager/thread_manager.cc

   1 /*
   2  *
   3  * Copyright 2016 gRPC authors.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  *
  17  */
  18
  19 #include "src/cpp/thread_manager/thread_manager.h"
  20
  21 #include <climits>
  22
  23 #include <grpc/support/log.h>
  24 #include "src/core/lib/gprpp/thd.h"
  25 #include "src/core/lib/iomgr/exec_ctx.h"
  26
  27 namespace grpc {
  28
  29 ThreadManager::WorkerThread::WorkerThread(ThreadManager* thd_mgr)
  30     : thd_mgr_(thd_mgr) {
  31   // Make thread creation exclusive with respect to its join happening in
  32   // ~WorkerThread().
  33   thd_ = grpc_core::Thread(
  34       "grpcpp_sync_server",
  35       [](void* th) { static_cast<ThreadManager::WorkerThread*>(th)->Run(); },
  36       this, &created_);
  37   if (!created_) {
  38     gpr_log(GPR_ERROR, "Could not create grpc_sync_server worker-thread");
  39   }
  40 }
  41
  42 void ThreadManager::WorkerThread::Run() {
  43   thd_mgr_->MainWorkLoop();
  44   thd_mgr_->MarkAsCompleted(this);
  45 }
  46
  47 ThreadManager::WorkerThread::~WorkerThread() {
  48   // Don't join until the thread is fully constructed.
  49   thd_.Join();
  50 }
  51
  52 ThreadManager::ThreadManager(const char* name,
  53                              grpc_resource_quota* resource_quota,
  54                              int min_pollers, int max_pollers)
  55     : shutdown_(false),
  56       num_pollers_(0),
  57       min_pollers_(min_pollers),
  58       max_pollers_(max_pollers == -1 ? INT_MAX : max_pollers),
  59       num_threads_(0),
  60       max_active_threads_sofar_(0) {
  61   resource_user_ = grpc_resource_user_create(resource_quota, name);
  62 }
  63
  64 ThreadManager::~ThreadManager() {
  65   {
  66     grpc_core::MutexLock lock(&mu_);
  67     GPR_ASSERT(num_threads_ == 0);
  68   }
  69
  70   grpc_core::ExecCtx exec_ctx;  // grpc_resource_user_unref needs an exec_ctx
  71   grpc_resource_user_unref(resource_user_);
  72   CleanupCompletedThreads();
  73 }
  74
  75 void ThreadManager::Wait() {
  76   grpc_core::MutexLock lock(&mu_);
  77   while (num_threads_ != 0) {
  78     shutdown_cv_.Wait(&mu_);
  79   }
  80 }
  81
  82 void ThreadManager::Shutdown() {
  83   grpc_core::MutexLock lock(&mu_);
  84   shutdown_ = true;
  85 }
  86
  87 bool ThreadManager::IsShutdown() {
  88   grpc_core::MutexLock lock(&mu_);
  89   return shutdown_;
  90 }
  91
  92 int ThreadManager::GetMaxActiveThreadsSoFar() {
  93   grpc_core::MutexLock list_lock(&list_mu_);
  94   return max_active_threads_sofar_;
  95 }
  96
  97 void ThreadManager::MarkAsCompleted(WorkerThread* thd) {
  98   {
  99     grpc_core::MutexLock list_lock(&list_mu_);
 100     completed_threads_.push_back(thd);
 101   }
 102
 103   {
 104     grpc_core::MutexLock lock(&mu_);
 105     num_threads_--;
 106     if (num_threads_ == 0) {
 107       shutdown_cv_.Signal();
 108     }
 109   }
 110
 111   // Give a thread back to the resource quota
 112   grpc_resource_user_free_threads(resource_user_, 1);
 113 }
 114
 115 void ThreadManager::CleanupCompletedThreads() {
 116   std::list<WorkerThread*> completed_threads;
 117   {
 118     // swap out the completed threads list: allows other threads to clean up
 119     // more quickly
 120     grpc_core::MutexLock lock(&list_mu_);
 121     completed_threads.swap(completed_threads_);
 122   }
 123   for (auto thd : completed_threads) delete thd;
 124 }
 125
 126 void ThreadManager::Initialize() {
 127   if (!grpc_resource_user_allocate_threads(resource_user_, min_pollers_)) {
 128     gpr_log(GPR_ERROR,
 129             "No thread quota available to even create the minimum required "
 130             "polling threads (i.e %d). Unable to start the thread manager",
 131             min_pollers_);
 132     abort();
 133   }
 134
 135   {
 136     grpc_core::MutexLock lock(&mu_);
 137     num_pollers_ = min_pollers_;
 138     num_threads_ = min_pollers_;
 139     max_active_threads_sofar_ = min_pollers_;
 140   }
 141
 142   for (int i = 0; i < min_pollers_; i++) {
 143     WorkerThread* worker = new WorkerThread(this);
 144     GPR_ASSERT(worker->created());  // Must be able to create the minimum
 145     worker->Start();
 146   }
 147 }
 148
 149 void ThreadManager::MainWorkLoop() {
 150   while (true) {
 151     void* tag;
 152     bool ok;
 153     WorkStatus work_status = PollForWork(&tag, &ok);
 154
 155     grpc_core::LockableAndReleasableMutexLock lock(&mu_);
 156     // Reduce the number of pollers by 1 and check what happened with the poll
 157     num_pollers_--;
 158     bool done = false;
 159     switch (work_status) {
 160       case TIMEOUT:
 161         // If we timed out and we have more pollers than we need (or we are
 162         // shutdown), finish this thread
 163         if (shutdown_ || num_pollers_ > max_pollers_) done = true;
 164         break;
 165       case SHUTDOWN:
 166         // If the thread manager is shutdown, finish this thread
 167         done = true;
 168         break;
 169       case WORK_FOUND:
 170         // If we got work and there are now insufficient pollers and there is
 171         // quota available to create a new thread, start a new poller thread
 172         bool resource_exhausted = false;
 173         if (!shutdown_ && num_pollers_ < min_pollers_) {
 174           if (grpc_resource_user_allocate_threads(resource_user_, 1)) {
 175             // We can allocate a new poller thread
 176             num_pollers_++;
 177             num_threads_++;
 178             if (num_threads_ > max_active_threads_sofar_) {
 179               max_active_threads_sofar_ = num_threads_;
 180             }
 181             // Drop lock before spawning thread to avoid contention
 182             lock.Release();
 183             WorkerThread* worker = new WorkerThread(this);
 184             if (worker->created()) {
 185               worker->Start();
 186             } else {
 187               // Get lock again to undo changes to poller/thread counters.
 188               grpc_core::MutexLock failure_lock(&mu_);
 189               num_pollers_--;
 190               num_threads_--;
 191               resource_exhausted = true;
 192               delete worker;
 193             }
 194           } else if (num_pollers_ > 0) {
 195             // There is still at least some thread polling, so we can go on
 196             // even though we are below the number of pollers that we would
 197             // like to have (min_pollers_)
 198             lock.Release();
 199           } else {
 200             // There are no pollers to spare and we couldn't allocate
 201             // a new thread, so resources are exhausted!
 202             lock.Release();
 203             resource_exhausted = true;
 204           }
 205         } else {
 206           // There are a sufficient number of pollers available so we can do
 207           // the work and continue polling with our existing poller threads
 208           lock.Release();
 209         }
 210         // Lock is always released at this point - do the application work
 211         // or return resource exhausted if there is new work but we couldn't
 212         // get a thread in which to do it.
 213         DoWork(tag, ok, !resource_exhausted);
 214         // Take the lock again to check post conditions
 215         lock.Lock();
 216         // If we're shutdown, we should finish at this point.
 217         if (shutdown_) done = true;
 218         break;
 219     }
 220     // If we decided to finish the thread, break out of the while loop
 221     if (done) break;
 222
 223     // Otherwise go back to polling as long as it doesn't exceed max_pollers_
 224     //
 225     // **WARNING**:
 226     // There is a possibility of threads thrashing here (i.e excessive thread
 227     // shutdowns and creations than the ideal case). This happens if max_poller_
 228     // count is small and the rate of incoming requests is also small. In such
 229     // scenarios we can possibly configure max_pollers_ to a higher value and/or
 230     // increase the cq timeout.
 231     //
 232     // However, not doing this check here and unconditionally incrementing
 233     // num_pollers (and hoping that the system will eventually settle down) has
 234     // far worse consequences i.e huge number of threads getting created to the
 235     // point of thread-exhaustion. For example: if the incoming request rate is
 236     // very high, all the polling threads will return very quickly from
 237     // PollForWork() with WORK_FOUND. They all briefly decrement num_pollers_
 238     // counter thereby possibly - and briefly - making it go below min_pollers;
 239     // This will most likely result in the creation of a new poller since
 240     // num_pollers_ dipped below min_pollers_.
 241     //
 242     // Now, If we didn't do the max_poller_ check here, all these threads will
 243     // go back to doing PollForWork() and the whole cycle repeats (with a new
 244     // thread being added in each cycle). Once the total number of threads in
 245     // the system crosses a certain threshold (around ~1500), there is heavy
 246     // contention on mutexes (the mu_ here or the mutexes in gRPC core like the
 247     // pollset mutex) that makes DoWork() take longer to finish thereby causing
 248     // new poller threads to be created even faster. This results in a thread
 249     // avalanche.
 250     if (num_pollers_ < max_pollers_) {
 251       num_pollers_++;
 252     } else {
 253       break;
 254     }
 255   };
 256
 257   // This thread is exiting. Do some cleanup work i.e delete already completed
 258   // worker threads
 259   CleanupCompletedThreads();
 260
 261   // If we are here, either ThreadManager is shutting down or it already has
 262   // enough threads.
 263 }
 264
 265 }  // namespace grpc