dali-physics/third-party/bullet3/src/LinearMath/btThreads.cpp

   1 /*
   2 Copyright (c) 2003-2014 Erwin Coumans  http://bullet.googlecode.com
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14
  15 #include "btThreads.h"
  16 #include "btQuickprof.h"
  17 #include <algorithm>  // for min and max
  18
  19 #if BT_USE_OPENMP && BT_THREADSAFE
  20
  21 #include <omp.h>
  22
  23 #endif  // #if BT_USE_OPENMP && BT_THREADSAFE
  24
  25 #if BT_USE_PPL && BT_THREADSAFE
  26
  27 // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
  28 #include <ppl.h>  // if you get a compile error here, check whether your version of Visual Studio includes PPL
  29 // Visual Studio 2010 and later should come with it
  30 #include <concrtrm.h>  // for GetProcessorCount()
  31
  32 #endif  // #if BT_USE_PPL && BT_THREADSAFE
  33
  34 #if BT_USE_TBB && BT_THREADSAFE
  35
  36 // use Intel Threading Building Blocks for thread management
  37 #define __TBB_NO_IMPLICIT_LINKAGE 1
  38 #include <tbb/tbb.h>
  39 #include <tbb/task_scheduler_init.h>
  40 #include <tbb/parallel_for.h>
  41 #include <tbb/blocked_range.h>
  42
  43 #endif  // #if BT_USE_TBB && BT_THREADSAFE
  44
  45 #if BT_THREADSAFE
  46 //
  47 // Lightweight spin-mutex based on atomics
  48 // Using ordinary system-provided mutexes like Windows critical sections was noticeably slower
  49 // presumably because when it fails to lock at first it would sleep the thread and trigger costly
  50 // context switching.
  51 //
  52
  53 #if __cplusplus >= 201103L
  54
  55 // for anything claiming full C++11 compliance, use C++11 atomics
  56 // on GCC or Clang you need to compile with -std=c++11
  57 #define USE_CPP11_ATOMICS 1
  58
  59 #elif defined(_MSC_VER)
  60
  61 // on MSVC, use intrinsics instead
  62 #define USE_MSVC_INTRINSICS 1
  63
  64 #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
  65
  66 // available since GCC 4.7 and some versions of clang
  67 // todo: check for clang
  68 #define USE_GCC_BUILTIN_ATOMICS 1
  69
  70 #elif defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
  71
  72 // available since GCC 4.1
  73 #define USE_GCC_BUILTIN_ATOMICS_OLD 1
  74
  75 #endif
  76
  77 #if USE_CPP11_ATOMICS
  78
  79 #include <atomic>
  80 #include <thread>
  81
  82 #define THREAD_LOCAL_STATIC thread_local static
  83
  84 bool btSpinMutex::tryLock()
  85 {
  86         std::atomic<int>* aDest = reinterpret_cast<std::atomic<int>*>(&mLock);
  87         int expected = 0;
  88         return std::atomic_compare_exchange_weak_explicit(aDest, &expected, int(1), std::memory_order_acq_rel, std::memory_order_acquire);
  89 }
  90
  91 void btSpinMutex::lock()
  92 {
  93         // note: this lock does not sleep the thread.
  94         while (!tryLock())
  95         {
  96                 // spin
  97         }
  98 }
  99
 100 void btSpinMutex::unlock()
 101 {
 102         std::atomic<int>* aDest = reinterpret_cast<std::atomic<int>*>(&mLock);
 103         std::atomic_store_explicit(aDest, int(0), std::memory_order_release);
 104 }
 105
 106 #elif USE_MSVC_INTRINSICS
 107
 108 #define WIN32_LEAN_AND_MEAN
 109
 110 #include <windows.h>
 111 #include <intrin.h>
 112
 113 #define THREAD_LOCAL_STATIC __declspec(thread) static
 114
 115 bool btSpinMutex::tryLock()
 116 {
 117         volatile long* aDest = reinterpret_cast<long*>(&mLock);
 118         return (0 == _InterlockedCompareExchange(aDest, 1, 0));
 119 }
 120
 121 void btSpinMutex::lock()
 122 {
 123         // note: this lock does not sleep the thread
 124         while (!tryLock())
 125         {
 126                 // spin
 127         }
 128 }
 129
 130 void btSpinMutex::unlock()
 131 {
 132         volatile long* aDest = reinterpret_cast<long*>(&mLock);
 133         _InterlockedExchange(aDest, 0);
 134 }
 135
 136 #elif USE_GCC_BUILTIN_ATOMICS
 137
 138 #define THREAD_LOCAL_STATIC static __thread
 139
 140 bool btSpinMutex::tryLock()
 141 {
 142         int expected = 0;
 143         bool weak = false;
 144         const int memOrderSuccess = __ATOMIC_ACQ_REL;
 145         const int memOrderFail = __ATOMIC_ACQUIRE;
 146         return __atomic_compare_exchange_n(&mLock, &expected, int(1), weak, memOrderSuccess, memOrderFail);
 147 }
 148
 149 void btSpinMutex::lock()
 150 {
 151         // note: this lock does not sleep the thread
 152         while (!tryLock())
 153         {
 154                 // spin
 155         }
 156 }
 157
 158 void btSpinMutex::unlock()
 159 {
 160         __atomic_store_n(&mLock, int(0), __ATOMIC_RELEASE);
 161 }
 162
 163 #elif USE_GCC_BUILTIN_ATOMICS_OLD
 164
 165 #define THREAD_LOCAL_STATIC static __thread
 166
 167 bool btSpinMutex::tryLock()
 168 {
 169         return __sync_bool_compare_and_swap(&mLock, int(0), int(1));
 170 }
 171
 172 void btSpinMutex::lock()
 173 {
 174         // note: this lock does not sleep the thread
 175         while (!tryLock())
 176         {
 177                 // spin
 178         }
 179 }
 180
 181 void btSpinMutex::unlock()
 182 {
 183         // write 0
 184         __sync_fetch_and_and(&mLock, int(0));
 185 }
 186
 187 #else  //#elif USE_MSVC_INTRINSICS
 188
 189 #error "no threading primitives defined -- unknown platform"
 190
 191 #endif  //#else //#elif USE_MSVC_INTRINSICS
 192
 193 #else  //#if BT_THREADSAFE
 194
 195 // These should not be called ever
 196 void btSpinMutex::lock()
 197 {
 198         btAssert(!"unimplemented btSpinMutex::lock() called");
 199 }
 200
 201 void btSpinMutex::unlock()
 202 {
 203         btAssert(!"unimplemented btSpinMutex::unlock() called");
 204 }
 205
 206 bool btSpinMutex::tryLock()
 207 {
 208         btAssert(!"unimplemented btSpinMutex::tryLock() called");
 209         return true;
 210 }
 211
 212 #define THREAD_LOCAL_STATIC static
 213
 214 #endif  // #else //#if BT_THREADSAFE
 215
 216 struct ThreadsafeCounter
 217 {
 218         unsigned int mCounter;
 219         btSpinMutex mMutex;
 220
 221         ThreadsafeCounter()
 222         {
 223                 mCounter = 0;
 224                 --mCounter;  // first count should come back 0
 225         }
 226
 227         unsigned int getNext()
 228         {
 229                 // no need to optimize this with atomics, it is only called ONCE per thread!
 230                 mMutex.lock();
 231                 mCounter++;
 232                 if (mCounter >= BT_MAX_THREAD_COUNT)
 233                 {
 234                         btAssert(!"thread counter exceeded");
 235                         // wrap back to the first worker index
 236                         mCounter = 1;
 237                 }
 238                 unsigned int val = mCounter;
 239                 mMutex.unlock();
 240                 return val;
 241         }
 242 };
 243
 244 static btITaskScheduler* gBtTaskScheduler=0;
 245 static int gThreadsRunningCounter = 0;  // useful for detecting if we are trying to do nested parallel-for calls
 246 static btSpinMutex gThreadsRunningCounterMutex;
 247 static ThreadsafeCounter gThreadCounter;
 248
 249 //
 250 // BT_DETECT_BAD_THREAD_INDEX tries to detect when there are multiple threads assigned the same thread index.
 251 //
 252 // BT_DETECT_BAD_THREAD_INDEX is a developer option to test if
 253 // certain assumptions about how the task scheduler manages its threads
 254 // holds true.
 255 // The main assumption is:
 256 //   - when the threadpool is resized, the task scheduler either
 257 //      1. destroys all worker threads and creates all new ones in the correct number, OR
 258 //      2. never destroys a worker thread
 259 //
 260 // We make that assumption because we can't easily enumerate the worker threads of a task scheduler
 261 // to assign nice sequential thread-indexes. We also do not get notified if a worker thread is destroyed,
 262 // so we can't tell when a thread-index is no longer being used.
 263 // We allocate thread-indexes as needed with a sequential global thread counter.
 264 //
 265 // Our simple thread-counting scheme falls apart if the task scheduler destroys some threads but
 266 // continues to re-use other threads and the application repeatedly resizes the thread pool of the
 267 // task scheduler.
 268 // In order to prevent the thread-counter from exceeding the global max (BT_MAX_THREAD_COUNT), we
 269 // wrap the thread counter back to 1. This should only happen if the worker threads have all been
 270 // destroyed and re-created.
 271 //
 272 // BT_DETECT_BAD_THREAD_INDEX only works for Win32 right now,
 273 // but could be adapted to work with pthreads
 274 #define BT_DETECT_BAD_THREAD_INDEX 0
 275
 276 #if BT_DETECT_BAD_THREAD_INDEX
 277
 278 typedef DWORD ThreadId_t;
 279 const static ThreadId_t kInvalidThreadId = 0;
 280 ThreadId_t gDebugThreadIds[BT_MAX_THREAD_COUNT];
 281
 282 static ThreadId_t getDebugThreadId()
 283 {
 284         return GetCurrentThreadId();
 285 }
 286
 287 #endif  // #if BT_DETECT_BAD_THREAD_INDEX
 288
 289 // return a unique index per thread, main thread is 0, worker threads are in [1, BT_MAX_THREAD_COUNT)
 290 unsigned int btGetCurrentThreadIndex()
 291 {
 292         const unsigned int kNullIndex = ~0U;
 293         THREAD_LOCAL_STATIC unsigned int sThreadIndex = kNullIndex;
 294         if (sThreadIndex == kNullIndex)
 295         {
 296                 sThreadIndex = gThreadCounter.getNext();
 297                 btAssert(sThreadIndex < BT_MAX_THREAD_COUNT);
 298         }
 299 #if BT_DETECT_BAD_THREAD_INDEX
 300         if (gBtTaskScheduler && sThreadIndex > 0)
 301         {
 302                 ThreadId_t tid = getDebugThreadId();
 303                 // if not set
 304                 if (gDebugThreadIds[sThreadIndex] == kInvalidThreadId)
 305                 {
 306                         // set it
 307                         gDebugThreadIds[sThreadIndex] = tid;
 308                 }
 309                 else
 310                 {
 311                         if (gDebugThreadIds[sThreadIndex] != tid)
 312                         {
 313                                 // this could indicate the task scheduler is breaking our assumptions about
 314                                 // how threads are managed when threadpool is resized
 315                                 btAssert(!"there are 2 or more threads with the same thread-index!");
 316                                 __debugbreak();
 317                         }
 318                 }
 319         }
 320 #endif  // #if BT_DETECT_BAD_THREAD_INDEX
 321         return sThreadIndex;
 322 }
 323
 324 bool btIsMainThread()
 325 {
 326         return btGetCurrentThreadIndex() == 0;
 327 }
 328
 329 void btResetThreadIndexCounter()
 330 {
 331         // for when all current worker threads are destroyed
 332         btAssert(btIsMainThread());
 333         gThreadCounter.mCounter = 0;
 334 }
 335
 336 btITaskScheduler::btITaskScheduler(const char* name)
 337 {
 338         m_name = name;
 339         m_savedThreadCounter = 0;
 340         m_isActive = false;
 341 }
 342
 343 void btITaskScheduler::activate()
 344 {
 345         // gThreadCounter is used to assign a thread-index to each worker thread in a task scheduler.
 346         // The main thread is always thread-index 0, and worker threads are numbered from 1 to 63 (BT_MAX_THREAD_COUNT-1)
 347         // The thread-indexes need to be unique amongst the threads that can be running simultaneously.
 348         // Since only one task scheduler can be used at a time, it is OK for a pair of threads that belong to different
 349         // task schedulers to share the same thread index because they can't be running at the same time.
 350         // So each task scheduler needs to keep its own thread counter value
 351         if (!m_isActive)
 352         {
 353                 gThreadCounter.mCounter = m_savedThreadCounter;  // restore saved thread counter
 354                 m_isActive = true;
 355         }
 356 }
 357
 358 void btITaskScheduler::deactivate()
 359 {
 360         if (m_isActive)
 361         {
 362                 m_savedThreadCounter = gThreadCounter.mCounter;  // save thread counter
 363                 m_isActive = false;
 364         }
 365 }
 366
 367 void btPushThreadsAreRunning()
 368 {
 369         gThreadsRunningCounterMutex.lock();
 370         gThreadsRunningCounter++;
 371         gThreadsRunningCounterMutex.unlock();
 372 }
 373
 374 void btPopThreadsAreRunning()
 375 {
 376         gThreadsRunningCounterMutex.lock();
 377         gThreadsRunningCounter--;
 378         gThreadsRunningCounterMutex.unlock();
 379 }
 380
 381 bool btThreadsAreRunning()
 382 {
 383         return gThreadsRunningCounter != 0;
 384 }
 385
 386 void btSetTaskScheduler(btITaskScheduler* ts)
 387 {
 388         int threadId = btGetCurrentThreadIndex();  // make sure we call this on main thread at least once before any workers run
 389         if (threadId != 0)
 390         {
 391                 btAssert(!"btSetTaskScheduler must be called from the main thread!");
 392                 return;
 393         }
 394         if (gBtTaskScheduler)
 395         {
 396                 // deactivate old task scheduler
 397                 gBtTaskScheduler->deactivate();
 398         }
 399         gBtTaskScheduler = ts;
 400         if (ts)
 401         {
 402                 // activate new task scheduler
 403                 ts->activate();
 404         }
 405 }
 406
 407 btITaskScheduler* btGetTaskScheduler()
 408 {
 409         return gBtTaskScheduler;
 410 }
 411
 412 void btParallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body)
 413 {
 414 #if BT_THREADSAFE
 415
 416 #if BT_DETECT_BAD_THREAD_INDEX
 417         if (!btThreadsAreRunning())
 418         {
 419                 // clear out thread ids
 420                 for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i)
 421                 {
 422                         gDebugThreadIds[i] = kInvalidThreadId;
 423                 }
 424         }
 425 #endif  // #if BT_DETECT_BAD_THREAD_INDEX
 426
 427         btAssert(gBtTaskScheduler != NULL);  // call btSetTaskScheduler() with a valid task scheduler first!
 428         gBtTaskScheduler->parallelFor(iBegin, iEnd, grainSize, body);
 429
 430 #else  // #if BT_THREADSAFE
 431
 432         // non-parallel version of btParallelFor
 433         btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
 434         body.forLoop(iBegin, iEnd);
 435
 436 #endif  // #if BT_THREADSAFE
 437 }
 438
 439 btScalar btParallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body)
 440 {
 441 #if BT_THREADSAFE
 442
 443 #if BT_DETECT_BAD_THREAD_INDEX
 444         if (!btThreadsAreRunning())
 445         {
 446                 // clear out thread ids
 447                 for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i)
 448                 {
 449                         gDebugThreadIds[i] = kInvalidThreadId;
 450                 }
 451         }
 452 #endif  // #if BT_DETECT_BAD_THREAD_INDEX
 453
 454         btAssert(gBtTaskScheduler != NULL);  // call btSetTaskScheduler() with a valid task scheduler first!
 455         return gBtTaskScheduler->parallelSum(iBegin, iEnd, grainSize, body);
 456
 457 #else  // #if BT_THREADSAFE
 458
 459         // non-parallel version of btParallelSum
 460         btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
 461         return body.sumLoop(iBegin, iEnd);
 462
 463 #endif  //#else // #if BT_THREADSAFE
 464 }
 465
 466 ///
 467 /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
 468 ///                              (really just useful for testing performance of single threaded vs multi)
 469 ///
 470 class btTaskSchedulerSequential : public btITaskScheduler
 471 {
 472 public:
 473         btTaskSchedulerSequential() : btITaskScheduler("Sequential") {}
 474         virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; }
 475         virtual int getNumThreads() const BT_OVERRIDE { return 1; }
 476         virtual void setNumThreads(int numThreads) BT_OVERRIDE {}
 477         virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
 478         {
 479                 BT_PROFILE("parallelFor_sequential");
 480                 body.forLoop(iBegin, iEnd);
 481         }
 482         virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
 483         {
 484                 BT_PROFILE("parallelSum_sequential");
 485                 return body.sumLoop(iBegin, iEnd);
 486         }
 487 };
 488
 489 #if BT_USE_OPENMP && BT_THREADSAFE
 490 ///
 491 /// btTaskSchedulerOpenMP -- wrapper around OpenMP task scheduler
 492 ///
 493 class btTaskSchedulerOpenMP : public btITaskScheduler
 494 {
 495         int m_numThreads;
 496
 497 public:
 498         btTaskSchedulerOpenMP() : btITaskScheduler("OpenMP")
 499         {
 500                 m_numThreads = 0;
 501         }
 502         virtual int getMaxNumThreads() const BT_OVERRIDE
 503         {
 504                 return omp_get_max_threads();
 505         }
 506         virtual int getNumThreads() const BT_OVERRIDE
 507         {
 508                 return m_numThreads;
 509         }
 510         virtual void setNumThreads(int numThreads) BT_OVERRIDE
 511         {
 512                 // With OpenMP, because it is a standard with various implementations, we can't
 513                 // know for sure if every implementation has the same behavior of destroying all
 514                 // previous threads when resizing the threadpool
 515                 m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads));
 516                 omp_set_num_threads(1);  // hopefully, all previous threads get destroyed here
 517                 omp_set_num_threads(m_numThreads);
 518                 m_savedThreadCounter = 0;
 519                 if (m_isActive)
 520                 {
 521                         btResetThreadIndexCounter();
 522                 }
 523         }
 524         virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
 525         {
 526                 BT_PROFILE("parallelFor_OpenMP");
 527                 btPushThreadsAreRunning();
 528 #pragma omp parallel for schedule(static, 1)
 529                 for (int i = iBegin; i < iEnd; i += grainSize)
 530                 {
 531                         BT_PROFILE("OpenMP_forJob");
 532                         body.forLoop(i, (std::min)(i + grainSize, iEnd));
 533                 }
 534                 btPopThreadsAreRunning();
 535         }
 536         virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
 537         {
 538                 BT_PROFILE("parallelFor_OpenMP");
 539                 btPushThreadsAreRunning();
 540                 btScalar sum = btScalar(0);
 541 #pragma omp parallel for schedule(static, 1) reduction(+ \
 542                                                                                                            : sum)
 543                 for (int i = iBegin; i < iEnd; i += grainSize)
 544                 {
 545                         BT_PROFILE("OpenMP_sumJob");
 546                         sum += body.sumLoop(i, (std::min)(i + grainSize, iEnd));
 547                 }
 548                 btPopThreadsAreRunning();
 549                 return sum;
 550         }
 551 };
 552 #endif  // #if BT_USE_OPENMP && BT_THREADSAFE
 553
 554 #if BT_USE_TBB && BT_THREADSAFE
 555 ///
 556 /// btTaskSchedulerTBB -- wrapper around Intel Threaded Building Blocks task scheduler
 557 ///
 558 class btTaskSchedulerTBB : public btITaskScheduler
 559 {
 560         int m_numThreads;
 561         tbb::task_scheduler_init* m_tbbSchedulerInit;
 562
 563 public:
 564         btTaskSchedulerTBB() : btITaskScheduler("IntelTBB")
 565         {
 566                 m_numThreads = 0;
 567                 m_tbbSchedulerInit = NULL;
 568         }
 569         ~btTaskSchedulerTBB()
 570         {
 571                 if (m_tbbSchedulerInit)
 572                 {
 573                         delete m_tbbSchedulerInit;
 574                         m_tbbSchedulerInit = NULL;
 575                 }
 576         }
 577
 578         virtual int getMaxNumThreads() const BT_OVERRIDE
 579         {
 580                 return tbb::task_scheduler_init::default_num_threads();
 581         }
 582         virtual int getNumThreads() const BT_OVERRIDE
 583         {
 584                 return m_numThreads;
 585         }
 586         virtual void setNumThreads(int numThreads) BT_OVERRIDE
 587         {
 588                 m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads));
 589                 if (m_tbbSchedulerInit)
 590                 {
 591                         // destroys all previous threads
 592                         delete m_tbbSchedulerInit;
 593                         m_tbbSchedulerInit = NULL;
 594                 }
 595                 m_tbbSchedulerInit = new tbb::task_scheduler_init(m_numThreads);
 596                 m_savedThreadCounter = 0;
 597                 if (m_isActive)
 598                 {
 599                         btResetThreadIndexCounter();
 600                 }
 601         }
 602         struct ForBodyAdapter
 603         {
 604                 const btIParallelForBody* mBody;
 605
 606                 ForBodyAdapter(const btIParallelForBody* body) : mBody(body) {}
 607                 void operator()(const tbb::blocked_range<int>& range) const
 608                 {
 609                         BT_PROFILE("TBB_forJob");
 610                         mBody->forLoop(range.begin(), range.end());
 611                 }
 612         };
 613         virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
 614         {
 615                 BT_PROFILE("parallelFor_TBB");
 616                 ForBodyAdapter tbbBody(&body);
 617                 btPushThreadsAreRunning();
 618                 tbb::parallel_for(tbb::blocked_range<int>(iBegin, iEnd, grainSize),
 619                                                   tbbBody,
 620                                                   tbb::simple_partitioner());
 621                 btPopThreadsAreRunning();
 622         }
 623         struct SumBodyAdapter
 624         {
 625                 const btIParallelSumBody* mBody;
 626                 btScalar mSum;
 627
 628                 SumBodyAdapter(const btIParallelSumBody* body) : mBody(body), mSum(btScalar(0)) {}
 629                 SumBodyAdapter(const SumBodyAdapter& src, tbb::split) : mBody(src.mBody), mSum(btScalar(0)) {}
 630                 void join(const SumBodyAdapter& src) { mSum += src.mSum; }
 631                 void operator()(const tbb::blocked_range<int>& range)
 632                 {
 633                         BT_PROFILE("TBB_sumJob");
 634                         mSum += mBody->sumLoop(range.begin(), range.end());
 635                 }
 636         };
 637         virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
 638         {
 639                 BT_PROFILE("parallelSum_TBB");
 640                 SumBodyAdapter tbbBody(&body);
 641                 btPushThreadsAreRunning();
 642                 tbb::parallel_deterministic_reduce(tbb::blocked_range<int>(iBegin, iEnd, grainSize), tbbBody);
 643                 btPopThreadsAreRunning();
 644                 return tbbBody.mSum;
 645         }
 646 };
 647 #endif  // #if BT_USE_TBB && BT_THREADSAFE
 648
 649 #if BT_USE_PPL && BT_THREADSAFE
 650 ///
 651 /// btTaskSchedulerPPL -- wrapper around Microsoft Parallel Patterns Lib task scheduler
 652 ///
 653 class btTaskSchedulerPPL : public btITaskScheduler
 654 {
 655         int m_numThreads;
 656         concurrency::combinable<btScalar> m_sum;  // for parallelSum
 657 public:
 658         btTaskSchedulerPPL() : btITaskScheduler("PPL")
 659         {
 660                 m_numThreads = 0;
 661         }
 662         virtual int getMaxNumThreads() const BT_OVERRIDE
 663         {
 664                 return concurrency::GetProcessorCount();
 665         }
 666         virtual int getNumThreads() const BT_OVERRIDE
 667         {
 668                 return m_numThreads;
 669         }
 670         virtual void setNumThreads(int numThreads) BT_OVERRIDE
 671         {
 672                 // capping the thread count for PPL due to a thread-index issue
 673                 const int maxThreadCount = (std::min)(int(BT_MAX_THREAD_COUNT), 31);
 674                 m_numThreads = (std::max)(1, (std::min)(maxThreadCount, numThreads));
 675                 using namespace concurrency;
 676                 if (CurrentScheduler::Id() != -1)
 677                 {
 678                         CurrentScheduler::Detach();
 679                 }
 680                 SchedulerPolicy policy;
 681                 {
 682                         // PPL seems to destroy threads when threadpool is shrunk, but keeps reusing old threads
 683                         // force it to destroy old threads
 684                         policy.SetConcurrencyLimits(1, 1);
 685                         CurrentScheduler::Create(policy);
 686                         CurrentScheduler::Detach();
 687                 }
 688                 policy.SetConcurrencyLimits(m_numThreads, m_numThreads);
 689                 CurrentScheduler::Create(policy);
 690                 m_savedThreadCounter = 0;
 691                 if (m_isActive)
 692                 {
 693                         btResetThreadIndexCounter();
 694                 }
 695         }
 696         struct ForBodyAdapter
 697         {
 698                 const btIParallelForBody* mBody;
 699                 int mGrainSize;
 700                 int mIndexEnd;
 701
 702                 ForBodyAdapter(const btIParallelForBody* body, int grainSize, int end) : mBody(body), mGrainSize(grainSize), mIndexEnd(end) {}
 703                 void operator()(int i) const
 704                 {
 705                         BT_PROFILE("PPL_forJob");
 706                         mBody->forLoop(i, (std::min)(i + mGrainSize, mIndexEnd));
 707                 }
 708         };
 709         virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
 710         {
 711                 BT_PROFILE("parallelFor_PPL");
 712                 // PPL dispatch
 713                 ForBodyAdapter pplBody(&body, grainSize, iEnd);
 714                 btPushThreadsAreRunning();
 715                 // note: MSVC 2010 doesn't support partitioner args, so avoid them
 716                 concurrency::parallel_for(iBegin,
 717                                                                   iEnd,
 718                                                                   grainSize,
 719                                                                   pplBody);
 720                 btPopThreadsAreRunning();
 721         }
 722         struct SumBodyAdapter
 723         {
 724                 const btIParallelSumBody* mBody;
 725                 concurrency::combinable<btScalar>* mSum;
 726                 int mGrainSize;
 727                 int mIndexEnd;
 728
 729                 SumBodyAdapter(const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end) : mBody(body), mSum(sum), mGrainSize(grainSize), mIndexEnd(end) {}
 730                 void operator()(int i) const
 731                 {
 732                         BT_PROFILE("PPL_sumJob");
 733                         mSum->local() += mBody->sumLoop(i, (std::min)(i + mGrainSize, mIndexEnd));
 734                 }
 735         };
 736         static btScalar sumFunc(btScalar a, btScalar b) { return a + b; }
 737         virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
 738         {
 739                 BT_PROFILE("parallelSum_PPL");
 740                 m_sum.clear();
 741                 SumBodyAdapter pplBody(&body, &m_sum, grainSize, iEnd);
 742                 btPushThreadsAreRunning();
 743                 // note: MSVC 2010 doesn't support partitioner args, so avoid them
 744                 concurrency::parallel_for(iBegin,
 745                                                                   iEnd,
 746                                                                   grainSize,
 747                                                                   pplBody);
 748                 btPopThreadsAreRunning();
 749                 return m_sum.combine(sumFunc);
 750         }
 751 };
 752 #endif  // #if BT_USE_PPL && BT_THREADSAFE
 753
 754 // create a non-threaded task scheduler (always available)
 755 btITaskScheduler* btGetSequentialTaskScheduler()
 756 {
 757         static btTaskSchedulerSequential sTaskScheduler;
 758         return &sTaskScheduler;
 759 }
 760
 761 // create an OpenMP task scheduler (if available, otherwise returns null)
 762 btITaskScheduler* btGetOpenMPTaskScheduler()
 763 {
 764 #if BT_USE_OPENMP && BT_THREADSAFE
 765         static btTaskSchedulerOpenMP sTaskScheduler;
 766         return &sTaskScheduler;
 767 #else
 768         return NULL;
 769 #endif
 770 }
 771
 772 // create an Intel TBB task scheduler (if available, otherwise returns null)
 773 btITaskScheduler* btGetTBBTaskScheduler()
 774 {
 775 #if BT_USE_TBB && BT_THREADSAFE
 776         static btTaskSchedulerTBB sTaskScheduler;
 777         return &sTaskScheduler;
 778 #else
 779         return NULL;
 780 #endif
 781 }
 782
 783 // create a PPL task scheduler (if available, otherwise returns null)
 784 btITaskScheduler* btGetPPLTaskScheduler()
 785 {
 786 #if BT_USE_PPL && BT_THREADSAFE
 787         static btTaskSchedulerPPL sTaskScheduler;
 788         return &sTaskScheduler;
 789 #else
 790         return NULL;
 791 #endif
 792 }