2 Copyright (c) 2003-2014 Erwin Coumans http://bullet.googlecode.com
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
15 #include "btThreads.h"
16 #include "btQuickprof.h"
17 #include <algorithm> // for min and max
19 #if BT_USE_OPENMP && BT_THREADSAFE
23 #endif // #if BT_USE_OPENMP && BT_THREADSAFE
25 #if BT_USE_PPL && BT_THREADSAFE
27 // use Microsoft Parallel Patterns Library (installed with Visual Studio 2010 and later)
28 #include <ppl.h> // if you get a compile error here, check whether your version of Visual Studio includes PPL
29 // Visual Studio 2010 and later should come with it
30 #include <concrtrm.h> // for GetProcessorCount()
32 #endif // #if BT_USE_PPL && BT_THREADSAFE
34 #if BT_USE_TBB && BT_THREADSAFE
36 // use Intel Threading Building Blocks for thread management
37 #define __TBB_NO_IMPLICIT_LINKAGE 1
39 #include <tbb/task_scheduler_init.h>
40 #include <tbb/parallel_for.h>
41 #include <tbb/blocked_range.h>
43 #endif // #if BT_USE_TBB && BT_THREADSAFE
47 // Lightweight spin-mutex based on atomics
48 // Using ordinary system-provided mutexes like Windows critical sections was noticeably slower
49 // presumably because when it fails to lock at first it would sleep the thread and trigger costly
53 #if __cplusplus >= 201103L
55 // for anything claiming full C++11 compliance, use C++11 atomics
56 // on GCC or Clang you need to compile with -std=c++11
57 #define USE_CPP11_ATOMICS 1
59 #elif defined(_MSC_VER)
61 // on MSVC, use intrinsics instead
62 #define USE_MSVC_INTRINSICS 1
64 #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
66 // available since GCC 4.7 and some versions of clang
67 // todo: check for clang
68 #define USE_GCC_BUILTIN_ATOMICS 1
70 #elif defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
72 // available since GCC 4.1
73 #define USE_GCC_BUILTIN_ATOMICS_OLD 1
82 #define THREAD_LOCAL_STATIC thread_local static
84 bool btSpinMutex::tryLock()
86 std::atomic<int>* aDest = reinterpret_cast<std::atomic<int>*>(&mLock);
88 return std::atomic_compare_exchange_weak_explicit(aDest, &expected, int(1), std::memory_order_acq_rel, std::memory_order_acquire);
91 void btSpinMutex::lock()
93 // note: this lock does not sleep the thread.
100 void btSpinMutex::unlock()
102 std::atomic<int>* aDest = reinterpret_cast<std::atomic<int>*>(&mLock);
103 std::atomic_store_explicit(aDest, int(0), std::memory_order_release);
106 #elif USE_MSVC_INTRINSICS
108 #define WIN32_LEAN_AND_MEAN
113 #define THREAD_LOCAL_STATIC __declspec(thread) static
115 bool btSpinMutex::tryLock()
117 volatile long* aDest = reinterpret_cast<long*>(&mLock);
118 return (0 == _InterlockedCompareExchange(aDest, 1, 0));
121 void btSpinMutex::lock()
123 // note: this lock does not sleep the thread
130 void btSpinMutex::unlock()
132 volatile long* aDest = reinterpret_cast<long*>(&mLock);
133 _InterlockedExchange(aDest, 0);
136 #elif USE_GCC_BUILTIN_ATOMICS
138 #define THREAD_LOCAL_STATIC static __thread
140 bool btSpinMutex::tryLock()
144 const int memOrderSuccess = __ATOMIC_ACQ_REL;
145 const int memOrderFail = __ATOMIC_ACQUIRE;
146 return __atomic_compare_exchange_n(&mLock, &expected, int(1), weak, memOrderSuccess, memOrderFail);
149 void btSpinMutex::lock()
151 // note: this lock does not sleep the thread
158 void btSpinMutex::unlock()
160 __atomic_store_n(&mLock, int(0), __ATOMIC_RELEASE);
163 #elif USE_GCC_BUILTIN_ATOMICS_OLD
165 #define THREAD_LOCAL_STATIC static __thread
167 bool btSpinMutex::tryLock()
169 return __sync_bool_compare_and_swap(&mLock, int(0), int(1));
172 void btSpinMutex::lock()
174 // note: this lock does not sleep the thread
181 void btSpinMutex::unlock()
184 __sync_fetch_and_and(&mLock, int(0));
187 #else //#elif USE_MSVC_INTRINSICS
189 #error "no threading primitives defined -- unknown platform"
191 #endif //#else //#elif USE_MSVC_INTRINSICS
193 #else //#if BT_THREADSAFE
195 // These should not be called ever
196 void btSpinMutex::lock()
198 btAssert(!"unimplemented btSpinMutex::lock() called");
201 void btSpinMutex::unlock()
203 btAssert(!"unimplemented btSpinMutex::unlock() called");
206 bool btSpinMutex::tryLock()
208 btAssert(!"unimplemented btSpinMutex::tryLock() called");
212 #define THREAD_LOCAL_STATIC static
214 #endif // #else //#if BT_THREADSAFE
216 struct ThreadsafeCounter
218 unsigned int mCounter;
224 --mCounter; // first count should come back 0
227 unsigned int getNext()
229 // no need to optimize this with atomics, it is only called ONCE per thread!
232 if (mCounter >= BT_MAX_THREAD_COUNT)
234 btAssert(!"thread counter exceeded");
235 // wrap back to the first worker index
238 unsigned int val = mCounter;
244 static btITaskScheduler* gBtTaskScheduler=0;
245 static int gThreadsRunningCounter = 0; // useful for detecting if we are trying to do nested parallel-for calls
246 static btSpinMutex gThreadsRunningCounterMutex;
247 static ThreadsafeCounter gThreadCounter;
250 // BT_DETECT_BAD_THREAD_INDEX tries to detect when there are multiple threads assigned the same thread index.
252 // BT_DETECT_BAD_THREAD_INDEX is a developer option to test if
253 // certain assumptions about how the task scheduler manages its threads
255 // The main assumption is:
256 // - when the threadpool is resized, the task scheduler either
257 // 1. destroys all worker threads and creates all new ones in the correct number, OR
258 // 2. never destroys a worker thread
260 // We make that assumption because we can't easily enumerate the worker threads of a task scheduler
261 // to assign nice sequential thread-indexes. We also do not get notified if a worker thread is destroyed,
262 // so we can't tell when a thread-index is no longer being used.
263 // We allocate thread-indexes as needed with a sequential global thread counter.
265 // Our simple thread-counting scheme falls apart if the task scheduler destroys some threads but
266 // continues to re-use other threads and the application repeatedly resizes the thread pool of the
268 // In order to prevent the thread-counter from exceeding the global max (BT_MAX_THREAD_COUNT), we
269 // wrap the thread counter back to 1. This should only happen if the worker threads have all been
270 // destroyed and re-created.
272 // BT_DETECT_BAD_THREAD_INDEX only works for Win32 right now,
273 // but could be adapted to work with pthreads
274 #define BT_DETECT_BAD_THREAD_INDEX 0
276 #if BT_DETECT_BAD_THREAD_INDEX
278 typedef DWORD ThreadId_t;
279 const static ThreadId_t kInvalidThreadId = 0;
280 ThreadId_t gDebugThreadIds[BT_MAX_THREAD_COUNT];
282 static ThreadId_t getDebugThreadId()
284 return GetCurrentThreadId();
287 #endif // #if BT_DETECT_BAD_THREAD_INDEX
289 // return a unique index per thread, main thread is 0, worker threads are in [1, BT_MAX_THREAD_COUNT)
290 unsigned int btGetCurrentThreadIndex()
292 const unsigned int kNullIndex = ~0U;
293 THREAD_LOCAL_STATIC unsigned int sThreadIndex = kNullIndex;
294 if (sThreadIndex == kNullIndex)
296 sThreadIndex = gThreadCounter.getNext();
297 btAssert(sThreadIndex < BT_MAX_THREAD_COUNT);
299 #if BT_DETECT_BAD_THREAD_INDEX
300 if (gBtTaskScheduler && sThreadIndex > 0)
302 ThreadId_t tid = getDebugThreadId();
304 if (gDebugThreadIds[sThreadIndex] == kInvalidThreadId)
307 gDebugThreadIds[sThreadIndex] = tid;
311 if (gDebugThreadIds[sThreadIndex] != tid)
313 // this could indicate the task scheduler is breaking our assumptions about
314 // how threads are managed when threadpool is resized
315 btAssert(!"there are 2 or more threads with the same thread-index!");
320 #endif // #if BT_DETECT_BAD_THREAD_INDEX
324 bool btIsMainThread()
326 return btGetCurrentThreadIndex() == 0;
329 void btResetThreadIndexCounter()
331 // for when all current worker threads are destroyed
332 btAssert(btIsMainThread());
333 gThreadCounter.mCounter = 0;
336 btITaskScheduler::btITaskScheduler(const char* name)
339 m_savedThreadCounter = 0;
343 void btITaskScheduler::activate()
345 // gThreadCounter is used to assign a thread-index to each worker thread in a task scheduler.
346 // The main thread is always thread-index 0, and worker threads are numbered from 1 to 63 (BT_MAX_THREAD_COUNT-1)
347 // The thread-indexes need to be unique amongst the threads that can be running simultaneously.
348 // Since only one task scheduler can be used at a time, it is OK for a pair of threads that belong to different
349 // task schedulers to share the same thread index because they can't be running at the same time.
350 // So each task scheduler needs to keep its own thread counter value
353 gThreadCounter.mCounter = m_savedThreadCounter; // restore saved thread counter
358 void btITaskScheduler::deactivate()
362 m_savedThreadCounter = gThreadCounter.mCounter; // save thread counter
367 void btPushThreadsAreRunning()
369 gThreadsRunningCounterMutex.lock();
370 gThreadsRunningCounter++;
371 gThreadsRunningCounterMutex.unlock();
374 void btPopThreadsAreRunning()
376 gThreadsRunningCounterMutex.lock();
377 gThreadsRunningCounter--;
378 gThreadsRunningCounterMutex.unlock();
381 bool btThreadsAreRunning()
383 return gThreadsRunningCounter != 0;
386 void btSetTaskScheduler(btITaskScheduler* ts)
388 int threadId = btGetCurrentThreadIndex(); // make sure we call this on main thread at least once before any workers run
391 btAssert(!"btSetTaskScheduler must be called from the main thread!");
394 if (gBtTaskScheduler)
396 // deactivate old task scheduler
397 gBtTaskScheduler->deactivate();
399 gBtTaskScheduler = ts;
402 // activate new task scheduler
407 btITaskScheduler* btGetTaskScheduler()
409 return gBtTaskScheduler;
412 void btParallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body)
416 #if BT_DETECT_BAD_THREAD_INDEX
417 if (!btThreadsAreRunning())
419 // clear out thread ids
420 for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i)
422 gDebugThreadIds[i] = kInvalidThreadId;
425 #endif // #if BT_DETECT_BAD_THREAD_INDEX
427 btAssert(gBtTaskScheduler != NULL); // call btSetTaskScheduler() with a valid task scheduler first!
428 gBtTaskScheduler->parallelFor(iBegin, iEnd, grainSize, body);
430 #else // #if BT_THREADSAFE
432 // non-parallel version of btParallelFor
433 btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
434 body.forLoop(iBegin, iEnd);
436 #endif // #if BT_THREADSAFE
439 btScalar btParallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body)
443 #if BT_DETECT_BAD_THREAD_INDEX
444 if (!btThreadsAreRunning())
446 // clear out thread ids
447 for (int i = 0; i < BT_MAX_THREAD_COUNT; ++i)
449 gDebugThreadIds[i] = kInvalidThreadId;
452 #endif // #if BT_DETECT_BAD_THREAD_INDEX
454 btAssert(gBtTaskScheduler != NULL); // call btSetTaskScheduler() with a valid task scheduler first!
455 return gBtTaskScheduler->parallelSum(iBegin, iEnd, grainSize, body);
457 #else // #if BT_THREADSAFE
459 // non-parallel version of btParallelSum
460 btAssert(!"called btParallelFor in non-threadsafe build. enable BT_THREADSAFE");
461 return body.sumLoop(iBegin, iEnd);
463 #endif //#else // #if BT_THREADSAFE
467 /// btTaskSchedulerSequential -- non-threaded implementation of task scheduler
468 /// (really just useful for testing performance of single threaded vs multi)
470 class btTaskSchedulerSequential : public btITaskScheduler
473 btTaskSchedulerSequential() : btITaskScheduler("Sequential") {}
474 virtual int getMaxNumThreads() const BT_OVERRIDE { return 1; }
475 virtual int getNumThreads() const BT_OVERRIDE { return 1; }
476 virtual void setNumThreads(int numThreads) BT_OVERRIDE {}
477 virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
479 BT_PROFILE("parallelFor_sequential");
480 body.forLoop(iBegin, iEnd);
482 virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
484 BT_PROFILE("parallelSum_sequential");
485 return body.sumLoop(iBegin, iEnd);
489 #if BT_USE_OPENMP && BT_THREADSAFE
491 /// btTaskSchedulerOpenMP -- wrapper around OpenMP task scheduler
493 class btTaskSchedulerOpenMP : public btITaskScheduler
498 btTaskSchedulerOpenMP() : btITaskScheduler("OpenMP")
502 virtual int getMaxNumThreads() const BT_OVERRIDE
504 return omp_get_max_threads();
506 virtual int getNumThreads() const BT_OVERRIDE
510 virtual void setNumThreads(int numThreads) BT_OVERRIDE
512 // With OpenMP, because it is a standard with various implementations, we can't
513 // know for sure if every implementation has the same behavior of destroying all
514 // previous threads when resizing the threadpool
515 m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads));
516 omp_set_num_threads(1); // hopefully, all previous threads get destroyed here
517 omp_set_num_threads(m_numThreads);
518 m_savedThreadCounter = 0;
521 btResetThreadIndexCounter();
524 virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
526 BT_PROFILE("parallelFor_OpenMP");
527 btPushThreadsAreRunning();
528 #pragma omp parallel for schedule(static, 1)
529 for (int i = iBegin; i < iEnd; i += grainSize)
531 BT_PROFILE("OpenMP_forJob");
532 body.forLoop(i, (std::min)(i + grainSize, iEnd));
534 btPopThreadsAreRunning();
536 virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
538 BT_PROFILE("parallelFor_OpenMP");
539 btPushThreadsAreRunning();
540 btScalar sum = btScalar(0);
541 #pragma omp parallel for schedule(static, 1) reduction(+ \
543 for (int i = iBegin; i < iEnd; i += grainSize)
545 BT_PROFILE("OpenMP_sumJob");
546 sum += body.sumLoop(i, (std::min)(i + grainSize, iEnd));
548 btPopThreadsAreRunning();
552 #endif // #if BT_USE_OPENMP && BT_THREADSAFE
554 #if BT_USE_TBB && BT_THREADSAFE
556 /// btTaskSchedulerTBB -- wrapper around Intel Threaded Building Blocks task scheduler
558 class btTaskSchedulerTBB : public btITaskScheduler
561 tbb::task_scheduler_init* m_tbbSchedulerInit;
564 btTaskSchedulerTBB() : btITaskScheduler("IntelTBB")
567 m_tbbSchedulerInit = NULL;
569 ~btTaskSchedulerTBB()
571 if (m_tbbSchedulerInit)
573 delete m_tbbSchedulerInit;
574 m_tbbSchedulerInit = NULL;
578 virtual int getMaxNumThreads() const BT_OVERRIDE
580 return tbb::task_scheduler_init::default_num_threads();
582 virtual int getNumThreads() const BT_OVERRIDE
586 virtual void setNumThreads(int numThreads) BT_OVERRIDE
588 m_numThreads = (std::max)(1, (std::min)(int(BT_MAX_THREAD_COUNT), numThreads));
589 if (m_tbbSchedulerInit)
591 // destroys all previous threads
592 delete m_tbbSchedulerInit;
593 m_tbbSchedulerInit = NULL;
595 m_tbbSchedulerInit = new tbb::task_scheduler_init(m_numThreads);
596 m_savedThreadCounter = 0;
599 btResetThreadIndexCounter();
602 struct ForBodyAdapter
604 const btIParallelForBody* mBody;
606 ForBodyAdapter(const btIParallelForBody* body) : mBody(body) {}
607 void operator()(const tbb::blocked_range<int>& range) const
609 BT_PROFILE("TBB_forJob");
610 mBody->forLoop(range.begin(), range.end());
613 virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
615 BT_PROFILE("parallelFor_TBB");
616 ForBodyAdapter tbbBody(&body);
617 btPushThreadsAreRunning();
618 tbb::parallel_for(tbb::blocked_range<int>(iBegin, iEnd, grainSize),
620 tbb::simple_partitioner());
621 btPopThreadsAreRunning();
623 struct SumBodyAdapter
625 const btIParallelSumBody* mBody;
628 SumBodyAdapter(const btIParallelSumBody* body) : mBody(body), mSum(btScalar(0)) {}
629 SumBodyAdapter(const SumBodyAdapter& src, tbb::split) : mBody(src.mBody), mSum(btScalar(0)) {}
630 void join(const SumBodyAdapter& src) { mSum += src.mSum; }
631 void operator()(const tbb::blocked_range<int>& range)
633 BT_PROFILE("TBB_sumJob");
634 mSum += mBody->sumLoop(range.begin(), range.end());
637 virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
639 BT_PROFILE("parallelSum_TBB");
640 SumBodyAdapter tbbBody(&body);
641 btPushThreadsAreRunning();
642 tbb::parallel_deterministic_reduce(tbb::blocked_range<int>(iBegin, iEnd, grainSize), tbbBody);
643 btPopThreadsAreRunning();
647 #endif // #if BT_USE_TBB && BT_THREADSAFE
649 #if BT_USE_PPL && BT_THREADSAFE
651 /// btTaskSchedulerPPL -- wrapper around Microsoft Parallel Patterns Lib task scheduler
653 class btTaskSchedulerPPL : public btITaskScheduler
656 concurrency::combinable<btScalar> m_sum; // for parallelSum
658 btTaskSchedulerPPL() : btITaskScheduler("PPL")
662 virtual int getMaxNumThreads() const BT_OVERRIDE
664 return concurrency::GetProcessorCount();
666 virtual int getNumThreads() const BT_OVERRIDE
670 virtual void setNumThreads(int numThreads) BT_OVERRIDE
672 // capping the thread count for PPL due to a thread-index issue
673 const int maxThreadCount = (std::min)(int(BT_MAX_THREAD_COUNT), 31);
674 m_numThreads = (std::max)(1, (std::min)(maxThreadCount, numThreads));
675 using namespace concurrency;
676 if (CurrentScheduler::Id() != -1)
678 CurrentScheduler::Detach();
680 SchedulerPolicy policy;
682 // PPL seems to destroy threads when threadpool is shrunk, but keeps reusing old threads
683 // force it to destroy old threads
684 policy.SetConcurrencyLimits(1, 1);
685 CurrentScheduler::Create(policy);
686 CurrentScheduler::Detach();
688 policy.SetConcurrencyLimits(m_numThreads, m_numThreads);
689 CurrentScheduler::Create(policy);
690 m_savedThreadCounter = 0;
693 btResetThreadIndexCounter();
696 struct ForBodyAdapter
698 const btIParallelForBody* mBody;
702 ForBodyAdapter(const btIParallelForBody* body, int grainSize, int end) : mBody(body), mGrainSize(grainSize), mIndexEnd(end) {}
703 void operator()(int i) const
705 BT_PROFILE("PPL_forJob");
706 mBody->forLoop(i, (std::min)(i + mGrainSize, mIndexEnd));
709 virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
711 BT_PROFILE("parallelFor_PPL");
713 ForBodyAdapter pplBody(&body, grainSize, iEnd);
714 btPushThreadsAreRunning();
715 // note: MSVC 2010 doesn't support partitioner args, so avoid them
716 concurrency::parallel_for(iBegin,
720 btPopThreadsAreRunning();
722 struct SumBodyAdapter
724 const btIParallelSumBody* mBody;
725 concurrency::combinable<btScalar>* mSum;
729 SumBodyAdapter(const btIParallelSumBody* body, concurrency::combinable<btScalar>* sum, int grainSize, int end) : mBody(body), mSum(sum), mGrainSize(grainSize), mIndexEnd(end) {}
730 void operator()(int i) const
732 BT_PROFILE("PPL_sumJob");
733 mSum->local() += mBody->sumLoop(i, (std::min)(i + mGrainSize, mIndexEnd));
736 static btScalar sumFunc(btScalar a, btScalar b) { return a + b; }
737 virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
739 BT_PROFILE("parallelSum_PPL");
741 SumBodyAdapter pplBody(&body, &m_sum, grainSize, iEnd);
742 btPushThreadsAreRunning();
743 // note: MSVC 2010 doesn't support partitioner args, so avoid them
744 concurrency::parallel_for(iBegin,
748 btPopThreadsAreRunning();
749 return m_sum.combine(sumFunc);
752 #endif // #if BT_USE_PPL && BT_THREADSAFE
754 // create a non-threaded task scheduler (always available)
755 btITaskScheduler* btGetSequentialTaskScheduler()
757 static btTaskSchedulerSequential sTaskScheduler;
758 return &sTaskScheduler;
761 // create an OpenMP task scheduler (if available, otherwise returns null)
762 btITaskScheduler* btGetOpenMPTaskScheduler()
764 #if BT_USE_OPENMP && BT_THREADSAFE
765 static btTaskSchedulerOpenMP sTaskScheduler;
766 return &sTaskScheduler;
772 // create an Intel TBB task scheduler (if available, otherwise returns null)
773 btITaskScheduler* btGetTBBTaskScheduler()
775 #if BT_USE_TBB && BT_THREADSAFE
776 static btTaskSchedulerTBB sTaskScheduler;
777 return &sTaskScheduler;
783 // create a PPL task scheduler (if available, otherwise returns null)
784 btITaskScheduler* btGetPPLTaskScheduler()
786 #if BT_USE_PPL && BT_THREADSAFE
787 static btTaskSchedulerPPL sTaskScheduler;
788 return &sTaskScheduler;