[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / LinearMath / TaskScheduler / btTaskScheduler.cpp
1
2 #include "LinearMath/btMinMax.h"
3 #include "LinearMath/btAlignedObjectArray.h"
4 #include "LinearMath/btThreads.h"
5 #include "LinearMath/btQuickprof.h"
6 #include <stdio.h>
7 #include <algorithm>
8
9 #if BT_THREADSAFE
10
11 #include "btThreadSupportInterface.h"
12
13 #if defined(_WIN32)
14
15 #define WIN32_LEAN_AND_MEAN
16
17 #include <windows.h>
18
19 #endif
20
21 typedef unsigned long long btU64;
22 static const int kCacheLineSize = 64;
23
24 void btSpinPause()
25 {
26 #if defined(_WIN32)
27         YieldProcessor();
28 #endif
29 }
30
31 struct WorkerThreadStatus
32 {
33         enum Type
34         {
35                 kInvalid,
36                 kWaitingForWork,
37                 kWorking,
38                 kSleeping,
39         };
40 };
41
42 ATTRIBUTE_ALIGNED64(class)
43 WorkerThreadDirectives
44 {
45         static const int kMaxThreadCount = BT_MAX_THREAD_COUNT;
46         // directives for all worker threads packed into a single cacheline
47         char m_threadDirs[kMaxThreadCount];
48
49 public:
50         enum Type
51         {
52                 kInvalid,
53                 kGoToSleep,         // go to sleep
54                 kStayAwakeButIdle,  // wait for not checking job queue
55                 kScanForJobs,       // actively scan job queue for jobs
56         };
57         WorkerThreadDirectives()
58         {
59                 for (int i = 0; i < kMaxThreadCount; ++i)
60                 {
61                         m_threadDirs[i] = 0;
62                 }
63         }
64
65         Type getDirective(int threadId)
66         {
67                 btAssert(threadId < kMaxThreadCount);
68                 return static_cast<Type>(m_threadDirs[threadId]);
69         }
70
71         void setDirectiveByRange(int threadBegin, int threadEnd, Type dir)
72         {
73                 btAssert(threadBegin < threadEnd);
74                 btAssert(threadEnd <= kMaxThreadCount);
75                 char dirChar = static_cast<char>(dir);
76                 for (int i = threadBegin; i < threadEnd; ++i)
77                 {
78                         m_threadDirs[i] = dirChar;
79                 }
80         }
81 };
82
83 class JobQueue;
84
85 ATTRIBUTE_ALIGNED64(struct)
86 ThreadLocalStorage
87 {
88         int m_threadId;
89         WorkerThreadStatus::Type m_status;
90         int m_numJobsFinished;
91         btSpinMutex m_mutex;
92         btScalar m_sumResult;
93         WorkerThreadDirectives* m_directive;
94         JobQueue* m_queue;
95         btClock* m_clock;
96         unsigned int m_cooldownTime;
97 };
98
99 struct IJob
100 {
101         virtual void executeJob(int threadId) = 0;
102 };
103
104 class ParallelForJob : public IJob
105 {
106         const btIParallelForBody* m_body;
107         int m_begin;
108         int m_end;
109
110 public:
111         ParallelForJob(int iBegin, int iEnd, const btIParallelForBody& body)
112         {
113                 m_body = &body;
114                 m_begin = iBegin;
115                 m_end = iEnd;
116         }
117         virtual void executeJob(int threadId) BT_OVERRIDE
118         {
119                 BT_PROFILE("executeJob");
120
121                 // call the functor body to do the work
122                 m_body->forLoop(m_begin, m_end);
123         }
124 };
125
126 class ParallelSumJob : public IJob
127 {
128         const btIParallelSumBody* m_body;
129         ThreadLocalStorage* m_threadLocalStoreArray;
130         int m_begin;
131         int m_end;
132
133 public:
134         ParallelSumJob(int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls)
135         {
136                 m_body = &body;
137                 m_threadLocalStoreArray = tls;
138                 m_begin = iBegin;
139                 m_end = iEnd;
140         }
141         virtual void executeJob(int threadId) BT_OVERRIDE
142         {
143                 BT_PROFILE("executeJob");
144
145                 // call the functor body to do the work
146                 btScalar val = m_body->sumLoop(m_begin, m_end);
147 #if BT_PARALLEL_SUM_DETERMINISTISM
148                 // by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision)
149                 const float TRUNC_SCALE = float(1 << 19);
150                 val = floor(val * TRUNC_SCALE + 0.5f) / TRUNC_SCALE;  // truncate some bits
151 #endif
152                 m_threadLocalStoreArray[threadId].m_sumResult += val;
153         }
154 };
155
156 ATTRIBUTE_ALIGNED64(class)
157 JobQueue
158 {
159         btThreadSupportInterface* m_threadSupport;
160         btCriticalSection* m_queueLock;
161         btSpinMutex m_mutex;
162
163         btAlignedObjectArray<IJob*> m_jobQueue;
164         char* m_jobMem;
165         int m_jobMemSize;
166         bool m_queueIsEmpty;
167         int m_tailIndex;
168         int m_headIndex;
169         int m_allocSize;
170         bool m_useSpinMutex;
171         btAlignedObjectArray<JobQueue*> m_neighborContexts;
172         char m_cachePadding[kCacheLineSize];  // prevent false sharing
173
174         void freeJobMem()
175         {
176                 if (m_jobMem)
177                 {
178                         // free old
179                         btAlignedFree(m_jobMem);
180                         m_jobMem = NULL;
181                 }
182         }
183         void resizeJobMem(int newSize)
184         {
185                 if (newSize > m_jobMemSize)
186                 {
187                         freeJobMem();
188                         m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize));
189                         m_jobMemSize = newSize;
190                 }
191         }
192
193 public:
194         JobQueue()
195         {
196                 m_jobMem = NULL;
197                 m_jobMemSize = 0;
198                 m_threadSupport = NULL;
199                 m_queueLock = NULL;
200                 m_headIndex = 0;
201                 m_tailIndex = 0;
202                 m_useSpinMutex = false;
203         }
204         ~JobQueue()
205         {
206                 exit();
207         }
208         void exit()
209         {
210                 freeJobMem();
211                 if (m_queueLock && m_threadSupport)
212                 {
213                         m_threadSupport->deleteCriticalSection(m_queueLock);
214                         m_queueLock = NULL;
215                         m_threadSupport = 0;
216                 }
217         }
218
219         void init(btThreadSupportInterface * threadSup, btAlignedObjectArray<JobQueue> * contextArray)
220         {
221                 m_threadSupport = threadSup;
222                 if (threadSup)
223                 {
224                         m_queueLock = m_threadSupport->createCriticalSection();
225                 }
226                 setupJobStealing(contextArray, contextArray->size());
227         }
228         void setupJobStealing(btAlignedObjectArray<JobQueue> * contextArray, int numActiveContexts)
229         {
230                 btAlignedObjectArray<JobQueue>& contexts = *contextArray;
231                 int selfIndex = 0;
232                 for (int i = 0; i < contexts.size(); ++i)
233                 {
234                         if (this == &contexts[i])
235                         {
236                                 selfIndex = i;
237                                 break;
238                         }
239                 }
240                 int numNeighbors = btMin(2, contexts.size() - 1);
241                 int neighborOffsets[] = {-1, 1, -2, 2, -3, 3};
242                 int numOffsets = sizeof(neighborOffsets) / sizeof(neighborOffsets[0]);
243                 m_neighborContexts.reserve(numNeighbors);
244                 m_neighborContexts.resizeNoInitialize(0);
245                 for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++)
246                 {
247                         int neighborIndex = selfIndex + neighborOffsets[i];
248                         if (neighborIndex >= 0 && neighborIndex < numActiveContexts)
249                         {
250                                 m_neighborContexts.push_back(&contexts[neighborIndex]);
251                         }
252                 }
253         }
254
255         bool isQueueEmpty() const { return m_queueIsEmpty; }
256         void lockQueue()
257         {
258                 if (m_useSpinMutex)
259                 {
260                         m_mutex.lock();
261                 }
262                 else
263                 {
264                         m_queueLock->lock();
265                 }
266         }
267         void unlockQueue()
268         {
269                 if (m_useSpinMutex)
270                 {
271                         m_mutex.unlock();
272                 }
273                 else
274                 {
275                         m_queueLock->unlock();
276                 }
277         }
278         void clearQueue(int jobCount, int jobSize)
279         {
280                 lockQueue();
281                 m_headIndex = 0;
282                 m_tailIndex = 0;
283                 m_allocSize = 0;
284                 m_queueIsEmpty = true;
285                 int jobBufSize = jobSize * jobCount;
286                 // make sure we have enough memory allocated to store jobs
287                 if (jobBufSize > m_jobMemSize)
288                 {
289                         resizeJobMem(jobBufSize);
290                 }
291                 // make sure job queue is big enough
292                 if (jobCount > m_jobQueue.capacity())
293                 {
294                         m_jobQueue.reserve(jobCount);
295                 }
296                 unlockQueue();
297                 m_jobQueue.resizeNoInitialize(0);
298         }
299         void* allocJobMem(int jobSize)
300         {
301                 btAssert(m_jobMemSize >= (m_allocSize + jobSize));
302                 void* jobMem = &m_jobMem[m_allocSize];
303                 m_allocSize += jobSize;
304                 return jobMem;
305         }
306         void submitJob(IJob * job)
307         {
308                 btAssert(reinterpret_cast<char*>(job) >= &m_jobMem[0] && reinterpret_cast<char*>(job) < &m_jobMem[0] + m_allocSize);
309                 m_jobQueue.push_back(job);
310                 lockQueue();
311                 m_tailIndex++;
312                 m_queueIsEmpty = false;
313                 unlockQueue();
314         }
315         IJob* consumeJobFromOwnQueue()
316         {
317                 if (m_queueIsEmpty)
318                 {
319                         // lock free path. even if this is taken erroneously it isn't harmful
320                         return NULL;
321                 }
322                 IJob* job = NULL;
323                 lockQueue();
324                 if (!m_queueIsEmpty)
325                 {
326                         job = m_jobQueue[m_headIndex++];
327                         btAssert(reinterpret_cast<char*>(job) >= &m_jobMem[0] && reinterpret_cast<char*>(job) < &m_jobMem[0] + m_allocSize);
328                         if (m_headIndex == m_tailIndex)
329                         {
330                                 m_queueIsEmpty = true;
331                         }
332                 }
333                 unlockQueue();
334                 return job;
335         }
336         IJob* consumeJob()
337         {
338                 if (IJob* job = consumeJobFromOwnQueue())
339                 {
340                         return job;
341                 }
342                 // own queue is empty, try to steal from neighbor
343                 for (int i = 0; i < m_neighborContexts.size(); ++i)
344                 {
345                         JobQueue* otherContext = m_neighborContexts[i];
346                         if (IJob* job = otherContext->consumeJobFromOwnQueue())
347                         {
348                                 return job;
349                         }
350                 }
351                 return NULL;
352         }
353 };
354
355 static void WorkerThreadFunc(void* userPtr)
356 {
357         BT_PROFILE("WorkerThreadFunc");
358         ThreadLocalStorage* localStorage = (ThreadLocalStorage*)userPtr;
359         JobQueue* jobQueue = localStorage->m_queue;
360
361         bool shouldSleep = false;
362         int threadId = localStorage->m_threadId;
363         while (!shouldSleep)
364         {
365                 // do work
366                 localStorage->m_mutex.lock();
367                 while (IJob* job = jobQueue->consumeJob())
368                 {
369                         localStorage->m_status = WorkerThreadStatus::kWorking;
370                         job->executeJob(threadId);
371                         localStorage->m_numJobsFinished++;
372                 }
373                 localStorage->m_status = WorkerThreadStatus::kWaitingForWork;
374                 localStorage->m_mutex.unlock();
375                 btU64 clockStart = localStorage->m_clock->getTimeMicroseconds();
376                 // while queue is empty,
377                 while (jobQueue->isQueueEmpty())
378                 {
379                         // todo: spin wait a bit to avoid hammering the empty queue
380                         btSpinPause();
381                         if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep)
382                         {
383                                 shouldSleep = true;
384                                 break;
385                         }
386                         // if jobs are incoming,
387                         if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs)
388                         {
389                                 clockStart = localStorage->m_clock->getTimeMicroseconds();  // reset clock
390                         }
391                         else
392                         {
393                                 for (int i = 0; i < 50; ++i)
394                                 {
395                                         btSpinPause();
396                                         btSpinPause();
397                                         btSpinPause();
398                                         btSpinPause();
399                                         if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty())
400                                         {
401                                                 break;
402                                         }
403                                 }
404                                 // if no jobs incoming and queue has been empty for the cooldown time, sleep
405                                 btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart;
406                                 if (timeElapsed > localStorage->m_cooldownTime)
407                                 {
408                                         shouldSleep = true;
409                                         break;
410                                 }
411                         }
412                 }
413         }
414         {
415                 BT_PROFILE("sleep");
416                 // go sleep
417                 localStorage->m_mutex.lock();
418                 localStorage->m_status = WorkerThreadStatus::kSleeping;
419                 localStorage->m_mutex.unlock();
420         }
421 }
422
423 class btTaskSchedulerDefault : public btITaskScheduler
424 {
425         btThreadSupportInterface* m_threadSupport;
426         WorkerThreadDirectives* m_workerDirective;
427         btAlignedObjectArray<JobQueue> m_jobQueues;
428         btAlignedObjectArray<JobQueue*> m_perThreadJobQueues;
429         btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage;
430         btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
431         btClock m_clock;
432         int m_numThreads;
433         int m_numWorkerThreads;
434         int m_numActiveJobQueues;
435         int m_maxNumThreads;
436         int m_numJobs;
437         static const int kFirstWorkerThreadId = 1;
438
439 public:
440         btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
441         {
442                 m_threadSupport = NULL;
443                 m_workerDirective = NULL;
444         }
445
446         virtual ~btTaskSchedulerDefault()
447         {
448                 waitForWorkersToSleep();
449
450                 for (int i = 0; i < m_jobQueues.size(); ++i)
451                 {
452                         m_jobQueues[i].exit();
453                 }
454
455                 if (m_threadSupport)
456                 {
457                         delete m_threadSupport;
458                         m_threadSupport = NULL;
459                 }
460                 if (m_workerDirective)
461                 {
462                         btAlignedFree(m_workerDirective);
463                         m_workerDirective = NULL;
464                 }
465         }
466
467         void init()
468         {
469                 btThreadSupportInterface::ConstructionInfo constructionInfo("TaskScheduler", WorkerThreadFunc);
470                 m_threadSupport = btThreadSupportInterface::create(constructionInfo);
471                 m_workerDirective = static_cast<WorkerThreadDirectives*>(btAlignedAlloc(sizeof(*m_workerDirective), 64));
472
473                 m_numWorkerThreads = m_threadSupport->getNumWorkerThreads();
474                 m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1;
475                 m_numThreads = m_maxNumThreads;
476                 // ideal to have one job queue for each physical processor (except for the main thread which needs no queue)
477                 int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio();
478                 int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads - 1) : (m_maxNumThreads / numThreadsPerQueue);
479                 m_jobQueues.resize(numJobQueues);
480                 m_numActiveJobQueues = numJobQueues;
481                 for (int i = 0; i < m_jobQueues.size(); ++i)
482                 {
483                         m_jobQueues[i].init(m_threadSupport, &m_jobQueues);
484                 }
485                 m_perThreadJobQueues.resize(m_numThreads);
486                 for (int i = 0; i < m_numThreads; i++)
487                 {
488                         JobQueue* jq = NULL;
489                         // only worker threads get a job queue
490                         if (i > 0)
491                         {
492                                 if (numThreadsPerQueue == 1)
493                                 {
494                                         // one queue per worker thread
495                                         jq = &m_jobQueues[i - kFirstWorkerThreadId];
496                                 }
497                                 else
498                                 {
499                                         // 2 threads share each queue
500                                         jq = &m_jobQueues[i / numThreadsPerQueue];
501                                 }
502                         }
503                         m_perThreadJobQueues[i] = jq;
504                 }
505                 m_threadLocalStorage.resize(m_numThreads);
506                 for (int i = 0; i < m_numThreads; i++)
507                 {
508                         ThreadLocalStorage& storage = m_threadLocalStorage[i];
509                         storage.m_threadId = i;
510                         storage.m_directive = m_workerDirective;
511                         storage.m_status = WorkerThreadStatus::kSleeping;
512                         storage.m_cooldownTime = 100;  // 100 microseconds, threads go to sleep after this long if they have nothing to do
513                         storage.m_clock = &m_clock;
514                         storage.m_queue = m_perThreadJobQueues[i];
515                 }
516                 setWorkerDirectives(WorkerThreadDirectives::kGoToSleep);  // no work for them yet
517                 setNumThreads(m_threadSupport->getCacheFriendlyNumThreads());
518         }
519
520         void setWorkerDirectives(WorkerThreadDirectives::Type dir)
521         {
522                 m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir);
523         }
524
525         virtual int getMaxNumThreads() const BT_OVERRIDE
526         {
527                 return m_maxNumThreads;
528         }
529
530         virtual int getNumThreads() const BT_OVERRIDE
531         {
532                 return m_numThreads;
533         }
534
535         virtual void setNumThreads(int numThreads) BT_OVERRIDE
536         {
537                 m_numThreads = btMax(btMin(numThreads, int(m_maxNumThreads)), 1);
538                 m_numWorkerThreads = m_numThreads - 1;
539                 m_numActiveJobQueues = 0;
540                 // if there is at least 1 worker,
541                 if (m_numWorkerThreads > 0)
542                 {
543                         // re-setup job stealing between queues to avoid attempting to steal from an inactive job queue
544                         JobQueue* lastActiveContext = m_perThreadJobQueues[m_numThreads - 1];
545                         int iLastActiveContext = lastActiveContext - &m_jobQueues[0];
546                         m_numActiveJobQueues = iLastActiveContext + 1;
547                         for (int i = 0; i < m_jobQueues.size(); ++i)
548                         {
549                                 m_jobQueues[i].setupJobStealing(&m_jobQueues, m_numActiveJobQueues);
550                         }
551                 }
552                 m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep);
553         }
554
555         void waitJobs()
556         {
557                 BT_PROFILE("waitJobs");
558                 // have the main thread work until the job queues are empty
559                 int numMainThreadJobsFinished = 0;
560                 for (int i = 0; i < m_numActiveJobQueues; ++i)
561                 {
562                         while (IJob* job = m_jobQueues[i].consumeJob())
563                         {
564                                 job->executeJob(0);
565                                 numMainThreadJobsFinished++;
566                         }
567                 }
568
569                 // done with jobs for now, tell workers to rest (but not sleep)
570                 setWorkerDirectives(WorkerThreadDirectives::kStayAwakeButIdle);
571
572                 btU64 clockStart = m_clock.getTimeMicroseconds();
573                 // wait for workers to finish any jobs in progress
574                 while (true)
575                 {
576                         int numWorkerJobsFinished = 0;
577                         for (int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread)
578                         {
579                                 ThreadLocalStorage* storage = &m_threadLocalStorage[iThread];
580                                 storage->m_mutex.lock();
581                                 numWorkerJobsFinished += storage->m_numJobsFinished;
582                                 storage->m_mutex.unlock();
583                         }
584                         if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs)
585                         {
586                                 break;
587                         }
588                         btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart;
589                         btAssert(timeElapsed < 1000);
590                         if (timeElapsed > 100000)
591                         {
592                                 break;
593                         }
594                         btSpinPause();
595                 }
596         }
597
598         void wakeWorkers(int numWorkersToWake)
599         {
600                 BT_PROFILE("wakeWorkers");
601                 btAssert(m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs);
602                 int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads);
603                 int numActiveWorkers = 0;
604                 for (int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker)
605                 {
606                         // note this count of active workers is not necessarily totally reliable, because a worker thread could be
607                         // just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare.
608                         ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker];
609                         if (storage.m_status != WorkerThreadStatus::kSleeping)
610                         {
611                                 numActiveWorkers++;
612                         }
613                 }
614                 for (int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker)
615                 {
616                         ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker];
617                         if (storage.m_status == WorkerThreadStatus::kSleeping)
618                         {
619                                 m_threadSupport->runTask(iWorker, &storage);
620                                 numActiveWorkers++;
621                         }
622                 }
623         }
624
625         void waitForWorkersToSleep()
626         {
627                 BT_PROFILE("waitForWorkersToSleep");
628                 setWorkerDirectives(WorkerThreadDirectives::kGoToSleep);
629                 m_threadSupport->waitForAllTasks();
630                 for (int i = kFirstWorkerThreadId; i < m_numThreads; i++)
631                 {
632                         ThreadLocalStorage& storage = m_threadLocalStorage[i];
633                         btAssert(storage.m_status == WorkerThreadStatus::kSleeping);
634                 }
635         }
636
637         virtual void sleepWorkerThreadsHint() BT_OVERRIDE
638         {
639                 BT_PROFILE("sleepWorkerThreadsHint");
640                 // hint the task scheduler that we may not be using these threads for a little while
641                 setWorkerDirectives(WorkerThreadDirectives::kGoToSleep);
642         }
643
644         void prepareWorkerThreads()
645         {
646                 for (int i = kFirstWorkerThreadId; i < m_numThreads; ++i)
647                 {
648                         ThreadLocalStorage& storage = m_threadLocalStorage[i];
649                         storage.m_mutex.lock();
650                         storage.m_numJobsFinished = 0;
651                         storage.m_mutex.unlock();
652                 }
653                 setWorkerDirectives(WorkerThreadDirectives::kScanForJobs);
654         }
655
656         virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
657         {
658                 BT_PROFILE("parallelFor_ThreadSupport");
659                 btAssert(iEnd >= iBegin);
660                 btAssert(grainSize >= 1);
661                 int iterationCount = iEnd - iBegin;
662                 if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock())
663                 {
664                         typedef ParallelForJob JobType;
665                         int jobCount = (iterationCount + grainSize - 1) / grainSize;
666                         m_numJobs = jobCount;
667                         btAssert(jobCount >= 2);  // need more than one job for multithreading
668                         int jobSize = sizeof(JobType);
669
670                         for (int i = 0; i < m_numActiveJobQueues; ++i)
671                         {
672                                 m_jobQueues[i].clearQueue(jobCount, jobSize);
673                         }
674                         // prepare worker threads for incoming work
675                         prepareWorkerThreads();
676                         // submit all of the jobs
677                         int iJob = 0;
678                         int iThread = kFirstWorkerThreadId;  // first worker thread
679                         for (int i = iBegin; i < iEnd; i += grainSize)
680                         {
681                                 btAssert(iJob < jobCount);
682                                 int iE = btMin(i + grainSize, iEnd);
683                                 JobQueue* jq = m_perThreadJobQueues[iThread];
684                                 btAssert(jq);
685                                 btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
686                                 void* jobMem = jq->allocJobMem(jobSize);
687                                 JobType* job = new (jobMem) ParallelForJob(i, iE, body);  // placement new
688                                 jq->submitJob(job);
689                                 iJob++;
690                                 iThread++;
691                                 if (iThread >= m_numThreads)
692                                 {
693                                         iThread = kFirstWorkerThreadId;  // first worker thread
694                                 }
695                         }
696                         wakeWorkers(jobCount - 1);
697
698                         // put the main thread to work on emptying the job queue and then wait for all workers to finish
699                         waitJobs();
700                         m_antiNestingLock.unlock();
701                 }
702                 else
703                 {
704                         BT_PROFILE("parallelFor_mainThread");
705                         // just run on main thread
706                         body.forLoop(iBegin, iEnd);
707                 }
708         }
709         virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
710         {
711                 BT_PROFILE("parallelSum_ThreadSupport");
712                 btAssert(iEnd >= iBegin);
713                 btAssert(grainSize >= 1);
714                 int iterationCount = iEnd - iBegin;
715                 if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock())
716                 {
717                         typedef ParallelSumJob JobType;
718                         int jobCount = (iterationCount + grainSize - 1) / grainSize;
719                         m_numJobs = jobCount;
720                         btAssert(jobCount >= 2);  // need more than one job for multithreading
721                         int jobSize = sizeof(JobType);
722                         for (int i = 0; i < m_numActiveJobQueues; ++i)
723                         {
724                                 m_jobQueues[i].clearQueue(jobCount, jobSize);
725                         }
726
727                         // initialize summation
728                         for (int iThread = 0; iThread < m_numThreads; ++iThread)
729                         {
730                                 m_threadLocalStorage[iThread].m_sumResult = btScalar(0);
731                         }
732
733                         // prepare worker threads for incoming work
734                         prepareWorkerThreads();
735                         // submit all of the jobs
736                         int iJob = 0;
737                         int iThread = kFirstWorkerThreadId;  // first worker thread
738                         for (int i = iBegin; i < iEnd; i += grainSize)
739                         {
740                                 btAssert(iJob < jobCount);
741                                 int iE = btMin(i + grainSize, iEnd);
742                                 JobQueue* jq = m_perThreadJobQueues[iThread];
743                                 btAssert(jq);
744                                 btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
745                                 void* jobMem = jq->allocJobMem(jobSize);
746                                 JobType* job = new (jobMem) ParallelSumJob(i, iE, body, &m_threadLocalStorage[0]);  // placement new
747                                 jq->submitJob(job);
748                                 iJob++;
749                                 iThread++;
750                                 if (iThread >= m_numThreads)
751                                 {
752                                         iThread = kFirstWorkerThreadId;  // first worker thread
753                                 }
754                         }
755                         wakeWorkers(jobCount - 1);
756
757                         // put the main thread to work on emptying the job queue and then wait for all workers to finish
758                         waitJobs();
759
760                         // add up all the thread sums
761                         btScalar sum = btScalar(0);
762                         for (int iThread = 0; iThread < m_numThreads; ++iThread)
763                         {
764                                 sum += m_threadLocalStorage[iThread].m_sumResult;
765                         }
766                         m_antiNestingLock.unlock();
767                         return sum;
768                 }
769                 else
770                 {
771                         BT_PROFILE("parallelSum_mainThread");
772                         // just run on main thread
773                         return body.sumLoop(iBegin, iEnd);
774                 }
775         }
776 };
777
778 btITaskScheduler* btCreateDefaultTaskScheduler()
779 {
780         btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
781         ts->init();
782         return ts;
783 }
784
785 #else  // #if BT_THREADSAFE
786
787 btITaskScheduler* btCreateDefaultTaskScheduler()
788 {
789         return NULL;
790 }
791
792 #endif  // #else // #if BT_THREADSAFE