[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / LinearMath / TaskScheduler / btThreadSupportWin32.cpp
1 /*
2 Bullet Continuous Collision Detection and Physics Library
3 Copyright (c) 2003-2018 Erwin Coumans  http://bulletphysics.com
4
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
10
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14 */
15
16 #if defined(_WIN32) && BT_THREADSAFE
17
18 #include "LinearMath/btScalar.h"
19 #include "LinearMath/btMinMax.h"
20 #include "LinearMath/btAlignedObjectArray.h"
21 #include "LinearMath/btThreads.h"
22 #include "btThreadSupportInterface.h"
23 #include <windows.h>
24 #include <stdio.h>
25
26 struct btProcessorInfo
27 {
28         int numLogicalProcessors;
29         int numCores;
30         int numNumaNodes;
31         int numL1Cache;
32         int numL2Cache;
33         int numL3Cache;
34         int numPhysicalPackages;
35         static const int maxNumTeamMasks = 32;
36         int numTeamMasks;
37         UINT64 processorTeamMasks[maxNumTeamMasks];
38 };
39
40 UINT64 getProcessorTeamMask(const btProcessorInfo& procInfo, int procId)
41 {
42         UINT64 procMask = UINT64(1) << procId;
43         for (int i = 0; i < procInfo.numTeamMasks; ++i)
44         {
45                 if (procMask & procInfo.processorTeamMasks[i])
46                 {
47                         return procInfo.processorTeamMasks[i];
48                 }
49         }
50         return 0;
51 }
52
53 int getProcessorTeamIndex(const btProcessorInfo& procInfo, int procId)
54 {
55         UINT64 procMask = UINT64(1) << procId;
56         for (int i = 0; i < procInfo.numTeamMasks; ++i)
57         {
58                 if (procMask & procInfo.processorTeamMasks[i])
59                 {
60                         return i;
61                 }
62         }
63         return -1;
64 }
65
66 int countSetBits(ULONG64 bits)
67 {
68         int count = 0;
69         while (bits)
70         {
71                 if (bits & 1)
72                 {
73                         count++;
74                 }
75                 bits >>= 1;
76         }
77         return count;
78 }
79
80 typedef BOOL(WINAPI* Pfn_GetLogicalProcessorInformation)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
81
82 void getProcessorInformation(btProcessorInfo* procInfo)
83 {
84         memset(procInfo, 0, sizeof(*procInfo));
85 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
86         // Can't dlopen libraries on UWP.
87         return;
88 #else
89         Pfn_GetLogicalProcessorInformation getLogicalProcInfo =
90                 (Pfn_GetLogicalProcessorInformation)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
91         if (getLogicalProcInfo == NULL)
92         {
93                 // no info
94                 return;
95         }
96         PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL;
97         DWORD bufSize = 0;
98         while (true)
99         {
100                 if (getLogicalProcInfo(buf, &bufSize))
101                 {
102                         break;
103                 }
104                 else
105                 {
106                         if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
107                         {
108                                 if (buf)
109                                 {
110                                         free(buf);
111                                 }
112                                 buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(bufSize);
113                         }
114                 }
115         }
116
117         int len = bufSize / sizeof(*buf);
118         for (int i = 0; i < len; ++i)
119         {
120                 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i;
121                 switch (info->Relationship)
122                 {
123                         case RelationNumaNode:
124                                 procInfo->numNumaNodes++;
125                                 break;
126
127                         case RelationProcessorCore:
128                                 procInfo->numCores++;
129                                 procInfo->numLogicalProcessors += countSetBits(info->ProcessorMask);
130                                 break;
131
132                         case RelationCache:
133                                 if (info->Cache.Level == 1)
134                                 {
135                                         procInfo->numL1Cache++;
136                                 }
137                                 else if (info->Cache.Level == 2)
138                                 {
139                                         procInfo->numL2Cache++;
140                                 }
141                                 else if (info->Cache.Level == 3)
142                                 {
143                                         procInfo->numL3Cache++;
144                                         // processors that share L3 cache are considered to be on the same team
145                                         // because they can more easily work together on the same data.
146                                         // Large performance penalties will occur if 2 or more threads from different
147                                         // teams attempt to frequently read and modify the same cache lines.
148                                         //
149                                         // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into
150                                         // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both
151                                         // CCXs are operating on the same data, many cycles will be spent keeping the
152                                         // two caches coherent.
153                                         if (procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks)
154                                         {
155                                                 procInfo->processorTeamMasks[procInfo->numTeamMasks] = info->ProcessorMask;
156                                                 procInfo->numTeamMasks++;
157                                         }
158                                 }
159                                 break;
160
161                         case RelationProcessorPackage:
162                                 procInfo->numPhysicalPackages++;
163                                 break;
164                 }
165         }
166         free(buf);
167 #endif
168 }
169
170 ///btThreadSupportWin32 helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
171 class btThreadSupportWin32 : public btThreadSupportInterface
172 {
173 public:
174         struct btThreadStatus
175         {
176                 int m_taskId;
177                 int m_commandId;
178                 int m_status;
179
180                 ThreadFunc m_userThreadFunc;
181                 void* m_userPtr;  //for taskDesc etc
182
183                 void* m_threadHandle;  //this one is calling 'Win32ThreadFunc'
184
185                 void* m_eventStartHandle;
186                 char m_eventStartHandleName[32];
187
188                 void* m_eventCompleteHandle;
189                 char m_eventCompleteHandleName[32];
190         };
191
192 private:
193         btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
194         btAlignedObjectArray<void*> m_completeHandles;
195         int m_numThreads;
196         DWORD_PTR m_startedThreadMask;
197         btProcessorInfo m_processorInfo;
198
199         void startThreads(const ConstructionInfo& threadInfo);
200         void stopThreads();
201         int waitForResponse();
202
203 public:
204         btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo);
205         virtual ~btThreadSupportWin32();
206
207         virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
208         virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); }
209         virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; }
210
211         virtual void runTask(int threadIndex, void* userData) BT_OVERRIDE;
212         virtual void waitForAllTasks() BT_OVERRIDE;
213
214         virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
215         virtual void deleteCriticalSection(btCriticalSection* criticalSection) BT_OVERRIDE;
216 };
217
218 btThreadSupportWin32::btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo)
219 {
220         startThreads(threadConstructionInfo);
221 }
222
223 btThreadSupportWin32::~btThreadSupportWin32()
224 {
225         stopThreads();
226 }
227
228 DWORD WINAPI win32threadStartFunc(LPVOID lpParam)
229 {
230         btThreadSupportWin32::btThreadStatus* status = (btThreadSupportWin32::btThreadStatus*)lpParam;
231
232         while (1)
233         {
234                 WaitForSingleObject(status->m_eventStartHandle, INFINITE);
235                 void* userPtr = status->m_userPtr;
236
237                 if (userPtr)
238                 {
239                         btAssert(status->m_status);
240                         status->m_userThreadFunc(userPtr);
241                         status->m_status = 2;
242                         SetEvent(status->m_eventCompleteHandle);
243                 }
244                 else
245                 {
246                         //exit Thread
247                         status->m_status = 3;
248                         printf("Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle);
249                         SetEvent(status->m_eventCompleteHandle);
250                         break;
251                 }
252         }
253         printf("Thread TERMINATED\n");
254         return 0;
255 }
256
257 void btThreadSupportWin32::runTask(int threadIndex, void* userData)
258 {
259         btThreadStatus& threadStatus = m_activeThreadStatus[threadIndex];
260         btAssert(threadIndex >= 0);
261         btAssert(int(threadIndex) < m_activeThreadStatus.size());
262
263         threadStatus.m_commandId = 1;
264         threadStatus.m_status = 1;
265         threadStatus.m_userPtr = userData;
266         m_startedThreadMask |= DWORD_PTR(1) << threadIndex;
267
268         ///fire event to start new task
269         SetEvent(threadStatus.m_eventStartHandle);
270 }
271
272 int btThreadSupportWin32::waitForResponse()
273 {
274         btAssert(m_activeThreadStatus.size());
275
276         int last = -1;
277         DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, INFINITE);
278         btAssert(res != WAIT_FAILED);
279         last = res - WAIT_OBJECT_0;
280
281         btThreadStatus& threadStatus = m_activeThreadStatus[last];
282         btAssert(threadStatus.m_threadHandle);
283         btAssert(threadStatus.m_eventCompleteHandle);
284
285         //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
286         btAssert(threadStatus.m_status > 1);
287         threadStatus.m_status = 0;
288
289         ///need to find an active spu
290         btAssert(last >= 0);
291         m_startedThreadMask &= ~(DWORD_PTR(1) << last);
292
293         return last;
294 }
295
296 void btThreadSupportWin32::waitForAllTasks()
297 {
298         while (m_startedThreadMask)
299         {
300                 waitForResponse();
301         }
302 }
303
304 void btThreadSupportWin32::startThreads(const ConstructionInfo& threadConstructionInfo)
305 {
306         static int uniqueId = 0;
307         uniqueId++;
308         btProcessorInfo& procInfo = m_processorInfo;
309         getProcessorInformation(&procInfo);
310         DWORD_PTR dwProcessAffinityMask = 0;
311         DWORD_PTR dwSystemAffinityMask = 0;
312         if (!GetProcessAffinityMask(GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask))
313         {
314                 dwProcessAffinityMask = 0;
315         }
316         ///The number of threads should be equal to the number of available cores - 1
317         m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1;  // cap to max thread count (-1 because main thread already exists)
318
319         m_activeThreadStatus.resize(m_numThreads);
320         m_completeHandles.resize(m_numThreads);
321         m_startedThreadMask = 0;
322
323         // set main thread affinity
324         if (DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask(procInfo, 0))
325         {
326                 SetThreadAffinityMask(GetCurrentThread(), mask);
327                 SetThreadIdealProcessor(GetCurrentThread(), 0);
328         }
329
330         for (int i = 0; i < m_numThreads; i++)
331         {
332                 printf("starting thread %d\n", i);
333
334                 btThreadStatus& threadStatus = m_activeThreadStatus[i];
335
336                 LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL;
337                 SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize;
338                 LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc;
339                 LPVOID lpParameter = &threadStatus;
340                 DWORD dwCreationFlags = 0;
341                 LPDWORD lpThreadId = 0;
342
343                 threadStatus.m_userPtr = 0;
344
345                 sprintf(threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i);
346                 threadStatus.m_eventStartHandle = CreateEventA(0, false, false, threadStatus.m_eventStartHandleName);
347
348                 sprintf(threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i);
349                 threadStatus.m_eventCompleteHandle = CreateEventA(0, false, false, threadStatus.m_eventCompleteHandleName);
350
351                 m_completeHandles[i] = threadStatus.m_eventCompleteHandle;
352
353                 HANDLE handle = CreateThread(lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId);
354                 //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST );
355                 // highest priority -- can cause erratic performance when numThreads > numCores
356                 //                     we don't want worker threads to be higher priority than the main thread or the main thread could get
357                 //                     totally shut out and unable to tell the workers to stop
358                 //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL );
359
360                 {
361                         int processorId = i + 1;  // leave processor 0 for main thread
362                         DWORD_PTR teamMask = getProcessorTeamMask(procInfo, processorId);
363                         if (teamMask)
364                         {
365                                 // bind each thread to only execute on processors of it's assigned team
366                                 //  - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team)
367                                 //  - for multi-socket Intel this will keep threads from migrating from one socket to another
368                                 //  - for AMD Ryzen this will keep threads from migrating from one CCX to another
369                                 DWORD_PTR mask = teamMask & dwProcessAffinityMask;
370                                 if (mask)
371                                 {
372                                         SetThreadAffinityMask(handle, mask);
373                                 }
374                         }
375                         SetThreadIdealProcessor(handle, processorId);
376                 }
377
378                 threadStatus.m_taskId = i;
379                 threadStatus.m_commandId = 0;
380                 threadStatus.m_status = 0;
381                 threadStatus.m_threadHandle = handle;
382                 threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
383
384                 printf("started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle);
385         }
386 }
387
388 ///tell the task scheduler we are done with the SPU tasks
389 void btThreadSupportWin32::stopThreads()
390 {
391         for (int i = 0; i < m_activeThreadStatus.size(); i++)
392         {
393                 btThreadStatus& threadStatus = m_activeThreadStatus[i];
394                 if (threadStatus.m_status > 0)
395                 {
396                         WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
397                 }
398
399                 threadStatus.m_userPtr = NULL;
400                 SetEvent(threadStatus.m_eventStartHandle);
401                 WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
402
403                 CloseHandle(threadStatus.m_eventCompleteHandle);
404                 CloseHandle(threadStatus.m_eventStartHandle);
405                 CloseHandle(threadStatus.m_threadHandle);
406         }
407
408         m_activeThreadStatus.clear();
409         m_completeHandles.clear();
410 }
411
412 class btWin32CriticalSection : public btCriticalSection
413 {
414 private:
415         CRITICAL_SECTION mCriticalSection;
416
417 public:
418         btWin32CriticalSection()
419         {
420                 InitializeCriticalSection(&mCriticalSection);
421         }
422
423         ~btWin32CriticalSection()
424         {
425                 DeleteCriticalSection(&mCriticalSection);
426         }
427
428         void lock()
429         {
430                 EnterCriticalSection(&mCriticalSection);
431         }
432
433         void unlock()
434         {
435                 LeaveCriticalSection(&mCriticalSection);
436         }
437 };
438
439 btCriticalSection* btThreadSupportWin32::createCriticalSection()
440 {
441         unsigned char* mem = (unsigned char*)btAlignedAlloc(sizeof(btWin32CriticalSection), 16);
442         btWin32CriticalSection* cs = new (mem) btWin32CriticalSection();
443         return cs;
444 }
445
446 void btThreadSupportWin32::deleteCriticalSection(btCriticalSection* criticalSection)
447 {
448         criticalSection->~btCriticalSection();
449         btAlignedFree(criticalSection);
450 }
451
452 btThreadSupportInterface* btThreadSupportInterface::create(const ConstructionInfo& info)
453 {
454         return new btThreadSupportWin32(info);
455 }
456
457 #endif  //defined(_WIN32) && BT_THREADSAFE