Bullet Collision Detection & Physics Library
btThreadSupportWin32.cpp
Go to the documentation of this file.
1/*
2Bullet Continuous Collision Detection and Physics Library
3Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com
4
5This software is provided 'as-is', without any express or implied warranty.
6In no event will the authors be held liable for any damages arising from the use of this software.
7Permission is granted to anyone to use this software for any purpose,
8including commercial applications, and to alter it and redistribute it freely,
9subject to the following restrictions:
10
111. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
122. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
133. This notice may not be removed or altered from any source distribution.
14*/
15
16#if defined(_WIN32) && BT_THREADSAFE
17
18#include "LinearMath/btScalar.h"
19#include "LinearMath/btMinMax.h"
23#include <windows.h>
24#include <stdio.h>
25
26struct btProcessorInfo
27{
28 int numLogicalProcessors;
29 int numCores;
30 int numNumaNodes;
31 int numL1Cache;
32 int numL2Cache;
33 int numL3Cache;
34 int numPhysicalPackages;
35 static const int maxNumTeamMasks = 32;
36 int numTeamMasks;
37 UINT64 processorTeamMasks[maxNumTeamMasks];
38};
39
40UINT64 getProcessorTeamMask(const btProcessorInfo& procInfo, int procId)
41{
42 UINT64 procMask = UINT64(1) << procId;
43 for (int i = 0; i < procInfo.numTeamMasks; ++i)
44 {
45 if (procMask & procInfo.processorTeamMasks[i])
46 {
47 return procInfo.processorTeamMasks[i];
48 }
49 }
50 return 0;
51}
52
53int getProcessorTeamIndex(const btProcessorInfo& procInfo, int procId)
54{
55 UINT64 procMask = UINT64(1) << procId;
56 for (int i = 0; i < procInfo.numTeamMasks; ++i)
57 {
58 if (procMask & procInfo.processorTeamMasks[i])
59 {
60 return i;
61 }
62 }
63 return -1;
64}
65
66int countSetBits(ULONG64 bits)
67{
68 int count = 0;
69 while (bits)
70 {
71 if (bits & 1)
72 {
73 count++;
74 }
75 bits >>= 1;
76 }
77 return count;
78}
79
80typedef BOOL(WINAPI* Pfn_GetLogicalProcessorInformation)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
81
82void getProcessorInformation(btProcessorInfo* procInfo)
83{
84 memset(procInfo, 0, sizeof(*procInfo));
85#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
86 // Can't dlopen libraries on UWP.
87 return;
88#else
89 Pfn_GetLogicalProcessorInformation getLogicalProcInfo =
90 (Pfn_GetLogicalProcessorInformation)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
91 if (getLogicalProcInfo == NULL)
92 {
93 // no info
94 return;
95 }
96 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL;
97 DWORD bufSize = 0;
98 while (true)
99 {
100 if (getLogicalProcInfo(buf, &bufSize))
101 {
102 break;
103 }
104 else
105 {
106 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
107 {
108 if (buf)
109 {
110 free(buf);
111 }
112 buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(bufSize);
113 }
114 }
115 }
116
117 int len = bufSize / sizeof(*buf);
118 for (int i = 0; i < len; ++i)
119 {
120 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i;
121 switch (info->Relationship)
122 {
123 case RelationNumaNode:
124 procInfo->numNumaNodes++;
125 break;
126
127 case RelationProcessorCore:
128 procInfo->numCores++;
129 procInfo->numLogicalProcessors += countSetBits(info->ProcessorMask);
130 break;
131
132 case RelationCache:
133 if (info->Cache.Level == 1)
134 {
135 procInfo->numL1Cache++;
136 }
137 else if (info->Cache.Level == 2)
138 {
139 procInfo->numL2Cache++;
140 }
141 else if (info->Cache.Level == 3)
142 {
143 procInfo->numL3Cache++;
144 // processors that share L3 cache are considered to be on the same team
145 // because they can more easily work together on the same data.
146 // Large performance penalties will occur if 2 or more threads from different
147 // teams attempt to frequently read and modify the same cache lines.
148 //
149 // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into
150 // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both
151 // CCXs are operating on the same data, many cycles will be spent keeping the
152 // two caches coherent.
153 if (procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks)
154 {
155 procInfo->processorTeamMasks[procInfo->numTeamMasks] = info->ProcessorMask;
156 procInfo->numTeamMasks++;
157 }
158 }
159 break;
160
161 case RelationProcessorPackage:
162 procInfo->numPhysicalPackages++;
163 break;
164 }
165 }
166 free(buf);
167#endif
168}
169
171class btThreadSupportWin32 : public btThreadSupportInterface
172{
173public:
174 struct btThreadStatus
175 {
176 int m_taskId;
177 int m_commandId;
178 int m_status;
179
180 ThreadFunc m_userThreadFunc;
181 void* m_userPtr; //for taskDesc etc
182
183 void* m_threadHandle; //this one is calling 'Win32ThreadFunc'
184
185 void* m_eventStartHandle;
186 char m_eventStartHandleName[32];
187
188 void* m_eventCompleteHandle;
189 char m_eventCompleteHandleName[32];
190 };
191
192private:
193 btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
194 btAlignedObjectArray<void*> m_completeHandles;
195 int m_numThreads;
196 DWORD_PTR m_startedThreadMask;
197 btProcessorInfo m_processorInfo;
198
199 void startThreads(const ConstructionInfo& threadInfo);
200 void stopThreads();
201 int waitForResponse();
202
203public:
204 btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo);
205 virtual ~btThreadSupportWin32();
206
207 virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
208 virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); }
209 virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; }
210
211 virtual void runTask(int threadIndex, void* userData) BT_OVERRIDE;
212 virtual void waitForAllTasks() BT_OVERRIDE;
213
214 virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
215 virtual void deleteCriticalSection(btCriticalSection* criticalSection) BT_OVERRIDE;
216};
217
218btThreadSupportWin32::btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo)
219{
220 startThreads(threadConstructionInfo);
221}
222
223btThreadSupportWin32::~btThreadSupportWin32()
224{
225 stopThreads();
226}
227
228DWORD WINAPI win32threadStartFunc(LPVOID lpParam)
229{
230 btThreadSupportWin32::btThreadStatus* status = (btThreadSupportWin32::btThreadStatus*)lpParam;
231
232 while (1)
233 {
234 WaitForSingleObject(status->m_eventStartHandle, INFINITE);
235 void* userPtr = status->m_userPtr;
236
237 if (userPtr)
238 {
239 btAssert(status->m_status);
240 status->m_userThreadFunc(userPtr);
241 status->m_status = 2;
242 SetEvent(status->m_eventCompleteHandle);
243 }
244 else
245 {
246 //exit Thread
247 status->m_status = 3;
248 printf("Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle);
249 SetEvent(status->m_eventCompleteHandle);
250 break;
251 }
252 }
253 printf("Thread TERMINATED\n");
254 return 0;
255}
256
257void btThreadSupportWin32::runTask(int threadIndex, void* userData)
258{
259 btThreadStatus& threadStatus = m_activeThreadStatus[threadIndex];
260 btAssert(threadIndex >= 0);
261 btAssert(int(threadIndex) < m_activeThreadStatus.size());
262
263 threadStatus.m_commandId = 1;
264 threadStatus.m_status = 1;
265 threadStatus.m_userPtr = userData;
266 m_startedThreadMask |= DWORD_PTR(1) << threadIndex;
267
269 SetEvent(threadStatus.m_eventStartHandle);
270}
271
272int btThreadSupportWin32::waitForResponse()
273{
274 btAssert(m_activeThreadStatus.size());
275
276 int last = -1;
277 DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, INFINITE);
278 btAssert(res != WAIT_FAILED);
279 last = res - WAIT_OBJECT_0;
280
281 btThreadStatus& threadStatus = m_activeThreadStatus[last];
282 btAssert(threadStatus.m_threadHandle);
283 btAssert(threadStatus.m_eventCompleteHandle);
284
285 //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
286 btAssert(threadStatus.m_status > 1);
287 threadStatus.m_status = 0;
288
290 btAssert(last >= 0);
291 m_startedThreadMask &= ~(DWORD_PTR(1) << last);
292
293 return last;
294}
295
296void btThreadSupportWin32::waitForAllTasks()
297{
298 while (m_startedThreadMask)
299 {
300 waitForResponse();
301 }
302}
303
304void btThreadSupportWin32::startThreads(const ConstructionInfo& threadConstructionInfo)
305{
306 static int uniqueId = 0;
307 uniqueId++;
308 btProcessorInfo& procInfo = m_processorInfo;
309 getProcessorInformation(&procInfo);
310 DWORD_PTR dwProcessAffinityMask = 0;
311 DWORD_PTR dwSystemAffinityMask = 0;
312 if (!GetProcessAffinityMask(GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask))
313 {
314 dwProcessAffinityMask = 0;
315 }
317 m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists)
318
319 m_activeThreadStatus.resize(m_numThreads);
320 m_completeHandles.resize(m_numThreads);
321 m_startedThreadMask = 0;
322
323 // set main thread affinity
324 if (DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask(procInfo, 0))
325 {
326 SetThreadAffinityMask(GetCurrentThread(), mask);
327 SetThreadIdealProcessor(GetCurrentThread(), 0);
328 }
329
330 for (int i = 0; i < m_numThreads; i++)
331 {
332 printf("starting thread %d\n", i);
333
334 btThreadStatus& threadStatus = m_activeThreadStatus[i];
335
336 LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL;
337 SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize;
338 LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc;
339 LPVOID lpParameter = &threadStatus;
340 DWORD dwCreationFlags = 0;
341 LPDWORD lpThreadId = 0;
342
343 threadStatus.m_userPtr = 0;
344
345 sprintf(threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i);
346 threadStatus.m_eventStartHandle = CreateEventA(0, false, false, threadStatus.m_eventStartHandleName);
347
348 sprintf(threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i);
349 threadStatus.m_eventCompleteHandle = CreateEventA(0, false, false, threadStatus.m_eventCompleteHandleName);
350
351 m_completeHandles[i] = threadStatus.m_eventCompleteHandle;
352
353 HANDLE handle = CreateThread(lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId);
354 //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST );
355 // highest priority -- can cause erratic performance when numThreads > numCores
356 // we don't want worker threads to be higher priority than the main thread or the main thread could get
357 // totally shut out and unable to tell the workers to stop
358 //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL );
359
360 {
361 int processorId = i + 1; // leave processor 0 for main thread
362 DWORD_PTR teamMask = getProcessorTeamMask(procInfo, processorId);
363 if (teamMask)
364 {
365 // bind each thread to only execute on processors of it's assigned team
366 // - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team)
367 // - for multi-socket Intel this will keep threads from migrating from one socket to another
368 // - for AMD Ryzen this will keep threads from migrating from one CCX to another
369 DWORD_PTR mask = teamMask & dwProcessAffinityMask;
370 if (mask)
371 {
372 SetThreadAffinityMask(handle, mask);
373 }
374 }
375 SetThreadIdealProcessor(handle, processorId);
376 }
377
378 threadStatus.m_taskId = i;
379 threadStatus.m_commandId = 0;
380 threadStatus.m_status = 0;
381 threadStatus.m_threadHandle = handle;
382 threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
383
384 printf("started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle);
385 }
386}
387
389void btThreadSupportWin32::stopThreads()
390{
391 for (int i = 0; i < m_activeThreadStatus.size(); i++)
392 {
393 btThreadStatus& threadStatus = m_activeThreadStatus[i];
394 if (threadStatus.m_status > 0)
395 {
396 WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
397 }
398
399 threadStatus.m_userPtr = NULL;
400 SetEvent(threadStatus.m_eventStartHandle);
401 WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
402
403 CloseHandle(threadStatus.m_eventCompleteHandle);
404 CloseHandle(threadStatus.m_eventStartHandle);
405 CloseHandle(threadStatus.m_threadHandle);
406 }
407
408 m_activeThreadStatus.clear();
409 m_completeHandles.clear();
410}
411
412class btWin32CriticalSection : public btCriticalSection
413{
414private:
415 CRITICAL_SECTION mCriticalSection;
416
417public:
418 btWin32CriticalSection()
419 {
420 InitializeCriticalSection(&mCriticalSection);
421 }
422
423 ~btWin32CriticalSection()
424 {
425 DeleteCriticalSection(&mCriticalSection);
426 }
427
428 void lock()
429 {
430 EnterCriticalSection(&mCriticalSection);
431 }
432
433 void unlock()
434 {
435 LeaveCriticalSection(&mCriticalSection);
436 }
437};
438
439btCriticalSection* btThreadSupportWin32::createCriticalSection()
440{
441 unsigned char* mem = (unsigned char*)btAlignedAlloc(sizeof(btWin32CriticalSection), 16);
442 btWin32CriticalSection* cs = new (mem) btWin32CriticalSection();
443 return cs;
444}
445
446void btThreadSupportWin32::deleteCriticalSection(btCriticalSection* criticalSection)
447{
448 criticalSection->~btCriticalSection();
449 btAlignedFree(criticalSection);
450}
451
453{
454 return new btThreadSupportWin32(info);
455}
456
457#endif //defined(_WIN32) && BT_THREADSAFE
#define btAlignedFree(ptr)
#define btAlignedAlloc(size, alignment)
const T & btMin(const T &a, const T &b)
Definition: btMinMax.h:21
static int uniqueId
Definition: btRigidBody.cpp:27
#define btAssert(x)
Definition: btScalar.h:153
#define BT_OVERRIDE
Definition: btThreads.h:26
const unsigned int BT_MAX_THREAD_COUNT
Definition: btThreads.h:31
The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods It...
int size() const
return the number of elements in the array
void resize(int newsize, const T &fillData=T())
void clear()
clear the array, deallocated memory. Generally it is better to use array.resize(0),...
virtual int getCacheFriendlyNumThreads() const =0
virtual int getLogicalToPhysicalCoreRatio() const =0
virtual void waitForAllTasks()=0
static btThreadSupportInterface * create(const ConstructionInfo &info)
virtual void runTask(int threadIndex, void *userData)=0
virtual int getNumWorkerThreads() const =0