2 bool gUseLargeBatches = false;
3 bool gCpuBatchContacts = false;
4 bool gCpuSolveConstraint = false;
5 bool gCpuRadixSort = false;
6 bool gCpuSetSortData = false;
7 bool gCpuSortContactsDeterminism = false;
8 bool gUseCpuCopyConstraints = false;
9 bool gUseScanHost = false;
10 bool gReorderContactsOnCpu = false;
12 bool optionalSortContactsDeterminism = true;
14 #include "b3GpuPgsContactSolver.h"
15 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
17 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
18 #include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
19 #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
21 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
22 #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
25 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
26 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
27 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
28 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
29 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
30 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
32 #include "kernels/solverSetup.h"
33 #include "kernels/solverSetup2.h"
34 #include "kernels/solveContact.h"
35 #include "kernels/solveFriction.h"
36 #include "kernels/batchingKernels.h"
37 #include "kernels/batchingKernelsNew.h"
39 struct b3GpuBatchingPgsSolverInternalData
42 cl_device_id m_device;
43 cl_command_queue m_queue;
47 b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU;
48 b3OpenCLArray<unsigned int>* m_numConstraints;
49 b3OpenCLArray<unsigned int>* m_offsets;
51 b3Solver* m_solverGPU;
53 cl_kernel m_batchingKernel;
54 cl_kernel m_batchingKernelNew;
55 cl_kernel m_solveContactKernel;
56 cl_kernel m_solveSingleContactKernel;
57 cl_kernel m_solveSingleFrictionKernel;
58 cl_kernel m_solveFrictionKernel;
59 cl_kernel m_contactToConstraintKernel;
60 cl_kernel m_setSortDataKernel;
61 cl_kernel m_reorderContactKernel;
62 cl_kernel m_copyConstraintKernel;
64 cl_kernel m_setDeterminismSortDataBodyAKernel;
65 cl_kernel m_setDeterminismSortDataBodyBKernel;
66 cl_kernel m_setDeterminismSortDataChildShapeAKernel;
67 cl_kernel m_setDeterminismSortDataChildShapeBKernel;
69 class b3RadixSort32CL* m_sort32;
70 class b3BoundSearchCL* m_search;
71 class b3PrefixScanCL* m_scan;
73 b3OpenCLArray<b3SortData>* m_sortDataBuffer;
74 b3OpenCLArray<b3Contact4>* m_contactBuffer;
76 b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
77 b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
78 b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
80 b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy;
81 b3OpenCLArray<b3SortData>* m_contactKeyValues;
83 b3AlignedObjectArray<unsigned int> m_idxBuffer;
84 b3AlignedObjectArray<b3SortData> m_sortData;
85 b3AlignedObjectArray<b3Contact4> m_old;
87 b3AlignedObjectArray<int> m_batchSizes;
88 b3OpenCLArray<int>* m_batchSizesGpu;
91 b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity)
94 m_data = new b3GpuBatchingPgsSolverInternalData;
95 m_data->m_context = ctx;
96 m_data->m_device = device;
98 m_data->m_pairCapacity = pairCapacity;
99 m_data->m_nIterations = 4;
100 m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx, q);
101 m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx, q);
102 m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx, q);
103 m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx, q);
105 m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx, q);
106 m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx, q);
108 m_data->m_solverGPU = new b3Solver(ctx, device, q, 512 * 1024);
110 m_data->m_sort32 = new b3RadixSort32CL(ctx, device, m_data->m_queue);
111 m_data->m_scan = new b3PrefixScanCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
112 m_data->m_search = new b3BoundSearchCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
114 const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
116 m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, m_data->m_queue, sortSize);
117 m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx, m_data->m_queue);
119 m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
120 m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS);
122 m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx, q, pairCapacity);
124 m_data->m_offsets = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
125 m_data->m_offsets->resize(B3_SOLVER_N_CELLS);
126 const char* additionalMacros = "";
127 //const char* srcFileNameForCaching="";
130 const char* batchKernelSource = batchingKernelsCL;
131 const char* batchKernelNewSource = batchingKernelsNewCL;
132 const char* solverSetupSource = solverSetupCL;
133 const char* solverSetup2Source = solverSetup2CL;
134 const char* solveContactSource = solveContactCL;
135 const char* solveFrictionSource = solveFrictionCL;
138 cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
139 b3Assert(solveContactProg);
141 cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
142 b3Assert(solveFrictionProg);
144 cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
146 b3Assert(solverSetup2Prog);
148 cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
149 b3Assert(solverSetupProg);
151 m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
152 b3Assert(m_data->m_solveFrictionKernel);
154 m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
155 b3Assert(m_data->m_solveContactKernel);
157 m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg, additionalMacros);
158 b3Assert(m_data->m_solveSingleContactKernel);
160 m_data->m_solveSingleFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg, additionalMacros);
161 b3Assert(m_data->m_solveSingleFrictionKernel);
163 m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
164 b3Assert(m_data->m_contactToConstraintKernel);
166 m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
167 b3Assert(m_data->m_setSortDataKernel);
169 m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog, additionalMacros);
170 b3Assert(m_data->m_setDeterminismSortDataBodyAKernel);
172 m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog, additionalMacros);
173 b3Assert(m_data->m_setDeterminismSortDataBodyBKernel);
175 m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog, additionalMacros);
176 b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel);
178 m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog, additionalMacros);
179 b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel);
181 m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
182 b3Assert(m_data->m_reorderContactKernel);
184 m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
185 b3Assert(m_data->m_copyConstraintKernel);
189 cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
190 b3Assert(batchingProg);
192 m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
193 b3Assert(m_data->m_batchingKernel);
197 cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
198 b3Assert(batchingNewProg);
200 m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
201 b3Assert(m_data->m_batchingKernelNew);
205 b3GpuPgsContactSolver::~b3GpuPgsContactSolver()
207 delete m_data->m_batchSizesGpu;
208 delete m_data->m_bodyBufferGPU;
209 delete m_data->m_inertiaBufferGPU;
210 delete m_data->m_pBufContactOutGPU;
211 delete m_data->m_pBufContactOutGPUCopy;
212 delete m_data->m_contactKeyValues;
214 delete m_data->m_contactCGPU;
215 delete m_data->m_numConstraints;
216 delete m_data->m_offsets;
217 delete m_data->m_sortDataBuffer;
218 delete m_data->m_contactBuffer;
220 delete m_data->m_sort32;
221 delete m_data->m_scan;
222 delete m_data->m_search;
223 delete m_data->m_solverGPU;
225 clReleaseKernel(m_data->m_batchingKernel);
226 clReleaseKernel(m_data->m_batchingKernelNew);
227 clReleaseKernel(m_data->m_solveSingleContactKernel);
228 clReleaseKernel(m_data->m_solveSingleFrictionKernel);
229 clReleaseKernel(m_data->m_solveContactKernel);
230 clReleaseKernel(m_data->m_solveFrictionKernel);
232 clReleaseKernel(m_data->m_contactToConstraintKernel);
233 clReleaseKernel(m_data->m_setSortDataKernel);
234 clReleaseKernel(m_data->m_reorderContactKernel);
235 clReleaseKernel(m_data->m_copyConstraintKernel);
237 clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel);
238 clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel);
239 clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel);
240 clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel);
245 struct b3ConstraintCfg
247 b3ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(0) {}
249 float m_positionDrift;
250 float m_positionConstraintCoeff;
252 bool m_enableParallelSolve;
253 float m_batchCellSize;
257 void b3GpuPgsContactSolver::solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
258 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes) //const b3OpenCLArray<int>* gpuBatchSizes)
260 B3_PROFILE("solveContactConstraintBatchSizes");
261 int numBatches = batchSizes->size() / B3_MAX_NUM_BATCHES;
262 for (int iter = 0; iter < numIterations; iter++)
264 for (int cellId = 0; cellId < numBatches; cellId++)
267 for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
269 int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
274 b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleContactKernel, "m_solveSingleContactKernel");
275 launcher.setBuffer(bodyBuf->getBufferCL());
276 launcher.setBuffer(shapeBuf->getBufferCL());
277 launcher.setBuffer(constraint->getBufferCL());
278 launcher.setConst(cellId);
279 launcher.setConst(offset);
280 launcher.setConst(numInBatch);
281 launcher.launch1D(numInBatch);
282 offset += numInBatch;
288 for (int iter = 0; iter < numIterations; iter++)
290 for (int cellId = 0; cellId < numBatches; cellId++)
293 for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
295 int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
300 b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleFrictionKernel, "m_solveSingleFrictionKernel");
301 launcher.setBuffer(bodyBuf->getBufferCL());
302 launcher.setBuffer(shapeBuf->getBufferCL());
303 launcher.setBuffer(constraint->getBufferCL());
304 launcher.setConst(cellId);
305 launcher.setConst(offset);
306 launcher.setConst(numInBatch);
307 launcher.launch1D(numInBatch);
308 offset += numInBatch;
315 void b3GpuPgsContactSolver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
316 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes) //,const b3OpenCLArray<int>* gpuBatchSizes)
320 b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
322 const int nn = B3_SOLVER_N_CELLS;
325 cdata.y = maxNumBatches; //250;
327 int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
329 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
330 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
334 B3_PROFILE("m_batchSolveKernel iterations");
335 for (int iter = 0; iter < numIterations; iter++)
337 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
340 memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
341 gpuDebugInfo.write(debugInfo, numWorkItems);
346 b3LauncherCL launcher(m_data->m_queue, m_data->m_solveContactKernel, "m_solveContactKernel");
349 b3BufferInfoCL bInfo[] = {
351 b3BufferInfoCL(bodyBuf->getBufferCL()),
352 b3BufferInfoCL(shapeBuf->getBufferCL()),
353 b3BufferInfoCL(constraint->getBufferCL()),
354 b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
355 b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
358 b3BufferInfoCL(&gpuDebugInfo)
362 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
363 launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
364 //launcher.setConst( cdata.x );
365 launcher.setConst(cdata.y);
366 launcher.setConst(cdata.z);
368 nSplit.x = B3_SOLVER_N_SPLIT_X;
369 nSplit.y = B3_SOLVER_N_SPLIT_Y;
370 nSplit.z = B3_SOLVER_N_SPLIT_Z;
372 launcher.setConst(nSplit);
373 launcher.launch1D(numWorkItems, 64);
376 const char* fileName = "m_batchSolveKernel.bin";
377 FILE* f = fopen(fileName, "rb");
381 if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
383 printf("error, cannot get file size\n");
387 unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
388 fread(buf, sizeInBytes, 1, f);
389 int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
390 int num = *(int*)&buf[serializedBytes];
392 launcher.launch1D(num);
394 //this clFinish is for testing on errors
402 gpuDebugInfo.read(debugInfo, numWorkItems);
404 for (int i = 0; i < numWorkItems; i++)
406 if (debugInfo[i].m_valInt2 > 0)
408 printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
411 if (debugInfo[i].m_valInt3 > 0)
413 printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
420 clFinish(m_data->m_queue);
424 bool applyFriction = true;
427 B3_PROFILE("m_batchSolveKernel iterations2");
428 for (int iter = 0; iter < numIterations; iter++)
430 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
434 b3BufferInfoCL bInfo[] = {
435 b3BufferInfoCL(bodyBuf->getBufferCL()),
436 b3BufferInfoCL(shapeBuf->getBufferCL()),
437 b3BufferInfoCL(constraint->getBufferCL()),
438 b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
439 b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
442 b3BufferInfoCL(&gpuDebugInfo)
445 b3LauncherCL launcher(m_data->m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel");
446 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
447 launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
448 //launcher.setConst( cdata.x );
449 launcher.setConst(cdata.y);
450 launcher.setConst(cdata.z);
453 nSplit.x = B3_SOLVER_N_SPLIT_X;
454 nSplit.y = B3_SOLVER_N_SPLIT_Y;
455 nSplit.z = B3_SOLVER_N_SPLIT_Z;
457 launcher.setConst(nSplit);
459 launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
462 clFinish(m_data->m_queue);
470 static bool sortfnc(const b3SortData& a, const b3SortData& b)
472 return (a.m_key < b.m_key);
475 static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q)
477 return ((p.m_bodyAPtrAndSignBit < q.m_bodyAPtrAndSignBit) ||
478 ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit < q.m_bodyBPtrAndSignBit)) ||
479 ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
480 ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
481 ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA == q.m_childIndexA && p.m_childIndexB < q.m_childIndexB));
484 #define USE_SPATIAL_BATCHING 1
485 #define USE_4x4_GRID 1
487 #ifndef USE_SPATIAL_BATCHING
488 static const int gridTable4x4[] =
494 static const int gridTable8x8[] =
496 0, 2, 3, 16, 17, 18, 19, 1,
497 66, 64, 80, 67, 82, 81, 65, 83,
498 131, 144, 128, 130, 147, 129, 145, 146,
499 208, 195, 194, 192, 193, 211, 210, 209,
500 21, 22, 23, 5, 4, 6, 7, 20,
501 86, 85, 69, 87, 70, 68, 84, 71,
502 151, 133, 149, 150, 135, 148, 132, 134,
503 197, 27, 214, 213, 212, 199, 198, 196
509 void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts, float scale, const b3Int4& nSplit, int staticIdx)
511 for (int gIdx = 0; gIdx < nContacts; gIdx++)
513 if (gIdx < nContacts)
515 int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;
516 int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;
518 int aIdx = abs(aPtrAndSignBit);
519 int bIdx = abs(bPtrAndSignBit);
521 bool aStatic = (aPtrAndSignBit < 0) || (aPtrAndSignBit == staticIdx);
523 #if USE_SPATIAL_BATCHING
524 int idx = (aStatic) ? bIdx : aIdx;
525 b3Vector3 p = gBodies[idx].m_pos;
526 int xIdx = (int)((p.x - ((p.x < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.x - 1);
527 int yIdx = (int)((p.y - ((p.y < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.y - 1);
528 int zIdx = (int)((p.z - ((p.z < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.z - 1);
530 int newIndex = (xIdx + yIdx * nSplit.x + zIdx * nSplit.x * nSplit.y);
532 #else //USE_SPATIAL_BATCHING
533 bool bStatic = (bPtrAndSignBit < 0) || (bPtrAndSignBit == staticIdx);
543 int gridIndex = aa + bb * 4;
544 int newIndex = gridTable4x4[gridIndex];
553 int gridIndex = aa + bb * 8;
554 int newIndex = gridTable8x8[gridIndex];
555 #endif //USE_4x4_GRID
556 #endif //USE_SPATIAL_BATCHING
558 gSortDataOut[gIdx].x = newIndex;
559 gSortDataOut[gIdx].y = gIdx;
563 gSortDataOut[gIdx].x = 0xffffffff;
568 void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index)
570 B3_PROFILE("solveContacts");
571 m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf, numBodies);
572 m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf, numBodies);
573 m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf, numContacts);
575 if (optionalSortContactsDeterminism)
577 if (!gCpuSortContactsDeterminism)
579 B3_PROFILE("GPU Sort contact constraints (determinism)");
581 m_data->m_pBufContactOutGPUCopy->resize(numContacts);
582 m_data->m_contactKeyValues->resize(numContacts);
584 m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(), numContacts, 0, 0);
587 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel, "m_setDeterminismSortDataChildShapeBKernel");
588 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
589 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
590 launcher.setConst(numContacts);
591 launcher.launch1D(numContacts, 64);
593 m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
595 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel, "m_setDeterminismSortDataChildShapeAKernel");
596 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
597 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
598 launcher.setConst(numContacts);
599 launcher.launch1D(numContacts, 64);
601 m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
603 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel, "m_setDeterminismSortDataBodyBKernel");
604 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
605 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
606 launcher.setConst(numContacts);
607 launcher.launch1D(numContacts, 64);
610 m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
613 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel, "m_setDeterminismSortDataBodyAKernel");
614 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
615 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
616 launcher.setConst(numContacts);
617 launcher.launch1D(numContacts, 64);
620 m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
623 B3_PROFILE("gpu reorderContactKernel (determinism)");
626 cdata.x = numContacts;
628 //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
629 // , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
630 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
631 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
632 launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL());
633 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
634 launcher.setConst(cdata);
635 launcher.launch1D(numContacts, 64);
640 B3_PROFILE("CPU Sort contact constraints (determinism)");
641 b3AlignedObjectArray<b3Contact4> cpuConstraints;
642 m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints);
646 cpuConstraints.quickSort(b3ContactCmp);
648 for (int i = 0; i < cpuConstraints.size(); i++)
650 cpuConstraints[i].m_batchIdx = i;
653 m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints);
654 if (m_debugOutput == 100)
656 for (int i = 0; i < cpuConstraints.size(); i++)
658 printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n", i, cpuConstraints[i].m_bodyAPtrAndSignBit, cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx);
666 int nContactOut = m_data->m_pBufContactOutGPU->size();
668 bool useSolver = true;
673 b3ConstraintCfg csCfg(dt);
674 csCfg.m_enableParallelSolve = true;
675 csCfg.m_batchCellSize = 6;
676 csCfg.m_staticIdx = static0Index;
678 b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU;
680 void* additionalData = 0; //m_data->m_frictionCGPU;
681 const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU;
682 b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU;
683 int nContacts = nContactOut;
685 int maxNumBatches = 0;
687 if (!gUseLargeBatches)
689 if (m_data->m_solverGPU->m_contactBuffer2)
691 m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
694 if (m_data->m_solverGPU->m_contactBuffer2 == 0)
696 m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context, m_data->m_queue, nContacts);
697 m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
700 //clFinish(m_data->m_queue);
703 B3_PROFILE("batching");
704 //@todo: just reserve it, without copy of original contact (unless we use warmstarting)
706 //const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf;
709 //b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
710 //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
712 const int sortAlignment = 512; // todo. get this out of sort
713 if (csCfg.m_enableParallelSolve)
715 int sortSize = B3NEXTMULTIPLEOF(nContacts, sortAlignment);
717 b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
718 b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
720 if (!gCpuSetSortData)
722 B3_PROFILE("GPU set cell idx");
731 b3Assert(sortSize % 64 == 0);
733 cdata.m_nContacts = nContacts;
734 cdata.m_staticIdx = csCfg.m_staticIdx;
735 cdata.m_scale = 1.f / csCfg.m_batchCellSize;
736 cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
737 cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
738 cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
740 m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
742 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
743 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel, "m_setSortDataKernel");
744 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
745 launcher.setConst(cdata.m_nContacts);
746 launcher.setConst(cdata.m_scale);
747 launcher.setConst(cdata.m_nSplit);
748 launcher.setConst(cdata.m_staticIdx);
750 launcher.launch1D(sortSize, 64);
754 m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
755 b3AlignedObjectArray<b3SortData> sortDataCPU;
756 m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU);
758 b3AlignedObjectArray<b3Contact4> contactCPU;
759 m_data->m_pBufContactOutGPU->copyToHost(contactCPU);
760 b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
761 bodyBuf->copyToHost(bodiesCPU);
762 float scale = 1.f / csCfg.m_batchCellSize;
764 nSplit.x = B3_SOLVER_N_SPLIT_X;
765 nSplit.y = B3_SOLVER_N_SPLIT_Y;
766 nSplit.z = B3_SOLVER_N_SPLIT_Z;
768 SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts, scale, nSplit, csCfg.m_staticIdx);
770 m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU);
774 { // 3. sort by cell idx
775 B3_PROFILE("gpuRadixSort");
776 //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
778 //if( n <= 0xffff ) sortBit = 16;
779 //if( n <= 0xff ) sortBit = 8;
780 //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
781 //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
782 b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
783 this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut);
787 b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
788 b3AlignedObjectArray<b3SortData> hostValues;
789 keyValuesInOut.copyToHost(hostValues);
790 hostValues.quickSort(sortfnc);
791 keyValuesInOut.copyFromHost(hostValues);
797 B3_PROFILE("cpuBoundSearch");
798 b3AlignedObjectArray<unsigned int> countsHost;
799 countsNative->copyToHost(countsHost);
801 b3AlignedObjectArray<b3SortData> sortDataHost;
802 m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
804 //m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
805 m_data->m_solverGPU->m_search->executeHost(sortDataHost, nContacts, countsHost, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
807 countsNative->copyFromHost(countsHost);
809 //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
810 // B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
813 //m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
814 b3AlignedObjectArray<unsigned int> offsetsHost;
815 offsetsHost.resize(offsetsNative->size());
817 m_data->m_solverGPU->m_scan->executeHost(countsHost, offsetsHost, B3_SOLVER_N_CELLS); //,&sum );
818 offsetsNative->copyFromHost(offsetsHost);
820 //printf("sum = %d\n",sum);
825 B3_PROFILE("gpuBoundSearch");
826 m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
827 m_data->m_solverGPU->m_scan->execute(*countsNative, *offsetsNative, B3_SOLVER_N_CELLS); //,&sum );
831 { // 5. sort constraints by cellIdx
832 if (gReorderContactsOnCpu)
834 B3_PROFILE("cpu m_reorderContactKernel");
835 b3AlignedObjectArray<b3SortData> sortDataHost;
836 m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
837 b3AlignedObjectArray<b3Contact4> inContacts;
838 b3AlignedObjectArray<b3Contact4> outContacts;
839 m_data->m_pBufContactOutGPU->copyToHost(inContacts);
840 outContacts.resize(inContacts.size());
841 for (int i = 0; i < nContacts; i++)
843 int srcIdx = sortDataHost[i].y;
844 outContacts[i] = inContacts[srcIdx];
846 m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
848 /* "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
850 " int nContacts = cb.x;\n"
851 " int gIdx = GET_GLOBAL_IDX;\n"
852 " if( gIdx < nContacts )\n"
854 " int srcIdx = sortData[gIdx].y;\n"
855 " out[gIdx] = in[srcIdx];\n"
862 B3_PROFILE("gpu m_reorderContactKernel");
867 b3BufferInfoCL bInfo[] = {
868 b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()),
869 b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
871 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
872 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
873 launcher.setConst(cdata);
874 launcher.launch1D(nContacts, 64);
880 //clFinish(m_data->m_queue);
883 // b3AlignedObjectArray<unsigned int> histogram;
884 // m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
890 if (gUseCpuCopyConstraints)
892 for (int i = 0; i < nContacts; i++)
894 m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
895 // m_data->m_solverGPU->m_contactBuffer2->getBufferCL();
896 // m_data->m_pBufContactOutGPU->getBufferCL()
901 B3_PROFILE("gpu m_copyConstraintKernel");
904 b3BufferInfoCL bInfo[] = {
905 b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()),
906 b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL())};
908 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel, "m_copyConstraintKernel");
909 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
910 launcher.setConst(cdata);
911 launcher.launch1D(nContacts, 64);
912 //we use the clFinish for proper benchmark/profile
913 clFinish(m_data->m_queue);
917 // bool compareGPU = false;
920 if (!gCpuBatchContacts)
922 B3_PROFILE("gpu batchContacts");
923 maxNumBatches = 250; //250;
924 m_data->m_solverGPU->batchContacts(m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx);
925 clFinish(m_data->m_queue);
929 B3_PROFILE("cpu batchContacts");
930 static b3AlignedObjectArray<b3Contact4> cpuContacts;
931 b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
933 B3_PROFILE("copyToHost");
934 contactsIn->copyToHost(cpuContacts);
936 b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
937 b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
939 b3AlignedObjectArray<unsigned int> nNativeHost;
940 b3AlignedObjectArray<unsigned int> offsetsNativeHost;
943 B3_PROFILE("countsNative/offsetsNative copyToHost");
944 countsNative->copyToHost(nNativeHost);
945 offsetsNative->copyToHost(offsetsNativeHost);
948 int numNonzeroGrid = 0;
950 if (gUseLargeBatches)
952 m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
953 int totalNumConstraints = cpuContacts.size();
954 //int simdWidth =numBodies+1;//-1;//64;//-1;//32;
955 int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]); // on GPU
956 maxNumBatches = b3Max(numBatches, maxNumBatches);
957 static int globalMaxBatch = 0;
958 if (maxNumBatches > globalMaxBatch)
960 globalMaxBatch = maxNumBatches;
961 b3Printf("maxNumBatches = %d\n", maxNumBatches);
966 m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS * B3_MAX_NUM_BATCHES);
967 B3_PROFILE("cpu batch grid");
968 for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
970 int n = (nNativeHost)[i];
971 int offset = (offsetsNativeHost)[i];
975 int simdWidth = numBodies + 1; //-1;//64;//-1;//32;
976 int numBatches = sortConstraintByBatch3(&cpuContacts[0] + offset, n, simdWidth, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[i * B3_MAX_NUM_BATCHES]); // on GPU
977 maxNumBatches = b3Max(numBatches, maxNumBatches);
978 static int globalMaxBatch = 0;
979 if (maxNumBatches > globalMaxBatch)
981 globalMaxBatch = maxNumBatches;
982 b3Printf("maxNumBatches = %d\n", maxNumBatches);
984 //we use the clFinish for proper benchmark/profile
987 //clFinish(m_data->m_queue);
990 B3_PROFILE("m_contactBuffer->copyFromHost");
991 m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
998 //printf("maxNumBatches = %d\n", maxNumBatches);
1000 if (gUseLargeBatches)
1004 B3_PROFILE("cpu batchContacts");
1005 static b3AlignedObjectArray<b3Contact4> cpuContacts;
1006 // b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
1008 B3_PROFILE("copyToHost");
1009 m_data->m_pBufContactOutGPU->copyToHost(cpuContacts);
1011 // b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
1012 // b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
1014 // int numNonzeroGrid=0;
1017 m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
1018 int totalNumConstraints = cpuContacts.size();
1019 // int simdWidth =numBodies+1;//-1;//64;//-1;//32;
1020 int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]); // on GPU
1021 maxNumBatches = b3Max(numBatches, maxNumBatches);
1022 static int globalMaxBatch = 0;
1023 if (maxNumBatches > globalMaxBatch)
1025 globalMaxBatch = maxNumBatches;
1026 b3Printf("maxNumBatches = %d\n", maxNumBatches);
1030 B3_PROFILE("m_contactBuffer->copyFromHost");
1031 m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
1038 B3_PROFILE("gpu convertToConstraints");
1039 m_data->m_solverGPU->convertToConstraints(bodyBuf,
1040 shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
1041 contactConstraintOut,
1042 additionalData, nContacts,
1043 (b3SolverBase::ConstraintCfg&)csCfg);
1044 clFinish(m_data->m_queue);
1051 m_data->m_solverGPU->m_nIterations = numIter; //10
1052 if (!gCpuSolveConstraint)
1054 B3_PROFILE("GPU solveContactConstraint");
1056 /*m_data->m_solverGPU->solveContactConstraint(
1057 m_data->m_bodyBufferGPU,
1058 m_data->m_inertiaBufferGPU,
1059 m_data->m_contactCGPU,0,
1064 //m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes);
1066 if (gUseLargeBatches)
1068 solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU,
1069 m_data->m_inertiaBufferGPU,
1070 m_data->m_contactCGPU, 0,
1072 maxNumBatches, numIter, &m_data->m_batchSizes);
1076 solveContactConstraint(
1077 m_data->m_bodyBufferGPU,
1078 m_data->m_inertiaBufferGPU,
1079 m_data->m_contactCGPU, 0,
1081 maxNumBatches, numIter, &m_data->m_batchSizes); //m_data->m_batchSizesGpu);
1086 B3_PROFILE("Host solveContactConstraint");
1088 m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU, 0, nContactOut, maxNumBatches, &m_data->m_batchSizes);
1095 B3_PROFILE("read body velocities back to CPU");
1096 //read body updated linear/angular velocities back to CPU
1097 m_data->m_bodyBufferGPU->read(
1098 m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
1099 adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL );
1105 void b3GpuPgsContactSolver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx)
1109 b3AlignedObjectArray<unsigned int> idxBuffer;
1110 b3AlignedObjectArray<b3SortData> sortData;
1111 b3AlignedObjectArray<b3Contact4> old;
1113 inline int b3GpuPgsContactSolver::sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies)
1115 B3_PROFILE("sortConstraintByBatch");
1119 idxBuffer.resize(n);
1122 unsigned int* idxSrc = &idxBuffer[0];
1123 unsigned int* idxDst = &idxBuffer[0];
1124 int nIdxSrc, nIdxDst;
1126 const int N_FLG = 256;
1127 const int FLG_MASK = N_FLG - 1;
1128 unsigned int flg[N_FLG / 32];
1130 for (int i = 0; i < n; i++)
1131 cs[i].getBatchIdx() = -1;
1133 for (int i = 0; i < n; i++)
1140 B3_PROFILE("cpu batch innerloop");
1145 int nCurrentBatch = 0;
1148 for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1150 for (int i = 0; i < nIdxSrc; i++)
1152 int idx = idxSrc[i];
1155 // check if it can go
1156 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1157 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1159 int bodyA = abs(bodyAS);
1160 int bodyB = abs(bodyBS);
1162 int aIdx = bodyA & FLG_MASK;
1163 int bIdx = bodyB & FLG_MASK;
1165 unsigned int aUnavailable = flg[aIdx / 32] & (1 << (aIdx & 31));
1166 unsigned int bUnavailable = flg[bIdx / 32] & (1 << (bIdx & 31));
1168 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1169 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1172 aUnavailable = !aIsStatic ? aUnavailable : 0; //
1173 bUnavailable = !bIsStatic ? bUnavailable : 0;
1175 if (aUnavailable == 0 && bUnavailable == 0) // ok
1178 flg[aIdx / 32] |= (1 << (aIdx & 31));
1180 flg[bIdx / 32] |= (1 << (bIdx & 31));
1182 cs[idx].getBatchIdx() = batchIdx;
1183 sortData[idx].m_key = batchIdx;
1184 sortData[idx].m_value = idx;
1188 if (nCurrentBatch == simdWidth)
1191 for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1197 idxDst[nIdxDst++] = idx;
1200 b3Swap(idxSrc, idxDst);
1201 b3Swap(nIdxSrc, nIdxDst);
1206 B3_PROFILE("quickSort");
1207 sortData.quickSort(sortfnc);
1211 B3_PROFILE("reorder");
1214 memcpy(&old[0], cs, sizeof(b3Contact4) * n);
1215 for (int i = 0; i < n; i++)
1217 int idx = sortData[i].m_value;
1223 // debugPrintf( "nBatches: %d\n", batchIdx );
1224 for (int i = 0; i < n; i++)
1226 b3Assert(cs[i].getBatchIdx() != -1);
1232 b3AlignedObjectArray<int> bodyUsed2;
1234 inline int b3GpuPgsContactSolver::sortConstraintByBatch2(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies)
1236 B3_PROFILE("sortConstraintByBatch2");
1238 bodyUsed2.resize(2 * simdWidth);
1240 for (int q = 0; q < 2 * simdWidth; q++)
1243 int curBodyUsed = 0;
1247 m_data->m_sortData.resize(numConstraints);
1248 m_data->m_idxBuffer.resize(numConstraints);
1249 m_data->m_old.resize(numConstraints);
1251 unsigned int* idxSrc = &m_data->m_idxBuffer[0];
1254 for (int i = 0; i < numConstraints; i++)
1255 cs[i].getBatchIdx() = -1;
1257 for (int i = 0; i < numConstraints; i++)
1260 int numValidConstraints = 0;
1261 // int unprocessedConstraintIndex = 0;
1266 B3_PROFILE("cpu batch innerloop");
1268 while (numValidConstraints < numConstraints)
1271 int nCurrentBatch = 0;
1273 for (int i = 0; i < curBodyUsed; i++)
1277 for (int i = numValidConstraints; i < numConstraints; i++)
1279 int idx = idxSrc[i];
1280 b3Assert(idx < numConstraints);
1281 // check if it can go
1282 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1283 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1284 int bodyA = abs(bodyAS);
1285 int bodyB = abs(bodyBS);
1286 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1287 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1288 int aUnavailable = 0;
1289 int bUnavailable = 0;
1292 for (int j = 0; j < curBodyUsed; j++)
1294 if (bodyA == bodyUsed2[j])
1304 for (int j = 0; j < curBodyUsed; j++)
1306 if (bodyB == bodyUsed2[j])
1314 if (aUnavailable == 0 && bUnavailable == 0) // ok
1318 bodyUsed2[curBodyUsed++] = bodyA;
1322 bodyUsed2[curBodyUsed++] = bodyB;
1325 cs[idx].getBatchIdx() = batchIdx;
1326 m_data->m_sortData[idx].m_key = batchIdx;
1327 m_data->m_sortData[idx].m_value = idx;
1329 if (i != numValidConstraints)
1331 b3Swap(idxSrc[i], idxSrc[numValidConstraints]);
1334 numValidConstraints++;
1337 if (nCurrentBatch == simdWidth)
1340 for (int i = 0; i < curBodyUsed; i++)
1353 B3_PROFILE("quickSort");
1354 //m_data->m_sortData.quickSort(sortfnc);
1358 B3_PROFILE("reorder");
1361 memcpy(&m_data->m_old[0], cs, sizeof(b3Contact4) * numConstraints);
1363 for (int i = 0; i < numConstraints; i++)
1365 b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
1366 int idx = m_data->m_sortData[idxSrc[i]].m_value;
1367 cs[i] = m_data->m_old[idx];
1372 // debugPrintf( "nBatches: %d\n", batchIdx );
1373 for (int i = 0; i < numConstraints; i++)
1375 b3Assert(cs[i].getBatchIdx() != -1);
1382 b3AlignedObjectArray<int> bodyUsed;
1383 b3AlignedObjectArray<int> curUsed;
1385 inline int b3GpuPgsContactSolver::sortConstraintByBatch3(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies, int* batchSizes)
1387 B3_PROFILE("sortConstraintByBatch3");
1389 static int maxSwaps = 0;
1392 curUsed.resize(2 * simdWidth);
1394 static int maxNumConstraints = 0;
1395 if (maxNumConstraints < numConstraints)
1397 maxNumConstraints = numConstraints;
1398 //printf("maxNumConstraints = %d\n",maxNumConstraints );
1401 int numUsedArray = numBodies / 32 + 1;
1402 bodyUsed.resize(numUsedArray);
1404 for (int q = 0; q < numUsedArray; q++)
1407 int curBodyUsed = 0;
1411 m_data->m_sortData.resize(0);
1412 m_data->m_idxBuffer.resize(0);
1413 m_data->m_old.resize(0);
1416 for (int i = 0; i < numConstraints; i++)
1417 cs[i].getBatchIdx() = -1;
1420 int numValidConstraints = 0;
1421 // int unprocessedConstraintIndex = 0;
1426 B3_PROFILE("cpu batch innerloop");
1428 while (numValidConstraints < numConstraints)
1431 int nCurrentBatch = 0;
1432 batchSizes[batchIdx] = 0;
1435 for (int i = 0; i < curBodyUsed; i++)
1436 bodyUsed[curUsed[i] / 32] = 0;
1440 for (int i = numValidConstraints; i < numConstraints; i++)
1443 b3Assert(idx < numConstraints);
1444 // check if it can go
1445 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1446 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1447 int bodyA = abs(bodyAS);
1448 int bodyB = abs(bodyBS);
1449 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1450 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1451 int aUnavailable = 0;
1452 int bUnavailable = 0;
1455 aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31));
1460 bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31));
1463 if (aUnavailable == 0 && bUnavailable == 0) // ok
1467 bodyUsed[bodyA / 32] |= (1 << (bodyA & 31));
1468 curUsed[curBodyUsed++] = bodyA;
1472 bodyUsed[bodyB / 32] |= (1 << (bodyB & 31));
1473 curUsed[curBodyUsed++] = bodyB;
1476 cs[idx].getBatchIdx() = batchIdx;
1478 if (i != numValidConstraints)
1480 b3Swap(cs[i], cs[numValidConstraints]);
1484 numValidConstraints++;
1487 if (nCurrentBatch == simdWidth)
1489 batchSizes[batchIdx] += simdWidth;
1491 for (int i = 0; i < curBodyUsed; i++)
1492 bodyUsed[curUsed[i] / 32] = 0;
1499 if (batchIdx >= B3_MAX_NUM_BATCHES)
1501 b3Error("batchIdx>=B3_MAX_NUM_BATCHES");
1506 batchSizes[batchIdx] += nCurrentBatch;
1513 // debugPrintf( "nBatches: %d\n", batchIdx );
1514 for (int i = 0; i < numConstraints; i++)
1516 b3Assert(cs[i].getBatchIdx() != -1);
1520 batchSizes[batchIdx] = 0;
1522 if (maxSwaps < numSwaps)
1524 maxSwaps = numSwaps;
1525 //printf("maxSwaps = %d\n", maxSwaps);