[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / Bullet3OpenCL / RigidBody / b3GpuPgsContactSolver.cpp
1
2 bool gUseLargeBatches = false;
3 bool gCpuBatchContacts = false;
4 bool gCpuSolveConstraint = false;
5 bool gCpuRadixSort = false;
6 bool gCpuSetSortData = false;
7 bool gCpuSortContactsDeterminism = false;
8 bool gUseCpuCopyConstraints = false;
9 bool gUseScanHost = false;
10 bool gReorderContactsOnCpu = false;
11
12 bool optionalSortContactsDeterminism = true;
13
14 #include "b3GpuPgsContactSolver.h"
15 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
16
17 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
18 #include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
19 #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
20 #include <string.h>
21 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
22 #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
23 #include "b3Solver.h"
24
25 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
26 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
27 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
28 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
29 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
30 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
31
32 #include "kernels/solverSetup.h"
33 #include "kernels/solverSetup2.h"
34 #include "kernels/solveContact.h"
35 #include "kernels/solveFriction.h"
36 #include "kernels/batchingKernels.h"
37 #include "kernels/batchingKernelsNew.h"
38
39 struct b3GpuBatchingPgsSolverInternalData
40 {
41         cl_context m_context;
42         cl_device_id m_device;
43         cl_command_queue m_queue;
44         int m_pairCapacity;
45         int m_nIterations;
46
47         b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU;
48         b3OpenCLArray<unsigned int>* m_numConstraints;
49         b3OpenCLArray<unsigned int>* m_offsets;
50
51         b3Solver* m_solverGPU;
52
53         cl_kernel m_batchingKernel;
54         cl_kernel m_batchingKernelNew;
55         cl_kernel m_solveContactKernel;
56         cl_kernel m_solveSingleContactKernel;
57         cl_kernel m_solveSingleFrictionKernel;
58         cl_kernel m_solveFrictionKernel;
59         cl_kernel m_contactToConstraintKernel;
60         cl_kernel m_setSortDataKernel;
61         cl_kernel m_reorderContactKernel;
62         cl_kernel m_copyConstraintKernel;
63
64         cl_kernel m_setDeterminismSortDataBodyAKernel;
65         cl_kernel m_setDeterminismSortDataBodyBKernel;
66         cl_kernel m_setDeterminismSortDataChildShapeAKernel;
67         cl_kernel m_setDeterminismSortDataChildShapeBKernel;
68
69         class b3RadixSort32CL* m_sort32;
70         class b3BoundSearchCL* m_search;
71         class b3PrefixScanCL* m_scan;
72
73         b3OpenCLArray<b3SortData>* m_sortDataBuffer;
74         b3OpenCLArray<b3Contact4>* m_contactBuffer;
75
76         b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
77         b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
78         b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
79
80         b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy;
81         b3OpenCLArray<b3SortData>* m_contactKeyValues;
82
83         b3AlignedObjectArray<unsigned int> m_idxBuffer;
84         b3AlignedObjectArray<b3SortData> m_sortData;
85         b3AlignedObjectArray<b3Contact4> m_old;
86
87         b3AlignedObjectArray<int> m_batchSizes;
88         b3OpenCLArray<int>* m_batchSizesGpu;
89 };
90
91 b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity)
92 {
93         m_debugOutput = 0;
94         m_data = new b3GpuBatchingPgsSolverInternalData;
95         m_data->m_context = ctx;
96         m_data->m_device = device;
97         m_data->m_queue = q;
98         m_data->m_pairCapacity = pairCapacity;
99         m_data->m_nIterations = 4;
100         m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx, q);
101         m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx, q);
102         m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx, q);
103         m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx, q);
104
105         m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx, q);
106         m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx, q);
107
108         m_data->m_solverGPU = new b3Solver(ctx, device, q, 512 * 1024);
109
110         m_data->m_sort32 = new b3RadixSort32CL(ctx, device, m_data->m_queue);
111         m_data->m_scan = new b3PrefixScanCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
112         m_data->m_search = new b3BoundSearchCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
113
114         const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
115
116         m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, m_data->m_queue, sortSize);
117         m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx, m_data->m_queue);
118
119         m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
120         m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS);
121
122         m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx, q, pairCapacity);
123
124         m_data->m_offsets = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
125         m_data->m_offsets->resize(B3_SOLVER_N_CELLS);
126         const char* additionalMacros = "";
127         //const char* srcFileNameForCaching="";
128
129         cl_int pErrNum;
130         const char* batchKernelSource = batchingKernelsCL;
131         const char* batchKernelNewSource = batchingKernelsNewCL;
132         const char* solverSetupSource = solverSetupCL;
133         const char* solverSetup2Source = solverSetup2CL;
134         const char* solveContactSource = solveContactCL;
135         const char* solveFrictionSource = solveFrictionCL;
136
137         {
138                 cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
139                 b3Assert(solveContactProg);
140
141                 cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
142                 b3Assert(solveFrictionProg);
143
144                 cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
145
146                 b3Assert(solverSetup2Prog);
147
148                 cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
149                 b3Assert(solverSetupProg);
150
151                 m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
152                 b3Assert(m_data->m_solveFrictionKernel);
153
154                 m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
155                 b3Assert(m_data->m_solveContactKernel);
156
157                 m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg, additionalMacros);
158                 b3Assert(m_data->m_solveSingleContactKernel);
159
160                 m_data->m_solveSingleFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg, additionalMacros);
161                 b3Assert(m_data->m_solveSingleFrictionKernel);
162
163                 m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
164                 b3Assert(m_data->m_contactToConstraintKernel);
165
166                 m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
167                 b3Assert(m_data->m_setSortDataKernel);
168
169                 m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog, additionalMacros);
170                 b3Assert(m_data->m_setDeterminismSortDataBodyAKernel);
171
172                 m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog, additionalMacros);
173                 b3Assert(m_data->m_setDeterminismSortDataBodyBKernel);
174
175                 m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog, additionalMacros);
176                 b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel);
177
178                 m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog, additionalMacros);
179                 b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel);
180
181                 m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
182                 b3Assert(m_data->m_reorderContactKernel);
183
184                 m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
185                 b3Assert(m_data->m_copyConstraintKernel);
186         }
187
188         {
189                 cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
190                 b3Assert(batchingProg);
191
192                 m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
193                 b3Assert(m_data->m_batchingKernel);
194         }
195
196         {
197                 cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
198                 b3Assert(batchingNewProg);
199
200                 m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
201                 b3Assert(m_data->m_batchingKernelNew);
202         }
203 }
204
205 b3GpuPgsContactSolver::~b3GpuPgsContactSolver()
206 {
207         delete m_data->m_batchSizesGpu;
208         delete m_data->m_bodyBufferGPU;
209         delete m_data->m_inertiaBufferGPU;
210         delete m_data->m_pBufContactOutGPU;
211         delete m_data->m_pBufContactOutGPUCopy;
212         delete m_data->m_contactKeyValues;
213
214         delete m_data->m_contactCGPU;
215         delete m_data->m_numConstraints;
216         delete m_data->m_offsets;
217         delete m_data->m_sortDataBuffer;
218         delete m_data->m_contactBuffer;
219
220         delete m_data->m_sort32;
221         delete m_data->m_scan;
222         delete m_data->m_search;
223         delete m_data->m_solverGPU;
224
225         clReleaseKernel(m_data->m_batchingKernel);
226         clReleaseKernel(m_data->m_batchingKernelNew);
227         clReleaseKernel(m_data->m_solveSingleContactKernel);
228         clReleaseKernel(m_data->m_solveSingleFrictionKernel);
229         clReleaseKernel(m_data->m_solveContactKernel);
230         clReleaseKernel(m_data->m_solveFrictionKernel);
231
232         clReleaseKernel(m_data->m_contactToConstraintKernel);
233         clReleaseKernel(m_data->m_setSortDataKernel);
234         clReleaseKernel(m_data->m_reorderContactKernel);
235         clReleaseKernel(m_data->m_copyConstraintKernel);
236
237         clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel);
238         clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel);
239         clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel);
240         clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel);
241
242         delete m_data;
243 }
244
245 struct b3ConstraintCfg
246 {
247         b3ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(0) {}
248
249         float m_positionDrift;
250         float m_positionConstraintCoeff;
251         float m_dt;
252         bool m_enableParallelSolve;
253         float m_batchCellSize;
254         int m_staticIdx;
255 };
256
257 void b3GpuPgsContactSolver::solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
258                                                                                                                          b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes)  //const b3OpenCLArray<int>* gpuBatchSizes)
259 {
260         B3_PROFILE("solveContactConstraintBatchSizes");
261         int numBatches = batchSizes->size() / B3_MAX_NUM_BATCHES;
262         for (int iter = 0; iter < numIterations; iter++)
263         {
264                 for (int cellId = 0; cellId < numBatches; cellId++)
265                 {
266                         int offset = 0;
267                         for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
268                         {
269                                 int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
270                                 if (!numInBatch)
271                                         break;
272
273                                 {
274                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleContactKernel, "m_solveSingleContactKernel");
275                                         launcher.setBuffer(bodyBuf->getBufferCL());
276                                         launcher.setBuffer(shapeBuf->getBufferCL());
277                                         launcher.setBuffer(constraint->getBufferCL());
278                                         launcher.setConst(cellId);
279                                         launcher.setConst(offset);
280                                         launcher.setConst(numInBatch);
281                                         launcher.launch1D(numInBatch);
282                                         offset += numInBatch;
283                                 }
284                         }
285                 }
286         }
287
288         for (int iter = 0; iter < numIterations; iter++)
289         {
290                 for (int cellId = 0; cellId < numBatches; cellId++)
291                 {
292                         int offset = 0;
293                         for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
294                         {
295                                 int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
296                                 if (!numInBatch)
297                                         break;
298
299                                 {
300                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleFrictionKernel, "m_solveSingleFrictionKernel");
301                                         launcher.setBuffer(bodyBuf->getBufferCL());
302                                         launcher.setBuffer(shapeBuf->getBufferCL());
303                                         launcher.setBuffer(constraint->getBufferCL());
304                                         launcher.setConst(cellId);
305                                         launcher.setConst(offset);
306                                         launcher.setConst(numInBatch);
307                                         launcher.launch1D(numInBatch);
308                                         offset += numInBatch;
309                                 }
310                         }
311                 }
312         }
313 }
314
315 void b3GpuPgsContactSolver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
316                                                                                                    b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes)  //,const b3OpenCLArray<int>* gpuBatchSizes)
317 {
318         //sort the contacts
319
320         b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
321         {
322                 const int nn = B3_SOLVER_N_CELLS;
323
324                 cdata.x = 0;
325                 cdata.y = maxNumBatches;  //250;
326
327                 int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
328 #ifdef DEBUG_ME
329                 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
330                 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
331 #endif
332
333                 {
334                         B3_PROFILE("m_batchSolveKernel iterations");
335                         for (int iter = 0; iter < numIterations; iter++)
336                         {
337                                 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
338                                 {
339 #ifdef DEBUG_ME
340                                         memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
341                                         gpuDebugInfo.write(debugInfo, numWorkItems);
342 #endif
343
344                                         cdata.z = ib;
345
346                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solveContactKernel, "m_solveContactKernel");
347 #if 1
348
349                                         b3BufferInfoCL bInfo[] = {
350
351                                                 b3BufferInfoCL(bodyBuf->getBufferCL()),
352                                                 b3BufferInfoCL(shapeBuf->getBufferCL()),
353                                                 b3BufferInfoCL(constraint->getBufferCL()),
354                                                 b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
355                                                 b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
356 #ifdef DEBUG_ME
357                                                         ,
358                                                 b3BufferInfoCL(&gpuDebugInfo)
359 #endif
360                                         };
361
362                                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
363                                         launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
364                                         //launcher.setConst(  cdata.x );
365                                         launcher.setConst(cdata.y);
366                                         launcher.setConst(cdata.z);
367                                         b3Int4 nSplit;
368                                         nSplit.x = B3_SOLVER_N_SPLIT_X;
369                                         nSplit.y = B3_SOLVER_N_SPLIT_Y;
370                                         nSplit.z = B3_SOLVER_N_SPLIT_Z;
371
372                                         launcher.setConst(nSplit);
373                                         launcher.launch1D(numWorkItems, 64);
374
375 #else
376                                         const char* fileName = "m_batchSolveKernel.bin";
377                                         FILE* f = fopen(fileName, "rb");
378                                         if (f)
379                                         {
380                                                 int sizeInBytes = 0;
381                                                 if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
382                                                 {
383                                                         printf("error, cannot get file size\n");
384                                                         exit(0);
385                                                 }
386
387                                                 unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
388                                                 fread(buf, sizeInBytes, 1, f);
389                                                 int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
390                                                 int num = *(int*)&buf[serializedBytes];
391
392                                                 launcher.launch1D(num);
393
394                                                 //this clFinish is for testing on errors
395                                                 clFinish(m_queue);
396                                         }
397
398 #endif
399
400 #ifdef DEBUG_ME
401                                         clFinish(m_queue);
402                                         gpuDebugInfo.read(debugInfo, numWorkItems);
403                                         clFinish(m_queue);
404                                         for (int i = 0; i < numWorkItems; i++)
405                                         {
406                                                 if (debugInfo[i].m_valInt2 > 0)
407                                                 {
408                                                         printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
409                                                 }
410
411                                                 if (debugInfo[i].m_valInt3 > 0)
412                                                 {
413                                                         printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
414                                                 }
415                                         }
416 #endif  //DEBUG_ME
417                                 }
418                         }
419
420                         clFinish(m_data->m_queue);
421                 }
422
423                 cdata.x = 1;
424                 bool applyFriction = true;
425                 if (applyFriction)
426                 {
427                         B3_PROFILE("m_batchSolveKernel iterations2");
428                         for (int iter = 0; iter < numIterations; iter++)
429                         {
430                                 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
431                                 {
432                                         cdata.z = ib;
433
434                                         b3BufferInfoCL bInfo[] = {
435                                                 b3BufferInfoCL(bodyBuf->getBufferCL()),
436                                                 b3BufferInfoCL(shapeBuf->getBufferCL()),
437                                                 b3BufferInfoCL(constraint->getBufferCL()),
438                                                 b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
439                                                 b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
440 #ifdef DEBUG_ME
441                                                         ,
442                                                 b3BufferInfoCL(&gpuDebugInfo)
443 #endif  //DEBUG_ME
444                                         };
445                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel");
446                                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
447                                         launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
448                                         //launcher.setConst(  cdata.x );
449                                         launcher.setConst(cdata.y);
450                                         launcher.setConst(cdata.z);
451
452                                         b3Int4 nSplit;
453                                         nSplit.x = B3_SOLVER_N_SPLIT_X;
454                                         nSplit.y = B3_SOLVER_N_SPLIT_Y;
455                                         nSplit.z = B3_SOLVER_N_SPLIT_Z;
456
457                                         launcher.setConst(nSplit);
458
459                                         launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
460                                 }
461                         }
462                         clFinish(m_data->m_queue);
463                 }
464 #ifdef DEBUG_ME
465                 delete[] debugInfo;
466 #endif  //DEBUG_ME
467         }
468 }
469
470 static bool sortfnc(const b3SortData& a, const b3SortData& b)
471 {
472         return (a.m_key < b.m_key);
473 }
474
475 static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q)
476 {
477         return ((p.m_bodyAPtrAndSignBit < q.m_bodyAPtrAndSignBit) ||
478                         ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit < q.m_bodyBPtrAndSignBit)) ||
479                         ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
480                         ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
481                         ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA == q.m_childIndexA && p.m_childIndexB < q.m_childIndexB));
482 }
483
484 #define USE_SPATIAL_BATCHING 1
485 #define USE_4x4_GRID 1
486
487 #ifndef USE_SPATIAL_BATCHING
488 static const int gridTable4x4[] =
489         {
490                 0, 1, 17, 16,
491                 1, 2, 18, 19,
492                 17, 18, 32, 3,
493                 16, 19, 3, 34};
494 static const int gridTable8x8[] =
495         {
496                 0, 2, 3, 16, 17, 18, 19, 1,
497                 66, 64, 80, 67, 82, 81, 65, 83,
498                 131, 144, 128, 130, 147, 129, 145, 146,
499                 208, 195, 194, 192, 193, 211, 210, 209,
500                 21, 22, 23, 5, 4, 6, 7, 20,
501                 86, 85, 69, 87, 70, 68, 84, 71,
502                 151, 133, 149, 150, 135, 148, 132, 134,
503                 197, 27, 214, 213, 212, 199, 198, 196
504
505 };
506
507 #endif
508
509 void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts, float scale, const b3Int4& nSplit, int staticIdx)
510 {
511         for (int gIdx = 0; gIdx < nContacts; gIdx++)
512         {
513                 if (gIdx < nContacts)
514                 {
515                         int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;
516                         int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;
517
518                         int aIdx = abs(aPtrAndSignBit);
519                         int bIdx = abs(bPtrAndSignBit);
520
521                         bool aStatic = (aPtrAndSignBit < 0) || (aPtrAndSignBit == staticIdx);
522
523 #if USE_SPATIAL_BATCHING
524                         int idx = (aStatic) ? bIdx : aIdx;
525                         b3Vector3 p = gBodies[idx].m_pos;
526                         int xIdx = (int)((p.x - ((p.x < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.x - 1);
527                         int yIdx = (int)((p.y - ((p.y < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.y - 1);
528                         int zIdx = (int)((p.z - ((p.z < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.z - 1);
529
530                         int newIndex = (xIdx + yIdx * nSplit.x + zIdx * nSplit.x * nSplit.y);
531
532 #else  //USE_SPATIAL_BATCHING
533                         bool bStatic = (bPtrAndSignBit < 0) || (bPtrAndSignBit == staticIdx);
534
535 #if USE_4x4_GRID
536                         int aa = aIdx & 3;
537                         int bb = bIdx & 3;
538                         if (aStatic)
539                                 aa = bb;
540                         if (bStatic)
541                                 bb = aa;
542
543                         int gridIndex = aa + bb * 4;
544                         int newIndex = gridTable4x4[gridIndex];
545 #else   //USE_4x4_GRID
546                         int aa = aIdx & 7;
547                         int bb = bIdx & 7;
548                         if (aStatic)
549                                 aa = bb;
550                         if (bStatic)
551                                 bb = aa;
552
553                         int gridIndex = aa + bb * 8;
554                         int newIndex = gridTable8x8[gridIndex];
555 #endif  //USE_4x4_GRID
556 #endif  //USE_SPATIAL_BATCHING
557
558                         gSortDataOut[gIdx].x = newIndex;
559                         gSortDataOut[gIdx].y = gIdx;
560                 }
561                 else
562                 {
563                         gSortDataOut[gIdx].x = 0xffffffff;
564                 }
565         }
566 }
567
568 void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index)
569 {
570         B3_PROFILE("solveContacts");
571         m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf, numBodies);
572         m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf, numBodies);
573         m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf, numContacts);
574
575         if (optionalSortContactsDeterminism)
576         {
577                 if (!gCpuSortContactsDeterminism)
578                 {
579                         B3_PROFILE("GPU Sort contact constraints (determinism)");
580
581                         m_data->m_pBufContactOutGPUCopy->resize(numContacts);
582                         m_data->m_contactKeyValues->resize(numContacts);
583
584                         m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(), numContacts, 0, 0);
585
586                         {
587                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel, "m_setDeterminismSortDataChildShapeBKernel");
588                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
589                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
590                                 launcher.setConst(numContacts);
591                                 launcher.launch1D(numContacts, 64);
592                         }
593                         m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
594                         {
595                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel, "m_setDeterminismSortDataChildShapeAKernel");
596                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
597                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
598                                 launcher.setConst(numContacts);
599                                 launcher.launch1D(numContacts, 64);
600                         }
601                         m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
602                         {
603                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel, "m_setDeterminismSortDataBodyBKernel");
604                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
605                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
606                                 launcher.setConst(numContacts);
607                                 launcher.launch1D(numContacts, 64);
608                         }
609
610                         m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
611
612                         {
613                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel, "m_setDeterminismSortDataBodyAKernel");
614                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
615                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
616                                 launcher.setConst(numContacts);
617                                 launcher.launch1D(numContacts, 64);
618                         }
619
620                         m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
621
622                         {
623                                 B3_PROFILE("gpu reorderContactKernel (determinism)");
624
625                                 b3Int4 cdata;
626                                 cdata.x = numContacts;
627
628                                 //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
629                                 //      , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
630                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
631                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
632                                 launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL());
633                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
634                                 launcher.setConst(cdata);
635                                 launcher.launch1D(numContacts, 64);
636                         }
637                 }
638                 else
639                 {
640                         B3_PROFILE("CPU Sort contact constraints (determinism)");
641                         b3AlignedObjectArray<b3Contact4> cpuConstraints;
642                         m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints);
643                         bool sort = true;
644                         if (sort)
645                         {
646                                 cpuConstraints.quickSort(b3ContactCmp);
647
648                                 for (int i = 0; i < cpuConstraints.size(); i++)
649                                 {
650                                         cpuConstraints[i].m_batchIdx = i;
651                                 }
652                         }
653                         m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints);
654                         if (m_debugOutput == 100)
655                         {
656                                 for (int i = 0; i < cpuConstraints.size(); i++)
657                                 {
658                                         printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n", i, cpuConstraints[i].m_bodyAPtrAndSignBit, cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx);
659                                 }
660                         }
661
662                         m_debugOutput++;
663                 }
664         }
665
666         int nContactOut = m_data->m_pBufContactOutGPU->size();
667
668         bool useSolver = true;
669
670         if (useSolver)
671         {
672                 float dt = 1. / 60.;
673                 b3ConstraintCfg csCfg(dt);
674                 csCfg.m_enableParallelSolve = true;
675                 csCfg.m_batchCellSize = 6;
676                 csCfg.m_staticIdx = static0Index;
677
678                 b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU;
679
680                 void* additionalData = 0;  //m_data->m_frictionCGPU;
681                 const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU;
682                 b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU;
683                 int nContacts = nContactOut;
684
685                 int maxNumBatches = 0;
686
687                 if (!gUseLargeBatches)
688                 {
689                         if (m_data->m_solverGPU->m_contactBuffer2)
690                         {
691                                 m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
692                         }
693
694                         if (m_data->m_solverGPU->m_contactBuffer2 == 0)
695                         {
696                                 m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context, m_data->m_queue, nContacts);
697                                 m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
698                         }
699
700                         //clFinish(m_data->m_queue);
701
702                         {
703                                 B3_PROFILE("batching");
704                                 //@todo: just reserve it, without copy of original contact (unless we use warmstarting)
705
706                                 //const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf;
707
708                                 {
709                                         //b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
710                                         //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
711
712                                         const int sortAlignment = 512;  // todo. get this out of sort
713                                         if (csCfg.m_enableParallelSolve)
714                                         {
715                                                 int sortSize = B3NEXTMULTIPLEOF(nContacts, sortAlignment);
716
717                                                 b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
718                                                 b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
719
720                                                 if (!gCpuSetSortData)
721                                                 {  //   2. set cell idx
722                                                         B3_PROFILE("GPU set cell idx");
723                                                         struct CB
724                                                         {
725                                                                 int m_nContacts;
726                                                                 int m_staticIdx;
727                                                                 float m_scale;
728                                                                 b3Int4 m_nSplit;
729                                                         };
730
731                                                         b3Assert(sortSize % 64 == 0);
732                                                         CB cdata;
733                                                         cdata.m_nContacts = nContacts;
734                                                         cdata.m_staticIdx = csCfg.m_staticIdx;
735                                                         cdata.m_scale = 1.f / csCfg.m_batchCellSize;
736                                                         cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
737                                                         cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
738                                                         cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
739
740                                                         m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
741
742                                                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
743                                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel, "m_setSortDataKernel");
744                                                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
745                                                         launcher.setConst(cdata.m_nContacts);
746                                                         launcher.setConst(cdata.m_scale);
747                                                         launcher.setConst(cdata.m_nSplit);
748                                                         launcher.setConst(cdata.m_staticIdx);
749
750                                                         launcher.launch1D(sortSize, 64);
751                                                 }
752                                                 else
753                                                 {
754                                                         m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
755                                                         b3AlignedObjectArray<b3SortData> sortDataCPU;
756                                                         m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU);
757
758                                                         b3AlignedObjectArray<b3Contact4> contactCPU;
759                                                         m_data->m_pBufContactOutGPU->copyToHost(contactCPU);
760                                                         b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
761                                                         bodyBuf->copyToHost(bodiesCPU);
762                                                         float scale = 1.f / csCfg.m_batchCellSize;
763                                                         b3Int4 nSplit;
764                                                         nSplit.x = B3_SOLVER_N_SPLIT_X;
765                                                         nSplit.y = B3_SOLVER_N_SPLIT_Y;
766                                                         nSplit.z = B3_SOLVER_N_SPLIT_Z;
767
768                                                         SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts, scale, nSplit, csCfg.m_staticIdx);
769
770                                                         m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU);
771                                                 }
772
773                                                 if (!gCpuRadixSort)
774                                                 {  //   3. sort by cell idx
775                                                         B3_PROFILE("gpuRadixSort");
776                                                         //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
777                                                         //int sortBit = 32;
778                                                         //if( n <= 0xffff ) sortBit = 16;
779                                                         //if( n <= 0xff ) sortBit = 8;
780                                                         //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
781                                                         //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
782                                                         b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
783                                                         this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut);
784                                                 }
785                                                 else
786                                                 {
787                                                         b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
788                                                         b3AlignedObjectArray<b3SortData> hostValues;
789                                                         keyValuesInOut.copyToHost(hostValues);
790                                                         hostValues.quickSort(sortfnc);
791                                                         keyValuesInOut.copyFromHost(hostValues);
792                                                 }
793
794                                                 if (gUseScanHost)
795                                                 {
796                                                         //      4. find entries
797                                                         B3_PROFILE("cpuBoundSearch");
798                                                         b3AlignedObjectArray<unsigned int> countsHost;
799                                                         countsNative->copyToHost(countsHost);
800
801                                                         b3AlignedObjectArray<b3SortData> sortDataHost;
802                                                         m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
803
804                                                         //m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
805                                                         m_data->m_solverGPU->m_search->executeHost(sortDataHost, nContacts, countsHost, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
806
807                                                         countsNative->copyFromHost(countsHost);
808
809                                                         //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
810                                                         //      B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
811
812                                                         //unsigned int sum;
813                                                         //m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
814                                                         b3AlignedObjectArray<unsigned int> offsetsHost;
815                                                         offsetsHost.resize(offsetsNative->size());
816
817                                                         m_data->m_solverGPU->m_scan->executeHost(countsHost, offsetsHost, B3_SOLVER_N_CELLS);  //,&sum );
818                                                         offsetsNative->copyFromHost(offsetsHost);
819
820                                                         //printf("sum = %d\n",sum);
821                                                 }
822                                                 else
823                                                 {
824                                                         //      4. find entries
825                                                         B3_PROFILE("gpuBoundSearch");
826                                                         m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
827                                                         m_data->m_solverGPU->m_scan->execute(*countsNative, *offsetsNative, B3_SOLVER_N_CELLS);  //,&sum );
828                                                 }
829
830                                                 if (nContacts)
831                                                 {  //   5. sort constraints by cellIdx
832                                                         if (gReorderContactsOnCpu)
833                                                         {
834                                                                 B3_PROFILE("cpu m_reorderContactKernel");
835                                                                 b3AlignedObjectArray<b3SortData> sortDataHost;
836                                                                 m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
837                                                                 b3AlignedObjectArray<b3Contact4> inContacts;
838                                                                 b3AlignedObjectArray<b3Contact4> outContacts;
839                                                                 m_data->m_pBufContactOutGPU->copyToHost(inContacts);
840                                                                 outContacts.resize(inContacts.size());
841                                                                 for (int i = 0; i < nContacts; i++)
842                                                                 {
843                                                                         int srcIdx = sortDataHost[i].y;
844                                                                         outContacts[i] = inContacts[srcIdx];
845                                                                 }
846                                                                 m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
847
848                                                                 /*                                                              "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
849                                                                 "{\n"
850                                                                 "       int nContacts = cb.x;\n"
851                                                                 "       int gIdx = GET_GLOBAL_IDX;\n"
852                                                                 "       if( gIdx < nContacts )\n"
853                                                                 "       {\n"
854                                                                 "               int srcIdx = sortData[gIdx].y;\n"
855                                                                 "               out[gIdx] = in[srcIdx];\n"
856                                                                 "       }\n"
857                                                                 "}\n"
858                                                                 */
859                                                         }
860                                                         else
861                                                         {
862                                                                 B3_PROFILE("gpu m_reorderContactKernel");
863
864                                                                 b3Int4 cdata;
865                                                                 cdata.x = nContacts;
866
867                                                                 b3BufferInfoCL bInfo[] = {
868                                                                         b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()),
869                                                                         b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
870
871                                                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
872                                                                 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
873                                                                 launcher.setConst(cdata);
874                                                                 launcher.launch1D(nContacts, 64);
875                                                         }
876                                                 }
877                                         }
878                                 }
879
880                                 //clFinish(m_data->m_queue);
881
882                                 //                              {
883                                 //                              b3AlignedObjectArray<unsigned int> histogram;
884                                 //                              m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
885                                 //                              printf(",,,\n");
886                                 //                              }
887
888                                 if (nContacts)
889                                 {
890                                         if (gUseCpuCopyConstraints)
891                                         {
892                                                 for (int i = 0; i < nContacts; i++)
893                                                 {
894                                                         m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
895                                                         //                                                      m_data->m_solverGPU->m_contactBuffer2->getBufferCL();
896                                                         //                                              m_data->m_pBufContactOutGPU->getBufferCL()
897                                                 }
898                                         }
899                                         else
900                                         {
901                                                 B3_PROFILE("gpu m_copyConstraintKernel");
902                                                 b3Int4 cdata;
903                                                 cdata.x = nContacts;
904                                                 b3BufferInfoCL bInfo[] = {
905                                                         b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()),
906                                                         b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL())};
907
908                                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel, "m_copyConstraintKernel");
909                                                 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
910                                                 launcher.setConst(cdata);
911                                                 launcher.launch1D(nContacts, 64);
912                                                 //we use the clFinish for proper benchmark/profile
913                                                 clFinish(m_data->m_queue);
914                                         }
915                                 }
916
917                                 //                              bool compareGPU = false;
918                                 if (nContacts)
919                                 {
920                                         if (!gCpuBatchContacts)
921                                         {
922                                                 B3_PROFILE("gpu batchContacts");
923                                                 maxNumBatches = 250;  //250;
924                                                 m_data->m_solverGPU->batchContacts(m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx);
925                                                 clFinish(m_data->m_queue);
926                                         }
927                                         else
928                                         {
929                                                 B3_PROFILE("cpu batchContacts");
930                                                 static b3AlignedObjectArray<b3Contact4> cpuContacts;
931                                                 b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
932                                                 {
933                                                         B3_PROFILE("copyToHost");
934                                                         contactsIn->copyToHost(cpuContacts);
935                                                 }
936                                                 b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
937                                                 b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
938
939                                                 b3AlignedObjectArray<unsigned int> nNativeHost;
940                                                 b3AlignedObjectArray<unsigned int> offsetsNativeHost;
941
942                                                 {
943                                                         B3_PROFILE("countsNative/offsetsNative copyToHost");
944                                                         countsNative->copyToHost(nNativeHost);
945                                                         offsetsNative->copyToHost(offsetsNativeHost);
946                                                 }
947
948                                                 int numNonzeroGrid = 0;
949
950                                                 if (gUseLargeBatches)
951                                                 {
952                                                         m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
953                                                         int totalNumConstraints = cpuContacts.size();
954                                                         //int simdWidth =numBodies+1;//-1;//64;//-1;//32;
955                                                         int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]);  //     on GPU
956                                                         maxNumBatches = b3Max(numBatches, maxNumBatches);
957                                                         static int globalMaxBatch = 0;
958                                                         if (maxNumBatches > globalMaxBatch)
959                                                         {
960                                                                 globalMaxBatch = maxNumBatches;
961                                                                 b3Printf("maxNumBatches = %d\n", maxNumBatches);
962                                                         }
963                                                 }
964                                                 else
965                                                 {
966                                                         m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS * B3_MAX_NUM_BATCHES);
967                                                         B3_PROFILE("cpu batch grid");
968                                                         for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
969                                                         {
970                                                                 int n = (nNativeHost)[i];
971                                                                 int offset = (offsetsNativeHost)[i];
972                                                                 if (n)
973                                                                 {
974                                                                         numNonzeroGrid++;
975                                                                         int simdWidth = numBodies + 1;                                                                                                                                 //-1;//64;//-1;//32;
976                                                                         int numBatches = sortConstraintByBatch3(&cpuContacts[0] + offset, n, simdWidth, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[i * B3_MAX_NUM_BATCHES]);  //       on GPU
977                                                                         maxNumBatches = b3Max(numBatches, maxNumBatches);
978                                                                         static int globalMaxBatch = 0;
979                                                                         if (maxNumBatches > globalMaxBatch)
980                                                                         {
981                                                                                 globalMaxBatch = maxNumBatches;
982                                                                                 b3Printf("maxNumBatches = %d\n", maxNumBatches);
983                                                                         }
984                                                                         //we use the clFinish for proper benchmark/profile
985                                                                 }
986                                                         }
987                                                         //clFinish(m_data->m_queue);
988                                                 }
989                                                 {
990                                                         B3_PROFILE("m_contactBuffer->copyFromHost");
991                                                         m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
992                                                 }
993                                         }
994                                 }
995                         }
996                 }
997
998                 //printf("maxNumBatches = %d\n", maxNumBatches);
999
1000                 if (gUseLargeBatches)
1001                 {
1002                         if (nContacts)
1003                         {
1004                                 B3_PROFILE("cpu batchContacts");
1005                                 static b3AlignedObjectArray<b3Contact4> cpuContacts;
1006                                 //                              b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
1007                                 {
1008                                         B3_PROFILE("copyToHost");
1009                                         m_data->m_pBufContactOutGPU->copyToHost(cpuContacts);
1010                                 }
1011                                 //                              b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
1012                                 //                              b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
1013
1014                                 //                              int numNonzeroGrid=0;
1015
1016                                 {
1017                                         m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
1018                                         int totalNumConstraints = cpuContacts.size();
1019                                         //                              int simdWidth =numBodies+1;//-1;//64;//-1;//32;
1020                                         int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]);  //     on GPU
1021                                         maxNumBatches = b3Max(numBatches, maxNumBatches);
1022                                         static int globalMaxBatch = 0;
1023                                         if (maxNumBatches > globalMaxBatch)
1024                                         {
1025                                                 globalMaxBatch = maxNumBatches;
1026                                                 b3Printf("maxNumBatches = %d\n", maxNumBatches);
1027                                         }
1028                                 }
1029                                 {
1030                                         B3_PROFILE("m_contactBuffer->copyFromHost");
1031                                         m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
1032                                 }
1033                         }
1034                 }
1035
1036                 if (nContacts)
1037                 {
1038                         B3_PROFILE("gpu convertToConstraints");
1039                         m_data->m_solverGPU->convertToConstraints(bodyBuf,
1040                                                                                                           shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
1041                                                                                                           contactConstraintOut,
1042                                                                                                           additionalData, nContacts,
1043                                                                                                           (b3SolverBase::ConstraintCfg&)csCfg);
1044                         clFinish(m_data->m_queue);
1045                 }
1046
1047                 if (1)
1048                 {
1049                         int numIter = 4;
1050
1051                         m_data->m_solverGPU->m_nIterations = numIter;  //10
1052                         if (!gCpuSolveConstraint)
1053                         {
1054                                 B3_PROFILE("GPU solveContactConstraint");
1055
1056                                 /*m_data->m_solverGPU->solveContactConstraint(
1057                                 m_data->m_bodyBufferGPU, 
1058                                 m_data->m_inertiaBufferGPU,
1059                                 m_data->m_contactCGPU,0,
1060                                 nContactOut ,
1061                                 maxNumBatches);
1062                                 */
1063
1064                                 //m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes);
1065
1066                                 if (gUseLargeBatches)
1067                                 {
1068                                         solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU,
1069                                                                                                          m_data->m_inertiaBufferGPU,
1070                                                                                                          m_data->m_contactCGPU, 0,
1071                                                                                                          nContactOut,
1072                                                                                                          maxNumBatches, numIter, &m_data->m_batchSizes);
1073                                 }
1074                                 else
1075                                 {
1076                                         solveContactConstraint(
1077                                                 m_data->m_bodyBufferGPU,
1078                                                 m_data->m_inertiaBufferGPU,
1079                                                 m_data->m_contactCGPU, 0,
1080                                                 nContactOut,
1081                                                 maxNumBatches, numIter, &m_data->m_batchSizes);  //m_data->m_batchSizesGpu);
1082                                 }
1083                         }
1084                         else
1085                         {
1086                                 B3_PROFILE("Host solveContactConstraint");
1087
1088                                 m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU, 0, nContactOut, maxNumBatches, &m_data->m_batchSizes);
1089                         }
1090                 }
1091
1092 #if 0
1093         if (0)
1094         {
1095             B3_PROFILE("read body velocities back to CPU");
1096             //read body updated linear/angular velocities back to CPU
1097             m_data->m_bodyBufferGPU->read(
1098                                                   m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
1099             adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL );
1100         }
1101 #endif
1102         }
1103 }
1104
1105 void b3GpuPgsContactSolver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx)
1106 {
1107 }
1108
1109 b3AlignedObjectArray<unsigned int> idxBuffer;
1110 b3AlignedObjectArray<b3SortData> sortData;
1111 b3AlignedObjectArray<b3Contact4> old;
1112
1113 inline int b3GpuPgsContactSolver::sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies)
1114 {
1115         B3_PROFILE("sortConstraintByBatch");
1116         int numIter = 0;
1117
1118         sortData.resize(n);
1119         idxBuffer.resize(n);
1120         old.resize(n);
1121
1122         unsigned int* idxSrc = &idxBuffer[0];
1123         unsigned int* idxDst = &idxBuffer[0];
1124         int nIdxSrc, nIdxDst;
1125
1126         const int N_FLG = 256;
1127         const int FLG_MASK = N_FLG - 1;
1128         unsigned int flg[N_FLG / 32];
1129 #if defined(_DEBUG)
1130         for (int i = 0; i < n; i++)
1131                 cs[i].getBatchIdx() = -1;
1132 #endif
1133         for (int i = 0; i < n; i++)
1134                 idxSrc[i] = i;
1135         nIdxSrc = n;
1136
1137         int batchIdx = 0;
1138
1139         {
1140                 B3_PROFILE("cpu batch innerloop");
1141                 while (nIdxSrc)
1142                 {
1143                         numIter++;
1144                         nIdxDst = 0;
1145                         int nCurrentBatch = 0;
1146
1147                         //      clear flag
1148                         for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1149
1150                         for (int i = 0; i < nIdxSrc; i++)
1151                         {
1152                                 int idx = idxSrc[i];
1153
1154                                 b3Assert(idx < n);
1155                                 //      check if it can go
1156                                 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1157                                 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1158
1159                                 int bodyA = abs(bodyAS);
1160                                 int bodyB = abs(bodyBS);
1161
1162                                 int aIdx = bodyA & FLG_MASK;
1163                                 int bIdx = bodyB & FLG_MASK;
1164
1165                                 unsigned int aUnavailable = flg[aIdx / 32] & (1 << (aIdx & 31));
1166                                 unsigned int bUnavailable = flg[bIdx / 32] & (1 << (bIdx & 31));
1167
1168                                 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1169                                 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1170
1171                                 //use inv_mass!
1172                                 aUnavailable = !aIsStatic ? aUnavailable : 0;  //
1173                                 bUnavailable = !bIsStatic ? bUnavailable : 0;
1174
1175                                 if (aUnavailable == 0 && bUnavailable == 0)  // ok
1176                                 {
1177                                         if (!aIsStatic)
1178                                                 flg[aIdx / 32] |= (1 << (aIdx & 31));
1179                                         if (!bIsStatic)
1180                                                 flg[bIdx / 32] |= (1 << (bIdx & 31));
1181
1182                                         cs[idx].getBatchIdx() = batchIdx;
1183                                         sortData[idx].m_key = batchIdx;
1184                                         sortData[idx].m_value = idx;
1185
1186                                         {
1187                                                 nCurrentBatch++;
1188                                                 if (nCurrentBatch == simdWidth)
1189                                                 {
1190                                                         nCurrentBatch = 0;
1191                                                         for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1192                                                 }
1193                                         }
1194                                 }
1195                                 else
1196                                 {
1197                                         idxDst[nIdxDst++] = idx;
1198                                 }
1199                         }
1200                         b3Swap(idxSrc, idxDst);
1201                         b3Swap(nIdxSrc, nIdxDst);
1202                         batchIdx++;
1203                 }
1204         }
1205         {
1206                 B3_PROFILE("quickSort");
1207                 sortData.quickSort(sortfnc);
1208         }
1209
1210         {
1211                 B3_PROFILE("reorder");
1212                 //      reorder
1213
1214                 memcpy(&old[0], cs, sizeof(b3Contact4) * n);
1215                 for (int i = 0; i < n; i++)
1216                 {
1217                         int idx = sortData[i].m_value;
1218                         cs[i] = old[idx];
1219                 }
1220         }
1221
1222 #if defined(_DEBUG)
1223         //              debugPrintf( "nBatches: %d\n", batchIdx );
1224         for (int i = 0; i < n; i++)
1225         {
1226                 b3Assert(cs[i].getBatchIdx() != -1);
1227         }
1228 #endif
1229         return batchIdx;
1230 }
1231
1232 b3AlignedObjectArray<int> bodyUsed2;
1233
1234 inline int b3GpuPgsContactSolver::sortConstraintByBatch2(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies)
1235 {
1236         B3_PROFILE("sortConstraintByBatch2");
1237
1238         bodyUsed2.resize(2 * simdWidth);
1239
1240         for (int q = 0; q < 2 * simdWidth; q++)
1241                 bodyUsed2[q] = 0;
1242
1243         int curBodyUsed = 0;
1244
1245         int numIter = 0;
1246
1247         m_data->m_sortData.resize(numConstraints);
1248         m_data->m_idxBuffer.resize(numConstraints);
1249         m_data->m_old.resize(numConstraints);
1250
1251         unsigned int* idxSrc = &m_data->m_idxBuffer[0];
1252
1253 #if defined(_DEBUG)
1254         for (int i = 0; i < numConstraints; i++)
1255                 cs[i].getBatchIdx() = -1;
1256 #endif
1257         for (int i = 0; i < numConstraints; i++)
1258                 idxSrc[i] = i;
1259
1260         int numValidConstraints = 0;
1261         //      int unprocessedConstraintIndex = 0;
1262
1263         int batchIdx = 0;
1264
1265         {
1266                 B3_PROFILE("cpu batch innerloop");
1267
1268                 while (numValidConstraints < numConstraints)
1269                 {
1270                         numIter++;
1271                         int nCurrentBatch = 0;
1272                         //      clear flag
1273                         for (int i = 0; i < curBodyUsed; i++)
1274                                 bodyUsed2[i] = 0;
1275                         curBodyUsed = 0;
1276
1277                         for (int i = numValidConstraints; i < numConstraints; i++)
1278                         {
1279                                 int idx = idxSrc[i];
1280                                 b3Assert(idx < numConstraints);
1281                                 //      check if it can go
1282                                 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1283                                 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1284                                 int bodyA = abs(bodyAS);
1285                                 int bodyB = abs(bodyBS);
1286                                 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1287                                 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1288                                 int aUnavailable = 0;
1289                                 int bUnavailable = 0;
1290                                 if (!aIsStatic)
1291                                 {
1292                                         for (int j = 0; j < curBodyUsed; j++)
1293                                         {
1294                                                 if (bodyA == bodyUsed2[j])
1295                                                 {
1296                                                         aUnavailable = 1;
1297                                                         break;
1298                                                 }
1299                                         }
1300                                 }
1301                                 if (!aUnavailable)
1302                                         if (!bIsStatic)
1303                                         {
1304                                                 for (int j = 0; j < curBodyUsed; j++)
1305                                                 {
1306                                                         if (bodyB == bodyUsed2[j])
1307                                                         {
1308                                                                 bUnavailable = 1;
1309                                                                 break;
1310                                                         }
1311                                                 }
1312                                         }
1313
1314                                 if (aUnavailable == 0 && bUnavailable == 0)  // ok
1315                                 {
1316                                         if (!aIsStatic)
1317                                         {
1318                                                 bodyUsed2[curBodyUsed++] = bodyA;
1319                                         }
1320                                         if (!bIsStatic)
1321                                         {
1322                                                 bodyUsed2[curBodyUsed++] = bodyB;
1323                                         }
1324
1325                                         cs[idx].getBatchIdx() = batchIdx;
1326                                         m_data->m_sortData[idx].m_key = batchIdx;
1327                                         m_data->m_sortData[idx].m_value = idx;
1328
1329                                         if (i != numValidConstraints)
1330                                         {
1331                                                 b3Swap(idxSrc[i], idxSrc[numValidConstraints]);
1332                                         }
1333
1334                                         numValidConstraints++;
1335                                         {
1336                                                 nCurrentBatch++;
1337                                                 if (nCurrentBatch == simdWidth)
1338                                                 {
1339                                                         nCurrentBatch = 0;
1340                                                         for (int i = 0; i < curBodyUsed; i++)
1341                                                                 bodyUsed2[i] = 0;
1342
1343                                                         curBodyUsed = 0;
1344                                                 }
1345                                         }
1346                                 }
1347                         }
1348
1349                         batchIdx++;
1350                 }
1351         }
1352         {
1353                 B3_PROFILE("quickSort");
1354                 //m_data->m_sortData.quickSort(sortfnc);
1355         }
1356
1357         {
1358                 B3_PROFILE("reorder");
1359                 //      reorder
1360
1361                 memcpy(&m_data->m_old[0], cs, sizeof(b3Contact4) * numConstraints);
1362
1363                 for (int i = 0; i < numConstraints; i++)
1364                 {
1365                         b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
1366                         int idx = m_data->m_sortData[idxSrc[i]].m_value;
1367                         cs[i] = m_data->m_old[idx];
1368                 }
1369         }
1370
1371 #if defined(_DEBUG)
1372         //              debugPrintf( "nBatches: %d\n", batchIdx );
1373         for (int i = 0; i < numConstraints; i++)
1374         {
1375                 b3Assert(cs[i].getBatchIdx() != -1);
1376         }
1377 #endif
1378
1379         return batchIdx;
1380 }
1381
1382 b3AlignedObjectArray<int> bodyUsed;
1383 b3AlignedObjectArray<int> curUsed;
1384
1385 inline int b3GpuPgsContactSolver::sortConstraintByBatch3(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies, int* batchSizes)
1386 {
1387         B3_PROFILE("sortConstraintByBatch3");
1388
1389         static int maxSwaps = 0;
1390         int numSwaps = 0;
1391
1392         curUsed.resize(2 * simdWidth);
1393
1394         static int maxNumConstraints = 0;
1395         if (maxNumConstraints < numConstraints)
1396         {
1397                 maxNumConstraints = numConstraints;
1398                 //printf("maxNumConstraints  = %d\n",maxNumConstraints );
1399         }
1400
1401         int numUsedArray = numBodies / 32 + 1;
1402         bodyUsed.resize(numUsedArray);
1403
1404         for (int q = 0; q < numUsedArray; q++)
1405                 bodyUsed[q] = 0;
1406
1407         int curBodyUsed = 0;
1408
1409         int numIter = 0;
1410
1411         m_data->m_sortData.resize(0);
1412         m_data->m_idxBuffer.resize(0);
1413         m_data->m_old.resize(0);
1414
1415 #if defined(_DEBUG)
1416         for (int i = 0; i < numConstraints; i++)
1417                 cs[i].getBatchIdx() = -1;
1418 #endif
1419
1420         int numValidConstraints = 0;
1421         //      int unprocessedConstraintIndex = 0;
1422
1423         int batchIdx = 0;
1424
1425         {
1426                 B3_PROFILE("cpu batch innerloop");
1427
1428                 while (numValidConstraints < numConstraints)
1429                 {
1430                         numIter++;
1431                         int nCurrentBatch = 0;
1432                         batchSizes[batchIdx] = 0;
1433
1434                         //      clear flag
1435                         for (int i = 0; i < curBodyUsed; i++)
1436                                 bodyUsed[curUsed[i] / 32] = 0;
1437
1438                         curBodyUsed = 0;
1439
1440                         for (int i = numValidConstraints; i < numConstraints; i++)
1441                         {
1442                                 int idx = i;
1443                                 b3Assert(idx < numConstraints);
1444                                 //      check if it can go
1445                                 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1446                                 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1447                                 int bodyA = abs(bodyAS);
1448                                 int bodyB = abs(bodyBS);
1449                                 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1450                                 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1451                                 int aUnavailable = 0;
1452                                 int bUnavailable = 0;
1453                                 if (!aIsStatic)
1454                                 {
1455                                         aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31));
1456                                 }
1457                                 if (!aUnavailable)
1458                                         if (!bIsStatic)
1459                                         {
1460                                                 bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31));
1461                                         }
1462
1463                                 if (aUnavailable == 0 && bUnavailable == 0)  // ok
1464                                 {
1465                                         if (!aIsStatic)
1466                                         {
1467                                                 bodyUsed[bodyA / 32] |= (1 << (bodyA & 31));
1468                                                 curUsed[curBodyUsed++] = bodyA;
1469                                         }
1470                                         if (!bIsStatic)
1471                                         {
1472                                                 bodyUsed[bodyB / 32] |= (1 << (bodyB & 31));
1473                                                 curUsed[curBodyUsed++] = bodyB;
1474                                         }
1475
1476                                         cs[idx].getBatchIdx() = batchIdx;
1477
1478                                         if (i != numValidConstraints)
1479                                         {
1480                                                 b3Swap(cs[i], cs[numValidConstraints]);
1481                                                 numSwaps++;
1482                                         }
1483
1484                                         numValidConstraints++;
1485                                         {
1486                                                 nCurrentBatch++;
1487                                                 if (nCurrentBatch == simdWidth)
1488                                                 {
1489                                                         batchSizes[batchIdx] += simdWidth;
1490                                                         nCurrentBatch = 0;
1491                                                         for (int i = 0; i < curBodyUsed; i++)
1492                                                                 bodyUsed[curUsed[i] / 32] = 0;
1493                                                         curBodyUsed = 0;
1494                                                 }
1495                                         }
1496                                 }
1497                         }
1498
1499                         if (batchIdx >= B3_MAX_NUM_BATCHES)
1500                         {
1501                                 b3Error("batchIdx>=B3_MAX_NUM_BATCHES");
1502                                 b3Assert(0);
1503                                 break;
1504                         }
1505
1506                         batchSizes[batchIdx] += nCurrentBatch;
1507
1508                         batchIdx++;
1509                 }
1510         }
1511
1512 #if defined(_DEBUG)
1513         //              debugPrintf( "nBatches: %d\n", batchIdx );
1514         for (int i = 0; i < numConstraints; i++)
1515         {
1516                 b3Assert(cs[i].getBatchIdx() != -1);
1517         }
1518 #endif
1519
1520         batchSizes[batchIdx] = 0;
1521
1522         if (maxSwaps < numSwaps)
1523         {
1524                 maxSwaps = numSwaps;
1525                 //printf("maxSwaps = %d\n", maxSwaps);
1526         }
1527
1528         return batchIdx;
1529 }