[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / Bullet3OpenCL / RigidBody / b3Solver.cpp
1 /*
2 Copyright (c) 2012 Advanced Micro Devices, Inc.  
3
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose, 
7 including commercial applications, and to alter it and redistribute it freely, 
8 subject to the following restrictions:
9
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14 //Originally written by Takahiro Harada
15
16 #include "b3Solver.h"
17
18 ///useNewBatchingKernel  is a rewritten kernel using just a single thread of the warp, for experiments
19 bool useNewBatchingKernel = true;
20 bool gConvertConstraintOnCpu = false;
21
22 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
23 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
24 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
25 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
26 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
27 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
28
29 #include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
30
31 #include "kernels/solverSetup.h"
32 #include "kernels/solverSetup2.h"
33
34 #include "kernels/solveContact.h"
35 #include "kernels/solveFriction.h"
36
37 #include "kernels/batchingKernels.h"
38 #include "kernels/batchingKernelsNew.h"
39
40 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
41 #include "Bullet3Common/b3Vector3.h"
42
43 struct SolverDebugInfo
44 {
45         int m_valInt0;
46         int m_valInt1;
47         int m_valInt2;
48         int m_valInt3;
49
50         int m_valInt4;
51         int m_valInt5;
52         int m_valInt6;
53         int m_valInt7;
54
55         int m_valInt8;
56         int m_valInt9;
57         int m_valInt10;
58         int m_valInt11;
59
60         int m_valInt12;
61         int m_valInt13;
62         int m_valInt14;
63         int m_valInt15;
64
65         float m_val0;
66         float m_val1;
67         float m_val2;
68         float m_val3;
69 };
70
71 class SolverDeviceInl
72 {
73 public:
74         struct ParallelSolveData
75         {
76                 b3OpenCLArray<unsigned int>* m_numConstraints;
77                 b3OpenCLArray<unsigned int>* m_offsets;
78         };
79 };
80
81 b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
82         : m_context(ctx),
83           m_device(device),
84           m_queue(queue),
85           m_batchSizes(ctx, queue),
86           m_nIterations(4)
87 {
88         m_sort32 = new b3RadixSort32CL(ctx, device, queue);
89         m_scan = new b3PrefixScanCL(ctx, device, queue, B3_SOLVER_N_CELLS);
90         m_search = new b3BoundSearchCL(ctx, device, queue, B3_SOLVER_N_CELLS);
91
92         const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
93
94         m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, queue, sortSize);
95         m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx, queue);
96
97         m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
98         m_numConstraints->resize(B3_SOLVER_N_CELLS);
99
100         m_offsets = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
101         m_offsets->resize(B3_SOLVER_N_CELLS);
102         const char* additionalMacros = "";
103         //      const char* srcFileNameForCaching="";
104
105         cl_int pErrNum;
106         const char* batchKernelSource = batchingKernelsCL;
107         const char* batchKernelNewSource = batchingKernelsNewCL;
108
109         const char* solverSetupSource = solverSetupCL;
110         const char* solverSetup2Source = solverSetup2CL;
111         const char* solveContactSource = solveContactCL;
112         const char* solveFrictionSource = solveFrictionCL;
113
114         {
115                 cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
116                 b3Assert(solveContactProg);
117
118                 cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
119                 b3Assert(solveFrictionProg);
120
121                 cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
122                 b3Assert(solverSetup2Prog);
123
124                 cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
125                 b3Assert(solverSetupProg);
126
127                 m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
128                 b3Assert(m_solveFrictionKernel);
129
130                 m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
131                 b3Assert(m_solveContactKernel);
132
133                 m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
134                 b3Assert(m_contactToConstraintKernel);
135
136                 m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
137                 b3Assert(m_setSortDataKernel);
138
139                 m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
140                 b3Assert(m_reorderContactKernel);
141
142                 m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
143                 b3Assert(m_copyConstraintKernel);
144         }
145
146         {
147                 cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
148                 //cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
149                 b3Assert(batchingProg);
150
151                 m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
152                 b3Assert(m_batchingKernel);
153         }
154         {
155                 cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
156                 b3Assert(batchingNewProg);
157
158                 m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
159                 //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
160                 b3Assert(m_batchingKernelNew);
161         }
162 }
163
164 b3Solver::~b3Solver()
165 {
166         delete m_offsets;
167         delete m_numConstraints;
168         delete m_sortDataBuffer;
169         delete m_contactBuffer2;
170
171         delete m_sort32;
172         delete m_scan;
173         delete m_search;
174
175         clReleaseKernel(m_batchingKernel);
176         clReleaseKernel(m_batchingKernelNew);
177
178         clReleaseKernel(m_solveContactKernel);
179         clReleaseKernel(m_solveFrictionKernel);
180
181         clReleaseKernel(m_contactToConstraintKernel);
182         clReleaseKernel(m_setSortDataKernel);
183         clReleaseKernel(m_reorderContactKernel);
184         clReleaseKernel(m_copyConstraintKernel);
185 }
186
187 template <bool JACOBI>
188 static __inline void solveContact(b3GpuConstraint4& cs,
189                                                                   const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
190                                                                   const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
191                                                                   float maxRambdaDt[4], float minRambdaDt[4])
192 {
193         b3Vector3 dLinVelA;
194         dLinVelA.setZero();
195         b3Vector3 dAngVelA;
196         dAngVelA.setZero();
197         b3Vector3 dLinVelB;
198         dLinVelB.setZero();
199         b3Vector3 dAngVelB;
200         dAngVelB.setZero();
201
202         for (int ic = 0; ic < 4; ic++)
203         {
204                 //      dont necessary because this makes change to 0
205                 if (cs.m_jacCoeffInv[ic] == 0.f) continue;
206
207                 {
208                         b3Vector3 angular0, angular1, linear;
209                         b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
210                         b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
211                         setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, &linear, &angular0, &angular1);
212
213                         float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1,
214                                                                                 linVelA, angVelA, linVelB, angVelB) +
215                                                          cs.m_b[ic];
216                         rambdaDt *= cs.m_jacCoeffInv[ic];
217
218                         {
219                                 float prevSum = cs.m_appliedRambdaDt[ic];
220                                 float updated = prevSum;
221                                 updated += rambdaDt;
222                                 updated = b3Max(updated, minRambdaDt[ic]);
223                                 updated = b3Min(updated, maxRambdaDt[ic]);
224                                 rambdaDt = updated - prevSum;
225                                 cs.m_appliedRambdaDt[ic] = updated;
226                         }
227
228                         b3Vector3 linImp0 = invMassA * linear * rambdaDt;
229                         b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
230                         b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
231                         b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
232 #ifdef _WIN32
233                         b3Assert(_finite(linImp0.getX()));
234                         b3Assert(_finite(linImp1.getX()));
235 #endif
236                         if (JACOBI)
237                         {
238                                 dLinVelA += linImp0;
239                                 dAngVelA += angImp0;
240                                 dLinVelB += linImp1;
241                                 dAngVelB += angImp1;
242                         }
243                         else
244                         {
245                                 linVelA += linImp0;
246                                 angVelA += angImp0;
247                                 linVelB += linImp1;
248                                 angVelB += angImp1;
249                         }
250                 }
251         }
252
253         if (JACOBI)
254         {
255                 linVelA += dLinVelA;
256                 angVelA += dAngVelA;
257                 linVelB += dLinVelB;
258                 angVelB += dAngVelB;
259         }
260 }
261
262 static __inline void solveFriction(b3GpuConstraint4& cs,
263                                                                    const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
264                                                                    const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
265                                                                    float maxRambdaDt[4], float minRambdaDt[4])
266 {
267         if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return;
268         const b3Vector3& center = (const b3Vector3&)cs.m_center;
269
270         b3Vector3 n = -(const b3Vector3&)cs.m_linear;
271
272         b3Vector3 tangent[2];
273 #if 1
274         b3PlaneSpace1(n, tangent[0], tangent[1]);
275 #else
276         b3Vector3 r = cs.m_worldPos[0] - center;
277         tangent[0] = cross3(n, r);
278         tangent[1] = cross3(tangent[0], n);
279         tangent[0] = normalize3(tangent[0]);
280         tangent[1] = normalize3(tangent[1]);
281 #endif
282
283         b3Vector3 angular0, angular1, linear;
284         b3Vector3 r0 = center - posA;
285         b3Vector3 r1 = center - posB;
286         for (int i = 0; i < 2; i++)
287         {
288                 setLinearAndAngular(tangent[i], r0, r1, &linear, &angular0, &angular1);
289                 float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
290                                                                         linVelA, angVelA, linVelB, angVelB);
291                 rambdaDt *= cs.m_fJacCoeffInv[i];
292
293                 {
294                         float prevSum = cs.m_fAppliedRambdaDt[i];
295                         float updated = prevSum;
296                         updated += rambdaDt;
297                         updated = b3Max(updated, minRambdaDt[i]);
298                         updated = b3Min(updated, maxRambdaDt[i]);
299                         rambdaDt = updated - prevSum;
300                         cs.m_fAppliedRambdaDt[i] = updated;
301                 }
302
303                 b3Vector3 linImp0 = invMassA * linear * rambdaDt;
304                 b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
305                 b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
306                 b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
307 #ifdef _WIN32
308                 b3Assert(_finite(linImp0.getX()));
309                 b3Assert(_finite(linImp1.getX()));
310 #endif
311                 linVelA += linImp0;
312                 angVelA += angImp0;
313                 linVelB += linImp1;
314                 angVelB += angImp1;
315         }
316
317         {  //   angular damping for point constraint
318                 b3Vector3 ab = (posB - posA).normalized();
319                 b3Vector3 ac = (center - posA).normalized();
320                 if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
321                 {
322                         float angNA = b3Dot(n, angVelA);
323                         float angNB = b3Dot(n, angVelB);
324
325                         angVelA -= (angNA * 0.1f) * n;
326                         angVelB -= (angNB * 0.1f) * n;
327                 }
328         }
329 }
330 /*
331  b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
332         b3AlignedObjectArray<b3InertiaData>& m_shapes;
333         b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
334         b3AlignedObjectArray<int>* m_batchSizes;
335         int m_cellIndex;
336         int m_curWgidx;
337         int m_start;
338         int m_nConstraints;
339         bool m_solveFriction;
340         int m_maxNumBatches;
341  */
342
343 struct SolveTask  // : public ThreadPool::Task
344 {
345         SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
346                           int start, int nConstraints, int maxNumBatches, b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
347                 : m_bodies(bodies), m_shapes(shapes), m_constraints(constraints), m_batchSizes(batchSizes), m_cellIndex(cellIndex), m_curWgidx(curWgidx), m_start(start), m_nConstraints(nConstraints), m_solveFriction(true), m_maxNumBatches(maxNumBatches)
348         {
349         }
350
351         unsigned short int getType() { return 0; }
352
353         void run(int tIdx)
354         {
355                 int offset = 0;
356                 for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
357                 {
358                         int numInBatch = m_batchSizes->at(m_cellIndex * B3_MAX_NUM_BATCHES + ii);
359                         if (!numInBatch)
360                                 break;
361
362                         for (int jj = 0; jj < numInBatch; jj++)
363                         {
364                                 int i = m_start + offset + jj;
365                                 int batchId = m_constraints[i].m_batchIdx;
366                                 b3Assert(batchId == ii);
367                                 float frictionCoeff = m_constraints[i].getFrictionCoeff();
368                                 int aIdx = (int)m_constraints[i].m_bodyA;
369                                 int bIdx = (int)m_constraints[i].m_bodyB;
370                                 //                              int localBatch = m_constraints[i].m_batchIdx;
371                                 b3RigidBodyData& bodyA = m_bodies[aIdx];
372                                 b3RigidBodyData& bodyB = m_bodies[bIdx];
373
374                                 if (!m_solveFriction)
375                                 {
376                                         float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
377                                         float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
378
379                                         solveContact<false>(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
380                                                                                 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
381                                                                                 maxRambdaDt, minRambdaDt);
382                                 }
383                                 else
384                                 {
385                                         float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
386                                         float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
387                                         float sum = 0;
388                                         for (int j = 0; j < 4; j++)
389                                         {
390                                                 sum += m_constraints[i].m_appliedRambdaDt[j];
391                                         }
392                                         frictionCoeff = 0.7f;
393                                         for (int j = 0; j < 4; j++)
394                                         {
395                                                 maxRambdaDt[j] = frictionCoeff * sum;
396                                                 minRambdaDt[j] = -maxRambdaDt[j];
397                                         }
398                                         solveFriction(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
399                                                                   (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
400                                                                   maxRambdaDt, minRambdaDt);
401                                 }
402                         }
403                         offset += numInBatch;
404                 }
405                 /*              for (int bb=0;bb<m_maxNumBatches;bb++)
406                 {
407                         //for(int ic=m_nConstraints-1; ic>=0; ic--)
408                         for(int ic=0; ic<m_nConstraints; ic++)
409                         {
410                                 
411                                 int i = m_start + ic;
412                                 if (m_constraints[i].m_batchIdx != bb)
413                                         continue;
414
415                                 float frictionCoeff = m_constraints[i].getFrictionCoeff();
416                                 int aIdx = (int)m_constraints[i].m_bodyA;
417                                 int bIdx = (int)m_constraints[i].m_bodyB;
418                                 int localBatch = m_constraints[i].m_batchIdx;
419                                 b3RigidBodyData& bodyA = m_bodies[aIdx];
420                                 b3RigidBodyData& bodyB = m_bodies[bIdx];
421
422                                 if( !m_solveFriction )
423                                 {
424                                         float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
425                                         float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
426
427                                         solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld, 
428                                                         (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
429                                                 maxRambdaDt, minRambdaDt );
430                                 }
431                                 else
432                                 {
433                                         float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
434                                         float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
435                                         float sum = 0;
436                                         for(int j=0; j<4; j++)
437                                         {
438                                                 sum +=m_constraints[i].m_appliedRambdaDt[j];
439                                         }
440                                         frictionCoeff = 0.7f;
441                                         for(int j=0; j<4; j++)
442                                         {
443                                                 maxRambdaDt[j] = frictionCoeff*sum;
444                                                 minRambdaDt[j] = -maxRambdaDt[j];
445                                         }
446                                         solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld, 
447                                                 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
448                                                 maxRambdaDt, minRambdaDt );
449                         
450                                 }
451                         }
452                 }
453                 */
454         }
455
456         b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
457         b3AlignedObjectArray<b3InertiaData>& m_shapes;
458         b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
459         b3AlignedObjectArray<int>* m_batchSizes;
460         int m_cellIndex;
461         int m_curWgidx;
462         int m_start;
463         int m_nConstraints;
464         bool m_solveFriction;
465         int m_maxNumBatches;
466 };
467
468 void b3Solver::solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
469                                                                                   b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes)
470 {
471 #if 0
472         {       
473                 int nSplitX = B3_SOLVER_N_SPLIT_X;
474                 int nSplitY = B3_SOLVER_N_SPLIT_Y;
475                 int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
476                 for (int z=0;z<4;z++)
477                 {
478                         for (int y=0;y<4;y++)
479                         {
480                                 for (int x=0;x<4;x++)
481                                 {
482                                         int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
483                                 //      printf("newIndex=%d\n",newIndex);
484
485                                         int zIdx = newIndex/(nSplitX*nSplitY);
486                                         int remain = newIndex%(nSplitX*nSplitY);
487                                         int yIdx = remain/nSplitX;
488                                         int xIdx = remain%nSplitX;
489                                 //      printf("newIndex=%d\n",newIndex);
490                                 }
491                         }
492                 }
493
494                 //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
495                 for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
496                 {
497                         for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
498                         {
499                                 int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
500                                 int remain= (wgIdx%((nSplitX*nSplitY)/4));
501                                 int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
502                                 int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
503                                 
504                                 /*int zIdx = newIndex/(nSplitX*nSplitY);
505                                 int remain = newIndex%(nSplitX*nSplitY);
506                                 int yIdx = remain/nSplitX;
507                                 int xIdx = remain%nSplitX;
508                                 */
509                                 int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
510                         //      printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
511                         }
512                 }
513         }
514 #endif
515
516         b3AlignedObjectArray<b3RigidBodyData> bodyNative;
517         bodyBuf->copyToHost(bodyNative);
518         b3AlignedObjectArray<b3InertiaData> shapeNative;
519         shapeBuf->copyToHost(shapeNative);
520         b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
521         constraint->copyToHost(constraintNative);
522
523         b3AlignedObjectArray<unsigned int> numConstraintsHost;
524         m_numConstraints->copyToHost(numConstraintsHost);
525
526         //printf("------------------------\n");
527         b3AlignedObjectArray<unsigned int> offsetsHost;
528         m_offsets->copyToHost(offsetsHost);
529         static int frame = 0;
530         bool useBatches = true;
531         if (useBatches)
532         {
533                 for (int iter = 0; iter < m_nIterations; iter++)
534                 {
535                         for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
536                         {
537                                 int nSplitX = B3_SOLVER_N_SPLIT_X;
538                                 int nSplitY = B3_SOLVER_N_SPLIT_Y;
539                                 int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
540                                 //printf("cell Batch %d\n",cellBatch);
541                                 b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
542                                 for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
543                                 {
544                                         usedBodies[i].resize(0);
545                                 }
546
547                                 //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
548                                 for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
549                                 {
550                                         int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
551                                         int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
552                                         int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
553                                         int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
554                                         int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
555
556                                         if (numConstraintsHost[cellIdx] == 0)
557                                                 continue;
558
559                                         //printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
560                                         //printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
561                                         if (zIdx)
562                                         {
563                                                 //printf("?\n");
564                                         }
565
566                                         if (iter == 0)
567                                         {
568                                                 //printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
569                                                 //printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
570                                         }
571                                         const int start = offsetsHost[cellIdx];
572                                         int numConstraintsInCell = numConstraintsHost[cellIdx];
573                                         //                              const int end = start + numConstraintsInCell;
574
575                                         SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, usedBodies, wgIdx, batchSizes, cellIdx);
576                                         task.m_solveFriction = false;
577                                         task.run(0);
578                                 }
579                         }
580                 }
581
582                 for (int iter = 0; iter < m_nIterations; iter++)
583                 {
584                         for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
585                         {
586                                 int nSplitX = B3_SOLVER_N_SPLIT_X;
587                                 int nSplitY = B3_SOLVER_N_SPLIT_Y;
588
589                                 int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
590
591                                 for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
592                                 {
593                                         int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
594                                         int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
595                                         int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
596                                         int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
597
598                                         int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
599
600                                         if (numConstraintsHost[cellIdx] == 0)
601                                                 continue;
602
603                                         //printf("yIdx=%d\n",yIdx);
604
605                                         const int start = offsetsHost[cellIdx];
606                                         int numConstraintsInCell = numConstraintsHost[cellIdx];
607                                         //                              const int end = start + numConstraintsInCell;
608
609                                         SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, 0, 0, batchSizes, cellIdx);
610                                         task.m_solveFriction = true;
611                                         task.run(0);
612                                 }
613                         }
614                 }
615         }
616         else
617         {
618                 for (int iter = 0; iter < m_nIterations; iter++)
619                 {
620                         SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
621                         task.m_solveFriction = false;
622                         task.run(0);
623                 }
624
625                 for (int iter = 0; iter < m_nIterations; iter++)
626                 {
627                         SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
628                         task.m_solveFriction = true;
629                         task.run(0);
630                 }
631         }
632
633         bodyBuf->copyFromHost(bodyNative);
634         shapeBuf->copyFromHost(shapeNative);
635         constraint->copyFromHost(constraintNative);
636         frame++;
637 }
638
639 void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
640                                                   const b3OpenCLArray<b3InertiaData>* shapeBuf,
641                                                   b3OpenCLArray<b3GpuConstraint4>* constraint,
642                                                   b3OpenCLArray<unsigned int>* m_numConstraints,
643                                                   b3OpenCLArray<unsigned int>* m_offsets,
644                                                   int batchId)
645 {
646         //                                              b3BufferInfoCL( m_numConstraints->getBufferCL() ),
647         //                                              b3BufferInfoCL( m_offsets->getBufferCL() )
648
649         int cellBatch = batchId;
650         const int nn = B3_SOLVER_N_CELLS;
651         //      int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
652
653         b3AlignedObjectArray<unsigned int> gN;
654         m_numConstraints->copyToHost(gN);
655         b3AlignedObjectArray<unsigned int> gOffsets;
656         m_offsets->copyToHost(gOffsets);
657         int nSplitX = B3_SOLVER_N_SPLIT_X;
658         int nSplitY = B3_SOLVER_N_SPLIT_Y;
659
660         //      int bIdx = batchId;
661
662         b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
663         constraint->copyToHost(cpuConstraints);
664
665         printf("batch = %d\n", batchId);
666
667         int numWorkgroups = nn / B3_SOLVER_N_BATCHES;
668         b3AlignedObjectArray<int> usedBodies;
669
670         for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
671         {
672                 printf("wgIdx = %d           ", wgIdx);
673
674                 int zIdx = (wgIdx / ((nSplitX * nSplitY)) / 2) * 2 + ((cellBatch & 4) >> 2);
675                 int remain = wgIdx % ((nSplitX * nSplitY));
676                 int yIdx = (remain % (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
677                 int xIdx = (remain / (nSplitX / 2)) * 2 + (cellBatch & 1);
678
679                 int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
680                 printf("cellIdx=%d\n", cellIdx);
681                 if (gN[cellIdx] == 0)
682                         continue;
683
684                 const int start = gOffsets[cellIdx];
685                 const int end = start + gN[cellIdx];
686
687                 for (int c = start; c < end; c++)
688                 {
689                         b3GpuConstraint4& constraint = cpuConstraints[c];
690                         //printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
691                         if (usedBodies.findLinearSearch(constraint.m_bodyA) < usedBodies.size())
692                         {
693                                 printf("error?\n");
694                         }
695                         if (usedBodies.findLinearSearch(constraint.m_bodyB) < usedBodies.size())
696                         {
697                                 printf("error?\n");
698                         }
699                 }
700
701                 for (int c = start; c < end; c++)
702                 {
703                         b3GpuConstraint4& constraint = cpuConstraints[c];
704                         usedBodies.push_back(constraint.m_bodyA);
705                         usedBodies.push_back(constraint.m_bodyB);
706                 }
707         }
708 }
709
710 static bool verify = false;
711
712 void b3Solver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
713                                                                           b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches)
714 {
715         b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
716         {
717                 const int nn = B3_SOLVER_N_CELLS;
718
719                 cdata.x = 0;
720                 cdata.y = maxNumBatches;  //250;
721
722                 int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
723 #ifdef DEBUG_ME
724                 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
725                 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
726 #endif
727
728                 {
729                         B3_PROFILE("m_batchSolveKernel iterations");
730                         for (int iter = 0; iter < m_nIterations; iter++)
731                         {
732                                 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
733                                 {
734                                         if (verify)
735                                         {
736                                                 checkConstraintBatch(bodyBuf, shapeBuf, constraint, m_numConstraints, m_offsets, ib);
737                                         }
738
739 #ifdef DEBUG_ME
740                                         memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
741                                         gpuDebugInfo.write(debugInfo, numWorkItems);
742 #endif
743
744                                         cdata.z = ib;
745
746                                         b3LauncherCL launcher(m_queue, m_solveContactKernel, "m_solveContactKernel");
747 #if 1
748
749                                         b3BufferInfoCL bInfo[] = {
750
751                                                 b3BufferInfoCL(bodyBuf->getBufferCL()),
752                                                 b3BufferInfoCL(shapeBuf->getBufferCL()),
753                                                 b3BufferInfoCL(constraint->getBufferCL()),
754                                                 b3BufferInfoCL(m_numConstraints->getBufferCL()),
755                                                 b3BufferInfoCL(m_offsets->getBufferCL())
756 #ifdef DEBUG_ME
757                                                         ,
758                                                 b3BufferInfoCL(&gpuDebugInfo)
759 #endif
760                                         };
761
762                                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
763                                         //launcher.setConst(  cdata.x );
764                                         launcher.setConst(cdata.y);
765                                         launcher.setConst(cdata.z);
766                                         b3Int4 nSplit;
767                                         nSplit.x = B3_SOLVER_N_SPLIT_X;
768                                         nSplit.y = B3_SOLVER_N_SPLIT_Y;
769                                         nSplit.z = B3_SOLVER_N_SPLIT_Z;
770
771                                         launcher.setConst(nSplit);
772                                         launcher.launch1D(numWorkItems, 64);
773
774 #else
775                                         const char* fileName = "m_batchSolveKernel.bin";
776                                         FILE* f = fopen(fileName, "rb");
777                                         if (f)
778                                         {
779                                                 int sizeInBytes = 0;
780                                                 if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
781                                                 {
782                                                         printf("error, cannot get file size\n");
783                                                         exit(0);
784                                                 }
785
786                                                 unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
787                                                 fread(buf, sizeInBytes, 1, f);
788                                                 int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
789                                                 int num = *(int*)&buf[serializedBytes];
790
791                                                 launcher.launch1D(num);
792
793                                                 //this clFinish is for testing on errors
794                                                 clFinish(m_queue);
795                                         }
796
797 #endif
798
799 #ifdef DEBUG_ME
800                                         clFinish(m_queue);
801                                         gpuDebugInfo.read(debugInfo, numWorkItems);
802                                         clFinish(m_queue);
803                                         for (int i = 0; i < numWorkItems; i++)
804                                         {
805                                                 if (debugInfo[i].m_valInt2 > 0)
806                                                 {
807                                                         printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
808                                                 }
809
810                                                 if (debugInfo[i].m_valInt3 > 0)
811                                                 {
812                                                         printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
813                                                 }
814                                         }
815 #endif  //DEBUG_ME
816                                 }
817                         }
818
819                         clFinish(m_queue);
820                 }
821
822                 cdata.x = 1;
823                 bool applyFriction = true;
824                 if (applyFriction)
825                 {
826                         B3_PROFILE("m_batchSolveKernel iterations2");
827                         for (int iter = 0; iter < m_nIterations; iter++)
828                         {
829                                 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
830                                 {
831                                         cdata.z = ib;
832
833                                         b3BufferInfoCL bInfo[] = {
834                                                 b3BufferInfoCL(bodyBuf->getBufferCL()),
835                                                 b3BufferInfoCL(shapeBuf->getBufferCL()),
836                                                 b3BufferInfoCL(constraint->getBufferCL()),
837                                                 b3BufferInfoCL(m_numConstraints->getBufferCL()),
838                                                 b3BufferInfoCL(m_offsets->getBufferCL())
839 #ifdef DEBUG_ME
840                                                         ,
841                                                 b3BufferInfoCL(&gpuDebugInfo)
842 #endif  //DEBUG_ME
843                                         };
844                                         b3LauncherCL launcher(m_queue, m_solveFrictionKernel, "m_solveFrictionKernel");
845                                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
846                                         //launcher.setConst(  cdata.x );
847                                         launcher.setConst(cdata.y);
848                                         launcher.setConst(cdata.z);
849                                         b3Int4 nSplit;
850                                         nSplit.x = B3_SOLVER_N_SPLIT_X;
851                                         nSplit.y = B3_SOLVER_N_SPLIT_Y;
852                                         nSplit.z = B3_SOLVER_N_SPLIT_Z;
853
854                                         launcher.setConst(nSplit);
855
856                                         launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
857                                 }
858                         }
859                         clFinish(m_queue);
860                 }
861 #ifdef DEBUG_ME
862                 delete[] debugInfo;
863 #endif  //DEBUG_ME
864         }
865 }
866
867 void b3Solver::convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
868                                                                         const b3OpenCLArray<b3InertiaData>* shapeBuf,
869                                                                         b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
870                                                                         int nContacts, const ConstraintCfg& cfg)
871 {
872         //      b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
873         contactCOut->resize(nContacts);
874         struct CB
875         {
876                 int m_nContacts;
877                 float m_dt;
878                 float m_positionDrift;
879                 float m_positionConstraintCoeff;
880         };
881
882         {
883                 CB cdata;
884                 cdata.m_nContacts = nContacts;
885                 cdata.m_dt = cfg.m_dt;
886                 cdata.m_positionDrift = cfg.m_positionDrift;
887                 cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
888
889                 if (gConvertConstraintOnCpu)
890                 {
891                         b3AlignedObjectArray<b3RigidBodyData> gBodies;
892                         bodyBuf->copyToHost(gBodies);
893
894                         b3AlignedObjectArray<b3Contact4> gContact;
895                         contactsIn->copyToHost(gContact);
896
897                         b3AlignedObjectArray<b3InertiaData> gShapes;
898                         shapeBuf->copyToHost(gShapes);
899
900                         b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
901                         gConstraintOut.resize(nContacts);
902
903                         B3_PROFILE("cpu contactToConstraintKernel");
904                         for (int gIdx = 0; gIdx < nContacts; gIdx++)
905                         {
906                                 int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
907                                 int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
908
909                                 b3Float4 posA = gBodies[aIdx].m_pos;
910                                 b3Float4 linVelA = gBodies[aIdx].m_linVel;
911                                 b3Float4 angVelA = gBodies[aIdx].m_angVel;
912                                 float invMassA = gBodies[aIdx].m_invMass;
913                                 b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
914
915                                 b3Float4 posB = gBodies[bIdx].m_pos;
916                                 b3Float4 linVelB = gBodies[bIdx].m_linVel;
917                                 b3Float4 angVelB = gBodies[bIdx].m_angVel;
918                                 float invMassB = gBodies[bIdx].m_invMass;
919                                 b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
920
921                                 b3ContactConstraint4_t cs;
922
923                                 setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
924                                                            &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
925                                                            &cs);
926
927                                 cs.m_batchIdx = gContact[gIdx].m_batchIdx;
928
929                                 gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
930                         }
931
932                         contactCOut->copyFromHost(gConstraintOut);
933                 }
934                 else
935                 {
936                         B3_PROFILE("gpu m_contactToConstraintKernel");
937
938                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(contactsIn->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(shapeBuf->getBufferCL()),
939                                                                           b3BufferInfoCL(contactCOut->getBufferCL())};
940                         b3LauncherCL launcher(m_queue, m_contactToConstraintKernel, "m_contactToConstraintKernel");
941                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
942                         //launcher.setConst(  cdata );
943
944                         launcher.setConst(cdata.m_nContacts);
945                         launcher.setConst(cdata.m_dt);
946                         launcher.setConst(cdata.m_positionDrift);
947                         launcher.setConst(cdata.m_positionConstraintCoeff);
948
949                         launcher.launch1D(nContacts, 64);
950                         clFinish(m_queue);
951                 }
952         }
953 }
954
955 /*
956 void b3Solver::sortContacts(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
957                         b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData, 
958                         int nContacts, const b3Solver::ConstraintCfg& cfg )
959 {
960         
961         
962
963         const int sortAlignment = 512; // todo. get this out of sort
964         if( cfg.m_enableParallelSolve )
965         {
966                 
967
968                 int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
969
970                 b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
971                 b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
972
973                 {       //      2. set cell idx
974                         struct CB
975                         {
976                                 int m_nContacts;
977                                 int m_staticIdx;
978                                 float m_scale;
979                                 int m_nSplit;
980                         };
981
982                         b3Assert( sortSize%64 == 0 );
983                         CB cdata;
984                         cdata.m_nContacts = nContacts;
985                         cdata.m_staticIdx = cfg.m_staticIdx;
986                         cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
987                         cdata.m_nSplit = B3_SOLVER_N_SPLIT;
988
989                         
990                         b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
991                         b3LauncherCL launcher( m_queue, m_setSortDataKernel );
992                         launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
993                         launcher.setConst(  cdata );
994                         launcher.launch1D( sortSize, 64 );
995                 }
996
997                 {       //      3. sort by cell idx
998                         int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
999                         int sortBit = 32;
1000                         //if( n <= 0xffff ) sortBit = 16;
1001                         //if( n <= 0xff ) sortBit = 8;
1002                         m_sort32->execute(*m_sortDataBuffer,sortSize);
1003                 }
1004                 {       //      4. find entries
1005                         m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);
1006
1007                         m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
1008                 }
1009
1010                 {       //      5. sort constraints by cellIdx
1011                         //      todo. preallocate this
1012 //                      b3Assert( contactsIn->getType() == TYPE_HOST );
1013 //                      b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );        //      copying contacts to this buffer
1014
1015                         {
1016                                 
1017
1018                                 b3Int4 cdata; cdata.x = nContacts;
1019                                 b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
1020                                 b3LauncherCL launcher( m_queue, m_reorderContactKernel );
1021                                 launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
1022                                 launcher.setConst(  cdata );
1023                                 launcher.launch1D( nContacts, 64 );
1024                         }
1025 //                      BufferUtils::unmap<true>( out, contactsIn, nContacts );
1026                 }
1027         }
1028
1029         
1030 }
1031
1032 */
1033 void b3Solver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx)
1034 {
1035         int numWorkItems = 64 * B3_SOLVER_N_CELLS;
1036         {
1037                 B3_PROFILE("batch generation");
1038
1039                 b3Int4 cdata;
1040                 cdata.x = nContacts;
1041                 cdata.y = 0;
1042                 cdata.z = staticIdx;
1043
1044 #ifdef BATCH_DEBUG
1045                 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
1046                 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
1047                 memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
1048                 gpuDebugInfo.write(debugInfo, numWorkItems);
1049 #endif
1050
1051 #if 0
1052                 b3BufferInfoCL bInfo[] = { 
1053                         b3BufferInfoCL( contacts->getBufferCL() ), 
1054                         b3BufferInfoCL(  m_contactBuffer2->getBufferCL()),
1055                         b3BufferInfoCL( nNative->getBufferCL() ), 
1056                         b3BufferInfoCL( offsetsNative->getBufferCL() ),
1057 #ifdef BATCH_DEBUG
1058                         ,       b3BufferInfoCL(&gpuDebugInfo)
1059 #endif
1060                 };
1061 #endif
1062
1063                 {
1064                         m_batchSizes.resize(nNative->size());
1065                         B3_PROFILE("batchingKernel");
1066                         //b3LauncherCL launcher( m_queue, m_batchingKernel);
1067                         cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
1068
1069                         b3LauncherCL launcher(m_queue, k, "*batchingKernel");
1070                         if (!useNewBatchingKernel)
1071                         {
1072                                 launcher.setBuffer(contacts->getBufferCL());
1073                         }
1074                         launcher.setBuffer(m_contactBuffer2->getBufferCL());
1075                         launcher.setBuffer(nNative->getBufferCL());
1076                         launcher.setBuffer(offsetsNative->getBufferCL());
1077
1078                         launcher.setBuffer(m_batchSizes.getBufferCL());
1079
1080                         //launcher.setConst(  cdata );
1081                         launcher.setConst(staticIdx);
1082
1083                         launcher.launch1D(numWorkItems, 64);
1084                         //clFinish(m_queue);
1085                         //b3AlignedObjectArray<int> batchSizesCPU;
1086                         //m_batchSizes.copyToHost(batchSizesCPU);
1087                         //printf(".\n");
1088                 }
1089
1090 #ifdef BATCH_DEBUG
1091                 aaaa
1092                         b3Contact4* hostContacts = new b3Contact4[nContacts];
1093                 m_contactBuffer->read(hostContacts, nContacts);
1094                 clFinish(m_queue);
1095
1096                 gpuDebugInfo.read(debugInfo, numWorkItems);
1097                 clFinish(m_queue);
1098
1099                 for (int i = 0; i < numWorkItems; i++)
1100                 {
1101                         if (debugInfo[i].m_valInt1 > 0)
1102                         {
1103                                 printf("catch\n");
1104                         }
1105                         if (debugInfo[i].m_valInt2 > 0)
1106                         {
1107                                 printf("catch22\n");
1108                         }
1109
1110                         if (debugInfo[i].m_valInt3 > 0)
1111                         {
1112                                 printf("catch666\n");
1113                         }
1114
1115                         if (debugInfo[i].m_valInt4 > 0)
1116                         {
1117                                 printf("catch777\n");
1118                         }
1119                 }
1120                 delete[] debugInfo;
1121 #endif  //BATCH_DEBUG
1122         }
1123
1124         //      copy buffer to buffer
1125         //b3Assert(m_contactBuffer->size()==nContacts);
1126         //contacts->copyFromOpenCLArray( *m_contactBuffer);
1127         //clFinish(m_queue);//needed?
1128 }