[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / Bullet3OpenCL / RigidBody / b3GpuPgsConstraintSolver.cpp
1
2 /*
3 Copyright (c) 2013 Advanced Micro Devices, Inc.  
4
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose, 
8 including commercial applications, and to alter it and redistribute it freely, 
9 subject to the following restrictions:
10
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14 */
15 //Originally written by Erwin Coumans
16
17 bool useGpuInitSolverBodies = true;
18 bool useGpuInfo1 = true;
19 bool useGpuInfo2 = true;
20 bool useGpuSolveJointConstraintRows = true;
21 bool useGpuWriteBackVelocities = true;
22 bool gpuBreakConstraints = true;
23
24 #include "b3GpuPgsConstraintSolver.h"
25
26 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
27
28 #include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
29 #include <new>
30 #include "Bullet3Common/b3AlignedObjectArray.h"
31 #include <string.h>  //for memset
32 #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
33 #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
34 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
35
36 #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
37
38 #include "Bullet3OpenCL/RigidBody/kernels/jointSolver.h"  //solveConstraintRowsCL
39 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
40
41 #define B3_JOINT_SOLVER_PATH "src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl"
42
43 struct b3GpuPgsJacobiSolverInternalData
44 {
45         cl_context m_context;
46         cl_device_id m_device;
47         cl_command_queue m_queue;
48
49         b3PrefixScanCL* m_prefixScan;
50
51         cl_kernel m_solveJointConstraintRowsKernels;
52         cl_kernel m_initSolverBodiesKernel;
53         cl_kernel m_getInfo1Kernel;
54         cl_kernel m_initBatchConstraintsKernel;
55         cl_kernel m_getInfo2Kernel;
56         cl_kernel m_writeBackVelocitiesKernel;
57         cl_kernel m_breakViolatedConstraintsKernel;
58
59         b3OpenCLArray<unsigned int>* m_gpuConstraintRowOffsets;
60
61         b3OpenCLArray<b3GpuSolverBody>* m_gpuSolverBodies;
62         b3OpenCLArray<b3BatchConstraint>* m_gpuBatchConstraints;
63         b3OpenCLArray<b3GpuSolverConstraint>* m_gpuConstraintRows;
64         b3OpenCLArray<unsigned int>* m_gpuConstraintInfo1;
65
66         //      b3AlignedObjectArray<b3GpuSolverBody>           m_cpuSolverBodies;
67         b3AlignedObjectArray<b3BatchConstraint> m_cpuBatchConstraints;
68         b3AlignedObjectArray<b3GpuSolverConstraint> m_cpuConstraintRows;
69         b3AlignedObjectArray<unsigned int> m_cpuConstraintInfo1;
70         b3AlignedObjectArray<unsigned int> m_cpuConstraintRowOffsets;
71
72         b3AlignedObjectArray<b3RigidBodyData> m_cpuBodies;
73         b3AlignedObjectArray<b3InertiaData> m_cpuInertias;
74
75         b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints;
76
77         b3AlignedObjectArray<int> m_batchSizes;
78 };
79
80 /*
81 static b3Transform      getWorldTransform(b3RigidBodyData* rb)
82 {
83         b3Transform newTrans;
84         newTrans.setOrigin(rb->m_pos);
85         newTrans.setRotation(rb->m_quat);
86         return newTrans;
87 }
88
89 static const b3Matrix3x3&       getInvInertiaTensorWorld(b3InertiaData* inertia)
90 {
91         return inertia->m_invInertiaWorld;
92 }
93
94 */
95
96 static const b3Vector3& getLinearVelocity(b3RigidBodyData* rb)
97 {
98         return rb->m_linVel;
99 }
100
101 static const b3Vector3& getAngularVelocity(b3RigidBodyData* rb)
102 {
103         return rb->m_angVel;
104 }
105
106 b3Vector3 getVelocityInLocalPoint(b3RigidBodyData* rb, const b3Vector3& rel_pos)
107 {
108         //we also calculate lin/ang velocity for kinematic objects
109         return getLinearVelocity(rb) + getAngularVelocity(rb).cross(rel_pos);
110 }
111
112 b3GpuPgsConstraintSolver::b3GpuPgsConstraintSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, bool usePgs)
113 {
114         m_usePgs = usePgs;
115         m_gpuData = new b3GpuPgsJacobiSolverInternalData();
116         m_gpuData->m_context = ctx;
117         m_gpuData->m_device = device;
118         m_gpuData->m_queue = queue;
119
120         m_gpuData->m_prefixScan = new b3PrefixScanCL(ctx, device, queue);
121
122         m_gpuData->m_gpuConstraintRowOffsets = new b3OpenCLArray<unsigned int>(m_gpuData->m_context, m_gpuData->m_queue);
123
124         m_gpuData->m_gpuSolverBodies = new b3OpenCLArray<b3GpuSolverBody>(m_gpuData->m_context, m_gpuData->m_queue);
125         m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray<b3BatchConstraint>(m_gpuData->m_context, m_gpuData->m_queue);
126         m_gpuData->m_gpuConstraintRows = new b3OpenCLArray<b3GpuSolverConstraint>(m_gpuData->m_context, m_gpuData->m_queue);
127         m_gpuData->m_gpuConstraintInfo1 = new b3OpenCLArray<unsigned int>(m_gpuData->m_context, m_gpuData->m_queue);
128         cl_int errNum = 0;
129
130         {
131                 cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, &errNum, "", B3_JOINT_SOLVER_PATH);
132                 //cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_gpuData->m_context,m_gpuData->m_device,0,&errNum,"",B3_JOINT_SOLVER_PATH,true);
133                 b3Assert(errNum == CL_SUCCESS);
134                 m_gpuData->m_solveJointConstraintRowsKernels = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "solveJointConstraintRows", &errNum, prog);
135                 b3Assert(errNum == CL_SUCCESS);
136                 m_gpuData->m_initSolverBodiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "initSolverBodies", &errNum, prog);
137                 b3Assert(errNum == CL_SUCCESS);
138                 m_gpuData->m_getInfo1Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "getInfo1Kernel", &errNum, prog);
139                 b3Assert(errNum == CL_SUCCESS);
140                 m_gpuData->m_initBatchConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "initBatchConstraintsKernel", &errNum, prog);
141                 b3Assert(errNum == CL_SUCCESS);
142                 m_gpuData->m_getInfo2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "getInfo2Kernel", &errNum, prog);
143                 b3Assert(errNum == CL_SUCCESS);
144                 m_gpuData->m_writeBackVelocitiesKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "writeBackVelocitiesKernel", &errNum, prog);
145                 b3Assert(errNum == CL_SUCCESS);
146                 m_gpuData->m_breakViolatedConstraintsKernel = b3OpenCLUtils::compileCLKernelFromString(m_gpuData->m_context, m_gpuData->m_device, solveConstraintRowsCL, "breakViolatedConstraintsKernel", &errNum, prog);
147                 b3Assert(errNum == CL_SUCCESS);
148
149                 clReleaseProgram(prog);
150         }
151 }
152
153 b3GpuPgsConstraintSolver::~b3GpuPgsConstraintSolver()
154 {
155         clReleaseKernel(m_gpuData->m_solveJointConstraintRowsKernels);
156         clReleaseKernel(m_gpuData->m_initSolverBodiesKernel);
157         clReleaseKernel(m_gpuData->m_getInfo1Kernel);
158         clReleaseKernel(m_gpuData->m_initBatchConstraintsKernel);
159         clReleaseKernel(m_gpuData->m_getInfo2Kernel);
160         clReleaseKernel(m_gpuData->m_writeBackVelocitiesKernel);
161         clReleaseKernel(m_gpuData->m_breakViolatedConstraintsKernel);
162
163         delete m_gpuData->m_prefixScan;
164         delete m_gpuData->m_gpuConstraintRowOffsets;
165         delete m_gpuData->m_gpuSolverBodies;
166         delete m_gpuData->m_gpuBatchConstraints;
167         delete m_gpuData->m_gpuConstraintRows;
168         delete m_gpuData->m_gpuConstraintInfo1;
169
170         delete m_gpuData;
171 }
172
173 struct b3BatchConstraint
174 {
175         int m_bodyAPtrAndSignBit;
176         int m_bodyBPtrAndSignBit;
177         int m_originalConstraintIndex;
178         int m_batchId;
179 };
180
181 static b3AlignedObjectArray<b3BatchConstraint> batchConstraints;
182
183 void b3GpuPgsConstraintSolver::recomputeBatches()
184 {
185         m_gpuData->m_batchSizes.clear();
186 }
187
188 b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal)
189 {
190         B3_PROFILE("GPU solveGroupCacheFriendlySetup");
191         batchConstraints.resize(numConstraints);
192         m_gpuData->m_gpuBatchConstraints->resize(numConstraints);
193         m_staticIdx = -1;
194         m_maxOverrideNumSolverIterations = 0;
195
196         /*      m_gpuData->m_gpuBodies->resize(numBodies);
197         m_gpuData->m_gpuBodies->copyFromHostPointer(bodies,numBodies);
198
199         b3OpenCLArray<b3InertiaData> gpuInertias(m_gpuData->m_context,m_gpuData->m_queue);
200         gpuInertias.resize(numBodies);
201         gpuInertias.copyFromHostPointer(inertias,numBodies);
202         */
203
204         m_gpuData->m_gpuSolverBodies->resize(numBodies);
205
206         m_tmpSolverBodyPool.resize(numBodies);
207         {
208                 if (useGpuInitSolverBodies)
209                 {
210                         B3_PROFILE("m_initSolverBodiesKernel");
211
212                         b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_initSolverBodiesKernel, "m_initSolverBodiesKernel");
213                         launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
214                         launcher.setBuffer(gpuBodies->getBufferCL());
215                         launcher.setConst(numBodies);
216                         launcher.launch1D(numBodies);
217                         clFinish(m_gpuData->m_queue);
218
219                         //                      m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
220                 }
221                 else
222                 {
223                         gpuBodies->copyToHost(m_gpuData->m_cpuBodies);
224                         for (int i = 0; i < numBodies; i++)
225                         {
226                                 b3RigidBodyData& body = m_gpuData->m_cpuBodies[i];
227                                 b3GpuSolverBody& solverBody = m_tmpSolverBodyPool[i];
228                                 initSolverBody(i, &solverBody, &body);
229                                 solverBody.m_originalBodyIndex = i;
230                         }
231                         m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool);
232                 }
233         }
234
235         //      int totalBodies = 0;
236         int totalNumRows = 0;
237         //b3RigidBody* rb0=0,*rb1=0;
238         //if (1)
239         {
240                 {
241                         //                      int i;
242
243                         m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
244
245                         //                      b3OpenCLArray<b3GpuGenericConstraint> gpuConstraints(m_gpuData->m_context,m_gpuData->m_queue);
246
247                         if (useGpuInfo1)
248                         {
249                                 B3_PROFILE("info1 and init batchConstraint");
250
251                                 m_gpuData->m_gpuConstraintInfo1->resize(numConstraints);
252
253                                 if (1)
254                                 {
255                                         B3_PROFILE("getInfo1Kernel");
256
257                                         b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_getInfo1Kernel, "m_getInfo1Kernel");
258                                         launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
259                                         launcher.setBuffer(gpuConstraints->getBufferCL());
260                                         launcher.setConst(numConstraints);
261                                         launcher.launch1D(numConstraints);
262                                         clFinish(m_gpuData->m_queue);
263                                 }
264
265                                 if (m_gpuData->m_batchSizes.size() == 0)
266                                 {
267                                         B3_PROFILE("initBatchConstraintsKernel");
268
269                                         m_gpuData->m_gpuConstraintRowOffsets->resize(numConstraints);
270                                         unsigned int total = 0;
271                                         m_gpuData->m_prefixScan->execute(*m_gpuData->m_gpuConstraintInfo1, *m_gpuData->m_gpuConstraintRowOffsets, numConstraints, &total);
272                                         unsigned int lastElem = m_gpuData->m_gpuConstraintInfo1->at(numConstraints - 1);
273                                         totalNumRows = total + lastElem;
274
275                                         {
276                                                 B3_PROFILE("init batch constraints");
277                                                 b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_initBatchConstraintsKernel, "m_initBatchConstraintsKernel");
278                                                 launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
279                                                 launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL());
280                                                 launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL());
281                                                 launcher.setBuffer(gpuConstraints->getBufferCL());
282                                                 launcher.setBuffer(gpuBodies->getBufferCL());
283                                                 launcher.setConst(numConstraints);
284                                                 launcher.launch1D(numConstraints);
285                                                 clFinish(m_gpuData->m_queue);
286                                         }
287                                         //assume the batching happens on CPU, so copy the data
288                                         m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
289                                 }
290                         }
291                         else
292                         {
293                                 totalNumRows = 0;
294                                 gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints);
295                                 //calculate the total number of contraint rows
296                                 for (int i = 0; i < numConstraints; i++)
297                                 {
298                                         unsigned int& info1 = m_tmpConstraintSizesPool[i];
299                                         //                                      unsigned int info1;
300                                         if (m_gpuData->m_cpuConstraints[i].isEnabled())
301                                         {
302                                                 m_gpuData->m_cpuConstraints[i].getInfo1(&info1, &m_gpuData->m_cpuBodies[0]);
303                                         }
304                                         else
305                                         {
306                                                 info1 = 0;
307                                         }
308
309                                         totalNumRows += info1;
310                                 }
311
312                                 m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
313                                 m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool);
314                         }
315                         m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
316                         m_gpuData->m_gpuConstraintRows->resize(totalNumRows);
317
318                         //                      b3GpuConstraintArray            verify;
319
320                         if (useGpuInfo2)
321                         {
322                                 {
323                                         B3_PROFILE("getInfo2Kernel");
324                                         b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_getInfo2Kernel, "m_getInfo2Kernel");
325                                         launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL());
326                                         launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
327                                         launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL());
328                                         launcher.setBuffer(gpuConstraints->getBufferCL());
329                                         launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL());
330                                         launcher.setBuffer(gpuBodies->getBufferCL());
331                                         launcher.setBuffer(gpuInertias->getBufferCL());
332                                         launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
333                                         launcher.setConst(infoGlobal.m_timeStep);
334                                         launcher.setConst(infoGlobal.m_erp);
335                                         launcher.setConst(infoGlobal.m_globalCfm);
336                                         launcher.setConst(infoGlobal.m_damping);
337                                         launcher.setConst(infoGlobal.m_numIterations);
338                                         launcher.setConst(numConstraints);
339                                         launcher.launch1D(numConstraints);
340                                         clFinish(m_gpuData->m_queue);
341
342                                         if (m_gpuData->m_batchSizes.size() == 0)
343                                                 m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
344                                         //m_gpuData->m_gpuConstraintRows->copyToHost(verify);
345                                         //m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool);
346                                 }
347                         }
348                         else
349                         {
350                                 gpuInertias->copyToHost(m_gpuData->m_cpuInertias);
351
352                                 ///setup the b3SolverConstraints
353
354                                 for (int i = 0; i < numConstraints; i++)
355                                 {
356                                         const int& info1 = m_tmpConstraintSizesPool[i];
357
358                                         if (info1)
359                                         {
360                                                 int constraintIndex = batchConstraints[i].m_originalConstraintIndex;
361                                                 int constraintRowOffset = m_gpuData->m_cpuConstraintRowOffsets[constraintIndex];
362
363                                                 b3GpuSolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[constraintRowOffset];
364                                                 b3GpuGenericConstraint& constraint = m_gpuData->m_cpuConstraints[i];
365
366                                                 b3RigidBodyData& rbA = m_gpuData->m_cpuBodies[constraint.getRigidBodyA()];
367                                                 //b3RigidBody& rbA = constraint.getRigidBodyA();
368                                                 //                              b3RigidBody& rbB = constraint.getRigidBodyB();
369                                                 b3RigidBodyData& rbB = m_gpuData->m_cpuBodies[constraint.getRigidBodyB()];
370
371                                                 int solverBodyIdA = constraint.getRigidBodyA();  //getOrInitSolverBody(constraint.getRigidBodyA(),bodies,inertias);
372                                                 int solverBodyIdB = constraint.getRigidBodyB();  //getOrInitSolverBody(constraint.getRigidBodyB(),bodies,inertias);
373
374                                                 b3GpuSolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
375                                                 b3GpuSolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
376
377                                                 if (rbA.m_invMass)
378                                                 {
379                                                         batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;
380                                                 }
381                                                 else
382                                                 {
383                                                         if (!solverBodyIdA)
384                                                                 m_staticIdx = 0;
385                                                         batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;
386                                                 }
387
388                                                 if (rbB.m_invMass)
389                                                 {
390                                                         batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;
391                                                 }
392                                                 else
393                                                 {
394                                                         if (!solverBodyIdB)
395                                                                 m_staticIdx = 0;
396                                                         batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;
397                                                 }
398
399                                                 int overrideNumSolverIterations = 0;  //constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
400                                                 if (overrideNumSolverIterations > m_maxOverrideNumSolverIterations)
401                                                         m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
402
403                                                 int j;
404                                                 for (j = 0; j < info1; j++)
405                                                 {
406                                                         memset(&currentConstraintRow[j], 0, sizeof(b3GpuSolverConstraint));
407                                                         currentConstraintRow[j].m_angularComponentA.setValue(0, 0, 0);
408                                                         currentConstraintRow[j].m_angularComponentB.setValue(0, 0, 0);
409                                                         currentConstraintRow[j].m_appliedImpulse = 0.f;
410                                                         currentConstraintRow[j].m_appliedPushImpulse = 0.f;
411                                                         currentConstraintRow[j].m_cfm = 0.f;
412                                                         currentConstraintRow[j].m_contactNormal.setValue(0, 0, 0);
413                                                         currentConstraintRow[j].m_friction = 0.f;
414                                                         currentConstraintRow[j].m_frictionIndex = 0;
415                                                         currentConstraintRow[j].m_jacDiagABInv = 0.f;
416                                                         currentConstraintRow[j].m_lowerLimit = 0.f;
417                                                         currentConstraintRow[j].m_upperLimit = 0.f;
418
419                                                         currentConstraintRow[j].m_originalContactPoint = 0;
420                                                         currentConstraintRow[j].m_overrideNumSolverIterations = 0;
421                                                         currentConstraintRow[j].m_relpos1CrossNormal.setValue(0, 0, 0);
422                                                         currentConstraintRow[j].m_relpos2CrossNormal.setValue(0, 0, 0);
423                                                         currentConstraintRow[j].m_rhs = 0.f;
424                                                         currentConstraintRow[j].m_rhsPenetration = 0.f;
425                                                         currentConstraintRow[j].m_solverBodyIdA = 0;
426                                                         currentConstraintRow[j].m_solverBodyIdB = 0;
427
428                                                         currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;
429                                                         currentConstraintRow[j].m_upperLimit = B3_INFINITY;
430                                                         currentConstraintRow[j].m_appliedImpulse = 0.f;
431                                                         currentConstraintRow[j].m_appliedPushImpulse = 0.f;
432                                                         currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
433                                                         currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
434                                                         currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
435                                                 }
436
437                                                 bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f, 0.f, 0.f);
438                                                 bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f, 0.f, 0.f);
439                                                 bodyAPtr->internalGetPushVelocity().setValue(0.f, 0.f, 0.f);
440                                                 bodyAPtr->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f);
441                                                 bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f, 0.f, 0.f);
442                                                 bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f, 0.f, 0.f);
443                                                 bodyBPtr->internalGetPushVelocity().setValue(0.f, 0.f, 0.f);
444                                                 bodyBPtr->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f);
445
446                                                 b3GpuConstraintInfo2 info2;
447                                                 info2.fps = 1.f / infoGlobal.m_timeStep;
448                                                 info2.erp = infoGlobal.m_erp;
449                                                 info2.m_J1linearAxis = currentConstraintRow->m_contactNormal;
450                                                 info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
451                                                 info2.m_J2linearAxis = 0;
452                                                 info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
453                                                 info2.rowskip = sizeof(b3GpuSolverConstraint) / sizeof(b3Scalar);  //check this
454                                                 ///the size of b3GpuSolverConstraint needs be a multiple of b3Scalar
455                                                 b3Assert(info2.rowskip * sizeof(b3Scalar) == sizeof(b3GpuSolverConstraint));
456                                                 info2.m_constraintError = &currentConstraintRow->m_rhs;
457                                                 currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
458                                                 info2.m_damping = infoGlobal.m_damping;
459                                                 info2.cfm = &currentConstraintRow->m_cfm;
460                                                 info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
461                                                 info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
462                                                 info2.m_numIterations = infoGlobal.m_numIterations;
463                                                 m_gpuData->m_cpuConstraints[i].getInfo2(&info2, &m_gpuData->m_cpuBodies[0]);
464
465                                                 ///finalize the constraint setup
466                                                 for (j = 0; j < info1; j++)
467                                                 {
468                                                         b3GpuSolverConstraint& solverConstraint = currentConstraintRow[j];
469
470                                                         if (solverConstraint.m_upperLimit >= m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold())
471                                                         {
472                                                                 solverConstraint.m_upperLimit = m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold();
473                                                         }
474
475                                                         if (solverConstraint.m_lowerLimit <= -m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold())
476                                                         {
477                                                                 solverConstraint.m_lowerLimit = -m_gpuData->m_cpuConstraints[i].getBreakingImpulseThreshold();
478                                                         }
479
480                                                         //                                              solverConstraint.m_originalContactPoint = constraint;
481
482                                                         b3Matrix3x3& invInertiaWorldA = m_gpuData->m_cpuInertias[constraint.getRigidBodyA()].m_invInertiaWorld;
483                                                         {
484                                                                 //b3Vector3 angularFactorA(1,1,1);
485                                                                 const b3Vector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
486                                                                 solverConstraint.m_angularComponentA = invInertiaWorldA * ftorqueAxis1;  //*angularFactorA;
487                                                         }
488
489                                                         b3Matrix3x3& invInertiaWorldB = m_gpuData->m_cpuInertias[constraint.getRigidBodyB()].m_invInertiaWorld;
490                                                         {
491                                                                 const b3Vector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
492                                                                 solverConstraint.m_angularComponentB = invInertiaWorldB * ftorqueAxis2;  //*constraint.getRigidBodyB().getAngularFactor();
493                                                         }
494
495                                                         {
496                                                                 //it is ok to use solverConstraint.m_contactNormal instead of -solverConstraint.m_contactNormal
497                                                                 //because it gets multiplied iMJlB
498                                                                 b3Vector3 iMJlA = solverConstraint.m_contactNormal * rbA.m_invMass;
499                                                                 b3Vector3 iMJaA = invInertiaWorldA * solverConstraint.m_relpos1CrossNormal;
500                                                                 b3Vector3 iMJlB = solverConstraint.m_contactNormal * rbB.m_invMass;  //sign of normal?
501                                                                 b3Vector3 iMJaB = invInertiaWorldB * solverConstraint.m_relpos2CrossNormal;
502
503                                                                 b3Scalar sum = iMJlA.dot(solverConstraint.m_contactNormal);
504                                                                 sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
505                                                                 sum += iMJlB.dot(solverConstraint.m_contactNormal);
506                                                                 sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
507                                                                 b3Scalar fsum = b3Fabs(sum);
508                                                                 b3Assert(fsum > B3_EPSILON);
509                                                                 solverConstraint.m_jacDiagABInv = fsum > B3_EPSILON ? b3Scalar(1.) / sum : 0.f;
510                                                         }
511
512                                                         ///fix rhs
513                                                         ///todo: add force/torque accelerators
514                                                         {
515                                                                 b3Scalar rel_vel;
516                                                                 b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.m_linVel) + solverConstraint.m_relpos1CrossNormal.dot(rbA.m_angVel);
517                                                                 b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.m_linVel) + solverConstraint.m_relpos2CrossNormal.dot(rbB.m_angVel);
518
519                                                                 rel_vel = vel1Dotn + vel2Dotn;
520
521                                                                 b3Scalar restitution = 0.f;
522                                                                 b3Scalar positionalError = solverConstraint.m_rhs;  //already filled in by getConstraintInfo2
523                                                                 b3Scalar velocityError = restitution - rel_vel * info2.m_damping;
524                                                                 b3Scalar penetrationImpulse = positionalError * solverConstraint.m_jacDiagABInv;
525                                                                 b3Scalar velocityImpulse = velocityError * solverConstraint.m_jacDiagABInv;
526                                                                 solverConstraint.m_rhs = penetrationImpulse + velocityImpulse;
527                                                                 solverConstraint.m_appliedImpulse = 0.f;
528                                                         }
529                                                 }
530                                         }
531                                 }
532
533                                 m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool);
534                                 m_gpuData->m_gpuConstraintInfo1->copyFromHost(m_tmpConstraintSizesPool);
535
536                                 if (m_gpuData->m_batchSizes.size() == 0)
537                                         m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
538                                 else
539                                         m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
540
541                                 m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool);
542
543                         }  //end useGpuInfo2
544                 }
545
546 #ifdef B3_SUPPORT_CONTACT_CONSTRAINTS
547                 {
548                         int i;
549
550                         for (i = 0; i < numManifolds; i++)
551                         {
552                                 b3Contact4& manifold = manifoldPtr[i];
553                                 convertContact(bodies, inertias, &manifold, infoGlobal);
554                         }
555                 }
556 #endif  //B3_SUPPORT_CONTACT_CONSTRAINTS
557         }
558
559         //      b3ContactSolverInfo info = infoGlobal;
560
561         //      int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
562         //      int numConstraintPool = m_tmpSolverContactConstraintPool.size();
563         //      int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
564
565         return 0.f;
566 }
567
568 ///a straight copy from GPU/OpenCL kernel, for debugging
569 __inline void internalApplyImpulse(b3GpuSolverBody* body, const b3Vector3& linearComponent, const b3Vector3& angularComponent, float impulseMagnitude)
570 {
571         body->m_deltaLinearVelocity += linearComponent * impulseMagnitude * body->m_linearFactor;
572         body->m_deltaAngularVelocity += angularComponent * (impulseMagnitude * body->m_angularFactor);
573 }
574
575 void resolveSingleConstraintRowGeneric2(b3GpuSolverBody* body1, b3GpuSolverBody* body2, b3GpuSolverConstraint* c)
576 {
577         float deltaImpulse = c->m_rhs - b3Scalar(c->m_appliedImpulse) * c->m_cfm;
578         float deltaVel1Dotn = b3Dot(c->m_contactNormal, body1->m_deltaLinearVelocity) + b3Dot(c->m_relpos1CrossNormal, body1->m_deltaAngularVelocity);
579         float deltaVel2Dotn = -b3Dot(c->m_contactNormal, body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal, body2->m_deltaAngularVelocity);
580
581         deltaImpulse -= deltaVel1Dotn * c->m_jacDiagABInv;
582         deltaImpulse -= deltaVel2Dotn * c->m_jacDiagABInv;
583
584         float sum = b3Scalar(c->m_appliedImpulse) + deltaImpulse;
585         if (sum < c->m_lowerLimit)
586         {
587                 deltaImpulse = c->m_lowerLimit - b3Scalar(c->m_appliedImpulse);
588                 c->m_appliedImpulse = c->m_lowerLimit;
589         }
590         else if (sum > c->m_upperLimit)
591         {
592                 deltaImpulse = c->m_upperLimit - b3Scalar(c->m_appliedImpulse);
593                 c->m_appliedImpulse = c->m_upperLimit;
594         }
595         else
596         {
597                 c->m_appliedImpulse = sum;
598         }
599
600         internalApplyImpulse(body1, c->m_contactNormal * body1->m_invMass, c->m_angularComponentA, deltaImpulse);
601         internalApplyImpulse(body2, -c->m_contactNormal * body2->m_invMass, c->m_angularComponentB, deltaImpulse);
602 }
603
604 void b3GpuPgsConstraintSolver::initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb)
605 {
606         solverBody->m_deltaLinearVelocity.setValue(0.f, 0.f, 0.f);
607         solverBody->m_deltaAngularVelocity.setValue(0.f, 0.f, 0.f);
608         solverBody->internalGetPushVelocity().setValue(0.f, 0.f, 0.f);
609         solverBody->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f);
610
611         b3Assert(rb);
612         //      solverBody->m_worldTransform = getWorldTransform(rb);
613         solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass, rb->m_invMass, rb->m_invMass));
614         solverBody->m_originalBodyIndex = bodyIndex;
615         solverBody->m_angularFactor = b3MakeVector3(1, 1, 1);
616         solverBody->m_linearFactor = b3MakeVector3(1, 1, 1);
617         solverBody->m_linearVelocity = getLinearVelocity(rb);
618         solverBody->m_angularVelocity = getAngularVelocity(rb);
619 }
620
621 void b3GpuPgsConstraintSolver::averageVelocities()
622 {
623 }
624
625 b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1, int numConstraints, const b3ContactSolverInfo& infoGlobal)
626 {
627         //only create the batches once.
628         //@todo: incrementally update batches when constraints are added/activated and/or removed/deactivated
629         B3_PROFILE("GpuSolveGroupCacheFriendlyIterations");
630
631         bool createBatches = m_gpuData->m_batchSizes.size() == 0;
632         {
633                 if (createBatches)
634                 {
635                         m_gpuData->m_batchSizes.resize(0);
636
637                         {
638                                 m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
639
640                                 B3_PROFILE("batch joints");
641                                 b3Assert(batchConstraints.size() == numConstraints);
642                                 int simdWidth = numConstraints + 1;
643                                 int numBodies = m_tmpSolverBodyPool.size();
644                                 sortConstraintByBatch3(&batchConstraints[0], numConstraints, simdWidth, m_staticIdx, numBodies);
645
646                                 m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
647                         }
648                 }
649                 else
650                 {
651                         /*b3AlignedObjectArray<b3BatchConstraint> cpuCheckBatches;
652                         m_gpuData->m_gpuBatchConstraints->copyToHost(cpuCheckBatches);
653                         b3Assert(cpuCheckBatches.size()==batchConstraints.size());
654                         printf(".\n");
655                         */
656                         //>copyFromHost(batchConstraints);
657                 }
658                 int maxIterations = infoGlobal.m_numIterations;
659
660                 bool useBatching = true;
661
662                 if (useBatching)
663                 {
664                         if (!useGpuSolveJointConstraintRows)
665                         {
666                                 B3_PROFILE("copy to host");
667                                 m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
668                                 m_gpuData->m_gpuBatchConstraints->copyToHost(batchConstraints);
669                                 m_gpuData->m_gpuConstraintRows->copyToHost(m_tmpSolverNonContactConstraintPool);
670                                 m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1);
671                                 m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets);
672                                 gpuConstraints1->copyToHost(m_gpuData->m_cpuConstraints);
673                         }
674
675                         for (int iteration = 0; iteration < maxIterations; iteration++)
676                         {
677                                 int batchOffset = 0;
678                                 int constraintOffset = 0;
679                                 int numBatches = m_gpuData->m_batchSizes.size();
680                                 for (int bb = 0; bb < numBatches; bb++)
681                                 {
682                                         int numConstraintsInBatch = m_gpuData->m_batchSizes[bb];
683
684                                         if (useGpuSolveJointConstraintRows)
685                                         {
686                                                 B3_PROFILE("solveJointConstraintRowsKernels");
687
688                                                 /*
689                                                 __kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,
690                                           __global b3BatchConstraint* batchConstraints,
691                                                 __global b3SolverConstraint* rows,
692                                                 __global unsigned int* numConstraintRowsInfo1, 
693                                                 __global unsigned int* rowOffsets,
694                                                 __global b3GpuGenericConstraint* constraints,
695                                                 int batchOffset,
696                                                 int numConstraintsInBatch*/
697
698                                                 b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_solveJointConstraintRowsKernels, "m_solveJointConstraintRowsKernels");
699                                                 launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
700                                                 launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL());
701                                                 launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL());
702                                                 launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
703                                                 launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL());
704                                                 launcher.setBuffer(gpuConstraints1->getBufferCL());  //to detect disabled constraints
705                                                 launcher.setConst(batchOffset);
706                                                 launcher.setConst(numConstraintsInBatch);
707
708                                                 launcher.launch1D(numConstraintsInBatch);
709                                         }
710                                         else  //useGpu
711                                         {
712                                                 for (int b = 0; b < numConstraintsInBatch; b++)
713                                                 {
714                                                         const b3BatchConstraint& c = batchConstraints[batchOffset + b];
715                                                         /*printf("-----------\n");
716                                                         printf("bb=%d\n",bb);
717                                                         printf("c.batchId = %d\n", c.m_batchId);
718                                                         */
719                                                         b3Assert(c.m_batchId == bb);
720                                                         b3GpuGenericConstraint* constraint = &m_gpuData->m_cpuConstraints[c.m_originalConstraintIndex];
721                                                         if (constraint->m_flags & B3_CONSTRAINT_FLAG_ENABLED)
722                                                         {
723                                                                 int numConstraintRows = m_gpuData->m_cpuConstraintInfo1[c.m_originalConstraintIndex];
724                                                                 int constraintOffset = m_gpuData->m_cpuConstraintRowOffsets[c.m_originalConstraintIndex];
725
726                                                                 for (int jj = 0; jj < numConstraintRows; jj++)
727                                                                 {
728                                                                         //
729                                                                         b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[constraintOffset + jj];
730                                                                         //resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
731                                                                         resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA], &m_tmpSolverBodyPool[constraint.m_solverBodyIdB], &constraint);
732                                                                 }
733                                                         }
734                                                 }
735                                         }  //useGpu
736                                         batchOffset += numConstraintsInBatch;
737                                         constraintOffset += numConstraintsInBatch;
738                                 }
739                         }  //for (int iteration...
740
741                         if (!useGpuSolveJointConstraintRows)
742                         {
743                                 {
744                                         B3_PROFILE("copy from host");
745                                         m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool);
746                                         m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
747                                         m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool);
748                                 }
749
750                                 //B3_PROFILE("copy to host");
751                                 //m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
752                         }
753                         //int sz = sizeof(b3GpuSolverBody);
754                         //printf("cpu sizeof(b3GpuSolverBody)=%d\n",sz);
755                 }
756                 else
757                 {
758                         for (int iteration = 0; iteration < maxIterations; iteration++)
759                         {
760                                 int numJoints = m_tmpSolverNonContactConstraintPool.size();
761                                 for (int j = 0; j < numJoints; j++)
762                                 {
763                                         b3GpuSolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[j];
764                                         resolveSingleConstraintRowGeneric2(&m_tmpSolverBodyPool[constraint.m_solverBodyIdA], &m_tmpSolverBodyPool[constraint.m_solverBodyIdB], &constraint);
765                                 }
766
767                                 if (!m_usePgs)
768                                 {
769                                         averageVelocities();
770                                 }
771                         }
772                 }
773         }
774         clFinish(m_gpuData->m_queue);
775         return 0.f;
776 }
777
778 static b3AlignedObjectArray<int> bodyUsed;
779 static b3AlignedObjectArray<int> curUsed;
780
781 inline int b3GpuPgsConstraintSolver::sortConstraintByBatch3(b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies)
782 {
783         //int sz = sizeof(b3BatchConstraint);
784
785         B3_PROFILE("sortConstraintByBatch3");
786
787         static int maxSwaps = 0;
788         int numSwaps = 0;
789
790         curUsed.resize(2 * simdWidth);
791
792         static int maxNumConstraints = 0;
793         if (maxNumConstraints < numConstraints)
794         {
795                 maxNumConstraints = numConstraints;
796                 //printf("maxNumConstraints  = %d\n",maxNumConstraints );
797         }
798
799         int numUsedArray = numBodies / 32 + 1;
800         bodyUsed.resize(numUsedArray);
801
802         for (int q = 0; q < numUsedArray; q++)
803                 bodyUsed[q] = 0;
804
805         int curBodyUsed = 0;
806
807         int numIter = 0;
808
809 #if defined(_DEBUG)
810         for (int i = 0; i < numConstraints; i++)
811                 cs[i].m_batchId = -1;
812 #endif
813
814         int numValidConstraints = 0;
815         //      int unprocessedConstraintIndex = 0;
816
817         int batchIdx = 0;
818
819         {
820                 B3_PROFILE("cpu batch innerloop");
821
822                 while (numValidConstraints < numConstraints)
823                 {
824                         numIter++;
825                         int nCurrentBatch = 0;
826                         //      clear flag
827                         for (int i = 0; i < curBodyUsed; i++)
828                                 bodyUsed[curUsed[i] / 32] = 0;
829
830                         curBodyUsed = 0;
831
832                         for (int i = numValidConstraints; i < numConstraints; i++)
833                         {
834                                 int idx = i;
835                                 b3Assert(idx < numConstraints);
836                                 //      check if it can go
837                                 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
838                                 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
839                                 int bodyA = abs(bodyAS);
840                                 int bodyB = abs(bodyBS);
841                                 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
842                                 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
843                                 int aUnavailable = 0;
844                                 int bUnavailable = 0;
845                                 if (!aIsStatic)
846                                 {
847                                         aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31));
848                                 }
849                                 if (!aUnavailable)
850                                         if (!bIsStatic)
851                                         {
852                                                 bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31));
853                                         }
854
855                                 if (aUnavailable == 0 && bUnavailable == 0)  // ok
856                                 {
857                                         if (!aIsStatic)
858                                         {
859                                                 bodyUsed[bodyA / 32] |= (1 << (bodyA & 31));
860                                                 curUsed[curBodyUsed++] = bodyA;
861                                         }
862                                         if (!bIsStatic)
863                                         {
864                                                 bodyUsed[bodyB / 32] |= (1 << (bodyB & 31));
865                                                 curUsed[curBodyUsed++] = bodyB;
866                                         }
867
868                                         cs[idx].m_batchId = batchIdx;
869
870                                         if (i != numValidConstraints)
871                                         {
872                                                 b3Swap(cs[i], cs[numValidConstraints]);
873                                                 numSwaps++;
874                                         }
875
876                                         numValidConstraints++;
877                                         {
878                                                 nCurrentBatch++;
879                                                 if (nCurrentBatch == simdWidth)
880                                                 {
881                                                         nCurrentBatch = 0;
882                                                         for (int i = 0; i < curBodyUsed; i++)
883                                                                 bodyUsed[curUsed[i] / 32] = 0;
884                                                         curBodyUsed = 0;
885                                                 }
886                                         }
887                                 }
888                         }
889                         m_gpuData->m_batchSizes.push_back(nCurrentBatch);
890                         batchIdx++;
891                 }
892         }
893
894 #if defined(_DEBUG)
895         //              debugPrintf( "nBatches: %d\n", batchIdx );
896         for (int i = 0; i < numConstraints; i++)
897         {
898                 b3Assert(cs[i].m_batchId != -1);
899         }
900 #endif
901
902         if (maxSwaps < numSwaps)
903         {
904                 maxSwaps = numSwaps;
905                 //printf("maxSwaps = %d\n", maxSwaps);
906         }
907
908         return batchIdx;
909 }
910
911 /// b3PgsJacobiSolver Sequentially applies impulses
912 b3Scalar b3GpuPgsConstraintSolver::solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias,
913                                                                                           int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal)
914 {
915         B3_PROFILE("solveJoints");
916         //you need to provide at least some bodies
917
918         solveGroupCacheFriendlySetup(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal);
919
920         solveGroupCacheFriendlyIterations(gpuConstraints, numConstraints, infoGlobal);
921
922         solveGroupCacheFriendlyFinish(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal);
923
924         return 0.f;
925 }
926
927 void b3GpuPgsConstraintSolver::solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias,
928                                                                                    int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints)
929 {
930         b3ContactSolverInfo infoGlobal;
931         infoGlobal.m_splitImpulse = false;
932         infoGlobal.m_timeStep = 1.f / 60.f;
933         infoGlobal.m_numIterations = 4;  //4;
934                                                                          //     infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS|B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION;
935         //infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS;
936         infoGlobal.m_solverMode |= B3_SOLVER_USE_2_FRICTION_DIRECTIONS;
937
938         //if (infoGlobal.m_solverMode & B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS)
939         //if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS) && (infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION))
940
941         solveGroup(gpuBodies, gpuInertias, numBodies, gpuConstraints, numConstraints, infoGlobal);
942 }
943
944 //b3AlignedObjectArray<b3RigidBodyData> testBodies;
945
946 b3Scalar b3GpuPgsConstraintSolver::solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal)
947 {
948         B3_PROFILE("solveGroupCacheFriendlyFinish");
949         //      int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
950         //      int i,j;
951
952         {
953                 if (gpuBreakConstraints)
954                 {
955                         B3_PROFILE("breakViolatedConstraintsKernel");
956                         b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_breakViolatedConstraintsKernel, "m_breakViolatedConstraintsKernel");
957                         launcher.setBuffer(gpuConstraints->getBufferCL());
958                         launcher.setBuffer(m_gpuData->m_gpuConstraintInfo1->getBufferCL());
959                         launcher.setBuffer(m_gpuData->m_gpuConstraintRowOffsets->getBufferCL());
960                         launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL());
961                         launcher.setConst(numConstraints);
962                         launcher.launch1D(numConstraints);
963                 }
964                 else
965                 {
966                         gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints);
967                         m_gpuData->m_gpuBatchConstraints->copyToHost(m_gpuData->m_cpuBatchConstraints);
968                         m_gpuData->m_gpuConstraintRows->copyToHost(m_gpuData->m_cpuConstraintRows);
969                         gpuConstraints->copyToHost(m_gpuData->m_cpuConstraints);
970                         m_gpuData->m_gpuConstraintInfo1->copyToHost(m_gpuData->m_cpuConstraintInfo1);
971                         m_gpuData->m_gpuConstraintRowOffsets->copyToHost(m_gpuData->m_cpuConstraintRowOffsets);
972
973                         for (int cid = 0; cid < numConstraints; cid++)
974                         {
975                                 int originalConstraintIndex = batchConstraints[cid].m_originalConstraintIndex;
976                                 int constraintRowOffset = m_gpuData->m_cpuConstraintRowOffsets[originalConstraintIndex];
977                                 int numRows = m_gpuData->m_cpuConstraintInfo1[originalConstraintIndex];
978                                 if (numRows)
979                                 {
980                                         //      printf("cid=%d, breakingThreshold =%f\n",cid,breakingThreshold);
981                                         for (int i = 0; i < numRows; i++)
982                                         {
983                                                 int rowIndex = constraintRowOffset + i;
984                                                 int orgConstraintIndex = m_gpuData->m_cpuConstraintRows[rowIndex].m_originalConstraintIndex;
985                                                 float breakingThreshold = m_gpuData->m_cpuConstraints[orgConstraintIndex].m_breakingImpulseThreshold;
986                                                 //      printf("rows[%d].m_appliedImpulse=%f\n",rowIndex,rows[rowIndex].m_appliedImpulse);
987                                                 if (b3Fabs(m_gpuData->m_cpuConstraintRows[rowIndex].m_appliedImpulse) >= breakingThreshold)
988                                                 {
989                                                         m_gpuData->m_cpuConstraints[orgConstraintIndex].m_flags = 0;  //&= ~B3_CONSTRAINT_FLAG_ENABLED;
990                                                 }
991                                         }
992                                 }
993                         }
994
995                         gpuConstraints->copyFromHost(m_gpuData->m_cpuConstraints);
996                 }
997         }
998
999         {
1000                 if (useGpuWriteBackVelocities)
1001                 {
1002                         B3_PROFILE("GPU write back velocities and transforms");
1003
1004                         b3LauncherCL launcher(m_gpuData->m_queue, m_gpuData->m_writeBackVelocitiesKernel, "m_writeBackVelocitiesKernel");
1005                         launcher.setBuffer(gpuBodies->getBufferCL());
1006                         launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
1007                         launcher.setConst(numBodies);
1008                         launcher.launch1D(numBodies);
1009                         clFinish(m_gpuData->m_queue);
1010                         //                      m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
1011                         //                      m_gpuData->m_gpuBodies->copyToHostPointer(bodies,numBodies);
1012                         //m_gpuData->m_gpuBodies->copyToHost(testBodies);
1013                 }
1014                 else
1015                 {
1016                         B3_PROFILE("CPU write back velocities and transforms");
1017
1018                         m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
1019                         gpuBodies->copyToHost(m_gpuData->m_cpuBodies);
1020                         for (int i = 0; i < m_tmpSolverBodyPool.size(); i++)
1021                         {
1022                                 int bodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex;
1023                                 //printf("bodyIndex=%d\n",bodyIndex);
1024                                 b3Assert(i == bodyIndex);
1025
1026                                 b3RigidBodyData* body = &m_gpuData->m_cpuBodies[bodyIndex];
1027                                 if (body->m_invMass)
1028                                 {
1029                                         if (infoGlobal.m_splitImpulse)
1030                                                 m_tmpSolverBodyPool[i].writebackVelocityAndTransform(infoGlobal.m_timeStep, infoGlobal.m_splitImpulseTurnErp);
1031                                         else
1032                                                 m_tmpSolverBodyPool[i].writebackVelocity();
1033
1034                                         if (m_usePgs)
1035                                         {
1036                                                 body->m_linVel = m_tmpSolverBodyPool[i].m_linearVelocity;
1037                                                 body->m_angVel = m_tmpSolverBodyPool[i].m_angularVelocity;
1038                                         }
1039                                         else
1040                                         {
1041                                                 b3Assert(0);
1042                                         }
1043                                         /*                      
1044                                         if (infoGlobal.m_splitImpulse)
1045                                         {
1046                                                 body->m_pos = m_tmpSolverBodyPool[i].m_worldTransform.getOrigin();
1047                                                 b3Quaternion orn;
1048                                                 orn = m_tmpSolverBodyPool[i].m_worldTransform.getRotation();
1049                                                 body->m_quat = orn;
1050                                         }
1051                                         */
1052                                 }
1053                         }  //for
1054
1055                         gpuBodies->copyFromHost(m_gpuData->m_cpuBodies);
1056                 }
1057         }
1058
1059         clFinish(m_gpuData->m_queue);
1060
1061         m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
1062         m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
1063         m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0);
1064         m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(0);
1065
1066         m_tmpSolverBodyPool.resizeNoInitialize(0);
1067         return 0.f;
1068 }