2 Copyright (c) 2012 Advanced Micro Devices, Inc.
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
14 //Originally written by Takahiro Harada
18 ///useNewBatchingKernel is a rewritten kernel using just a single thread of the warp, for experiments
19 bool useNewBatchingKernel = true;
20 bool gConvertConstraintOnCpu = false;
22 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
23 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
24 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
25 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
26 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
27 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
29 #include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
31 #include "kernels/solverSetup.h"
32 #include "kernels/solverSetup2.h"
34 #include "kernels/solveContact.h"
35 #include "kernels/solveFriction.h"
37 #include "kernels/batchingKernels.h"
38 #include "kernels/batchingKernelsNew.h"
40 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
41 #include "Bullet3Common/b3Vector3.h"
43 struct SolverDebugInfo
74 struct ParallelSolveData
76 b3OpenCLArray<unsigned int>* m_numConstraints;
77 b3OpenCLArray<unsigned int>* m_offsets;
81 b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
85 m_batchSizes(ctx, queue),
88 m_sort32 = new b3RadixSort32CL(ctx, device, queue);
89 m_scan = new b3PrefixScanCL(ctx, device, queue, B3_SOLVER_N_CELLS);
90 m_search = new b3BoundSearchCL(ctx, device, queue, B3_SOLVER_N_CELLS);
92 const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
94 m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, queue, sortSize);
95 m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx, queue);
97 m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
98 m_numConstraints->resize(B3_SOLVER_N_CELLS);
100 m_offsets = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
101 m_offsets->resize(B3_SOLVER_N_CELLS);
102 const char* additionalMacros = "";
103 // const char* srcFileNameForCaching="";
106 const char* batchKernelSource = batchingKernelsCL;
107 const char* batchKernelNewSource = batchingKernelsNewCL;
109 const char* solverSetupSource = solverSetupCL;
110 const char* solverSetup2Source = solverSetup2CL;
111 const char* solveContactSource = solveContactCL;
112 const char* solveFrictionSource = solveFrictionCL;
115 cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
116 b3Assert(solveContactProg);
118 cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
119 b3Assert(solveFrictionProg);
121 cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
122 b3Assert(solverSetup2Prog);
124 cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
125 b3Assert(solverSetupProg);
127 m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
128 b3Assert(m_solveFrictionKernel);
130 m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
131 b3Assert(m_solveContactKernel);
133 m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
134 b3Assert(m_contactToConstraintKernel);
136 m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
137 b3Assert(m_setSortDataKernel);
139 m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
140 b3Assert(m_reorderContactKernel);
142 m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
143 b3Assert(m_copyConstraintKernel);
147 cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
148 //cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
149 b3Assert(batchingProg);
151 m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
152 b3Assert(m_batchingKernel);
155 cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
156 b3Assert(batchingNewProg);
158 m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
159 //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
160 b3Assert(m_batchingKernelNew);
164 b3Solver::~b3Solver()
167 delete m_numConstraints;
168 delete m_sortDataBuffer;
169 delete m_contactBuffer2;
175 clReleaseKernel(m_batchingKernel);
176 clReleaseKernel(m_batchingKernelNew);
178 clReleaseKernel(m_solveContactKernel);
179 clReleaseKernel(m_solveFrictionKernel);
181 clReleaseKernel(m_contactToConstraintKernel);
182 clReleaseKernel(m_setSortDataKernel);
183 clReleaseKernel(m_reorderContactKernel);
184 clReleaseKernel(m_copyConstraintKernel);
187 template <bool JACOBI>
188 static __inline void solveContact(b3GpuConstraint4& cs,
189 const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
190 const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
191 float maxRambdaDt[4], float minRambdaDt[4])
202 for (int ic = 0; ic < 4; ic++)
204 // dont necessary because this makes change to 0
205 if (cs.m_jacCoeffInv[ic] == 0.f) continue;
208 b3Vector3 angular0, angular1, linear;
209 b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
210 b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
211 setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, &linear, &angular0, &angular1);
213 float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1,
214 linVelA, angVelA, linVelB, angVelB) +
216 rambdaDt *= cs.m_jacCoeffInv[ic];
219 float prevSum = cs.m_appliedRambdaDt[ic];
220 float updated = prevSum;
222 updated = b3Max(updated, minRambdaDt[ic]);
223 updated = b3Min(updated, maxRambdaDt[ic]);
224 rambdaDt = updated - prevSum;
225 cs.m_appliedRambdaDt[ic] = updated;
228 b3Vector3 linImp0 = invMassA * linear * rambdaDt;
229 b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
230 b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
231 b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
233 b3Assert(_finite(linImp0.getX()));
234 b3Assert(_finite(linImp1.getX()));
262 static __inline void solveFriction(b3GpuConstraint4& cs,
263 const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
264 const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
265 float maxRambdaDt[4], float minRambdaDt[4])
267 if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return;
268 const b3Vector3& center = (const b3Vector3&)cs.m_center;
270 b3Vector3 n = -(const b3Vector3&)cs.m_linear;
272 b3Vector3 tangent[2];
274 b3PlaneSpace1(n, tangent[0], tangent[1]);
276 b3Vector3 r = cs.m_worldPos[0] - center;
277 tangent[0] = cross3(n, r);
278 tangent[1] = cross3(tangent[0], n);
279 tangent[0] = normalize3(tangent[0]);
280 tangent[1] = normalize3(tangent[1]);
283 b3Vector3 angular0, angular1, linear;
284 b3Vector3 r0 = center - posA;
285 b3Vector3 r1 = center - posB;
286 for (int i = 0; i < 2; i++)
288 setLinearAndAngular(tangent[i], r0, r1, &linear, &angular0, &angular1);
289 float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
290 linVelA, angVelA, linVelB, angVelB);
291 rambdaDt *= cs.m_fJacCoeffInv[i];
294 float prevSum = cs.m_fAppliedRambdaDt[i];
295 float updated = prevSum;
297 updated = b3Max(updated, minRambdaDt[i]);
298 updated = b3Min(updated, maxRambdaDt[i]);
299 rambdaDt = updated - prevSum;
300 cs.m_fAppliedRambdaDt[i] = updated;
303 b3Vector3 linImp0 = invMassA * linear * rambdaDt;
304 b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
305 b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
306 b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
308 b3Assert(_finite(linImp0.getX()));
309 b3Assert(_finite(linImp1.getX()));
317 { // angular damping for point constraint
318 b3Vector3 ab = (posB - posA).normalized();
319 b3Vector3 ac = (center - posA).normalized();
320 if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
322 float angNA = b3Dot(n, angVelA);
323 float angNB = b3Dot(n, angVelB);
325 angVelA -= (angNA * 0.1f) * n;
326 angVelB -= (angNB * 0.1f) * n;
331 b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
332 b3AlignedObjectArray<b3InertiaData>& m_shapes;
333 b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
334 b3AlignedObjectArray<int>* m_batchSizes;
339 bool m_solveFriction;
343 struct SolveTask // : public ThreadPool::Task
345 SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
346 int start, int nConstraints, int maxNumBatches, b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
347 : m_bodies(bodies), m_shapes(shapes), m_constraints(constraints), m_batchSizes(batchSizes), m_cellIndex(cellIndex), m_curWgidx(curWgidx), m_start(start), m_nConstraints(nConstraints), m_solveFriction(true), m_maxNumBatches(maxNumBatches)
351 unsigned short int getType() { return 0; }
356 for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
358 int numInBatch = m_batchSizes->at(m_cellIndex * B3_MAX_NUM_BATCHES + ii);
362 for (int jj = 0; jj < numInBatch; jj++)
364 int i = m_start + offset + jj;
365 int batchId = m_constraints[i].m_batchIdx;
366 b3Assert(batchId == ii);
367 float frictionCoeff = m_constraints[i].getFrictionCoeff();
368 int aIdx = (int)m_constraints[i].m_bodyA;
369 int bIdx = (int)m_constraints[i].m_bodyB;
370 // int localBatch = m_constraints[i].m_batchIdx;
371 b3RigidBodyData& bodyA = m_bodies[aIdx];
372 b3RigidBodyData& bodyB = m_bodies[bIdx];
374 if (!m_solveFriction)
376 float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
377 float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
379 solveContact<false>(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
380 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
381 maxRambdaDt, minRambdaDt);
385 float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
386 float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
388 for (int j = 0; j < 4; j++)
390 sum += m_constraints[i].m_appliedRambdaDt[j];
392 frictionCoeff = 0.7f;
393 for (int j = 0; j < 4; j++)
395 maxRambdaDt[j] = frictionCoeff * sum;
396 minRambdaDt[j] = -maxRambdaDt[j];
398 solveFriction(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
399 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
400 maxRambdaDt, minRambdaDt);
403 offset += numInBatch;
405 /* for (int bb=0;bb<m_maxNumBatches;bb++)
407 //for(int ic=m_nConstraints-1; ic>=0; ic--)
408 for(int ic=0; ic<m_nConstraints; ic++)
411 int i = m_start + ic;
412 if (m_constraints[i].m_batchIdx != bb)
415 float frictionCoeff = m_constraints[i].getFrictionCoeff();
416 int aIdx = (int)m_constraints[i].m_bodyA;
417 int bIdx = (int)m_constraints[i].m_bodyB;
418 int localBatch = m_constraints[i].m_batchIdx;
419 b3RigidBodyData& bodyA = m_bodies[aIdx];
420 b3RigidBodyData& bodyB = m_bodies[bIdx];
422 if( !m_solveFriction )
424 float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
425 float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
427 solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld,
428 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
429 maxRambdaDt, minRambdaDt );
433 float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
434 float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
436 for(int j=0; j<4; j++)
438 sum +=m_constraints[i].m_appliedRambdaDt[j];
440 frictionCoeff = 0.7f;
441 for(int j=0; j<4; j++)
443 maxRambdaDt[j] = frictionCoeff*sum;
444 minRambdaDt[j] = -maxRambdaDt[j];
446 solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld,
447 (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
448 maxRambdaDt, minRambdaDt );
456 b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
457 b3AlignedObjectArray<b3InertiaData>& m_shapes;
458 b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
459 b3AlignedObjectArray<int>* m_batchSizes;
464 bool m_solveFriction;
468 void b3Solver::solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
469 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes)
473 int nSplitX = B3_SOLVER_N_SPLIT_X;
474 int nSplitY = B3_SOLVER_N_SPLIT_Y;
475 int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
476 for (int z=0;z<4;z++)
478 for (int y=0;y<4;y++)
480 for (int x=0;x<4;x++)
482 int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
483 // printf("newIndex=%d\n",newIndex);
485 int zIdx = newIndex/(nSplitX*nSplitY);
486 int remain = newIndex%(nSplitX*nSplitY);
487 int yIdx = remain/nSplitX;
488 int xIdx = remain%nSplitX;
489 // printf("newIndex=%d\n",newIndex);
494 //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
495 for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
497 for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
499 int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
500 int remain= (wgIdx%((nSplitX*nSplitY)/4));
501 int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
502 int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
504 /*int zIdx = newIndex/(nSplitX*nSplitY);
505 int remain = newIndex%(nSplitX*nSplitY);
506 int yIdx = remain/nSplitX;
507 int xIdx = remain%nSplitX;
509 int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
510 // printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
516 b3AlignedObjectArray<b3RigidBodyData> bodyNative;
517 bodyBuf->copyToHost(bodyNative);
518 b3AlignedObjectArray<b3InertiaData> shapeNative;
519 shapeBuf->copyToHost(shapeNative);
520 b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
521 constraint->copyToHost(constraintNative);
523 b3AlignedObjectArray<unsigned int> numConstraintsHost;
524 m_numConstraints->copyToHost(numConstraintsHost);
526 //printf("------------------------\n");
527 b3AlignedObjectArray<unsigned int> offsetsHost;
528 m_offsets->copyToHost(offsetsHost);
529 static int frame = 0;
530 bool useBatches = true;
533 for (int iter = 0; iter < m_nIterations; iter++)
535 for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
537 int nSplitX = B3_SOLVER_N_SPLIT_X;
538 int nSplitY = B3_SOLVER_N_SPLIT_Y;
539 int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
540 //printf("cell Batch %d\n",cellBatch);
541 b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
542 for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
544 usedBodies[i].resize(0);
547 //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
548 for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
550 int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
551 int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
552 int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
553 int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
554 int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
556 if (numConstraintsHost[cellIdx] == 0)
559 //printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
560 //printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
568 //printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
569 //printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
571 const int start = offsetsHost[cellIdx];
572 int numConstraintsInCell = numConstraintsHost[cellIdx];
573 // const int end = start + numConstraintsInCell;
575 SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, usedBodies, wgIdx, batchSizes, cellIdx);
576 task.m_solveFriction = false;
582 for (int iter = 0; iter < m_nIterations; iter++)
584 for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
586 int nSplitX = B3_SOLVER_N_SPLIT_X;
587 int nSplitY = B3_SOLVER_N_SPLIT_Y;
589 int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
591 for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
593 int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
594 int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
595 int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
596 int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
598 int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
600 if (numConstraintsHost[cellIdx] == 0)
603 //printf("yIdx=%d\n",yIdx);
605 const int start = offsetsHost[cellIdx];
606 int numConstraintsInCell = numConstraintsHost[cellIdx];
607 // const int end = start + numConstraintsInCell;
609 SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, 0, 0, batchSizes, cellIdx);
610 task.m_solveFriction = true;
618 for (int iter = 0; iter < m_nIterations; iter++)
620 SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
621 task.m_solveFriction = false;
625 for (int iter = 0; iter < m_nIterations; iter++)
627 SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
628 task.m_solveFriction = true;
633 bodyBuf->copyFromHost(bodyNative);
634 shapeBuf->copyFromHost(shapeNative);
635 constraint->copyFromHost(constraintNative);
639 void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
640 const b3OpenCLArray<b3InertiaData>* shapeBuf,
641 b3OpenCLArray<b3GpuConstraint4>* constraint,
642 b3OpenCLArray<unsigned int>* m_numConstraints,
643 b3OpenCLArray<unsigned int>* m_offsets,
646 // b3BufferInfoCL( m_numConstraints->getBufferCL() ),
647 // b3BufferInfoCL( m_offsets->getBufferCL() )
649 int cellBatch = batchId;
650 const int nn = B3_SOLVER_N_CELLS;
651 // int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
653 b3AlignedObjectArray<unsigned int> gN;
654 m_numConstraints->copyToHost(gN);
655 b3AlignedObjectArray<unsigned int> gOffsets;
656 m_offsets->copyToHost(gOffsets);
657 int nSplitX = B3_SOLVER_N_SPLIT_X;
658 int nSplitY = B3_SOLVER_N_SPLIT_Y;
660 // int bIdx = batchId;
662 b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
663 constraint->copyToHost(cpuConstraints);
665 printf("batch = %d\n", batchId);
667 int numWorkgroups = nn / B3_SOLVER_N_BATCHES;
668 b3AlignedObjectArray<int> usedBodies;
670 for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
672 printf("wgIdx = %d ", wgIdx);
674 int zIdx = (wgIdx / ((nSplitX * nSplitY)) / 2) * 2 + ((cellBatch & 4) >> 2);
675 int remain = wgIdx % ((nSplitX * nSplitY));
676 int yIdx = (remain % (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
677 int xIdx = (remain / (nSplitX / 2)) * 2 + (cellBatch & 1);
679 int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
680 printf("cellIdx=%d\n", cellIdx);
681 if (gN[cellIdx] == 0)
684 const int start = gOffsets[cellIdx];
685 const int end = start + gN[cellIdx];
687 for (int c = start; c < end; c++)
689 b3GpuConstraint4& constraint = cpuConstraints[c];
690 //printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
691 if (usedBodies.findLinearSearch(constraint.m_bodyA) < usedBodies.size())
695 if (usedBodies.findLinearSearch(constraint.m_bodyB) < usedBodies.size())
701 for (int c = start; c < end; c++)
703 b3GpuConstraint4& constraint = cpuConstraints[c];
704 usedBodies.push_back(constraint.m_bodyA);
705 usedBodies.push_back(constraint.m_bodyB);
710 static bool verify = false;
712 void b3Solver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
713 b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches)
715 b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
717 const int nn = B3_SOLVER_N_CELLS;
720 cdata.y = maxNumBatches; //250;
722 int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
724 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
725 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
729 B3_PROFILE("m_batchSolveKernel iterations");
730 for (int iter = 0; iter < m_nIterations; iter++)
732 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
736 checkConstraintBatch(bodyBuf, shapeBuf, constraint, m_numConstraints, m_offsets, ib);
740 memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
741 gpuDebugInfo.write(debugInfo, numWorkItems);
746 b3LauncherCL launcher(m_queue, m_solveContactKernel, "m_solveContactKernel");
749 b3BufferInfoCL bInfo[] = {
751 b3BufferInfoCL(bodyBuf->getBufferCL()),
752 b3BufferInfoCL(shapeBuf->getBufferCL()),
753 b3BufferInfoCL(constraint->getBufferCL()),
754 b3BufferInfoCL(m_numConstraints->getBufferCL()),
755 b3BufferInfoCL(m_offsets->getBufferCL())
758 b3BufferInfoCL(&gpuDebugInfo)
762 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
763 //launcher.setConst( cdata.x );
764 launcher.setConst(cdata.y);
765 launcher.setConst(cdata.z);
767 nSplit.x = B3_SOLVER_N_SPLIT_X;
768 nSplit.y = B3_SOLVER_N_SPLIT_Y;
769 nSplit.z = B3_SOLVER_N_SPLIT_Z;
771 launcher.setConst(nSplit);
772 launcher.launch1D(numWorkItems, 64);
775 const char* fileName = "m_batchSolveKernel.bin";
776 FILE* f = fopen(fileName, "rb");
780 if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
782 printf("error, cannot get file size\n");
786 unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
787 fread(buf, sizeInBytes, 1, f);
788 int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
789 int num = *(int*)&buf[serializedBytes];
791 launcher.launch1D(num);
793 //this clFinish is for testing on errors
801 gpuDebugInfo.read(debugInfo, numWorkItems);
803 for (int i = 0; i < numWorkItems; i++)
805 if (debugInfo[i].m_valInt2 > 0)
807 printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
810 if (debugInfo[i].m_valInt3 > 0)
812 printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
823 bool applyFriction = true;
826 B3_PROFILE("m_batchSolveKernel iterations2");
827 for (int iter = 0; iter < m_nIterations; iter++)
829 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
833 b3BufferInfoCL bInfo[] = {
834 b3BufferInfoCL(bodyBuf->getBufferCL()),
835 b3BufferInfoCL(shapeBuf->getBufferCL()),
836 b3BufferInfoCL(constraint->getBufferCL()),
837 b3BufferInfoCL(m_numConstraints->getBufferCL()),
838 b3BufferInfoCL(m_offsets->getBufferCL())
841 b3BufferInfoCL(&gpuDebugInfo)
844 b3LauncherCL launcher(m_queue, m_solveFrictionKernel, "m_solveFrictionKernel");
845 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
846 //launcher.setConst( cdata.x );
847 launcher.setConst(cdata.y);
848 launcher.setConst(cdata.z);
850 nSplit.x = B3_SOLVER_N_SPLIT_X;
851 nSplit.y = B3_SOLVER_N_SPLIT_Y;
852 nSplit.z = B3_SOLVER_N_SPLIT_Z;
854 launcher.setConst(nSplit);
856 launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
867 void b3Solver::convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
868 const b3OpenCLArray<b3InertiaData>* shapeBuf,
869 b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
870 int nContacts, const ConstraintCfg& cfg)
872 // b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
873 contactCOut->resize(nContacts);
878 float m_positionDrift;
879 float m_positionConstraintCoeff;
884 cdata.m_nContacts = nContacts;
885 cdata.m_dt = cfg.m_dt;
886 cdata.m_positionDrift = cfg.m_positionDrift;
887 cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
889 if (gConvertConstraintOnCpu)
891 b3AlignedObjectArray<b3RigidBodyData> gBodies;
892 bodyBuf->copyToHost(gBodies);
894 b3AlignedObjectArray<b3Contact4> gContact;
895 contactsIn->copyToHost(gContact);
897 b3AlignedObjectArray<b3InertiaData> gShapes;
898 shapeBuf->copyToHost(gShapes);
900 b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
901 gConstraintOut.resize(nContacts);
903 B3_PROFILE("cpu contactToConstraintKernel");
904 for (int gIdx = 0; gIdx < nContacts; gIdx++)
906 int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
907 int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
909 b3Float4 posA = gBodies[aIdx].m_pos;
910 b3Float4 linVelA = gBodies[aIdx].m_linVel;
911 b3Float4 angVelA = gBodies[aIdx].m_angVel;
912 float invMassA = gBodies[aIdx].m_invMass;
913 b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
915 b3Float4 posB = gBodies[bIdx].m_pos;
916 b3Float4 linVelB = gBodies[bIdx].m_linVel;
917 b3Float4 angVelB = gBodies[bIdx].m_angVel;
918 float invMassB = gBodies[bIdx].m_invMass;
919 b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
921 b3ContactConstraint4_t cs;
923 setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
924 &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
927 cs.m_batchIdx = gContact[gIdx].m_batchIdx;
929 gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
932 contactCOut->copyFromHost(gConstraintOut);
936 B3_PROFILE("gpu m_contactToConstraintKernel");
938 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(contactsIn->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(shapeBuf->getBufferCL()),
939 b3BufferInfoCL(contactCOut->getBufferCL())};
940 b3LauncherCL launcher(m_queue, m_contactToConstraintKernel, "m_contactToConstraintKernel");
941 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
942 //launcher.setConst( cdata );
944 launcher.setConst(cdata.m_nContacts);
945 launcher.setConst(cdata.m_dt);
946 launcher.setConst(cdata.m_positionDrift);
947 launcher.setConst(cdata.m_positionConstraintCoeff);
949 launcher.launch1D(nContacts, 64);
956 void b3Solver::sortContacts( const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
957 b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData,
958 int nContacts, const b3Solver::ConstraintCfg& cfg )
963 const int sortAlignment = 512; // todo. get this out of sort
964 if( cfg.m_enableParallelSolve )
968 int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
970 b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
971 b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
982 b3Assert( sortSize%64 == 0 );
984 cdata.m_nContacts = nContacts;
985 cdata.m_staticIdx = cfg.m_staticIdx;
986 cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
987 cdata.m_nSplit = B3_SOLVER_N_SPLIT;
990 b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
991 b3LauncherCL launcher( m_queue, m_setSortDataKernel );
992 launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
993 launcher.setConst( cdata );
994 launcher.launch1D( sortSize, 64 );
997 { // 3. sort by cell idx
998 int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
1000 //if( n <= 0xffff ) sortBit = 16;
1001 //if( n <= 0xff ) sortBit = 8;
1002 m_sort32->execute(*m_sortDataBuffer,sortSize);
1004 { // 4. find entries
1005 m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);
1007 m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
1010 { // 5. sort constraints by cellIdx
1011 // todo. preallocate this
1012 // b3Assert( contactsIn->getType() == TYPE_HOST );
1013 // b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn ); // copying contacts to this buffer
1018 b3Int4 cdata; cdata.x = nContacts;
1019 b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
1020 b3LauncherCL launcher( m_queue, m_reorderContactKernel );
1021 launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
1022 launcher.setConst( cdata );
1023 launcher.launch1D( nContacts, 64 );
1025 // BufferUtils::unmap<true>( out, contactsIn, nContacts );
1033 void b3Solver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx)
1035 int numWorkItems = 64 * B3_SOLVER_N_CELLS;
1037 B3_PROFILE("batch generation");
1040 cdata.x = nContacts;
1042 cdata.z = staticIdx;
1045 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
1046 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
1047 memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
1048 gpuDebugInfo.write(debugInfo, numWorkItems);
1052 b3BufferInfoCL bInfo[] = {
1053 b3BufferInfoCL( contacts->getBufferCL() ),
1054 b3BufferInfoCL( m_contactBuffer2->getBufferCL()),
1055 b3BufferInfoCL( nNative->getBufferCL() ),
1056 b3BufferInfoCL( offsetsNative->getBufferCL() ),
1058 , b3BufferInfoCL(&gpuDebugInfo)
1064 m_batchSizes.resize(nNative->size());
1065 B3_PROFILE("batchingKernel");
1066 //b3LauncherCL launcher( m_queue, m_batchingKernel);
1067 cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
1069 b3LauncherCL launcher(m_queue, k, "*batchingKernel");
1070 if (!useNewBatchingKernel)
1072 launcher.setBuffer(contacts->getBufferCL());
1074 launcher.setBuffer(m_contactBuffer2->getBufferCL());
1075 launcher.setBuffer(nNative->getBufferCL());
1076 launcher.setBuffer(offsetsNative->getBufferCL());
1078 launcher.setBuffer(m_batchSizes.getBufferCL());
1080 //launcher.setConst( cdata );
1081 launcher.setConst(staticIdx);
1083 launcher.launch1D(numWorkItems, 64);
1084 //clFinish(m_queue);
1085 //b3AlignedObjectArray<int> batchSizesCPU;
1086 //m_batchSizes.copyToHost(batchSizesCPU);
1092 b3Contact4* hostContacts = new b3Contact4[nContacts];
1093 m_contactBuffer->read(hostContacts, nContacts);
1096 gpuDebugInfo.read(debugInfo, numWorkItems);
1099 for (int i = 0; i < numWorkItems; i++)
1101 if (debugInfo[i].m_valInt1 > 0)
1105 if (debugInfo[i].m_valInt2 > 0)
1107 printf("catch22\n");
1110 if (debugInfo[i].m_valInt3 > 0)
1112 printf("catch666\n");
1115 if (debugInfo[i].m_valInt4 > 0)
1117 printf("catch777\n");
1121 #endif //BATCH_DEBUG
1124 // copy buffer to buffer
1125 //b3Assert(m_contactBuffer->size()==nContacts);
1126 //contacts->copyFromOpenCLArray( *m_contactBuffer);
1127 //clFinish(m_queue);//needed?