dali-physics/third-party/bullet3/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp

   1
   2 bool gUseLargeBatches = false;
   3 bool gCpuBatchContacts = false;
   4 bool gCpuSolveConstraint = false;
   5 bool gCpuRadixSort = false;
   6 bool gCpuSetSortData = false;
   7 bool gCpuSortContactsDeterminism = false;
   8 bool gUseCpuCopyConstraints = false;
   9 bool gUseScanHost = false;
  10 bool gReorderContactsOnCpu = false;
  11
  12 bool optionalSortContactsDeterminism = true;
  13
  14 #include "b3GpuPgsContactSolver.h"
  15 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
  16
  17 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
  18 #include "Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h"
  19 #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
  20 #include <string.h>
  21 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
  22 #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
  23 #include "b3Solver.h"
  24
  25 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
  26 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
  27 #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
  28 #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
  29 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
  30 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
  31
  32 #include "kernels/solverSetup.h"
  33 #include "kernels/solverSetup2.h"
  34 #include "kernels/solveContact.h"
  35 #include "kernels/solveFriction.h"
  36 #include "kernels/batchingKernels.h"
  37 #include "kernels/batchingKernelsNew.h"
  38
  39 struct b3GpuBatchingPgsSolverInternalData
  40 {
  41         cl_context m_context;
  42         cl_device_id m_device;
  43         cl_command_queue m_queue;
  44         int m_pairCapacity;
  45         int m_nIterations;
  46
  47         b3OpenCLArray<b3GpuConstraint4>* m_contactCGPU;
  48         b3OpenCLArray<unsigned int>* m_numConstraints;
  49         b3OpenCLArray<unsigned int>* m_offsets;
  50
  51         b3Solver* m_solverGPU;
  52
  53         cl_kernel m_batchingKernel;
  54         cl_kernel m_batchingKernelNew;
  55         cl_kernel m_solveContactKernel;
  56         cl_kernel m_solveSingleContactKernel;
  57         cl_kernel m_solveSingleFrictionKernel;
  58         cl_kernel m_solveFrictionKernel;
  59         cl_kernel m_contactToConstraintKernel;
  60         cl_kernel m_setSortDataKernel;
  61         cl_kernel m_reorderContactKernel;
  62         cl_kernel m_copyConstraintKernel;
  63
  64         cl_kernel m_setDeterminismSortDataBodyAKernel;
  65         cl_kernel m_setDeterminismSortDataBodyBKernel;
  66         cl_kernel m_setDeterminismSortDataChildShapeAKernel;
  67         cl_kernel m_setDeterminismSortDataChildShapeBKernel;
  68
  69         class b3RadixSort32CL* m_sort32;
  70         class b3BoundSearchCL* m_search;
  71         class b3PrefixScanCL* m_scan;
  72
  73         b3OpenCLArray<b3SortData>* m_sortDataBuffer;
  74         b3OpenCLArray<b3Contact4>* m_contactBuffer;
  75
  76         b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
  77         b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
  78         b3OpenCLArray<b3Contact4>* m_pBufContactOutGPU;
  79
  80         b3OpenCLArray<b3Contact4>* m_pBufContactOutGPUCopy;
  81         b3OpenCLArray<b3SortData>* m_contactKeyValues;
  82
  83         b3AlignedObjectArray<unsigned int> m_idxBuffer;
  84         b3AlignedObjectArray<b3SortData> m_sortData;
  85         b3AlignedObjectArray<b3Contact4> m_old;
  86
  87         b3AlignedObjectArray<int> m_batchSizes;
  88         b3OpenCLArray<int>* m_batchSizesGpu;
  89 };
  90
  91 b3GpuPgsContactSolver::b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity)
  92 {
  93         m_debugOutput = 0;
  94         m_data = new b3GpuBatchingPgsSolverInternalData;
  95         m_data->m_context = ctx;
  96         m_data->m_device = device;
  97         m_data->m_queue = q;
  98         m_data->m_pairCapacity = pairCapacity;
  99         m_data->m_nIterations = 4;
 100         m_data->m_batchSizesGpu = new b3OpenCLArray<int>(ctx, q);
 101         m_data->m_bodyBufferGPU = new b3OpenCLArray<b3RigidBodyData>(ctx, q);
 102         m_data->m_inertiaBufferGPU = new b3OpenCLArray<b3InertiaData>(ctx, q);
 103         m_data->m_pBufContactOutGPU = new b3OpenCLArray<b3Contact4>(ctx, q);
 104
 105         m_data->m_pBufContactOutGPUCopy = new b3OpenCLArray<b3Contact4>(ctx, q);
 106         m_data->m_contactKeyValues = new b3OpenCLArray<b3SortData>(ctx, q);
 107
 108         m_data->m_solverGPU = new b3Solver(ctx, device, q, 512 * 1024);
 109
 110         m_data->m_sort32 = new b3RadixSort32CL(ctx, device, m_data->m_queue);
 111         m_data->m_scan = new b3PrefixScanCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
 112         m_data->m_search = new b3BoundSearchCL(ctx, device, m_data->m_queue, B3_SOLVER_N_CELLS);
 113
 114         const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
 115
 116         m_data->m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, m_data->m_queue, sortSize);
 117         m_data->m_contactBuffer = new b3OpenCLArray<b3Contact4>(ctx, m_data->m_queue);
 118
 119         m_data->m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
 120         m_data->m_numConstraints->resize(B3_SOLVER_N_CELLS);
 121
 122         m_data->m_contactCGPU = new b3OpenCLArray<b3GpuConstraint4>(ctx, q, pairCapacity);
 123
 124         m_data->m_offsets = new b3OpenCLArray<unsigned int>(ctx, m_data->m_queue, B3_SOLVER_N_CELLS);
 125         m_data->m_offsets->resize(B3_SOLVER_N_CELLS);
 126         const char* additionalMacros = "";
 127         //const char* srcFileNameForCaching="";
 128
 129         cl_int pErrNum;
 130         const char* batchKernelSource = batchingKernelsCL;
 131         const char* batchKernelNewSource = batchingKernelsNewCL;
 132         const char* solverSetupSource = solverSetupCL;
 133         const char* solverSetup2Source = solverSetup2CL;
 134         const char* solveContactSource = solveContactCL;
 135         const char* solveFrictionSource = solveFrictionCL;
 136
 137         {
 138                 cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
 139                 b3Assert(solveContactProg);
 140
 141                 cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
 142                 b3Assert(solveFrictionProg);
 143
 144                 cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
 145
 146                 b3Assert(solverSetup2Prog);
 147
 148                 cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
 149                 b3Assert(solverSetupProg);
 150
 151                 m_data->m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
 152                 b3Assert(m_data->m_solveFrictionKernel);
 153
 154                 m_data->m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
 155                 b3Assert(m_data->m_solveContactKernel);
 156
 157                 m_data->m_solveSingleContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "solveSingleContactKernel", &pErrNum, solveContactProg, additionalMacros);
 158                 b3Assert(m_data->m_solveSingleContactKernel);
 159
 160                 m_data->m_solveSingleFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "solveSingleFrictionKernel", &pErrNum, solveFrictionProg, additionalMacros);
 161                 b3Assert(m_data->m_solveSingleFrictionKernel);
 162
 163                 m_data->m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
 164                 b3Assert(m_data->m_contactToConstraintKernel);
 165
 166                 m_data->m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
 167                 b3Assert(m_data->m_setSortDataKernel);
 168
 169                 m_data->m_setDeterminismSortDataBodyAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyA", &pErrNum, solverSetup2Prog, additionalMacros);
 170                 b3Assert(m_data->m_setDeterminismSortDataBodyAKernel);
 171
 172                 m_data->m_setDeterminismSortDataBodyBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataBodyB", &pErrNum, solverSetup2Prog, additionalMacros);
 173                 b3Assert(m_data->m_setDeterminismSortDataBodyBKernel);
 174
 175                 m_data->m_setDeterminismSortDataChildShapeAKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeA", &pErrNum, solverSetup2Prog, additionalMacros);
 176                 b3Assert(m_data->m_setDeterminismSortDataChildShapeAKernel);
 177
 178                 m_data->m_setDeterminismSortDataChildShapeBKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetDeterminismSortDataChildShapeB", &pErrNum, solverSetup2Prog, additionalMacros);
 179                 b3Assert(m_data->m_setDeterminismSortDataChildShapeBKernel);
 180
 181                 m_data->m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
 182                 b3Assert(m_data->m_reorderContactKernel);
 183
 184                 m_data->m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
 185                 b3Assert(m_data->m_copyConstraintKernel);
 186         }
 187
 188         {
 189                 cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
 190                 b3Assert(batchingProg);
 191
 192                 m_data->m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
 193                 b3Assert(m_data->m_batchingKernel);
 194         }
 195
 196         {
 197                 cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
 198                 b3Assert(batchingNewProg);
 199
 200                 m_data->m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
 201                 b3Assert(m_data->m_batchingKernelNew);
 202         }
 203 }
 204
 205 b3GpuPgsContactSolver::~b3GpuPgsContactSolver()
 206 {
 207         delete m_data->m_batchSizesGpu;
 208         delete m_data->m_bodyBufferGPU;
 209         delete m_data->m_inertiaBufferGPU;
 210         delete m_data->m_pBufContactOutGPU;
 211         delete m_data->m_pBufContactOutGPUCopy;
 212         delete m_data->m_contactKeyValues;
 213
 214         delete m_data->m_contactCGPU;
 215         delete m_data->m_numConstraints;
 216         delete m_data->m_offsets;
 217         delete m_data->m_sortDataBuffer;
 218         delete m_data->m_contactBuffer;
 219
 220         delete m_data->m_sort32;
 221         delete m_data->m_scan;
 222         delete m_data->m_search;
 223         delete m_data->m_solverGPU;
 224
 225         clReleaseKernel(m_data->m_batchingKernel);
 226         clReleaseKernel(m_data->m_batchingKernelNew);
 227         clReleaseKernel(m_data->m_solveSingleContactKernel);
 228         clReleaseKernel(m_data->m_solveSingleFrictionKernel);
 229         clReleaseKernel(m_data->m_solveContactKernel);
 230         clReleaseKernel(m_data->m_solveFrictionKernel);
 231
 232         clReleaseKernel(m_data->m_contactToConstraintKernel);
 233         clReleaseKernel(m_data->m_setSortDataKernel);
 234         clReleaseKernel(m_data->m_reorderContactKernel);
 235         clReleaseKernel(m_data->m_copyConstraintKernel);
 236
 237         clReleaseKernel(m_data->m_setDeterminismSortDataBodyAKernel);
 238         clReleaseKernel(m_data->m_setDeterminismSortDataBodyBKernel);
 239         clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeAKernel);
 240         clReleaseKernel(m_data->m_setDeterminismSortDataChildShapeBKernel);
 241
 242         delete m_data;
 243 }
 244
 245 struct b3ConstraintCfg
 246 {
 247         b3ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(0) {}
 248
 249         float m_positionDrift;
 250         float m_positionConstraintCoeff;
 251         float m_dt;
 252         bool m_enableParallelSolve;
 253         float m_batchCellSize;
 254         int m_staticIdx;
 255 };
 256
 257 void b3GpuPgsContactSolver::solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
 258                                                                                                                          b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes)  //const b3OpenCLArray<int>* gpuBatchSizes)
 259 {
 260         B3_PROFILE("solveContactConstraintBatchSizes");
 261         int numBatches = batchSizes->size() / B3_MAX_NUM_BATCHES;
 262         for (int iter = 0; iter < numIterations; iter++)
 263         {
 264                 for (int cellId = 0; cellId < numBatches; cellId++)
 265                 {
 266                         int offset = 0;
 267                         for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
 268                         {
 269                                 int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
 270                                 if (!numInBatch)
 271                                         break;
 272
 273                                 {
 274                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleContactKernel, "m_solveSingleContactKernel");
 275                                         launcher.setBuffer(bodyBuf->getBufferCL());
 276                                         launcher.setBuffer(shapeBuf->getBufferCL());
 277                                         launcher.setBuffer(constraint->getBufferCL());
 278                                         launcher.setConst(cellId);
 279                                         launcher.setConst(offset);
 280                                         launcher.setConst(numInBatch);
 281                                         launcher.launch1D(numInBatch);
 282                                         offset += numInBatch;
 283                                 }
 284                         }
 285                 }
 286         }
 287
 288         for (int iter = 0; iter < numIterations; iter++)
 289         {
 290                 for (int cellId = 0; cellId < numBatches; cellId++)
 291                 {
 292                         int offset = 0;
 293                         for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
 294                         {
 295                                 int numInBatch = batchSizes->at(cellId * B3_MAX_NUM_BATCHES + ii);
 296                                 if (!numInBatch)
 297                                         break;
 298
 299                                 {
 300                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solveSingleFrictionKernel, "m_solveSingleFrictionKernel");
 301                                         launcher.setBuffer(bodyBuf->getBufferCL());
 302                                         launcher.setBuffer(shapeBuf->getBufferCL());
 303                                         launcher.setBuffer(constraint->getBufferCL());
 304                                         launcher.setConst(cellId);
 305                                         launcher.setConst(offset);
 306                                         launcher.setConst(numInBatch);
 307                                         launcher.launch1D(numInBatch);
 308                                         offset += numInBatch;
 309                                 }
 310                         }
 311                 }
 312         }
 313 }
 314
 315 void b3GpuPgsContactSolver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
 316                                                                                                    b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes)  //,const b3OpenCLArray<int>* gpuBatchSizes)
 317 {
 318         //sort the contacts
 319
 320         b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
 321         {
 322                 const int nn = B3_SOLVER_N_CELLS;
 323
 324                 cdata.x = 0;
 325                 cdata.y = maxNumBatches;  //250;
 326
 327                 int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
 328 #ifdef DEBUG_ME
 329                 SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
 330                 adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
 331 #endif
 332
 333                 {
 334                         B3_PROFILE("m_batchSolveKernel iterations");
 335                         for (int iter = 0; iter < numIterations; iter++)
 336                         {
 337                                 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
 338                                 {
 339 #ifdef DEBUG_ME
 340                                         memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
 341                                         gpuDebugInfo.write(debugInfo, numWorkItems);
 342 #endif
 343
 344                                         cdata.z = ib;
 345
 346                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solveContactKernel, "m_solveContactKernel");
 347 #if 1
 348
 349                                         b3BufferInfoCL bInfo[] = {
 350
 351                                                 b3BufferInfoCL(bodyBuf->getBufferCL()),
 352                                                 b3BufferInfoCL(shapeBuf->getBufferCL()),
 353                                                 b3BufferInfoCL(constraint->getBufferCL()),
 354                                                 b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
 355                                                 b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
 356 #ifdef DEBUG_ME
 357                                                         ,
 358                                                 b3BufferInfoCL(&gpuDebugInfo)
 359 #endif
 360                                         };
 361
 362                                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 363                                         launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
 364                                         //launcher.setConst(  cdata.x );
 365                                         launcher.setConst(cdata.y);
 366                                         launcher.setConst(cdata.z);
 367                                         b3Int4 nSplit;
 368                                         nSplit.x = B3_SOLVER_N_SPLIT_X;
 369                                         nSplit.y = B3_SOLVER_N_SPLIT_Y;
 370                                         nSplit.z = B3_SOLVER_N_SPLIT_Z;
 371
 372                                         launcher.setConst(nSplit);
 373                                         launcher.launch1D(numWorkItems, 64);
 374
 375 #else
 376                                         const char* fileName = "m_batchSolveKernel.bin";
 377                                         FILE* f = fopen(fileName, "rb");
 378                                         if (f)
 379                                         {
 380                                                 int sizeInBytes = 0;
 381                                                 if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
 382                                                 {
 383                                                         printf("error, cannot get file size\n");
 384                                                         exit(0);
 385                                                 }
 386
 387                                                 unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
 388                                                 fread(buf, sizeInBytes, 1, f);
 389                                                 int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
 390                                                 int num = *(int*)&buf[serializedBytes];
 391
 392                                                 launcher.launch1D(num);
 393
 394                                                 //this clFinish is for testing on errors
 395                                                 clFinish(m_queue);
 396                                         }
 397
 398 #endif
 399
 400 #ifdef DEBUG_ME
 401                                         clFinish(m_queue);
 402                                         gpuDebugInfo.read(debugInfo, numWorkItems);
 403                                         clFinish(m_queue);
 404                                         for (int i = 0; i < numWorkItems; i++)
 405                                         {
 406                                                 if (debugInfo[i].m_valInt2 > 0)
 407                                                 {
 408                                                         printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
 409                                                 }
 410
 411                                                 if (debugInfo[i].m_valInt3 > 0)
 412                                                 {
 413                                                         printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
 414                                                 }
 415                                         }
 416 #endif  //DEBUG_ME
 417                                 }
 418                         }
 419
 420                         clFinish(m_data->m_queue);
 421                 }
 422
 423                 cdata.x = 1;
 424                 bool applyFriction = true;
 425                 if (applyFriction)
 426                 {
 427                         B3_PROFILE("m_batchSolveKernel iterations2");
 428                         for (int iter = 0; iter < numIterations; iter++)
 429                         {
 430                                 for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
 431                                 {
 432                                         cdata.z = ib;
 433
 434                                         b3BufferInfoCL bInfo[] = {
 435                                                 b3BufferInfoCL(bodyBuf->getBufferCL()),
 436                                                 b3BufferInfoCL(shapeBuf->getBufferCL()),
 437                                                 b3BufferInfoCL(constraint->getBufferCL()),
 438                                                 b3BufferInfoCL(m_data->m_solverGPU->m_numConstraints->getBufferCL()),
 439                                                 b3BufferInfoCL(m_data->m_solverGPU->m_offsets->getBufferCL())
 440 #ifdef DEBUG_ME
 441                                                         ,
 442                                                 b3BufferInfoCL(&gpuDebugInfo)
 443 #endif  //DEBUG_ME
 444                                         };
 445                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solveFrictionKernel, "m_solveFrictionKernel");
 446                                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 447                                         launcher.setBuffer(m_data->m_solverGPU->m_batchSizes.getBufferCL());
 448                                         //launcher.setConst(  cdata.x );
 449                                         launcher.setConst(cdata.y);
 450                                         launcher.setConst(cdata.z);
 451
 452                                         b3Int4 nSplit;
 453                                         nSplit.x = B3_SOLVER_N_SPLIT_X;
 454                                         nSplit.y = B3_SOLVER_N_SPLIT_Y;
 455                                         nSplit.z = B3_SOLVER_N_SPLIT_Z;
 456
 457                                         launcher.setConst(nSplit);
 458
 459                                         launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
 460                                 }
 461                         }
 462                         clFinish(m_data->m_queue);
 463                 }
 464 #ifdef DEBUG_ME
 465                 delete[] debugInfo;
 466 #endif  //DEBUG_ME
 467         }
 468 }
 469
 470 static bool sortfnc(const b3SortData& a, const b3SortData& b)
 471 {
 472         return (a.m_key < b.m_key);
 473 }
 474
 475 static bool b3ContactCmp(const b3Contact4& p, const b3Contact4& q)
 476 {
 477         return ((p.m_bodyAPtrAndSignBit < q.m_bodyAPtrAndSignBit) ||
 478                         ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit < q.m_bodyBPtrAndSignBit)) ||
 479                         ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
 480                         ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA < q.m_childIndexA) ||
 481                         ((p.m_bodyAPtrAndSignBit == q.m_bodyAPtrAndSignBit) && (p.m_bodyBPtrAndSignBit == q.m_bodyBPtrAndSignBit) && p.m_childIndexA == q.m_childIndexA && p.m_childIndexB < q.m_childIndexB));
 482 }
 483
 484 #define USE_SPATIAL_BATCHING 1
 485 #define USE_4x4_GRID 1
 486
 487 #ifndef USE_SPATIAL_BATCHING
 488 static const int gridTable4x4[] =
 489         {
 490                 0, 1, 17, 16,
 491                 1, 2, 18, 19,
 492                 17, 18, 32, 3,
 493                 16, 19, 3, 34};
 494 static const int gridTable8x8[] =
 495         {
 496                 0, 2, 3, 16, 17, 18, 19, 1,
 497                 66, 64, 80, 67, 82, 81, 65, 83,
 498                 131, 144, 128, 130, 147, 129, 145, 146,
 499                 208, 195, 194, 192, 193, 211, 210, 209,
 500                 21, 22, 23, 5, 4, 6, 7, 20,
 501                 86, 85, 69, 87, 70, 68, 84, 71,
 502                 151, 133, 149, 150, 135, 148, 132, 134,
 503                 197, 27, 214, 213, 212, 199, 198, 196
 504
 505 };
 506
 507 #endif
 508
 509 void SetSortDataCPU(b3Contact4* gContact, b3RigidBodyData* gBodies, b3SortData* gSortDataOut, int nContacts, float scale, const b3Int4& nSplit, int staticIdx)
 510 {
 511         for (int gIdx = 0; gIdx < nContacts; gIdx++)
 512         {
 513                 if (gIdx < nContacts)
 514                 {
 515                         int aPtrAndSignBit = gContact[gIdx].m_bodyAPtrAndSignBit;
 516                         int bPtrAndSignBit = gContact[gIdx].m_bodyBPtrAndSignBit;
 517
 518                         int aIdx = abs(aPtrAndSignBit);
 519                         int bIdx = abs(bPtrAndSignBit);
 520
 521                         bool aStatic = (aPtrAndSignBit < 0) || (aPtrAndSignBit == staticIdx);
 522
 523 #if USE_SPATIAL_BATCHING
 524                         int idx = (aStatic) ? bIdx : aIdx;
 525                         b3Vector3 p = gBodies[idx].m_pos;
 526                         int xIdx = (int)((p.x - ((p.x < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.x - 1);
 527                         int yIdx = (int)((p.y - ((p.y < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.y - 1);
 528                         int zIdx = (int)((p.z - ((p.z < 0.f) ? 1.f : 0.f)) * scale) & (nSplit.z - 1);
 529
 530                         int newIndex = (xIdx + yIdx * nSplit.x + zIdx * nSplit.x * nSplit.y);
 531
 532 #else  //USE_SPATIAL_BATCHING
 533                         bool bStatic = (bPtrAndSignBit < 0) || (bPtrAndSignBit == staticIdx);
 534
 535 #if USE_4x4_GRID
 536                         int aa = aIdx & 3;
 537                         int bb = bIdx & 3;
 538                         if (aStatic)
 539                                 aa = bb;
 540                         if (bStatic)
 541                                 bb = aa;
 542
 543                         int gridIndex = aa + bb * 4;
 544                         int newIndex = gridTable4x4[gridIndex];
 545 #else   //USE_4x4_GRID
 546                         int aa = aIdx & 7;
 547                         int bb = bIdx & 7;
 548                         if (aStatic)
 549                                 aa = bb;
 550                         if (bStatic)
 551                                 bb = aa;
 552
 553                         int gridIndex = aa + bb * 8;
 554                         int newIndex = gridTable8x8[gridIndex];
 555 #endif  //USE_4x4_GRID
 556 #endif  //USE_SPATIAL_BATCHING
 557
 558                         gSortDataOut[gIdx].x = newIndex;
 559                         gSortDataOut[gIdx].y = gIdx;
 560                 }
 561                 else
 562                 {
 563                         gSortDataOut[gIdx].x = 0xffffffff;
 564                 }
 565         }
 566 }
 567
 568 void b3GpuPgsContactSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const b3Config& config, int static0Index)
 569 {
 570         B3_PROFILE("solveContacts");
 571         m_data->m_bodyBufferGPU->setFromOpenCLBuffer(bodyBuf, numBodies);
 572         m_data->m_inertiaBufferGPU->setFromOpenCLBuffer(inertiaBuf, numBodies);
 573         m_data->m_pBufContactOutGPU->setFromOpenCLBuffer(contactBuf, numContacts);
 574
 575         if (optionalSortContactsDeterminism)
 576         {
 577                 if (!gCpuSortContactsDeterminism)
 578                 {
 579                         B3_PROFILE("GPU Sort contact constraints (determinism)");
 580
 581                         m_data->m_pBufContactOutGPUCopy->resize(numContacts);
 582                         m_data->m_contactKeyValues->resize(numContacts);
 583
 584                         m_data->m_pBufContactOutGPU->copyToCL(m_data->m_pBufContactOutGPUCopy->getBufferCL(), numContacts, 0, 0);
 585
 586                         {
 587                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeBKernel, "m_setDeterminismSortDataChildShapeBKernel");
 588                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
 589                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
 590                                 launcher.setConst(numContacts);
 591                                 launcher.launch1D(numContacts, 64);
 592                         }
 593                         m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
 594                         {
 595                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataChildShapeAKernel, "m_setDeterminismSortDataChildShapeAKernel");
 596                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
 597                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
 598                                 launcher.setConst(numContacts);
 599                                 launcher.launch1D(numContacts, 64);
 600                         }
 601                         m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
 602                         {
 603                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyBKernel, "m_setDeterminismSortDataBodyBKernel");
 604                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
 605                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
 606                                 launcher.setConst(numContacts);
 607                                 launcher.launch1D(numContacts, 64);
 608                         }
 609
 610                         m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
 611
 612                         {
 613                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_setDeterminismSortDataBodyAKernel, "m_setDeterminismSortDataBodyAKernel");
 614                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
 615                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
 616                                 launcher.setConst(numContacts);
 617                                 launcher.launch1D(numContacts, 64);
 618                         }
 619
 620                         m_data->m_solverGPU->m_sort32->execute(*m_data->m_contactKeyValues);
 621
 622                         {
 623                                 B3_PROFILE("gpu reorderContactKernel (determinism)");
 624
 625                                 b3Int4 cdata;
 626                                 cdata.x = numContacts;
 627
 628                                 //b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), b3BufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
 629                                 //      , b3BufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
 630                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
 631                                 launcher.setBuffer(m_data->m_pBufContactOutGPUCopy->getBufferCL());
 632                                 launcher.setBuffer(m_data->m_pBufContactOutGPU->getBufferCL());
 633                                 launcher.setBuffer(m_data->m_contactKeyValues->getBufferCL());
 634                                 launcher.setConst(cdata);
 635                                 launcher.launch1D(numContacts, 64);
 636                         }
 637                 }
 638                 else
 639                 {
 640                         B3_PROFILE("CPU Sort contact constraints (determinism)");
 641                         b3AlignedObjectArray<b3Contact4> cpuConstraints;
 642                         m_data->m_pBufContactOutGPU->copyToHost(cpuConstraints);
 643                         bool sort = true;
 644                         if (sort)
 645                         {
 646                                 cpuConstraints.quickSort(b3ContactCmp);
 647
 648                                 for (int i = 0; i < cpuConstraints.size(); i++)
 649                                 {
 650                                         cpuConstraints[i].m_batchIdx = i;
 651                                 }
 652                         }
 653                         m_data->m_pBufContactOutGPU->copyFromHost(cpuConstraints);
 654                         if (m_debugOutput == 100)
 655                         {
 656                                 for (int i = 0; i < cpuConstraints.size(); i++)
 657                                 {
 658                                         printf("c[%d].m_bodyA = %d, m_bodyB = %d, batchId = %d\n", i, cpuConstraints[i].m_bodyAPtrAndSignBit, cpuConstraints[i].m_bodyBPtrAndSignBit, cpuConstraints[i].m_batchIdx);
 659                                 }
 660                         }
 661
 662                         m_debugOutput++;
 663                 }
 664         }
 665
 666         int nContactOut = m_data->m_pBufContactOutGPU->size();
 667
 668         bool useSolver = true;
 669
 670         if (useSolver)
 671         {
 672                 float dt = 1. / 60.;
 673                 b3ConstraintCfg csCfg(dt);
 674                 csCfg.m_enableParallelSolve = true;
 675                 csCfg.m_batchCellSize = 6;
 676                 csCfg.m_staticIdx = static0Index;
 677
 678                 b3OpenCLArray<b3RigidBodyData>* bodyBuf = m_data->m_bodyBufferGPU;
 679
 680                 void* additionalData = 0;  //m_data->m_frictionCGPU;
 681                 const b3OpenCLArray<b3InertiaData>* shapeBuf = m_data->m_inertiaBufferGPU;
 682                 b3OpenCLArray<b3GpuConstraint4>* contactConstraintOut = m_data->m_contactCGPU;
 683                 int nContacts = nContactOut;
 684
 685                 int maxNumBatches = 0;
 686
 687                 if (!gUseLargeBatches)
 688                 {
 689                         if (m_data->m_solverGPU->m_contactBuffer2)
 690                         {
 691                                 m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
 692                         }
 693
 694                         if (m_data->m_solverGPU->m_contactBuffer2 == 0)
 695                         {
 696                                 m_data->m_solverGPU->m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(m_data->m_context, m_data->m_queue, nContacts);
 697                                 m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
 698                         }
 699
 700                         //clFinish(m_data->m_queue);
 701
 702                         {
 703                                 B3_PROFILE("batching");
 704                                 //@todo: just reserve it, without copy of original contact (unless we use warmstarting)
 705
 706                                 //const b3OpenCLArray<b3RigidBodyData>* bodyNative = bodyBuf;
 707
 708                                 {
 709                                         //b3OpenCLArray<b3RigidBodyData>* bodyNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
 710                                         //b3OpenCLArray<b3Contact4>* contactNative = b3OpenCLArrayUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
 711
 712                                         const int sortAlignment = 512;  // todo. get this out of sort
 713                                         if (csCfg.m_enableParallelSolve)
 714                                         {
 715                                                 int sortSize = B3NEXTMULTIPLEOF(nContacts, sortAlignment);
 716
 717                                                 b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
 718                                                 b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
 719
 720                                                 if (!gCpuSetSortData)
 721                                                 {  //   2. set cell idx
 722                                                         B3_PROFILE("GPU set cell idx");
 723                                                         struct CB
 724                                                         {
 725                                                                 int m_nContacts;
 726                                                                 int m_staticIdx;
 727                                                                 float m_scale;
 728                                                                 b3Int4 m_nSplit;
 729                                                         };
 730
 731                                                         b3Assert(sortSize % 64 == 0);
 732                                                         CB cdata;
 733                                                         cdata.m_nContacts = nContacts;
 734                                                         cdata.m_staticIdx = csCfg.m_staticIdx;
 735                                                         cdata.m_scale = 1.f / csCfg.m_batchCellSize;
 736                                                         cdata.m_nSplit.x = B3_SOLVER_N_SPLIT_X;
 737                                                         cdata.m_nSplit.y = B3_SOLVER_N_SPLIT_Y;
 738                                                         cdata.m_nSplit.z = B3_SOLVER_N_SPLIT_Z;
 739
 740                                                         m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
 741
 742                                                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
 743                                                         b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel, "m_setSortDataKernel");
 744                                                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 745                                                         launcher.setConst(cdata.m_nContacts);
 746                                                         launcher.setConst(cdata.m_scale);
 747                                                         launcher.setConst(cdata.m_nSplit);
 748                                                         launcher.setConst(cdata.m_staticIdx);
 749
 750                                                         launcher.launch1D(sortSize, 64);
 751                                                 }
 752                                                 else
 753                                                 {
 754                                                         m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
 755                                                         b3AlignedObjectArray<b3SortData> sortDataCPU;
 756                                                         m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataCPU);
 757
 758                                                         b3AlignedObjectArray<b3Contact4> contactCPU;
 759                                                         m_data->m_pBufContactOutGPU->copyToHost(contactCPU);
 760                                                         b3AlignedObjectArray<b3RigidBodyData> bodiesCPU;
 761                                                         bodyBuf->copyToHost(bodiesCPU);
 762                                                         float scale = 1.f / csCfg.m_batchCellSize;
 763                                                         b3Int4 nSplit;
 764                                                         nSplit.x = B3_SOLVER_N_SPLIT_X;
 765                                                         nSplit.y = B3_SOLVER_N_SPLIT_Y;
 766                                                         nSplit.z = B3_SOLVER_N_SPLIT_Z;
 767
 768                                                         SetSortDataCPU(&contactCPU[0], &bodiesCPU[0], &sortDataCPU[0], nContacts, scale, nSplit, csCfg.m_staticIdx);
 769
 770                                                         m_data->m_solverGPU->m_sortDataBuffer->copyFromHost(sortDataCPU);
 771                                                 }
 772
 773                                                 if (!gCpuRadixSort)
 774                                                 {  //   3. sort by cell idx
 775                                                         B3_PROFILE("gpuRadixSort");
 776                                                         //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
 777                                                         //int sortBit = 32;
 778                                                         //if( n <= 0xffff ) sortBit = 16;
 779                                                         //if( n <= 0xff ) sortBit = 8;
 780                                                         //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
 781                                                         //adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
 782                                                         b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
 783                                                         this->m_data->m_solverGPU->m_sort32->execute(keyValuesInOut);
 784                                                 }
 785                                                 else
 786                                                 {
 787                                                         b3OpenCLArray<b3SortData>& keyValuesInOut = *(m_data->m_solverGPU->m_sortDataBuffer);
 788                                                         b3AlignedObjectArray<b3SortData> hostValues;
 789                                                         keyValuesInOut.copyToHost(hostValues);
 790                                                         hostValues.quickSort(sortfnc);
 791                                                         keyValuesInOut.copyFromHost(hostValues);
 792                                                 }
 793
 794                                                 if (gUseScanHost)
 795                                                 {
 796                                                         //      4. find entries
 797                                                         B3_PROFILE("cpuBoundSearch");
 798                                                         b3AlignedObjectArray<unsigned int> countsHost;
 799                                                         countsNative->copyToHost(countsHost);
 800
 801                                                         b3AlignedObjectArray<b3SortData> sortDataHost;
 802                                                         m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
 803
 804                                                         //m_data->m_solverGPU->m_search->executeHost(*m_data->m_solverGPU->m_sortDataBuffer,nContacts,*countsNative,B3_SOLVER_N_CELLS,b3BoundSearchCL::COUNT);
 805                                                         m_data->m_solverGPU->m_search->executeHost(sortDataHost, nContacts, countsHost, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
 806
 807                                                         countsNative->copyFromHost(countsHost);
 808
 809                                                         //adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative,
 810                                                         //      B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, adl::BoundSearchBase::COUNT );
 811
 812                                                         //unsigned int sum;
 813                                                         //m_data->m_solverGPU->m_scan->execute(*countsNative,*offsetsNative, B3_SOLVER_N_CELLS);//,&sum );
 814                                                         b3AlignedObjectArray<unsigned int> offsetsHost;
 815                                                         offsetsHost.resize(offsetsNative->size());
 816
 817                                                         m_data->m_solverGPU->m_scan->executeHost(countsHost, offsetsHost, B3_SOLVER_N_CELLS);  //,&sum );
 818                                                         offsetsNative->copyFromHost(offsetsHost);
 819
 820                                                         //printf("sum = %d\n",sum);
 821                                                 }
 822                                                 else
 823                                                 {
 824                                                         //      4. find entries
 825                                                         B3_PROFILE("gpuBoundSearch");
 826                                                         m_data->m_solverGPU->m_search->execute(*m_data->m_solverGPU->m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_CELLS, b3BoundSearchCL::COUNT);
 827                                                         m_data->m_solverGPU->m_scan->execute(*countsNative, *offsetsNative, B3_SOLVER_N_CELLS);  //,&sum );
 828                                                 }
 829
 830                                                 if (nContacts)
 831                                                 {  //   5. sort constraints by cellIdx
 832                                                         if (gReorderContactsOnCpu)
 833                                                         {
 834                                                                 B3_PROFILE("cpu m_reorderContactKernel");
 835                                                                 b3AlignedObjectArray<b3SortData> sortDataHost;
 836                                                                 m_data->m_solverGPU->m_sortDataBuffer->copyToHost(sortDataHost);
 837                                                                 b3AlignedObjectArray<b3Contact4> inContacts;
 838                                                                 b3AlignedObjectArray<b3Contact4> outContacts;
 839                                                                 m_data->m_pBufContactOutGPU->copyToHost(inContacts);
 840                                                                 outContacts.resize(inContacts.size());
 841                                                                 for (int i = 0; i < nContacts; i++)
 842                                                                 {
 843                                                                         int srcIdx = sortDataHost[i].y;
 844                                                                         outContacts[i] = inContacts[srcIdx];
 845                                                                 }
 846                                                                 m_data->m_solverGPU->m_contactBuffer2->copyFromHost(outContacts);
 847
 848                                                                 /*                                                              "void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
 849                                                                 "{\n"
 850                                                                 "       int nContacts = cb.x;\n"
 851                                                                 "       int gIdx = GET_GLOBAL_IDX;\n"
 852                                                                 "       if( gIdx < nContacts )\n"
 853                                                                 "       {\n"
 854                                                                 "               int srcIdx = sortData[gIdx].y;\n"
 855                                                                 "               out[gIdx] = in[srcIdx];\n"
 856                                                                 "       }\n"
 857                                                                 "}\n"
 858                                                                 */
 859                                                         }
 860                                                         else
 861                                                         {
 862                                                                 B3_PROFILE("gpu m_reorderContactKernel");
 863
 864                                                                 b3Int4 cdata;
 865                                                                 cdata.x = nContacts;
 866
 867                                                                 b3BufferInfoCL bInfo[] = {
 868                                                                         b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL()),
 869                                                                         b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()), b3BufferInfoCL(m_data->m_solverGPU->m_sortDataBuffer->getBufferCL())};
 870
 871                                                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_reorderContactKernel, "m_reorderContactKernel");
 872                                                                 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 873                                                                 launcher.setConst(cdata);
 874                                                                 launcher.launch1D(nContacts, 64);
 875                                                         }
 876                                                 }
 877                                         }
 878                                 }
 879
 880                                 //clFinish(m_data->m_queue);
 881
 882                                 //                              {
 883                                 //                              b3AlignedObjectArray<unsigned int> histogram;
 884                                 //                              m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
 885                                 //                              printf(",,,\n");
 886                                 //                              }
 887
 888                                 if (nContacts)
 889                                 {
 890                                         if (gUseCpuCopyConstraints)
 891                                         {
 892                                                 for (int i = 0; i < nContacts; i++)
 893                                                 {
 894                                                         m_data->m_pBufContactOutGPU->copyFromOpenCLArray(*m_data->m_solverGPU->m_contactBuffer2);
 895                                                         //                                                      m_data->m_solverGPU->m_contactBuffer2->getBufferCL();
 896                                                         //                                              m_data->m_pBufContactOutGPU->getBufferCL()
 897                                                 }
 898                                         }
 899                                         else
 900                                         {
 901                                                 B3_PROFILE("gpu m_copyConstraintKernel");
 902                                                 b3Int4 cdata;
 903                                                 cdata.x = nContacts;
 904                                                 b3BufferInfoCL bInfo[] = {
 905                                                         b3BufferInfoCL(m_data->m_solverGPU->m_contactBuffer2->getBufferCL()),
 906                                                         b3BufferInfoCL(m_data->m_pBufContactOutGPU->getBufferCL())};
 907
 908                                                 b3LauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel, "m_copyConstraintKernel");
 909                                                 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 910                                                 launcher.setConst(cdata);
 911                                                 launcher.launch1D(nContacts, 64);
 912                                                 //we use the clFinish for proper benchmark/profile
 913                                                 clFinish(m_data->m_queue);
 914                                         }
 915                                 }
 916
 917                                 //                              bool compareGPU = false;
 918                                 if (nContacts)
 919                                 {
 920                                         if (!gCpuBatchContacts)
 921                                         {
 922                                                 B3_PROFILE("gpu batchContacts");
 923                                                 maxNumBatches = 250;  //250;
 924                                                 m_data->m_solverGPU->batchContacts(m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx);
 925                                                 clFinish(m_data->m_queue);
 926                                         }
 927                                         else
 928                                         {
 929                                                 B3_PROFILE("cpu batchContacts");
 930                                                 static b3AlignedObjectArray<b3Contact4> cpuContacts;
 931                                                 b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
 932                                                 {
 933                                                         B3_PROFILE("copyToHost");
 934                                                         contactsIn->copyToHost(cpuContacts);
 935                                                 }
 936                                                 b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
 937                                                 b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
 938
 939                                                 b3AlignedObjectArray<unsigned int> nNativeHost;
 940                                                 b3AlignedObjectArray<unsigned int> offsetsNativeHost;
 941
 942                                                 {
 943                                                         B3_PROFILE("countsNative/offsetsNative copyToHost");
 944                                                         countsNative->copyToHost(nNativeHost);
 945                                                         offsetsNative->copyToHost(offsetsNativeHost);
 946                                                 }
 947
 948                                                 int numNonzeroGrid = 0;
 949
 950                                                 if (gUseLargeBatches)
 951                                                 {
 952                                                         m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
 953                                                         int totalNumConstraints = cpuContacts.size();
 954                                                         //int simdWidth =numBodies+1;//-1;//64;//-1;//32;
 955                                                         int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]);  //     on GPU
 956                                                         maxNumBatches = b3Max(numBatches, maxNumBatches);
 957                                                         static int globalMaxBatch = 0;
 958                                                         if (maxNumBatches > globalMaxBatch)
 959                                                         {
 960                                                                 globalMaxBatch = maxNumBatches;
 961                                                                 b3Printf("maxNumBatches = %d\n", maxNumBatches);
 962                                                         }
 963                                                 }
 964                                                 else
 965                                                 {
 966                                                         m_data->m_batchSizes.resize(B3_SOLVER_N_CELLS * B3_MAX_NUM_BATCHES);
 967                                                         B3_PROFILE("cpu batch grid");
 968                                                         for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
 969                                                         {
 970                                                                 int n = (nNativeHost)[i];
 971                                                                 int offset = (offsetsNativeHost)[i];
 972                                                                 if (n)
 973                                                                 {
 974                                                                         numNonzeroGrid++;
 975                                                                         int simdWidth = numBodies + 1;                                                                                                                                 //-1;//64;//-1;//32;
 976                                                                         int numBatches = sortConstraintByBatch3(&cpuContacts[0] + offset, n, simdWidth, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[i * B3_MAX_NUM_BATCHES]);  //       on GPU
 977                                                                         maxNumBatches = b3Max(numBatches, maxNumBatches);
 978                                                                         static int globalMaxBatch = 0;
 979                                                                         if (maxNumBatches > globalMaxBatch)
 980                                                                         {
 981                                                                                 globalMaxBatch = maxNumBatches;
 982                                                                                 b3Printf("maxNumBatches = %d\n", maxNumBatches);
 983                                                                         }
 984                                                                         //we use the clFinish for proper benchmark/profile
 985                                                                 }
 986                                                         }
 987                                                         //clFinish(m_data->m_queue);
 988                                                 }
 989                                                 {
 990                                                         B3_PROFILE("m_contactBuffer->copyFromHost");
 991                                                         m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
 992                                                 }
 993                                         }
 994                                 }
 995                         }
 996                 }
 997
 998                 //printf("maxNumBatches = %d\n", maxNumBatches);
 999
1000                 if (gUseLargeBatches)
1001                 {
1002                         if (nContacts)
1003                         {
1004                                 B3_PROFILE("cpu batchContacts");
1005                                 static b3AlignedObjectArray<b3Contact4> cpuContacts;
1006                                 //                              b3OpenCLArray<b3Contact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
1007                                 {
1008                                         B3_PROFILE("copyToHost");
1009                                         m_data->m_pBufContactOutGPU->copyToHost(cpuContacts);
1010                                 }
1011                                 //                              b3OpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
1012                                 //                              b3OpenCLArray<unsigned int>* offsetsNative = m_data->m_solverGPU->m_offsets;
1013
1014                                 //                              int numNonzeroGrid=0;
1015
1016                                 {
1017                                         m_data->m_batchSizes.resize(B3_MAX_NUM_BATCHES);
1018                                         int totalNumConstraints = cpuContacts.size();
1019                                         //                              int simdWidth =numBodies+1;//-1;//64;//-1;//32;
1020                                         int numBatches = sortConstraintByBatch3(&cpuContacts[0], totalNumConstraints, totalNumConstraints + 1, csCfg.m_staticIdx, numBodies, &m_data->m_batchSizes[0]);  //     on GPU
1021                                         maxNumBatches = b3Max(numBatches, maxNumBatches);
1022                                         static int globalMaxBatch = 0;
1023                                         if (maxNumBatches > globalMaxBatch)
1024                                         {
1025                                                 globalMaxBatch = maxNumBatches;
1026                                                 b3Printf("maxNumBatches = %d\n", maxNumBatches);
1027                                         }
1028                                 }
1029                                 {
1030                                         B3_PROFILE("m_contactBuffer->copyFromHost");
1031                                         m_data->m_solverGPU->m_contactBuffer2->copyFromHost((b3AlignedObjectArray<b3Contact4>&)cpuContacts);
1032                                 }
1033                         }
1034                 }
1035
1036                 if (nContacts)
1037                 {
1038                         B3_PROFILE("gpu convertToConstraints");
1039                         m_data->m_solverGPU->convertToConstraints(bodyBuf,
1040                                                                                                           shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
1041                                                                                                           contactConstraintOut,
1042                                                                                                           additionalData, nContacts,
1043                                                                                                           (b3SolverBase::ConstraintCfg&)csCfg);
1044                         clFinish(m_data->m_queue);
1045                 }
1046
1047                 if (1)
1048                 {
1049                         int numIter = 4;
1050
1051                         m_data->m_solverGPU->m_nIterations = numIter;  //10
1052                         if (!gCpuSolveConstraint)
1053                         {
1054                                 B3_PROFILE("GPU solveContactConstraint");
1055
1056                                 /*m_data->m_solverGPU->solveContactConstraint(
1057                                 m_data->m_bodyBufferGPU,
1058                                 m_data->m_inertiaBufferGPU,
1059                                 m_data->m_contactCGPU,0,
1060                                 nContactOut ,
1061                                 maxNumBatches);
1062                                 */
1063
1064                                 //m_data->m_batchSizesGpu->copyFromHost(m_data->m_batchSizes);
1065
1066                                 if (gUseLargeBatches)
1067                                 {
1068                                         solveContactConstraintBatchSizes(m_data->m_bodyBufferGPU,
1069                                                                                                          m_data->m_inertiaBufferGPU,
1070                                                                                                          m_data->m_contactCGPU, 0,
1071                                                                                                          nContactOut,
1072                                                                                                          maxNumBatches, numIter, &m_data->m_batchSizes);
1073                                 }
1074                                 else
1075                                 {
1076                                         solveContactConstraint(
1077                                                 m_data->m_bodyBufferGPU,
1078                                                 m_data->m_inertiaBufferGPU,
1079                                                 m_data->m_contactCGPU, 0,
1080                                                 nContactOut,
1081                                                 maxNumBatches, numIter, &m_data->m_batchSizes);  //m_data->m_batchSizesGpu);
1082                                 }
1083                         }
1084                         else
1085                         {
1086                                 B3_PROFILE("Host solveContactConstraint");
1087
1088                                 m_data->m_solverGPU->solveContactConstraintHost(m_data->m_bodyBufferGPU, m_data->m_inertiaBufferGPU, m_data->m_contactCGPU, 0, nContactOut, maxNumBatches, &m_data->m_batchSizes);
1089                         }
1090                 }
1091
1092 #if 0
1093         if (0)
1094         {
1095             B3_PROFILE("read body velocities back to CPU");
1096             //read body updated linear/angular velocities back to CPU
1097             m_data->m_bodyBufferGPU->read(
1098                                                   m_data->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
1099             adl::DeviceUtils::waitForCompletion( m_data->m_deviceCL );
1100         }
1101 #endif
1102         }
1103 }
1104
1105 void b3GpuPgsContactSolver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx)
1106 {
1107 }
1108
1109 b3AlignedObjectArray<unsigned int> idxBuffer;
1110 b3AlignedObjectArray<b3SortData> sortData;
1111 b3AlignedObjectArray<b3Contact4> old;
1112
1113 inline int b3GpuPgsContactSolver::sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies)
1114 {
1115         B3_PROFILE("sortConstraintByBatch");
1116         int numIter = 0;
1117
1118         sortData.resize(n);
1119         idxBuffer.resize(n);
1120         old.resize(n);
1121
1122         unsigned int* idxSrc = &idxBuffer[0];
1123         unsigned int* idxDst = &idxBuffer[0];
1124         int nIdxSrc, nIdxDst;
1125
1126         const int N_FLG = 256;
1127         const int FLG_MASK = N_FLG - 1;
1128         unsigned int flg[N_FLG / 32];
1129 #if defined(_DEBUG)
1130         for (int i = 0; i < n; i++)
1131                 cs[i].getBatchIdx() = -1;
1132 #endif
1133         for (int i = 0; i < n; i++)
1134                 idxSrc[i] = i;
1135         nIdxSrc = n;
1136
1137         int batchIdx = 0;
1138
1139         {
1140                 B3_PROFILE("cpu batch innerloop");
1141                 while (nIdxSrc)
1142                 {
1143                         numIter++;
1144                         nIdxDst = 0;
1145                         int nCurrentBatch = 0;
1146
1147                         //      clear flag
1148                         for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1149
1150                         for (int i = 0; i < nIdxSrc; i++)
1151                         {
1152                                 int idx = idxSrc[i];
1153
1154                                 b3Assert(idx < n);
1155                                 //      check if it can go
1156                                 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1157                                 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1158
1159                                 int bodyA = abs(bodyAS);
1160                                 int bodyB = abs(bodyBS);
1161
1162                                 int aIdx = bodyA & FLG_MASK;
1163                                 int bIdx = bodyB & FLG_MASK;
1164
1165                                 unsigned int aUnavailable = flg[aIdx / 32] & (1 << (aIdx & 31));
1166                                 unsigned int bUnavailable = flg[bIdx / 32] & (1 << (bIdx & 31));
1167
1168                                 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1169                                 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1170
1171                                 //use inv_mass!
1172                                 aUnavailable = !aIsStatic ? aUnavailable : 0;  //
1173                                 bUnavailable = !bIsStatic ? bUnavailable : 0;
1174
1175                                 if (aUnavailable == 0 && bUnavailable == 0)  // ok
1176                                 {
1177                                         if (!aIsStatic)
1178                                                 flg[aIdx / 32] |= (1 << (aIdx & 31));
1179                                         if (!bIsStatic)
1180                                                 flg[bIdx / 32] |= (1 << (bIdx & 31));
1181
1182                                         cs[idx].getBatchIdx() = batchIdx;
1183                                         sortData[idx].m_key = batchIdx;
1184                                         sortData[idx].m_value = idx;
1185
1186                                         {
1187                                                 nCurrentBatch++;
1188                                                 if (nCurrentBatch == simdWidth)
1189                                                 {
1190                                                         nCurrentBatch = 0;
1191                                                         for (int i = 0; i < N_FLG / 32; i++) flg[i] = 0;
1192                                                 }
1193                                         }
1194                                 }
1195                                 else
1196                                 {
1197                                         idxDst[nIdxDst++] = idx;
1198                                 }
1199                         }
1200                         b3Swap(idxSrc, idxDst);
1201                         b3Swap(nIdxSrc, nIdxDst);
1202                         batchIdx++;
1203                 }
1204         }
1205         {
1206                 B3_PROFILE("quickSort");
1207                 sortData.quickSort(sortfnc);
1208         }
1209
1210         {
1211                 B3_PROFILE("reorder");
1212                 //      reorder
1213
1214                 memcpy(&old[0], cs, sizeof(b3Contact4) * n);
1215                 for (int i = 0; i < n; i++)
1216                 {
1217                         int idx = sortData[i].m_value;
1218                         cs[i] = old[idx];
1219                 }
1220         }
1221
1222 #if defined(_DEBUG)
1223         //              debugPrintf( "nBatches: %d\n", batchIdx );
1224         for (int i = 0; i < n; i++)
1225         {
1226                 b3Assert(cs[i].getBatchIdx() != -1);
1227         }
1228 #endif
1229         return batchIdx;
1230 }
1231
1232 b3AlignedObjectArray<int> bodyUsed2;
1233
1234 inline int b3GpuPgsContactSolver::sortConstraintByBatch2(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies)
1235 {
1236         B3_PROFILE("sortConstraintByBatch2");
1237
1238         bodyUsed2.resize(2 * simdWidth);
1239
1240         for (int q = 0; q < 2 * simdWidth; q++)
1241                 bodyUsed2[q] = 0;
1242
1243         int curBodyUsed = 0;
1244
1245         int numIter = 0;
1246
1247         m_data->m_sortData.resize(numConstraints);
1248         m_data->m_idxBuffer.resize(numConstraints);
1249         m_data->m_old.resize(numConstraints);
1250
1251         unsigned int* idxSrc = &m_data->m_idxBuffer[0];
1252
1253 #if defined(_DEBUG)
1254         for (int i = 0; i < numConstraints; i++)
1255                 cs[i].getBatchIdx() = -1;
1256 #endif
1257         for (int i = 0; i < numConstraints; i++)
1258                 idxSrc[i] = i;
1259
1260         int numValidConstraints = 0;
1261         //      int unprocessedConstraintIndex = 0;
1262
1263         int batchIdx = 0;
1264
1265         {
1266                 B3_PROFILE("cpu batch innerloop");
1267
1268                 while (numValidConstraints < numConstraints)
1269                 {
1270                         numIter++;
1271                         int nCurrentBatch = 0;
1272                         //      clear flag
1273                         for (int i = 0; i < curBodyUsed; i++)
1274                                 bodyUsed2[i] = 0;
1275                         curBodyUsed = 0;
1276
1277                         for (int i = numValidConstraints; i < numConstraints; i++)
1278                         {
1279                                 int idx = idxSrc[i];
1280                                 b3Assert(idx < numConstraints);
1281                                 //      check if it can go
1282                                 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1283                                 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1284                                 int bodyA = abs(bodyAS);
1285                                 int bodyB = abs(bodyBS);
1286                                 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1287                                 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1288                                 int aUnavailable = 0;
1289                                 int bUnavailable = 0;
1290                                 if (!aIsStatic)
1291                                 {
1292                                         for (int j = 0; j < curBodyUsed; j++)
1293                                         {
1294                                                 if (bodyA == bodyUsed2[j])
1295                                                 {
1296                                                         aUnavailable = 1;
1297                                                         break;
1298                                                 }
1299                                         }
1300                                 }
1301                                 if (!aUnavailable)
1302                                         if (!bIsStatic)
1303                                         {
1304                                                 for (int j = 0; j < curBodyUsed; j++)
1305                                                 {
1306                                                         if (bodyB == bodyUsed2[j])
1307                                                         {
1308                                                                 bUnavailable = 1;
1309                                                                 break;
1310                                                         }
1311                                                 }
1312                                         }
1313
1314                                 if (aUnavailable == 0 && bUnavailable == 0)  // ok
1315                                 {
1316                                         if (!aIsStatic)
1317                                         {
1318                                                 bodyUsed2[curBodyUsed++] = bodyA;
1319                                         }
1320                                         if (!bIsStatic)
1321                                         {
1322                                                 bodyUsed2[curBodyUsed++] = bodyB;
1323                                         }
1324
1325                                         cs[idx].getBatchIdx() = batchIdx;
1326                                         m_data->m_sortData[idx].m_key = batchIdx;
1327                                         m_data->m_sortData[idx].m_value = idx;
1328
1329                                         if (i != numValidConstraints)
1330                                         {
1331                                                 b3Swap(idxSrc[i], idxSrc[numValidConstraints]);
1332                                         }
1333
1334                                         numValidConstraints++;
1335                                         {
1336                                                 nCurrentBatch++;
1337                                                 if (nCurrentBatch == simdWidth)
1338                                                 {
1339                                                         nCurrentBatch = 0;
1340                                                         for (int i = 0; i < curBodyUsed; i++)
1341                                                                 bodyUsed2[i] = 0;
1342
1343                                                         curBodyUsed = 0;
1344                                                 }
1345                                         }
1346                                 }
1347                         }
1348
1349                         batchIdx++;
1350                 }
1351         }
1352         {
1353                 B3_PROFILE("quickSort");
1354                 //m_data->m_sortData.quickSort(sortfnc);
1355         }
1356
1357         {
1358                 B3_PROFILE("reorder");
1359                 //      reorder
1360
1361                 memcpy(&m_data->m_old[0], cs, sizeof(b3Contact4) * numConstraints);
1362
1363                 for (int i = 0; i < numConstraints; i++)
1364                 {
1365                         b3Assert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
1366                         int idx = m_data->m_sortData[idxSrc[i]].m_value;
1367                         cs[i] = m_data->m_old[idx];
1368                 }
1369         }
1370
1371 #if defined(_DEBUG)
1372         //              debugPrintf( "nBatches: %d\n", batchIdx );
1373         for (int i = 0; i < numConstraints; i++)
1374         {
1375                 b3Assert(cs[i].getBatchIdx() != -1);
1376         }
1377 #endif
1378
1379         return batchIdx;
1380 }
1381
1382 b3AlignedObjectArray<int> bodyUsed;
1383 b3AlignedObjectArray<int> curUsed;
1384
1385 inline int b3GpuPgsContactSolver::sortConstraintByBatch3(b3Contact4* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies, int* batchSizes)
1386 {
1387         B3_PROFILE("sortConstraintByBatch3");
1388
1389         static int maxSwaps = 0;
1390         int numSwaps = 0;
1391
1392         curUsed.resize(2 * simdWidth);
1393
1394         static int maxNumConstraints = 0;
1395         if (maxNumConstraints < numConstraints)
1396         {
1397                 maxNumConstraints = numConstraints;
1398                 //printf("maxNumConstraints  = %d\n",maxNumConstraints );
1399         }
1400
1401         int numUsedArray = numBodies / 32 + 1;
1402         bodyUsed.resize(numUsedArray);
1403
1404         for (int q = 0; q < numUsedArray; q++)
1405                 bodyUsed[q] = 0;
1406
1407         int curBodyUsed = 0;
1408
1409         int numIter = 0;
1410
1411         m_data->m_sortData.resize(0);
1412         m_data->m_idxBuffer.resize(0);
1413         m_data->m_old.resize(0);
1414
1415 #if defined(_DEBUG)
1416         for (int i = 0; i < numConstraints; i++)
1417                 cs[i].getBatchIdx() = -1;
1418 #endif
1419
1420         int numValidConstraints = 0;
1421         //      int unprocessedConstraintIndex = 0;
1422
1423         int batchIdx = 0;
1424
1425         {
1426                 B3_PROFILE("cpu batch innerloop");
1427
1428                 while (numValidConstraints < numConstraints)
1429                 {
1430                         numIter++;
1431                         int nCurrentBatch = 0;
1432                         batchSizes[batchIdx] = 0;
1433
1434                         //      clear flag
1435                         for (int i = 0; i < curBodyUsed; i++)
1436                                 bodyUsed[curUsed[i] / 32] = 0;
1437
1438                         curBodyUsed = 0;
1439
1440                         for (int i = numValidConstraints; i < numConstraints; i++)
1441                         {
1442                                 int idx = i;
1443                                 b3Assert(idx < numConstraints);
1444                                 //      check if it can go
1445                                 int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
1446                                 int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
1447                                 int bodyA = abs(bodyAS);
1448                                 int bodyB = abs(bodyBS);
1449                                 bool aIsStatic = (bodyAS < 0) || bodyAS == staticIdx;
1450                                 bool bIsStatic = (bodyBS < 0) || bodyBS == staticIdx;
1451                                 int aUnavailable = 0;
1452                                 int bUnavailable = 0;
1453                                 if (!aIsStatic)
1454                                 {
1455                                         aUnavailable = bodyUsed[bodyA / 32] & (1 << (bodyA & 31));
1456                                 }
1457                                 if (!aUnavailable)
1458                                         if (!bIsStatic)
1459                                         {
1460                                                 bUnavailable = bodyUsed[bodyB / 32] & (1 << (bodyB & 31));
1461                                         }
1462
1463                                 if (aUnavailable == 0 && bUnavailable == 0)  // ok
1464                                 {
1465                                         if (!aIsStatic)
1466                                         {
1467                                                 bodyUsed[bodyA / 32] |= (1 << (bodyA & 31));
1468                                                 curUsed[curBodyUsed++] = bodyA;
1469                                         }
1470                                         if (!bIsStatic)
1471                                         {
1472                                                 bodyUsed[bodyB / 32] |= (1 << (bodyB & 31));
1473                                                 curUsed[curBodyUsed++] = bodyB;
1474                                         }
1475
1476                                         cs[idx].getBatchIdx() = batchIdx;
1477
1478                                         if (i != numValidConstraints)
1479                                         {
1480                                                 b3Swap(cs[i], cs[numValidConstraints]);
1481                                                 numSwaps++;
1482                                         }
1483
1484                                         numValidConstraints++;
1485                                         {
1486                                                 nCurrentBatch++;
1487                                                 if (nCurrentBatch == simdWidth)
1488                                                 {
1489                                                         batchSizes[batchIdx] += simdWidth;
1490                                                         nCurrentBatch = 0;
1491                                                         for (int i = 0; i < curBodyUsed; i++)
1492                                                                 bodyUsed[curUsed[i] / 32] = 0;
1493                                                         curBodyUsed = 0;
1494                                                 }
1495                                         }
1496                                 }
1497                         }
1498
1499                         if (batchIdx >= B3_MAX_NUM_BATCHES)
1500                         {
1501                                 b3Error("batchIdx>=B3_MAX_NUM_BATCHES");
1502                                 b3Assert(0);
1503                                 break;
1504                         }
1505
1506                         batchSizes[batchIdx] += nCurrentBatch;
1507
1508                         batchIdx++;
1509                 }
1510         }
1511
1512 #if defined(_DEBUG)
1513         //              debugPrintf( "nBatches: %d\n", batchIdx );
1514         for (int i = 0; i < numConstraints; i++)
1515         {
1516                 b3Assert(cs[i].getBatchIdx() != -1);
1517         }
1518 #endif
1519
1520         batchSizes[batchIdx] = 0;
1521
1522         if (maxSwaps < numSwaps)
1523         {
1524                 maxSwaps = numSwaps;
1525                 //printf("maxSwaps = %d\n", maxSwaps);
1526         }
1527
1528         return batchIdx;
1529 }