2 bool searchIncremental3dSapOnGpu = true;
4 #include "b3GpuSapBroadphase.h"
5 #include "Bullet3Common/b3Vector3.h"
6 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
7 #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h"
9 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
10 #include "kernels/sapKernels.h"
12 #include "Bullet3Common/b3MinMax.h"
14 #define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
23 b3OpenCLArray<int> m_pairCount;
26 b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
27 b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
29 virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
33 virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
38 b3OpenCLArray<b3Vector3> m_sum;
39 b3OpenCLArray<b3Vector3> m_sum2;
40 b3OpenCLArray<b3Vector3> m_dst;
42 b3OpenCLArray<int> m_smallAabbsMappingGPU;
43 b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
45 b3OpenCLArray<int> m_largeAabbsMappingGPU;
46 b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
49 b3OpenCLArray<b3Int4> m_overlappingPairs;
51 //temporary gpu work memory
52 b3OpenCLArray<b3SortData> m_gpuSmallSortData;
53 b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
55 class b3PrefixScanFloat4CL* m_prefixScanFloat4;
58 b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType)
63 m_objectMinMaxIndexGPUaxis0(ctx, q),
64 m_objectMinMaxIndexGPUaxis1(ctx, q),
65 m_objectMinMaxIndexGPUaxis2(ctx, q),
66 m_objectMinMaxIndexGPUaxis0prev(ctx, q),
67 m_objectMinMaxIndexGPUaxis1prev(ctx, q),
68 m_objectMinMaxIndexGPUaxis2prev(ctx, q),
69 m_sortedAxisGPU0(ctx, q),
70 m_sortedAxisGPU1(ctx, q),
71 m_sortedAxisGPU2(ctx, q),
72 m_sortedAxisGPU0prev(ctx, q),
73 m_sortedAxisGPU1prev(ctx, q),
74 m_sortedAxisGPU2prev(ctx, q),
75 m_addedHostPairsGPU(ctx, q),
76 m_removedHostPairsGPU(ctx, q),
77 m_addedCountGPU(ctx, q),
78 m_removedCountGPU(ctx, q),
81 m_allAabbsGPU(ctx, q),
85 m_smallAabbsMappingGPU(ctx, q),
86 m_largeAabbsMappingGPU(ctx, q),
87 m_overlappingPairs(ctx, q),
88 m_gpuSmallSortData(ctx, q),
89 m_gpuSmallSortedAabbs(ctx, q)
91 const char* sapSrc = sapCL;
97 cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
98 b3Assert(errNum == CL_SUCCESS);
100 b3Assert(errNum == CL_SUCCESS);
102 m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context, m_device, m_queue);
104 m_prefixScanFloat4 = 0;
110 case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU:
115 case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU:
117 m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBruteForce", &errNum, sapProg);
121 case B3_GPU_SAP_KERNEL_ORIGINAL:
123 m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelOriginal", &errNum, sapProg);
126 case B3_GPU_SAP_KERNEL_BARRIER:
128 m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBarrier", &errNum, sapProg);
131 case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY:
133 m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg);
139 m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg);
140 b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory");
144 m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
145 b3Assert(errNum == CL_SUCCESS);
147 m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "prepareSumVarianceKernel", &errNum, sapProg);
148 b3Assert(errNum == CL_SUCCESS);
150 m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "flipFloatKernel", &errNum, sapProg);
152 m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
154 m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "scatterKernel", &errNum, sapProg);
156 m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
159 b3GpuSapBroadphase::~b3GpuSapBroadphase()
162 delete m_prefixScanFloat4;
164 clReleaseKernel(m_scatterKernel);
165 clReleaseKernel(m_flipFloatKernel);
166 clReleaseKernel(m_copyAabbsKernel);
167 clReleaseKernel(m_sapKernel);
168 clReleaseKernel(m_sap2Kernel);
169 clReleaseKernel(m_prepareSumVarianceKernel);
172 /// conservative test for overlap between two aabbs
173 static bool TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1,
174 const b3Vector3& aabbMin2, const b3Vector3& aabbMax2)
177 overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
178 overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
179 overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
183 //http://stereopsis.com/radix.html
184 static unsigned int FloatFlip(float fl)
186 unsigned int f = *(unsigned int*)&fl;
187 unsigned int mask = -(int)(f >> 31) | 0x80000000;
191 void b3GpuSapBroadphase::init3dSap()
193 if (m_currentBuffer < 0)
195 m_allAabbsGPU.copyToHost(m_allAabbsCPU);
198 for (int axis = 0; axis < 3; axis++)
200 for (int buf = 0; buf < 2; buf++)
202 int totalNumAabbs = m_allAabbsCPU.size();
203 int numEndPoints = 2 * totalNumAabbs;
204 m_sortedAxisCPU[axis][buf].resize(numEndPoints);
206 if (buf == m_currentBuffer)
208 for (int i = 0; i < totalNumAabbs; i++)
210 m_sortedAxisCPU[axis][buf][i * 2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis]) - 1;
211 m_sortedAxisCPU[axis][buf][i * 2].m_value = i * 2;
212 m_sortedAxisCPU[axis][buf][i * 2 + 1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis]) + 1;
213 m_sortedAxisCPU[axis][buf][i * 2 + 1].m_value = i * 2 + 1;
219 for (int axis = 0; axis < 3; axis++)
221 m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
224 for (int axis = 0; axis < 3; axis++)
226 //int totalNumAabbs = m_allAabbsCPU.size();
227 int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
228 m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints);
229 for (int i = 0; i < numEndPoints; i++)
231 int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
232 int newDest = destIndex / 2;
235 m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i;
239 m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i;
246 static bool b3PairCmp(const b3Int4& p, const b3Int4& q)
248 return ((p.x < q.x) || ((p.x == q.x) && (p.y < q.y)));
251 static bool operator==(const b3Int4& a, const b3Int4& b)
253 return a.x == b.x && a.y == b.y;
256 static bool operator<(const b3Int4& a, const b3Int4& b)
258 return a.x < b.x || (a.x == b.x && a.y < b.y);
261 static bool operator>(const b3Int4& a, const b3Int4& b)
263 return a.x > b.x || (a.x == b.x && a.y > b.y);
266 b3AlignedObjectArray<b3Int4> addedHostPairs;
267 b3AlignedObjectArray<b3Int4> removedHostPairs;
269 b3AlignedObjectArray<b3SapAabb> preAabbs;
271 void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
273 //static int framepje = 0;
274 //printf("framepje=%d\n",framepje++);
276 B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap");
278 addedHostPairs.resize(0);
279 removedHostPairs.resize(0);
281 b3Assert(m_currentBuffer >= 0);
284 preAabbs.resize(m_allAabbsCPU.size());
285 for (int i = 0; i < preAabbs.size(); i++)
287 preAabbs[i] = m_allAabbsCPU[i];
291 if (m_currentBuffer < 0)
294 B3_PROFILE("m_allAabbsGPU.copyToHost");
295 m_allAabbsGPU.copyToHost(m_allAabbsCPU);
298 b3AlignedObjectArray<b3Int4> allPairs;
300 B3_PROFILE("m_overlappingPairs.copyToHost");
301 m_overlappingPairs.copyToHost(allPairs);
306 printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n",
307 m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1], m_allAabbsCPU[40].m_min[2],
308 m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1], m_allAabbsCPU[40].m_max[2]);
312 printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n",
313 m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1], m_allAabbsCPU[53].m_min[2],
314 m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1], m_allAabbsCPU[53].m_max[2]);
321 int index = allPairs.findBinarySearch(newPair);
322 printf("hasPair(40,53)=%d out of %d\n", index, allPairs.size());
325 int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max, (const b3Vector3&)m_allAabbsCPU[53].m_min, (const b3Vector3&)m_allAabbsCPU[53].m_max);
326 printf("overlap=%d\n", overlap);
331 int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max, (const b3Vector3&)preAabbs[53].m_min, (const b3Vector3&)preAabbs[53].m_max);
332 printf("prevoverlap=%d\n", prevOverlap);
336 printf("unknown prevoverlap\n");
343 for (int i = 0; i < m_allAabbsCPU.size(); i++)
345 //printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]);
348 for (int axis = 0; axis < 3; axis++)
350 for (int buf = 0; buf < 2; buf++)
352 b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size() * 2);
357 m_currentBuffer = 1 - m_currentBuffer;
359 int totalNumAabbs = m_allAabbsCPU.size();
362 B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)");
363 for (int i = 0; i < totalNumAabbs; i++)
365 unsigned int keyMin[3];
366 unsigned int keyMax[3];
367 for (int axis = 0; axis < 3; axis++)
369 float vmin = m_allAabbsCPU[i].m_min[axis];
370 float vmax = m_allAabbsCPU[i].m_max[axis];
371 keyMin[axis] = FloatFlip(vmin);
372 keyMax[axis] = FloatFlip(vmax);
374 m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_key = keyMin[axis] - 1;
375 m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_value = i * 2;
376 m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_key = keyMax[axis] + 1;
377 m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_value = i * 2 + 1;
379 //printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]);
384 B3_PROFILE("sort m_sortedAxisCPU");
385 for (int axis = 0; axis < 3; axis++)
386 m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
392 for (int axis=0;axis<3;axis++)
394 //printf("axis %d\n",axis);
395 for (int i=0;i<m_sortedAxisCPU[axis][m_currentBuffer].size();i++)
397 //int key = m_sortedAxisCPU[axis][m_currentBuffer][i].m_key;
398 //int value = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
399 //printf("[%d]=%d\n",i,value);
407 B3_PROFILE("assign m_objectMinMaxIndexCPU");
408 for (int axis = 0; axis < 3; axis++)
410 int totalNumAabbs = m_allAabbsCPU.size();
411 int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
412 m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs);
413 for (int i = 0; i < numEndPoints; i++)
415 int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
416 int newDest = destIndex / 2;
419 m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i;
423 m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i;
432 printf("==========================\n");
433 for (int axis=0;axis<3;axis++)
435 unsigned int curMinIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].x;
436 unsigned int curMaxIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].y;
437 unsigned int prevMaxIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].y;
438 unsigned int prevMinIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].x;
440 int dmin40 = curMinIndex40 - prevMinIndex40;
441 int dmax40 = curMinIndex40 - prevMinIndex40;
442 printf("axis %d curMinIndex40=%d prevMinIndex40=%d\n",axis,curMinIndex40, prevMinIndex40);
443 printf("axis %d curMaxIndex40=%d prevMaxIndex40=%d\n",axis,curMaxIndex40, prevMaxIndex40);
445 printf(".........................\n");
446 for (int axis=0;axis<3;axis++)
448 unsigned int curMinIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].x;
449 unsigned int curMaxIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].y;
450 unsigned int prevMaxIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].y;
451 unsigned int prevMinIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].x;
453 int dmin40 = curMinIndex53 - prevMinIndex53;
454 int dmax40 = curMinIndex53 - prevMinIndex53;
455 printf("axis %d curMinIndex53=%d prevMinIndex53=%d\n",axis,curMinIndex53, prevMinIndex53);
456 printf("axis %d curMaxIndex53=%d prevMaxIndex53=%d\n",axis,curMaxIndex53, prevMaxIndex53);
462 int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
463 int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size();
464 int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size();
468 if (searchIncremental3dSapOnGpu)
470 B3_PROFILE("computePairsIncremental3dSapKernelGPU");
471 int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
472 int maxCapacity = 1024*1024;
474 B3_PROFILE("copy from host");
475 m_objectMinMaxIndexGPUaxis0.copyFromHost(m_objectMinMaxIndexCPU[0][m_currentBuffer]);
476 m_objectMinMaxIndexGPUaxis1.copyFromHost(m_objectMinMaxIndexCPU[1][m_currentBuffer]);
477 m_objectMinMaxIndexGPUaxis2.copyFromHost(m_objectMinMaxIndexCPU[2][m_currentBuffer]);
478 m_objectMinMaxIndexGPUaxis0prev.copyFromHost(m_objectMinMaxIndexCPU[0][1-m_currentBuffer]);
479 m_objectMinMaxIndexGPUaxis1prev.copyFromHost(m_objectMinMaxIndexCPU[1][1-m_currentBuffer]);
480 m_objectMinMaxIndexGPUaxis2prev.copyFromHost(m_objectMinMaxIndexCPU[2][1-m_currentBuffer]);
482 m_sortedAxisGPU0.copyFromHost(m_sortedAxisCPU[0][m_currentBuffer]);
483 m_sortedAxisGPU1.copyFromHost(m_sortedAxisCPU[1][m_currentBuffer]);
484 m_sortedAxisGPU2.copyFromHost(m_sortedAxisCPU[2][m_currentBuffer]);
485 m_sortedAxisGPU0prev.copyFromHost(m_sortedAxisCPU[0][1-m_currentBuffer]);
486 m_sortedAxisGPU1prev.copyFromHost(m_sortedAxisCPU[1][1-m_currentBuffer]);
487 m_sortedAxisGPU2prev.copyFromHost(m_sortedAxisCPU[2][1-m_currentBuffer]);
490 m_addedHostPairsGPU.resize(maxCapacity);
491 m_removedHostPairsGPU.resize(maxCapacity);
493 m_addedCountGPU.resize(0);
494 m_addedCountGPU.push_back(0);
495 m_removedCountGPU.resize(0);
496 m_removedCountGPU.push_back(0);
500 B3_PROFILE("launch1D");
501 b3LauncherCL launcher(m_queue, m_computePairsIncremental3dSapKernel,"m_computePairsIncremental3dSapKernel");
502 launcher.setBuffer(m_objectMinMaxIndexGPUaxis0.getBufferCL());
503 launcher.setBuffer(m_objectMinMaxIndexGPUaxis1.getBufferCL());
504 launcher.setBuffer(m_objectMinMaxIndexGPUaxis2.getBufferCL());
505 launcher.setBuffer(m_objectMinMaxIndexGPUaxis0prev.getBufferCL());
506 launcher.setBuffer(m_objectMinMaxIndexGPUaxis1prev.getBufferCL());
507 launcher.setBuffer(m_objectMinMaxIndexGPUaxis2prev.getBufferCL());
509 launcher.setBuffer(m_sortedAxisGPU0.getBufferCL());
510 launcher.setBuffer(m_sortedAxisGPU1.getBufferCL());
511 launcher.setBuffer(m_sortedAxisGPU2.getBufferCL());
512 launcher.setBuffer(m_sortedAxisGPU0prev.getBufferCL());
513 launcher.setBuffer(m_sortedAxisGPU1prev.getBufferCL());
514 launcher.setBuffer(m_sortedAxisGPU2prev.getBufferCL());
517 launcher.setBuffer(m_addedHostPairsGPU.getBufferCL());
518 launcher.setBuffer(m_removedHostPairsGPU.getBufferCL());
519 launcher.setBuffer(m_addedCountGPU.getBufferCL());
520 launcher.setBuffer(m_removedCountGPU.getBufferCL());
521 launcher.setConst(maxCapacity);
522 launcher.setConst( numObjects);
523 launcher.launch1D( numObjects);
528 B3_PROFILE("copy to host");
529 int addedCountGPU = m_addedCountGPU.at(0);
530 m_addedHostPairsGPU.resize(addedCountGPU);
531 m_addedHostPairsGPU.copyToHost(addedHostPairs);
533 //printf("addedCountGPU=%d\n",addedCountGPU);
534 int removedCountGPU = m_removedCountGPU.at(0);
535 m_removedHostPairsGPU.resize(removedCountGPU);
536 m_removedHostPairsGPU.copyToHost(removedHostPairs);
537 //printf("removedCountGPU=%d\n",removedCountGPU);
547 int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
549 B3_PROFILE("actual search");
550 for (int i = 0; i < numObjects; i++)
552 //int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size();
553 //int checkObjects[]={40,53};
554 //int numCheckObjects = sizeof(checkObjects)/sizeof(int);
556 //for (int a=0;a<numCheckObjects ;a++)
558 for (int axis = 0; axis < 3; axis++)
560 //int i = checkObjects[a];
562 unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x;
563 unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y;
564 unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].x;
565 int dmin = curMinIndex - prevMinIndex;
567 unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].y;
569 int dmax = curMaxIndex - prevMaxIndex;
572 //printf("for object %d, dmin=%d\n",i,dmin);
576 //printf("for object %d, dmax=%d\n",i,dmax);
578 for (int otherbuffer = 0; otherbuffer < 2; otherbuffer++)
582 int stepMin = dmin < 0 ? -1 : 1;
583 for (int j = prevMinIndex; j != curMinIndex; j += stepMin)
585 int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
586 int otherIndex = otherIndex2 / 2;
589 bool otherIsMax = ((otherIndex2 & 1) != 0);
593 //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
594 //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
598 for (int ax = 0; ax < 3; ax++)
600 if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
601 (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
605 // b3Assert(overlap2==overlap);
607 bool prevOverlap = true;
609 for (int ax = 0; ax < 3; ax++)
611 if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) ||
612 (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x))
616 //b3Assert(overlap==overlap2);
620 if (overlap && !prevOverlap)
627 newPair.y = otherIndex;
631 newPair.x = otherIndex;
634 addedHostPairs.push_back(newPair);
639 if (!overlap && prevOverlap)
646 removedPair.y = otherIndex;
650 removedPair.x = otherIndex;
653 removedHostPairs.push_back(removedPair);
657 } //if (otherIndex!=i)
663 int stepMax = dmax < 0 ? -1 : 1;
664 for (int j = prevMaxIndex; j != curMaxIndex; j += stepMax)
666 int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
667 int otherIndex = otherIndex2 / 2;
670 //bool otherIsMin = ((otherIndex2&1)==0);
673 //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
674 //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
678 for (int ax = 0; ax < 3; ax++)
680 if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
681 (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
684 //b3Assert(overlap2==overlap);
686 bool prevOverlap = true;
688 for (int ax = 0; ax < 3; ax++)
690 if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) ||
691 (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x))
697 if (overlap && !prevOverlap)
704 newPair.y = otherIndex;
708 newPair.x = otherIndex;
711 addedHostPairs.push_back(newPair);
716 if (!overlap && prevOverlap)
718 //if (otherIndex2&1==0) -> min?
724 removedPair.y = otherIndex;
728 removedPair.x = otherIndex;
731 removedHostPairs.push_back(removedPair);
736 } //if (otherIndex!=i)
739 } //for (int otherbuffer
741 } //for (int i=0;i<numObjects
744 //remove duplicates and add/remove then to existing m_overlappingPairs
748 B3_PROFILE("sort allPairs");
749 allPairs.quickSort(b3PairCmp);
752 B3_PROFILE("sort addedHostPairs");
753 addedHostPairs.quickSort(b3PairCmp);
756 B3_PROFILE("sort removedHostPairs");
757 removedHostPairs.quickSort(b3PairCmp);
765 int uniqueRemovedPairs = 0;
767 b3AlignedObjectArray<int> removedPositions;
770 B3_PROFILE("actual removing");
771 for (int i = 0; i < removedHostPairs.size(); i++)
773 b3Int4 removedPair = removedHostPairs[i];
774 if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y))
776 int index1 = allPairs.findBinarySearch(removedPair);
780 int index2 = allPairs.findLinearSearch(removedPair);
781 b3Assert(index1 == index2);
783 //b3Assert(index1!=allPairs.size());
784 if (index1 < allPairs.size())
787 uniqueRemovedPairs++;
788 removedPositions.push_back(index1);
790 //printf("framepje(%d) remove pair(%d):%d,%d\n",framepje,i,removedPair.x,removedPair.y);
794 prevPair = removedPair;
797 if (uniqueRemovedPairs)
799 for (int i = 0; i < removedPositions.size(); i++)
801 allPairs[removedPositions[i]].x = INT_MAX;
802 allPairs[removedPositions[i]].y = INT_MAX;
804 allPairs.quickSort(b3PairCmp);
805 allPairs.resize(allPairs.size() - uniqueRemovedPairs);
808 //if (uniqueRemovedPairs)
809 // printf("uniqueRemovedPairs=%d\n",uniqueRemovedPairs);
810 //printf("removedHostPairs.size = %d\n",removedHostPairs.size());
815 int uniqueAddedPairs = 0;
816 b3AlignedObjectArray<b3Int4> actualAddedPairs;
819 B3_PROFILE("actual adding");
820 for (int i = 0; i < addedHostPairs.size(); i++)
822 b3Int4 newPair = addedHostPairs[i];
823 if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y))
826 int index1 = allPairs.findBinarySearch(newPair);
828 int index2 = allPairs.findLinearSearch(newPair);
829 b3Assert(index1 == index2);
831 b3Assert(index1 == allPairs.size());
832 if (index1 != allPairs.size())
837 if (index1 == allPairs.size())
841 actualAddedPairs.push_back(newPair);
846 for (int i = 0; i < actualAddedPairs.size(); i++)
848 //printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y);
849 allPairs.push_back(actualAddedPairs[i]);
853 //if (uniqueAddedPairs)
854 // printf("uniqueAddedPairs=%d\n", uniqueAddedPairs);
857 B3_PROFILE("m_overlappingPairs.copyFromHost");
858 m_overlappingPairs.copyFromHost(allPairs);
862 void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
865 // if (m_currentBuffer>=0)
866 // return calculateOverlappingPairsHostIncremental3Sap();
868 b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
869 m_allAabbsGPU.copyToHost(m_allAabbsCPU);
873 B3_PROFILE("CPU compute best variance axis");
874 b3Vector3 s = b3MakeVector3(0, 0, 0), s2 = b3MakeVector3(0, 0, 0);
875 int numRigidBodies = m_smallAabbsMappingCPU.size();
877 for (int i = 0; i < numRigidBodies; i++)
879 b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
881 b3Vector3 maxAabb = b3MakeVector3(aabb.m_max[0], aabb.m_max[1], aabb.m_max[2]);
882 b3Vector3 minAabb = b3MakeVector3(aabb.m_min[0], aabb.m_min[1], aabb.m_min[2]);
883 b3Vector3 centerAabb = (maxAabb + minAabb) * 0.5f;
886 s2 += centerAabb * centerAabb;
888 b3Vector3 v = s2 - (s * s) / (float)numRigidBodies;
896 b3AlignedObjectArray<b3Int4> hostPairs;
899 int numSmallAabbs = m_smallAabbsMappingCPU.size();
900 for (int i = 0; i < numSmallAabbs; i++)
902 b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
903 //float reference = smallAabbi.m_max[axis];
905 for (int j = i + 1; j < numSmallAabbs; j++)
907 b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]];
909 if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max,
910 (b3Vector3&)smallAabbj.m_min, (b3Vector3&)smallAabbj.m_max))
913 int a = smallAabbi.m_minIndices[3];
914 int b = smallAabbj.m_minIndices[3];
917 pair.x = a; //store the original index in the unsorted aabb array
922 pair.x = b; //store the original index in the unsorted aabb array
925 hostPairs.push_back(pair);
932 int numSmallAabbs = m_smallAabbsMappingCPU.size();
933 for (int i = 0; i < numSmallAabbs; i++)
935 b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
937 //float reference = smallAabbi.m_max[axis];
938 int numLargeAabbs = m_largeAabbsMappingCPU.size();
940 for (int j = 0; j < numLargeAabbs; j++)
942 b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]];
943 if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max,
944 (b3Vector3&)largeAabbj.m_min, (b3Vector3&)largeAabbj.m_max))
947 int a = largeAabbj.m_minIndices[3];
948 int b = smallAabbi.m_minIndices[3];
952 pair.y = b; //store the original index in the unsorted aabb array
957 pair.y = a; //store the original index in the unsorted aabb array
960 hostPairs.push_back(pair);
966 if (hostPairs.size() > maxPairs)
968 hostPairs.resize(maxPairs);
971 if (hostPairs.size())
973 m_overlappingPairs.copyFromHost(hostPairs);
977 m_overlappingPairs.resize(0);
983 void b3GpuSapBroadphase::reset()
985 m_allAabbsGPU.resize(0);
986 m_allAabbsCPU.resize(0);
988 m_smallAabbsMappingGPU.resize(0);
989 m_smallAabbsMappingCPU.resize(0);
991 m_pairCount.resize(0);
993 m_largeAabbsMappingGPU.resize(0);
994 m_largeAabbsMappingCPU.resize(0);
997 void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
999 if (m_sapKernel == 0)
1001 calculateOverlappingPairsHost(maxPairs);
1005 //if (m_currentBuffer>=0)
1006 // return calculateOverlappingPairsHostIncremental3Sap();
1008 //calculateOverlappingPairsHost(maxPairs);
1010 B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs");
1015 //bool syncOnHost = false;
1017 int numSmallAabbs = m_smallAabbsMappingCPU.size();
1018 if (m_prefixScanFloat4 && numSmallAabbs)
1020 B3_PROFILE("GPU compute best variance axis");
1022 if (m_dst.size() != (numSmallAabbs + 1))
1024 m_dst.resize(numSmallAabbs + 128);
1025 m_sum.resize(numSmallAabbs + 128);
1026 m_sum2.resize(numSmallAabbs + 128);
1027 m_sum.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow?
1028 m_sum2.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow?
1031 b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel, "m_prepareSumVarianceKernel");
1032 launcher.setBuffer(m_allAabbsGPU.getBufferCL());
1034 launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
1035 launcher.setBuffer(m_sum.getBufferCL());
1036 launcher.setBuffer(m_sum2.getBufferCL());
1037 launcher.setConst(numSmallAabbs);
1038 int num = numSmallAabbs;
1039 launcher.launch1D(num);
1043 m_prefixScanFloat4->execute(m_sum, m_dst, numSmallAabbs + 1, &s);
1044 m_prefixScanFloat4->execute(m_sum2, m_dst, numSmallAabbs + 1, &s2);
1046 b3Vector3 v = s2 - (s * s) / (float)numSmallAabbs;
1054 m_gpuSmallSortData.resize(numSmallAabbs);
1057 if (m_smallAabbsMappingGPU.size())
1059 B3_PROFILE("flipFloatKernel");
1060 b3BufferInfoCL bInfo[] = {
1061 b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true),
1062 b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true),
1063 b3BufferInfoCL(m_gpuSmallSortData.getBufferCL())};
1064 b3LauncherCL launcher(m_queue, m_flipFloatKernel, "m_flipFloatKernel");
1065 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
1066 launcher.setConst(numSmallAabbs);
1067 launcher.setConst(axis);
1069 int num = numSmallAabbs;
1070 launcher.launch1D(num);
1074 if (m_gpuSmallSortData.size())
1076 B3_PROFILE("gpu radix sort");
1077 m_sorter->execute(m_gpuSmallSortData);
1081 m_gpuSmallSortedAabbs.resize(numSmallAabbs);
1084 B3_PROFILE("scatterKernel");
1086 b3BufferInfoCL bInfo[] = {
1087 b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true),
1088 b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true),
1089 b3BufferInfoCL(m_gpuSmallSortData.getBufferCL(), true),
1090 b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
1091 b3LauncherCL launcher(m_queue, m_scatterKernel, "m_scatterKernel ");
1092 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
1093 launcher.setConst(numSmallAabbs);
1094 int num = numSmallAabbs;
1095 launcher.launch1D(num);
1099 m_overlappingPairs.resize(maxPairs);
1101 m_pairCount.resize(0);
1102 m_pairCount.push_back(0);
1106 int numLargeAabbs = m_largeAabbsMappingGPU.size();
1107 if (numLargeAabbs && numSmallAabbs)
1110 B3_PROFILE("sap2Kernel");
1111 b3BufferInfoCL bInfo[] = {
1112 b3BufferInfoCL(m_allAabbsGPU.getBufferCL()),
1113 b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
1114 b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
1115 b3BufferInfoCL(m_overlappingPairs.getBufferCL()),
1116 b3BufferInfoCL(m_pairCount.getBufferCL())};
1117 b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
1118 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
1119 launcher.setConst(numLargeAabbs);
1120 launcher.setConst(numSmallAabbs);
1121 launcher.setConst(axis);
1122 launcher.setConst(maxPairs);
1123 //@todo: use actual maximum work item sizes of the device instead of hardcoded values
1124 launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
1126 numPairs = m_pairCount.at(0);
1127 if (numPairs > maxPairs)
1129 b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
1130 numPairs = maxPairs;
1134 if (m_gpuSmallSortedAabbs.size())
1136 B3_PROFILE("sapKernel");
1137 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())};
1138 b3LauncherCL launcher(m_queue, m_sapKernel, "m_sapKernel");
1139 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
1140 launcher.setConst(numSmallAabbs);
1141 launcher.setConst(axis);
1142 launcher.setConst(maxPairs);
1144 int num = numSmallAabbs;
1146 int buffSize = launcher.getSerializationBufferSize();
1147 unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
1148 for (int i=0;i<buffSize+1;i++)
1150 unsigned char* ptr = (unsigned char*)&buf[i];
1153 int actualWrite = launcher.serializeArguments(buf,buffSize);
1155 unsigned char* cptr = (unsigned char*)&buf[buffSize];
1156 // printf("buf[buffSize] = %d\n",*cptr);
1158 assert(buf[buffSize]==0xff);//check for buffer overrun
1159 int* ptr = (int*)&buf[buffSize];
1163 FILE* f = fopen("m_sapKernelArgs.bin","wb");
1164 fwrite(buf,buffSize+sizeof(int),1,f);
1168 launcher.launch1D(num);
1171 numPairs = m_pairCount.at(0);
1172 if (numPairs > maxPairs)
1174 b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
1175 numPairs = maxPairs;
1176 m_pairCount.resize(0);
1177 m_pairCount.push_back(maxPairs);
1184 b3LauncherCL launcher(m_queue, m_sapKernel);
1186 const char* fileName = "m_sapKernelArgs.bin";
1187 FILE* f = fopen(fileName, "rb");
1190 int sizeInBytes = 0;
1191 if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
1193 printf("error, cannot get file size\n");
1197 unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
1198 fread(buf, sizeInBytes, 1, f);
1199 int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
1200 int num = *(int*)&buf[serializedBytes];
1201 launcher.launch1D(num);
1203 b3OpenCLArray<int> pairCount(m_context, m_queue);
1204 int numElements = launcher.m_arrays[2]->size() / sizeof(int);
1205 pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(), numElements);
1206 numPairs = pairCount.at(0);
1207 //printf("overlapping pairs = %d\n",numPairs);
1208 b3AlignedObjectArray<b3Int4> hostOoverlappingPairs;
1209 b3OpenCLArray<b3Int4> tmpGpuPairs(m_context, m_queue);
1210 tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(), numPairs);
1212 tmpGpuPairs.copyToHost(hostOoverlappingPairs);
1213 m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
1214 //printf("hello %d\n", m_overlappingPairs.size());
1220 printf("error: cannot find file %s\n", fileName);
1227 m_overlappingPairs.resize(numPairs);
1229 } //B3_PROFILE("GPU_RADIX SORT");
1233 void b3GpuSapBroadphase::writeAabbsToGpu()
1235 m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
1236 m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
1238 m_allAabbsGPU.copyFromHost(m_allAabbsCPU); //might not be necessary, the 'setupGpuAabbsFull' already takes care of this
1241 void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
1243 int index = userPtr;
1245 for (int i = 0; i < 4; i++)
1247 aabb.m_min[i] = aabbMin[i];
1248 aabb.m_max[i] = aabbMax[i];
1250 aabb.m_minIndices[3] = index;
1251 aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
1252 m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size());
1254 m_allAabbsCPU.push_back(aabb);
1257 void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
1259 int index = userPtr;
1261 for (int i = 0; i < 4; i++)
1263 aabb.m_min[i] = aabbMin[i];
1264 aabb.m_max[i] = aabbMax[i];
1266 aabb.m_minIndices[3] = index;
1267 aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
1268 m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size());
1270 m_allAabbsCPU.push_back(aabb);
1273 cl_mem b3GpuSapBroadphase::getAabbBufferWS()
1275 return m_allAabbsGPU.getBufferCL();
1278 int b3GpuSapBroadphase::getNumOverlap()
1280 return m_overlappingPairs.size();
1282 cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
1284 return m_overlappingPairs.getBufferCL();
1287 b3OpenCLArray<b3Int4>& b3GpuSapBroadphase::getOverlappingPairsGPU()
1289 return m_overlappingPairs;
1291 b3OpenCLArray<int>& b3GpuSapBroadphase::getSmallAabbIndicesGPU()
1293 return m_smallAabbsMappingGPU;
1295 b3OpenCLArray<int>& b3GpuSapBroadphase::getLargeAabbIndicesGPU()
1297 return m_largeAabbsMappingGPU;