2 This software is provided 'as-is', without any express or implied warranty.
3 In no event will the authors be held liable for any damages arising from the use of this software.
4 Permission is granted to anyone to use this software for any purpose,
5 including commercial applications, and to alter it and redistribute it freely,
6 subject to the following restrictions:
8 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
9 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
10 3. This notice may not be removed or altered from any source distribution.
12 //Initial Author Jackson Lee, 2014
14 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
15 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
17 #include "b3GpuParallelLinearBvh.h"
19 b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
20 m_radixSorter(context, device, queue),
22 m_rootNodeIndex(context, queue),
23 m_maxDistanceFromRoot(context, queue),
24 m_temp(context, queue),
26 m_internalNodeAabbs(context, queue),
27 m_internalNodeLeafIndexRanges(context, queue),
28 m_internalNodeChildNodes(context, queue),
29 m_internalNodeParentNodes(context, queue),
31 m_commonPrefixes(context, queue),
32 m_commonPrefixLengths(context, queue),
33 m_distanceFromRoot(context, queue),
35 m_leafNodeParentNodes(context, queue),
36 m_mortonCodesAndAabbIndicies(context, queue),
37 m_mergedAabb(context, queue),
38 m_leafNodeAabbs(context, queue),
40 m_largeAabbs(context, queue)
42 m_rootNodeIndex.resize(1);
43 m_maxDistanceFromRoot.resize(1);
47 const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
49 const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
51 char* additionalMacros = 0;
52 m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
53 b3Assert(m_parallelLinearBvhProgram);
55 m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
56 b3Assert(m_separateAabbsKernel);
57 m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
58 b3Assert(m_findAllNodesMergedAabbKernel);
59 m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
60 b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
62 m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
63 b3Assert(m_computeAdjacentPairCommonPrefixKernel);
64 m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
65 b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
66 m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
67 b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
68 m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
69 b3Assert(m_findDistanceFromRootKernel);
70 m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
71 b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
73 m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
74 b3Assert(m_findLeafIndexRangesKernel);
76 m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
77 b3Assert(m_plbvhCalculateOverlappingPairsKernel);
78 m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
79 b3Assert(m_plbvhRayTraverseKernel);
80 m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
81 b3Assert(m_plbvhLargeAabbAabbTestKernel);
82 m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
83 b3Assert(m_plbvhLargeAabbRayTestKernel);
86 b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
88 clReleaseKernel(m_separateAabbsKernel);
89 clReleaseKernel(m_findAllNodesMergedAabbKernel);
90 clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
92 clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
93 clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
94 clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
95 clReleaseKernel(m_findDistanceFromRootKernel);
96 clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
98 clReleaseKernel(m_findLeafIndexRangesKernel);
100 clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
101 clReleaseKernel(m_plbvhRayTraverseKernel);
102 clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
103 clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
105 clReleaseProgram(m_parallelLinearBvhProgram);
108 void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
109 const b3OpenCLArray<int>& largeAabbIndices)
111 B3_PROFILE("b3ParallelLinearBvh::build()");
113 int numLargeAabbs = largeAabbIndices.size();
114 int numSmallAabbs = smallAabbIndices.size();
116 //Since all AABBs(both large and small) are input as a contiguous array,
117 //with 2 additional arrays used to indicate the indices of large and small AABBs,
118 //it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
120 B3_PROFILE("Separate large and small AABBs");
122 m_largeAabbs.resize(numLargeAabbs);
123 m_leafNodeAabbs.resize(numSmallAabbs);
125 //Write large AABBs into m_largeAabbs
127 b3BufferInfoCL bufferInfo[] =
129 b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
130 b3BufferInfoCL(largeAabbIndices.getBufferCL()),
132 b3BufferInfoCL(m_largeAabbs.getBufferCL())};
134 b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
135 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
136 launcher.setConst(numLargeAabbs);
138 launcher.launch1D(numLargeAabbs);
141 //Write small AABBs into m_leafNodeAabbs
143 b3BufferInfoCL bufferInfo[] =
145 b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
146 b3BufferInfoCL(smallAabbIndices.getBufferCL()),
148 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
150 b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
151 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
152 launcher.setConst(numSmallAabbs);
154 launcher.launch1D(numSmallAabbs);
161 int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
162 int numInternalNodes = numLeaves - 1;
166 //Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
167 //so it does not matter if numLeaves == 0 and rootNodeIndex == -1
168 int rootNodeIndex = numLeaves - 1;
169 m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
171 //Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
172 //m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
173 //instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
174 //( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
178 leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
180 m_mortonCodesAndAabbIndicies.resize(1);
181 m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
189 m_internalNodeAabbs.resize(numInternalNodes);
190 m_internalNodeLeafIndexRanges.resize(numInternalNodes);
191 m_internalNodeChildNodes.resize(numInternalNodes);
192 m_internalNodeParentNodes.resize(numInternalNodes);
194 m_commonPrefixes.resize(numInternalNodes);
195 m_commonPrefixLengths.resize(numInternalNodes);
196 m_distanceFromRoot.resize(numInternalNodes);
198 m_leafNodeParentNodes.resize(numLeaves);
199 m_mortonCodesAndAabbIndicies.resize(numLeaves);
200 m_mergedAabb.resize(numLeaves);
203 //Find the merged AABB of all small AABBs; this is used to define the size of
204 //each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
206 B3_PROFILE("Find AABB of merged nodes");
208 m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
210 for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
211 numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
213 b3BufferInfoCL bufferInfo[] =
215 b3BufferInfoCL(m_mergedAabb.getBufferCL()) //Resulting AABB is stored in m_mergedAabb[0]
218 b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
219 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
220 launcher.setConst(numAabbsNeedingMerge);
222 launcher.launch1D(numAabbsNeedingMerge);
228 //Insert the center of the AABBs into a virtual grid,
229 //then convert the discrete grid coordinates into a morton code
230 //For each element in m_mortonCodesAndAabbIndicies, set
231 // m_key == morton code (value to sort by)
232 // m_value == small AABB index
234 B3_PROFILE("Assign morton codes");
236 b3BufferInfoCL bufferInfo[] =
238 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
239 b3BufferInfoCL(m_mergedAabb.getBufferCL()),
240 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
242 b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
243 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
244 launcher.setConst(numLeaves);
246 launcher.launch1D(numLeaves);
252 B3_PROFILE("Sort leaves by morton codes");
254 m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
259 constructBinaryRadixTree();
261 //Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
262 //The root node contains leaf node indices in the range [0, numLeafNodes - 1].
263 //The child nodes of each node split their parent's index range into 2 contiguous halves.
265 //For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
266 //The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
268 //This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
270 B3_PROFILE("m_findLeafIndexRangesKernel");
272 b3BufferInfoCL bufferInfo[] =
274 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
275 b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
277 b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
278 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
279 launcher.setConst(numInternalNodes);
281 launcher.launch1D(numInternalNodes);
286 void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs)
288 int maxPairs = out_overlappingPairs.size();
289 b3OpenCLArray<int>& numPairsGpu = m_temp;
292 numPairsGpu.copyFromHostPointer(&reset, 1);
295 if (m_leafNodeAabbs.size() > 1)
297 B3_PROFILE("PLBVH small-small AABB test");
299 int numQueryAabbs = m_leafNodeAabbs.size();
301 b3BufferInfoCL bufferInfo[] =
303 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
305 b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
306 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
307 b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
308 b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
309 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
311 b3BufferInfoCL(numPairsGpu.getBufferCL()),
312 b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
314 b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
315 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
316 launcher.setConst(maxPairs);
317 launcher.setConst(numQueryAabbs);
319 launcher.launch1D(numQueryAabbs);
323 int numLargeAabbRigids = m_largeAabbs.size();
324 if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
326 B3_PROFILE("PLBVH large-small AABB test");
328 int numQueryAabbs = m_leafNodeAabbs.size();
330 b3BufferInfoCL bufferInfo[] =
332 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
333 b3BufferInfoCL(m_largeAabbs.getBufferCL()),
335 b3BufferInfoCL(numPairsGpu.getBufferCL()),
336 b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
338 b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
339 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
340 launcher.setConst(maxPairs);
341 launcher.setConst(numLargeAabbRigids);
342 launcher.setConst(numQueryAabbs);
344 launcher.launch1D(numQueryAabbs);
350 numPairsGpu.copyToHostPointer(&numPairs, 1);
351 if (numPairs > maxPairs)
353 b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
355 numPairsGpu.copyFromHostPointer(&maxPairs, 1);
358 out_overlappingPairs.resize(numPairs);
361 void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
362 b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
364 B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
366 int numRays = rays.size();
367 int maxRayRigidPairs = out_rayRigidPairs.size();
370 out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
373 if (m_leafNodeAabbs.size() > 0)
375 B3_PROFILE("PLBVH ray test small AABB");
377 b3BufferInfoCL bufferInfo[] =
379 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
381 b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
382 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
383 b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
384 b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
385 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
387 b3BufferInfoCL(rays.getBufferCL()),
389 b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
390 b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
392 b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
393 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
394 launcher.setConst(maxRayRigidPairs);
395 launcher.setConst(numRays);
397 launcher.launch1D(numRays);
401 int numLargeAabbRigids = m_largeAabbs.size();
402 if (numLargeAabbRigids > 0)
404 B3_PROFILE("PLBVH ray test large AABB");
406 b3BufferInfoCL bufferInfo[] =
408 b3BufferInfoCL(m_largeAabbs.getBufferCL()),
409 b3BufferInfoCL(rays.getBufferCL()),
411 b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
412 b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
414 b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
415 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
416 launcher.setConst(numLargeAabbRigids);
417 launcher.setConst(maxRayRigidPairs);
418 launcher.setConst(numRays);
420 launcher.launch1D(numRays);
425 int numRayRigidPairs = -1;
426 out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
428 if (numRayRigidPairs > maxRayRigidPairs)
429 b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
432 void b3GpuParallelLinearBvh::constructBinaryRadixTree()
434 B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
436 int numLeaves = m_leafNodeAabbs.size();
437 int numInternalNodes = numLeaves - 1;
439 //Each internal node is placed in between 2 leaf nodes.
440 //By using this arrangement and computing the common prefix between
441 //these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
443 B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
445 b3BufferInfoCL bufferInfo[] =
447 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
448 b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
449 b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
451 b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
452 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
453 launcher.setConst(numInternalNodes);
455 launcher.launch1D(numInternalNodes);
459 //For each leaf node, select its parent node by
460 //comparing the 2 nearest internal nodes and assign child node indices
462 B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
464 b3BufferInfoCL bufferInfo[] =
466 b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
467 b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
468 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
470 b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
471 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
472 launcher.setConst(numLeaves);
474 launcher.launch1D(numLeaves);
478 //For each internal node, perform 2 binary searches among the other internal nodes
479 //to its left and right to find its potential parent nodes and assign child node indices
481 B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
483 b3BufferInfoCL bufferInfo[] =
485 b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
486 b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
487 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
488 b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
489 b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
491 b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
492 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
493 launcher.setConst(numInternalNodes);
495 launcher.launch1D(numInternalNodes);
499 //Find the number of nodes separating each internal node and the root node
500 //so that the AABBs can be set using the next kernel.
501 //Also determine the maximum number of nodes separating an internal node and the root node.
503 B3_PROFILE("m_findDistanceFromRootKernel");
505 b3BufferInfoCL bufferInfo[] =
507 b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
508 b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
509 b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
510 b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
512 b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
513 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
514 launcher.setConst(numInternalNodes);
516 launcher.launch1D(numInternalNodes);
520 //Starting from the internal nodes nearest to the leaf nodes, recursively move up
521 //the tree towards the root to set the AABBs of each internal node; each internal node
522 //checks its children and merges their AABBs
524 B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
526 int maxDistanceFromRoot = -1;
528 B3_PROFILE("copy maxDistanceFromRoot to CPU");
529 m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
533 for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
535 b3BufferInfoCL bufferInfo[] =
537 b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
538 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
539 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
540 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
541 b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
543 b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
544 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
545 launcher.setConst(maxDistanceFromRoot);
546 launcher.setConst(distanceFromRoot);
547 launcher.setConst(numInternalNodes);
549 //It may seem inefficent to launch a thread for each internal node when a
550 //much smaller number of nodes is actually processed, but this is actually
551 //faster than determining the exact nodes that are ready to merge their child AABBs.
552 launcher.launch1D(numInternalNodes);