[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / Bullet3OpenCL / BroadphaseCollision / b3GpuParallelLinearBvh.cpp
1 /*
2 This software is provided 'as-is', without any express or implied warranty.
3 In no event will the authors be held liable for any damages arising from the use of this software.
4 Permission is granted to anyone to use this software for any purpose,
5 including commercial applications, and to alter it and redistribute it freely,
6 subject to the following restrictions:
7
8 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
9 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
10 3. This notice may not be removed or altered from any source distribution.
11 */
12 //Initial Author Jackson Lee, 2014
13
14 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
15 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
16
17 #include "b3GpuParallelLinearBvh.h"
18
19 b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
20                                                                                                                                                                                                                                   m_radixSorter(context, device, queue),
21
22                                                                                                                                                                                                                                   m_rootNodeIndex(context, queue),
23                                                                                                                                                                                                                                   m_maxDistanceFromRoot(context, queue),
24                                                                                                                                                                                                                                   m_temp(context, queue),
25
26                                                                                                                                                                                                                                   m_internalNodeAabbs(context, queue),
27                                                                                                                                                                                                                                   m_internalNodeLeafIndexRanges(context, queue),
28                                                                                                                                                                                                                                   m_internalNodeChildNodes(context, queue),
29                                                                                                                                                                                                                                   m_internalNodeParentNodes(context, queue),
30
31                                                                                                                                                                                                                                   m_commonPrefixes(context, queue),
32                                                                                                                                                                                                                                   m_commonPrefixLengths(context, queue),
33                                                                                                                                                                                                                                   m_distanceFromRoot(context, queue),
34
35                                                                                                                                                                                                                                   m_leafNodeParentNodes(context, queue),
36                                                                                                                                                                                                                                   m_mortonCodesAndAabbIndicies(context, queue),
37                                                                                                                                                                                                                                   m_mergedAabb(context, queue),
38                                                                                                                                                                                                                                   m_leafNodeAabbs(context, queue),
39
40                                                                                                                                                                                                                                   m_largeAabbs(context, queue)
41 {
42         m_rootNodeIndex.resize(1);
43         m_maxDistanceFromRoot.resize(1);
44         m_temp.resize(1);
45
46         //
47         const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
48
49         const char* kernelSource = parallelLinearBvhCL;  //parallelLinearBvhCL.h
50         cl_int error;
51         char* additionalMacros = 0;
52         m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
53         b3Assert(m_parallelLinearBvhProgram);
54
55         m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
56         b3Assert(m_separateAabbsKernel);
57         m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
58         b3Assert(m_findAllNodesMergedAabbKernel);
59         m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
60         b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
61
62         m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
63         b3Assert(m_computeAdjacentPairCommonPrefixKernel);
64         m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
65         b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
66         m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
67         b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
68         m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
69         b3Assert(m_findDistanceFromRootKernel);
70         m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
71         b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
72
73         m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
74         b3Assert(m_findLeafIndexRangesKernel);
75
76         m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
77         b3Assert(m_plbvhCalculateOverlappingPairsKernel);
78         m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
79         b3Assert(m_plbvhRayTraverseKernel);
80         m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
81         b3Assert(m_plbvhLargeAabbAabbTestKernel);
82         m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
83         b3Assert(m_plbvhLargeAabbRayTestKernel);
84 }
85
86 b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
87 {
88         clReleaseKernel(m_separateAabbsKernel);
89         clReleaseKernel(m_findAllNodesMergedAabbKernel);
90         clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
91
92         clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
93         clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
94         clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
95         clReleaseKernel(m_findDistanceFromRootKernel);
96         clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
97
98         clReleaseKernel(m_findLeafIndexRangesKernel);
99
100         clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
101         clReleaseKernel(m_plbvhRayTraverseKernel);
102         clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
103         clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
104
105         clReleaseProgram(m_parallelLinearBvhProgram);
106 }
107
108 void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
109                                                                    const b3OpenCLArray<int>& largeAabbIndices)
110 {
111         B3_PROFILE("b3ParallelLinearBvh::build()");
112
113         int numLargeAabbs = largeAabbIndices.size();
114         int numSmallAabbs = smallAabbIndices.size();
115
116         //Since all AABBs(both large and small) are input as a contiguous array,
117         //with 2 additional arrays used to indicate the indices of large and small AABBs,
118         //it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
119         {
120                 B3_PROFILE("Separate large and small AABBs");
121
122                 m_largeAabbs.resize(numLargeAabbs);
123                 m_leafNodeAabbs.resize(numSmallAabbs);
124
125                 //Write large AABBs into m_largeAabbs
126                 {
127                         b3BufferInfoCL bufferInfo[] =
128                                 {
129                                         b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
130                                         b3BufferInfoCL(largeAabbIndices.getBufferCL()),
131
132                                         b3BufferInfoCL(m_largeAabbs.getBufferCL())};
133
134                         b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
135                         launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
136                         launcher.setConst(numLargeAabbs);
137
138                         launcher.launch1D(numLargeAabbs);
139                 }
140
141                 //Write small AABBs into m_leafNodeAabbs
142                 {
143                         b3BufferInfoCL bufferInfo[] =
144                                 {
145                                         b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
146                                         b3BufferInfoCL(smallAabbIndices.getBufferCL()),
147
148                                         b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
149
150                         b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
151                         launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
152                         launcher.setConst(numSmallAabbs);
153
154                         launcher.launch1D(numSmallAabbs);
155                 }
156
157                 clFinish(m_queue);
158         }
159
160         //
161         int numLeaves = numSmallAabbs;  //Number of leaves in the BVH == Number of rigid bodies with small AABBs
162         int numInternalNodes = numLeaves - 1;
163
164         if (numLeaves < 2)
165         {
166                 //Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
167                 //so it does not matter if numLeaves == 0 and rootNodeIndex == -1
168                 int rootNodeIndex = numLeaves - 1;
169                 m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
170
171                 //Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
172                 //m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
173                 //instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
174                 //( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
175                 if (numLeaves == 1)
176                 {
177                         b3SortData leaf;
178                         leaf.m_value = 0;  //1 leaf so index is always 0; leaf.m_key does not need to be set
179
180                         m_mortonCodesAndAabbIndicies.resize(1);
181                         m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
182                 }
183
184                 return;
185         }
186
187         //
188         {
189                 m_internalNodeAabbs.resize(numInternalNodes);
190                 m_internalNodeLeafIndexRanges.resize(numInternalNodes);
191                 m_internalNodeChildNodes.resize(numInternalNodes);
192                 m_internalNodeParentNodes.resize(numInternalNodes);
193
194                 m_commonPrefixes.resize(numInternalNodes);
195                 m_commonPrefixLengths.resize(numInternalNodes);
196                 m_distanceFromRoot.resize(numInternalNodes);
197
198                 m_leafNodeParentNodes.resize(numLeaves);
199                 m_mortonCodesAndAabbIndicies.resize(numLeaves);
200                 m_mergedAabb.resize(numLeaves);
201         }
202
203         //Find the merged AABB of all small AABBs; this is used to define the size of
204         //each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
205         {
206                 B3_PROFILE("Find AABB of merged nodes");
207
208                 m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs);  //Need to make a copy since the kernel modifies the array
209
210                 for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
211                          numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
212                 {
213                         b3BufferInfoCL bufferInfo[] =
214                                 {
215                                         b3BufferInfoCL(m_mergedAabb.getBufferCL())  //Resulting AABB is stored in m_mergedAabb[0]
216                                 };
217
218                         b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
219                         launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
220                         launcher.setConst(numAabbsNeedingMerge);
221
222                         launcher.launch1D(numAabbsNeedingMerge);
223                 }
224
225                 clFinish(m_queue);
226         }
227
228         //Insert the center of the AABBs into a virtual grid,
229         //then convert the discrete grid coordinates into a morton code
230         //For each element in m_mortonCodesAndAabbIndicies, set
231         //      m_key == morton code (value to sort by)
232         //      m_value == small AABB index
233         {
234                 B3_PROFILE("Assign morton codes");
235
236                 b3BufferInfoCL bufferInfo[] =
237                         {
238                                 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
239                                 b3BufferInfoCL(m_mergedAabb.getBufferCL()),
240                                 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
241
242                 b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
243                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
244                 launcher.setConst(numLeaves);
245
246                 launcher.launch1D(numLeaves);
247                 clFinish(m_queue);
248         }
249
250         //
251         {
252                 B3_PROFILE("Sort leaves by morton codes");
253
254                 m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
255                 clFinish(m_queue);
256         }
257
258         //
259         constructBinaryRadixTree();
260
261         //Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
262         //The root node contains leaf node indices in the range [0, numLeafNodes - 1].
263         //The child nodes of each node split their parent's index range into 2 contiguous halves.
264         //
265         //For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
266         //The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
267         //
268         //This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
269         {
270                 B3_PROFILE("m_findLeafIndexRangesKernel");
271
272                 b3BufferInfoCL bufferInfo[] =
273                         {
274                                 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
275                                 b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
276
277                 b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
278                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
279                 launcher.setConst(numInternalNodes);
280
281                 launcher.launch1D(numInternalNodes);
282                 clFinish(m_queue);
283         }
284 }
285
286 void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs)
287 {
288         int maxPairs = out_overlappingPairs.size();
289         b3OpenCLArray<int>& numPairsGpu = m_temp;
290
291         int reset = 0;
292         numPairsGpu.copyFromHostPointer(&reset, 1);
293
294         //
295         if (m_leafNodeAabbs.size() > 1)
296         {
297                 B3_PROFILE("PLBVH small-small AABB test");
298
299                 int numQueryAabbs = m_leafNodeAabbs.size();
300
301                 b3BufferInfoCL bufferInfo[] =
302                         {
303                                 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
304
305                                 b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
306                                 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
307                                 b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
308                                 b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
309                                 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
310
311                                 b3BufferInfoCL(numPairsGpu.getBufferCL()),
312                                 b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
313
314                 b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
315                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
316                 launcher.setConst(maxPairs);
317                 launcher.setConst(numQueryAabbs);
318
319                 launcher.launch1D(numQueryAabbs);
320                 clFinish(m_queue);
321         }
322
323         int numLargeAabbRigids = m_largeAabbs.size();
324         if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
325         {
326                 B3_PROFILE("PLBVH large-small AABB test");
327
328                 int numQueryAabbs = m_leafNodeAabbs.size();
329
330                 b3BufferInfoCL bufferInfo[] =
331                         {
332                                 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
333                                 b3BufferInfoCL(m_largeAabbs.getBufferCL()),
334
335                                 b3BufferInfoCL(numPairsGpu.getBufferCL()),
336                                 b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
337
338                 b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
339                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
340                 launcher.setConst(maxPairs);
341                 launcher.setConst(numLargeAabbRigids);
342                 launcher.setConst(numQueryAabbs);
343
344                 launcher.launch1D(numQueryAabbs);
345                 clFinish(m_queue);
346         }
347
348         //
349         int numPairs = -1;
350         numPairsGpu.copyToHostPointer(&numPairs, 1);
351         if (numPairs > maxPairs)
352         {
353                 b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
354                 numPairs = maxPairs;
355                 numPairsGpu.copyFromHostPointer(&maxPairs, 1);
356         }
357
358         out_overlappingPairs.resize(numPairs);
359 }
360
361 void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
362                                                                                                          b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
363 {
364         B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
365
366         int numRays = rays.size();
367         int maxRayRigidPairs = out_rayRigidPairs.size();
368
369         int reset = 0;
370         out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
371
372         //
373         if (m_leafNodeAabbs.size() > 0)
374         {
375                 B3_PROFILE("PLBVH ray test small AABB");
376
377                 b3BufferInfoCL bufferInfo[] =
378                         {
379                                 b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
380
381                                 b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
382                                 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
383                                 b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
384                                 b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
385                                 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
386
387                                 b3BufferInfoCL(rays.getBufferCL()),
388
389                                 b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
390                                 b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
391
392                 b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
393                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
394                 launcher.setConst(maxRayRigidPairs);
395                 launcher.setConst(numRays);
396
397                 launcher.launch1D(numRays);
398                 clFinish(m_queue);
399         }
400
401         int numLargeAabbRigids = m_largeAabbs.size();
402         if (numLargeAabbRigids > 0)
403         {
404                 B3_PROFILE("PLBVH ray test large AABB");
405
406                 b3BufferInfoCL bufferInfo[] =
407                         {
408                                 b3BufferInfoCL(m_largeAabbs.getBufferCL()),
409                                 b3BufferInfoCL(rays.getBufferCL()),
410
411                                 b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
412                                 b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
413
414                 b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
415                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
416                 launcher.setConst(numLargeAabbRigids);
417                 launcher.setConst(maxRayRigidPairs);
418                 launcher.setConst(numRays);
419
420                 launcher.launch1D(numRays);
421                 clFinish(m_queue);
422         }
423
424         //
425         int numRayRigidPairs = -1;
426         out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
427
428         if (numRayRigidPairs > maxRayRigidPairs)
429                 b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
430 }
431
432 void b3GpuParallelLinearBvh::constructBinaryRadixTree()
433 {
434         B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
435
436         int numLeaves = m_leafNodeAabbs.size();
437         int numInternalNodes = numLeaves - 1;
438
439         //Each internal node is placed in between 2 leaf nodes.
440         //By using this arrangement and computing the common prefix between
441         //these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
442         {
443                 B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
444
445                 b3BufferInfoCL bufferInfo[] =
446                         {
447                                 b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
448                                 b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
449                                 b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
450
451                 b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
452                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
453                 launcher.setConst(numInternalNodes);
454
455                 launcher.launch1D(numInternalNodes);
456                 clFinish(m_queue);
457         }
458
459         //For each leaf node, select its parent node by
460         //comparing the 2 nearest internal nodes and assign child node indices
461         {
462                 B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
463
464                 b3BufferInfoCL bufferInfo[] =
465                         {
466                                 b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
467                                 b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
468                                 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
469
470                 b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
471                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
472                 launcher.setConst(numLeaves);
473
474                 launcher.launch1D(numLeaves);
475                 clFinish(m_queue);
476         }
477
478         //For each internal node, perform 2 binary searches among the other internal nodes
479         //to its left and right to find its potential parent nodes and assign child node indices
480         {
481                 B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
482
483                 b3BufferInfoCL bufferInfo[] =
484                         {
485                                 b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
486                                 b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
487                                 b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
488                                 b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
489                                 b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
490
491                 b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
492                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
493                 launcher.setConst(numInternalNodes);
494
495                 launcher.launch1D(numInternalNodes);
496                 clFinish(m_queue);
497         }
498
499         //Find the number of nodes separating each internal node and the root node
500         //so that the AABBs can be set using the next kernel.
501         //Also determine the maximum number of nodes separating an internal node and the root node.
502         {
503                 B3_PROFILE("m_findDistanceFromRootKernel");
504
505                 b3BufferInfoCL bufferInfo[] =
506                         {
507                                 b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
508                                 b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
509                                 b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
510                                 b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
511
512                 b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
513                 launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
514                 launcher.setConst(numInternalNodes);
515
516                 launcher.launch1D(numInternalNodes);
517                 clFinish(m_queue);
518         }
519
520         //Starting from the internal nodes nearest to the leaf nodes, recursively move up
521         //the tree towards the root to set the AABBs of each internal node; each internal node
522         //checks its children and merges their AABBs
523         {
524                 B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
525
526                 int maxDistanceFromRoot = -1;
527                 {
528                         B3_PROFILE("copy maxDistanceFromRoot to CPU");
529                         m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
530                         clFinish(m_queue);
531                 }
532
533                 for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
534                 {
535                         b3BufferInfoCL bufferInfo[] =
536                                 {
537                                         b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
538                                         b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
539                                         b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
540                                         b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
541                                         b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
542
543                         b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
544                         launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
545                         launcher.setConst(maxDistanceFromRoot);
546                         launcher.setConst(distanceFromRoot);
547                         launcher.setConst(numInternalNodes);
548
549                         //It may seem inefficent to launch a thread for each internal node when a
550                         //much smaller number of nodes is actually processed, but this is actually
551                         //faster than determining the exact nodes that are ready to merge their child AABBs.
552                         launcher.launch1D(numInternalNodes);
553                 }
554
555                 clFinish(m_queue);
556         }
557 }