2 Copyright (c) 2012 Advanced Micro Devices, Inc.
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
14 //Originally written by Erwin Coumans
16 #define NEW_PAIR_MARKER -1
35 /// conservative test for overlap between two aabbs
36 bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
37 bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
40 overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
41 overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
42 overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
45 bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
46 bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
49 overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
50 overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
51 overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
55 bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
56 bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
59 overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
60 overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
61 overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
66 __kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)
68 int i = get_global_id(0);
69 if (i>=numUnsortedAabbs)
72 int j = get_global_id(1);
73 if (j>=numUnSortedAabbs2)
77 __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];
78 __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];
80 if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))
84 int xIndex = unsortedAabbPtr[0].m_minIndices[3];
85 int yIndex = unsortedAabbPtr2[0].m_minIndices[3];
95 myPair.z = NEW_PAIR_MARKER;
96 myPair.w = NEW_PAIR_MARKER;
99 int curPair = atomic_inc (pairCount);
100 if (curPair<maxPairs)
102 pairsOut[curPair] = myPair; //flush to main memory
109 __kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
111 int i = get_global_id(0);
114 for (int j=i+1;j<numObjects;j++)
116 if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
119 myPair.x = aabbs[i].m_minIndices[3];
120 myPair.y = aabbs[j].m_minIndices[3];
121 myPair.z = NEW_PAIR_MARKER;
122 myPair.w = NEW_PAIR_MARKER;
124 int curPair = atomic_inc (pairCount);
125 if (curPair<maxPairs)
127 pairsOut[curPair] = myPair; //flush to main memory
133 __kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
135 int i = get_global_id(0);
138 for (int j=i+1;j<numObjects;j++)
140 if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
144 if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
147 myPair.x = aabbs[i].m_minIndices[3];
148 myPair.y = aabbs[j].m_minIndices[3];
149 myPair.z = NEW_PAIR_MARKER;
150 myPair.w = NEW_PAIR_MARKER;
152 int curPair = atomic_inc (pairCount);
153 if (curPair<maxPairs)
155 pairsOut[curPair] = myPair; //flush to main memory
164 __kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
166 int i = get_global_id(0);
167 int localId = get_local_id(0);
169 __local int numActiveWgItems[1];
170 __local int breakRequest[1];
174 numActiveWgItems[0] = 0;
177 barrier(CLK_LOCAL_MEM_FENCE);
178 atomic_inc(numActiveWgItems);
179 barrier(CLK_LOCAL_MEM_FENCE);
185 barrier(CLK_LOCAL_MEM_FENCE);
189 if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
193 atomic_inc(breakRequest);
199 barrier(CLK_LOCAL_MEM_FENCE);
201 if (j>=numObjects && !localBreak)
203 atomic_inc(breakRequest);
206 barrier(CLK_LOCAL_MEM_FENCE);
210 if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
213 myPair.x = aabbs[i].m_minIndices[3];
214 myPair.y = aabbs[j].m_minIndices[3];
215 myPair.z = NEW_PAIR_MARKER;
216 myPair.w = NEW_PAIR_MARKER;
218 int curPair = atomic_inc (pairCount);
219 if (curPair<maxPairs)
221 pairsOut[curPair] = myPair; //flush to main memory
227 } while (breakRequest[0]<numActiveWgItems[0]);
231 __kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
233 int i = get_global_id(0);
234 int localId = get_local_id(0);
236 __local int numActiveWgItems[1];
237 __local int breakRequest[1];
238 __local btAabbCL localAabbs[128];// = aabbs[i];
242 myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
243 float testValue = myAabb.m_maxElems[axis];
247 numActiveWgItems[0] = 0;
252 localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
253 localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
255 barrier(CLK_LOCAL_MEM_FENCE);
256 atomic_inc(numActiveWgItems);
257 barrier(CLK_LOCAL_MEM_FENCE);
263 barrier(CLK_LOCAL_MEM_FENCE);
267 if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
271 atomic_inc(breakRequest);
277 barrier(CLK_LOCAL_MEM_FENCE);
279 if (j>=numObjects && !localBreak)
281 atomic_inc(breakRequest);
284 barrier(CLK_LOCAL_MEM_FENCE);
288 if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
291 myPair.x = myAabb.m_minIndices[3];
292 myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
293 myPair.z = NEW_PAIR_MARKER;
294 myPair.w = NEW_PAIR_MARKER;
296 int curPair = atomic_inc (pairCount);
297 if (curPair<maxPairs)
299 pairsOut[curPair] = myPair; //flush to main memory
304 barrier(CLK_LOCAL_MEM_FENCE);
311 localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
312 localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
316 } while (breakRequest[0]<numActiveWgItems[0]);
323 //http://stereopsis.com/radix.html
324 unsigned int FloatFlip(float fl);
325 unsigned int FloatFlip(float fl)
327 unsigned int f = *(unsigned int*)&fl;
328 unsigned int mask = -(int)(f >> 31) | 0x80000000;
331 float IFloatFlip(unsigned int f);
332 float IFloatFlip(unsigned int f)
334 unsigned int mask = ((f >> 31) - 1) | 0x80000000;
335 unsigned int fl = f ^ mask;
342 __kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
344 int i = get_global_id(0);
347 int src = destAabbs[i].m_maxIndices[3];
348 destAabbs[i] = allAabbs[src];
349 destAabbs[i].m_maxIndices[3] = src;
353 __kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)
355 int i = get_global_id(0);
360 sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);
366 __kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
368 int i = get_global_id(0);
372 sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];
377 __kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)
379 int i = get_global_id(0);
383 btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];
386 s = (smallAabb.m_max+smallAabb.m_min)*0.5f;