Imported Upstream version 2.81
[platform/upstream/libbullet.git] / src / BulletMultiThreaded / btGpu3DGridBroadphaseSharedCode.h
1 /*
2 Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
3 Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
4
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose, 
8 including commercial applications, and to alter it and redistribute it freely, 
9 subject to the following restrictions:
10
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14 */
15
16 //----------------------------------------------------------------------------------------
17
18 //----------------------------------------------------------------------------------------
19 //----------------------------------------------------------------------------------------
20 //----------------------------------------------------------------------------------------
21 //----------------------------------------------------------------------------------------
22 //               K E R N E L    F U N C T I O N S 
23 //----------------------------------------------------------------------------------------
24 //----------------------------------------------------------------------------------------
25 //----------------------------------------------------------------------------------------
26 //----------------------------------------------------------------------------------------
27 //----------------------------------------------------------------------------------------
28 //----------------------------------------------------------------------------------------
29
30 // calculate position in uniform grid
31 BT_GPU___device__ int3 bt3DGrid_calcGridPos(float4 p)
32 {
33     int3 gridPos;
34     gridPos.x = (int)floor((p.x - BT_GPU_params.m_worldOriginX) / BT_GPU_params.m_cellSizeX);
35     gridPos.y = (int)floor((p.y - BT_GPU_params.m_worldOriginY) / BT_GPU_params.m_cellSizeY);
36     gridPos.z = (int)floor((p.z - BT_GPU_params.m_worldOriginZ) / BT_GPU_params.m_cellSizeZ);
37     return gridPos;
38 } // bt3DGrid_calcGridPos()
39
40 //----------------------------------------------------------------------------------------
41
42 // calculate address in grid from position (clamping to edges)
43 BT_GPU___device__ uint bt3DGrid_calcGridHash(int3 gridPos)
44 {
45     gridPos.x = BT_GPU_max(0, BT_GPU_min(gridPos.x, (int)BT_GPU_params.m_gridSizeX - 1));
46     gridPos.y = BT_GPU_max(0, BT_GPU_min(gridPos.y, (int)BT_GPU_params.m_gridSizeY - 1));
47     gridPos.z = BT_GPU_max(0, BT_GPU_min(gridPos.z, (int)BT_GPU_params.m_gridSizeZ - 1));
48     return BT_GPU___mul24(BT_GPU___mul24(gridPos.z, BT_GPU_params.m_gridSizeY), BT_GPU_params.m_gridSizeX) + BT_GPU___mul24(gridPos.y, BT_GPU_params.m_gridSizeX) + gridPos.x;
49 } // bt3DGrid_calcGridHash()
50
51 //----------------------------------------------------------------------------------------
52
53 // calculate grid hash value for each body using its AABB
54 BT_GPU___global__ void calcHashAABBD(bt3DGrid3F1U* pAABB, uint2* pHash, uint numBodies)
55 {
56     int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
57     if(index >= (int)numBodies)
58         {
59                 return;
60         }
61         bt3DGrid3F1U bbMin = pAABB[index*2];
62         bt3DGrid3F1U bbMax = pAABB[index*2 + 1];
63         float4 pos;
64         pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
65         pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
66         pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
67     // get address in grid
68     int3 gridPos = bt3DGrid_calcGridPos(pos);
69     uint gridHash = bt3DGrid_calcGridHash(gridPos);
70     // store grid hash and body index
71     pHash[index] = BT_GPU_make_uint2(gridHash, index);
72 } // calcHashAABBD()
73
74 //----------------------------------------------------------------------------------------
75
76 BT_GPU___global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
77 {
78     int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
79     if(index >= (int)numBodies)
80         {
81                 return;
82         }
83     uint2 sortedData = pHash[index];
84         // Load hash data into shared memory so that we can look 
85         // at neighboring body's hash value without loading
86         // two hash values per thread
87         BT_GPU___shared__ uint sharedHash[257];
88         sharedHash[BT_GPU_threadIdx.x+1] = sortedData.x;
89         if((index > 0) && (BT_GPU_threadIdx.x == 0))
90         {
91                 // first thread in block must load neighbor body hash
92                 volatile uint2 prevData = pHash[index-1];
93                 sharedHash[0] = prevData.x;
94         }
95         BT_GPU___syncthreads();
96         if((index == 0) || (sortedData.x != sharedHash[BT_GPU_threadIdx.x]))
97         {
98                 cellStart[sortedData.x] = index;
99         }
100 } // findCellStartD()
101
102 //----------------------------------------------------------------------------------------
103
104 BT_GPU___device__ uint cudaTestAABBOverlap(bt3DGrid3F1U min0, bt3DGrid3F1U max0, bt3DGrid3F1U min1, bt3DGrid3F1U max1)
105 {
106         return  (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
107                         (min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
108                         (min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
109 } // cudaTestAABBOverlap()
110  
111 //----------------------------------------------------------------------------------------
112
113 BT_GPU___device__ void findPairsInCell( int3    gridPos,
114                                                                                 uint    index,
115                                                                                 uint2*  pHash,
116                                                                                 uint*   pCellStart,
117                                                                                 bt3DGrid3F1U* pAABB, 
118                                                                                 uint*   pPairBuff,
119                                                                                 uint2*  pPairBuffStartCurr,
120                                                                                 uint    numBodies)
121 {
122     if (        (gridPos.x < 0) || (gridPos.x > (int)BT_GPU_params.m_gridSizeX - 1)
123                 ||      (gridPos.y < 0) || (gridPos.y > (int)BT_GPU_params.m_gridSizeY - 1)
124                 ||  (gridPos.z < 0) || (gridPos.z > (int)BT_GPU_params.m_gridSizeZ - 1)) 
125     {
126                 return;
127         }
128     uint gridHash = bt3DGrid_calcGridHash(gridPos);
129     // get start of bucket for this cell
130     uint bucketStart = pCellStart[gridHash];
131     if (bucketStart == 0xffffffff)
132         {
133         return;   // cell empty
134         }
135         // iterate over bodies in this cell
136     uint2 sortedData = pHash[index];
137         uint unsorted_indx = sortedData.y;
138     bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2); 
139         bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
140         uint handleIndex =  min0.uw;
141         uint2 start_curr = pPairBuffStartCurr[handleIndex];
142         uint start = start_curr.x;
143         uint curr = start_curr.y;
144         uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
145         uint curr_max = start_curr_next.x - start - 1;
146         uint bucketEnd = bucketStart + BT_GPU_params.m_maxBodiesPerCell;
147         bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
148         for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
149         {
150         uint2 cellData = pHash[index2];
151         if (cellData.x != gridHash)
152         {
153                         break;   // no longer in same bucket
154                 }
155                 uint unsorted_indx2 = cellData.y;
156         if (unsorted_indx2 < unsorted_indx) // check not colliding with self
157         {   
158                         bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2);
159                         bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2 + 1);
160                         if(cudaTestAABBOverlap(min0, max0, min1, max1))
161                         {
162                                 uint handleIndex2 = min1.uw;
163                                 uint k;
164                                 for(k = 0; k < curr; k++)
165                                 {
166                                         uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
167                                         if(old_pair == handleIndex2)
168                                         {
169                                                 pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
170                                                 break;
171                                         }
172                                 }
173                                 if(k == curr)
174                                 {
175                                         if(curr >= curr_max) 
176                                         { // not a good solution, but let's avoid crash
177                                                 break;
178                                         }
179                                         pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
180                                         curr++;
181                                 }
182                         }
183                 }
184         }
185         pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
186     return;
187 } // findPairsInCell()
188
189 //----------------------------------------------------------------------------------------
190
191 BT_GPU___global__ void findOverlappingPairsD(   bt3DGrid3F1U*   pAABB, uint2* pHash, uint* pCellStart, 
192                                                                                                 uint* pPairBuff, uint2* pPairBuffStartCurr, uint numBodies)
193 {
194     int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
195     if(index >= (int)numBodies)
196         {
197                 return;
198         }
199     uint2 sortedData = pHash[index];
200         uint unsorted_indx = sortedData.y;
201         bt3DGrid3F1U bbMin = BT_GPU_FETCH(pAABB, unsorted_indx*2);
202         bt3DGrid3F1U bbMax = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
203         float4 pos;
204         pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
205         pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
206         pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
207     // get address in grid
208     int3 gridPos = bt3DGrid_calcGridPos(pos);
209     // examine only neighbouring cells
210     for(int z=-1; z<=1; z++) {
211         for(int y=-1; y<=1; y++) {
212             for(int x=-1; x<=1; x++) {
213                 findPairsInCell(gridPos + BT_GPU_make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
214             }
215         }
216     }
217 } // findOverlappingPairsD()
218
219 //----------------------------------------------------------------------------------------
220
221 BT_GPU___global__ void findPairsLargeD( bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
222                                                                                 uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
223 {
224     int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
225     if(index >= (int)numBodies)
226         {
227                 return;
228         }
229     uint2 sortedData = pHash[index];
230         uint unsorted_indx = sortedData.y;
231         bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
232         bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
233         uint handleIndex =  min0.uw;
234         uint2 start_curr = pPairBuffStartCurr[handleIndex];
235         uint start = start_curr.x;
236         uint curr = start_curr.y;
237         uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
238         uint curr_max = start_curr_next.x - start - 1;
239     for(uint i = 0; i < numLarge; i++)
240     {
241                 uint indx2 = numBodies + i;
242                 bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, indx2*2);
243                 bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, indx2*2 + 1);
244                 if(cudaTestAABBOverlap(min0, max0, min1, max1))
245                 {
246                         uint k;
247                         uint handleIndex2 =  min1.uw;
248                         for(k = 0; k < curr; k++)
249                         {
250                                 uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
251                                 if(old_pair == handleIndex2)
252                                 {
253                                         pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
254                                         break;
255                                 }
256                         }
257                         if(k == curr)
258                         {
259                                 pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
260                                 if(curr >= curr_max) 
261                                 { // not a good solution, but let's avoid crash
262                                         break;
263                                 }
264                                 curr++;
265                         }
266                 }
267     }
268         pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
269     return;
270 } // findPairsLargeD()
271
272 //----------------------------------------------------------------------------------------
273
274 BT_GPU___global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, 
275                                                                                                 uint* pPairScan, bt3DGrid3F1U* pAABB, uint numBodies)
276 {
277     int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
278     if(index >= (int)numBodies)
279         {
280                 return;
281         }
282         bt3DGrid3F1U bbMin = pAABB[index * 2];
283         uint handleIndex = bbMin.uw;
284         uint2 start_curr = pPairBuffStartCurr[handleIndex];
285         uint start = start_curr.x;
286         uint curr = start_curr.y;
287         uint *pInp = pPairBuff + start;
288         uint num_changes = 0;
289         for(uint k = 0; k < curr; k++, pInp++)
290         {
291                 if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
292                 {
293                         num_changes++;
294                 }
295         }
296         pPairScan[index+1] = num_changes;
297 } // computePairCacheChangesD()
298
299 //----------------------------------------------------------------------------------------
300
301 BT_GPU___global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan,
302                                                                                                    uint* pPairOut, bt3DGrid3F1U* pAABB, uint numBodies)
303 {
304     int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
305     if(index >= (int)numBodies)
306         {
307                 return;
308         }
309         bt3DGrid3F1U bbMin = pAABB[index * 2];
310         uint handleIndex = bbMin.uw;
311         uint2 start_curr = pPairBuffStartCurr[handleIndex];
312         uint start = start_curr.x;
313         uint curr = start_curr.y;
314         uint* pInp = pPairBuff + start;
315         uint* pOut = pPairOut + pPairScan[index];
316         uint* pOut2 = pInp;
317         uint num = 0; 
318         for(uint k = 0; k < curr; k++, pInp++)
319         {
320                 if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
321                 {
322                         *pOut = *pInp;
323                         pOut++;
324                 }
325                 if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
326                 {
327                         *pOut2 = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
328                         pOut2++;
329                         num++;
330                 }
331         }
332         pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, num);
333 } // squeezeOverlappingPairBuffD()
334
335
336 //----------------------------------------------------------------------------------------
337 //----------------------------------------------------------------------------------------
338 //----------------------------------------------------------------------------------------
339 //----------------------------------------------------------------------------------------
340 //               E N D   O F    K E R N E L    F U N C T I O N S 
341 //----------------------------------------------------------------------------------------
342 //----------------------------------------------------------------------------------------
343 //----------------------------------------------------------------------------------------
344 //----------------------------------------------------------------------------------------
345
346 extern "C"
347 {
348
349 //----------------------------------------------------------------------------------------
350
351 void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash, unsigned int numBodies)
352 {
353     int numThreads, numBlocks;
354     BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
355     // execute the kernel
356     BT_GPU_EXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
357     // check if kernel invocation generated an error
358     BT_GPU_CHECK_ERROR("calcHashAABBD kernel execution failed");
359 } // calcHashAABB()
360
361 //----------------------------------------------------------------------------------------
362
363 void BT_GPU_PREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
364 {
365     int numThreads, numBlocks;
366     BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
367         BT_GPU_SAFE_CALL(BT_GPU_Memset(cellStart, 0xffffffff, numCells*sizeof(uint)));
368         BT_GPU_EXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
369     BT_GPU_CHECK_ERROR("Kernel execution failed: findCellStartD");
370 } // findCellStart()
371
372 //----------------------------------------------------------------------------------------
373
374 void BT_GPU_PREF(findOverlappingPairs(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int*        pPairBuffStartCurr, unsigned int        numBodies))
375 {
376 #if B_CUDA_USE_TEX
377     BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(bt3DGrid3F1U)));
378 #endif
379     int numThreads, numBlocks;
380     BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
381     BT_GPU_EXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
382     BT_GPU_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
383 #if B_CUDA_USE_TEX
384     BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
385 #endif
386 } // findOverlappingPairs()
387
388 //----------------------------------------------------------------------------------------
389
390 void BT_GPU_PREF(findPairsLarge(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
391 {
392 #if B_CUDA_USE_TEX
393     BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(bt3DGrid3F1U)));
394 #endif
395     int numThreads, numBlocks;
396     BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
397     BT_GPU_EXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
398     BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
399 #if B_CUDA_USE_TEX
400     BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
401 #endif
402 } // findPairsLarge()
403
404 //----------------------------------------------------------------------------------------
405
406 void BT_GPU_PREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies))
407 {
408     int numThreads, numBlocks;
409     BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
410     BT_GPU_EXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
411     BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
412 } // computePairCacheChanges()
413
414 //----------------------------------------------------------------------------------------
415
416 void BT_GPU_PREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies))
417 {
418     int numThreads, numBlocks;
419     BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
420     BT_GPU_EXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint*)pPairOut,pAABB,numBodies));
421     BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
422 } // btCuda_squeezeOverlappingPairBuff()
423
424 //------------------------------------------------------------------------------------------------
425
426 } // extern "C"
427
428 //------------------------------------------------------------------------------------------------
429 //------------------------------------------------------------------------------------------------
430 //------------------------------------------------------------------------------------------------