dali-physics/third-party/bullet3/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl

   1 /*
   2 Copyright (c) 2012 Advanced Micro Devices, Inc.
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14 //Originally written by Erwin Coumans
  15
  16 #define NEW_PAIR_MARKER -1
  17
  18 typedef struct
  19 {
  20         union
  21         {
  22                 float4  m_min;
  23                 float   m_minElems[4];
  24                 int                     m_minIndices[4];
  25         };
  26         union
  27         {
  28                 float4  m_max;
  29                 float   m_maxElems[4];
  30                 int                     m_maxIndices[4];
  31         };
  32 } btAabbCL;
  33
  34
  35 /// conservative test for overlap between two aabbs
  36 bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
  37 bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
  38 {
  39         bool overlap = true;
  40         overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
  41         overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
  42         overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
  43         return overlap;
  44 }
  45 bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
  46 bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
  47 {
  48         bool overlap = true;
  49         overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
  50         overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
  51         overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
  52         return overlap;
  53 }
  54
  55 bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
  56 bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
  57 {
  58         bool overlap = true;
  59         overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
  60         overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
  61         overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
  62         return overlap;
  63 }
  64
  65
  66 __kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)
  67 {
  68         int i = get_global_id(0);
  69         if (i>=numUnsortedAabbs)
  70                 return;
  71
  72         int j = get_global_id(1);
  73         if (j>=numUnSortedAabbs2)
  74                 return;
  75
  76
  77         __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];
  78         __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];
  79
  80         if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))
  81         {
  82                 int4 myPair;
  83
  84                 int xIndex = unsortedAabbPtr[0].m_minIndices[3];
  85                 int yIndex = unsortedAabbPtr2[0].m_minIndices[3];
  86                 if (xIndex>yIndex)
  87                 {
  88                         int tmp = xIndex;
  89                         xIndex=yIndex;
  90                         yIndex=tmp;
  91                 }
  92
  93                 myPair.x = xIndex;
  94                 myPair.y = yIndex;
  95                 myPair.z = NEW_PAIR_MARKER;
  96                 myPair.w = NEW_PAIR_MARKER;
  97
  98
  99                 int curPair = atomic_inc (pairCount);
 100                 if (curPair<maxPairs)
 101                 {
 102                                 pairsOut[curPair] = myPair; //flush to main memory
 103                 }
 104         }
 105 }
 106
 107
 108
 109 __kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
 110 {
 111         int i = get_global_id(0);
 112         if (i>=numObjects)
 113                 return;
 114         for (int j=i+1;j<numObjects;j++)
 115         {
 116                 if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
 117                 {
 118                         int4 myPair;
 119                         myPair.x = aabbs[i].m_minIndices[3];
 120                         myPair.y = aabbs[j].m_minIndices[3];
 121                         myPair.z = NEW_PAIR_MARKER;
 122                         myPair.w = NEW_PAIR_MARKER;
 123
 124                         int curPair = atomic_inc (pairCount);
 125                         if (curPair<maxPairs)
 126                         {
 127                                         pairsOut[curPair] = myPair; //flush to main memory
 128                         }
 129                 }
 130         }
 131 }
 132
 133 __kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
 134 {
 135         int i = get_global_id(0);
 136         if (i>=numObjects)
 137                 return;
 138         for (int j=i+1;j<numObjects;j++)
 139         {
 140         if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
 141                 {
 142                         break;
 143                 }
 144                 if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
 145                 {
 146                         int4 myPair;
 147                         myPair.x = aabbs[i].m_minIndices[3];
 148                         myPair.y = aabbs[j].m_minIndices[3];
 149                         myPair.z = NEW_PAIR_MARKER;
 150                         myPair.w = NEW_PAIR_MARKER;
 151
 152                         int curPair = atomic_inc (pairCount);
 153                         if (curPair<maxPairs)
 154                         {
 155                                         pairsOut[curPair] = myPair; //flush to main memory
 156                         }
 157                 }
 158         }
 159 }
 160
 161
 162
 163
 164 __kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
 165 {
 166         int i = get_global_id(0);
 167         int localId = get_local_id(0);
 168
 169         __local int numActiveWgItems[1];
 170         __local int breakRequest[1];
 171
 172         if (localId==0)
 173         {
 174                 numActiveWgItems[0] = 0;
 175                 breakRequest[0] = 0;
 176         }
 177         barrier(CLK_LOCAL_MEM_FENCE);
 178         atomic_inc(numActiveWgItems);
 179         barrier(CLK_LOCAL_MEM_FENCE);
 180         int localBreak = 0;
 181
 182         int j=i+1;
 183         do
 184         {
 185                 barrier(CLK_LOCAL_MEM_FENCE);
 186
 187                 if (j<numObjects)
 188                 {
 189                 if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
 190                         {
 191                                 if (!localBreak)
 192                                 {
 193                                         atomic_inc(breakRequest);
 194                                         localBreak = 1;
 195                                 }
 196                         }
 197                 }
 198
 199                 barrier(CLK_LOCAL_MEM_FENCE);
 200
 201                 if (j>=numObjects && !localBreak)
 202                 {
 203                         atomic_inc(breakRequest);
 204                         localBreak = 1;
 205                 }
 206                 barrier(CLK_LOCAL_MEM_FENCE);
 207
 208                 if (!localBreak)
 209                 {
 210                         if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
 211                         {
 212                                 int4 myPair;
 213                                 myPair.x = aabbs[i].m_minIndices[3];
 214                                 myPair.y = aabbs[j].m_minIndices[3];
 215                                 myPair.z = NEW_PAIR_MARKER;
 216                                 myPair.w = NEW_PAIR_MARKER;
 217
 218                                 int curPair = atomic_inc (pairCount);
 219                                 if (curPair<maxPairs)
 220                                 {
 221                                                 pairsOut[curPair] = myPair; //flush to main memory
 222                                 }
 223                         }
 224                 }
 225                 j++;
 226
 227         } while (breakRequest[0]<numActiveWgItems[0]);
 228 }
 229
 230
 231 __kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
 232 {
 233         int i = get_global_id(0);
 234         int localId = get_local_id(0);
 235
 236         __local int numActiveWgItems[1];
 237         __local int breakRequest[1];
 238         __local btAabbCL localAabbs[128];// = aabbs[i];
 239
 240         btAabbCL myAabb;
 241
 242         myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
 243         float testValue =       myAabb.m_maxElems[axis];
 244
 245         if (localId==0)
 246         {
 247                 numActiveWgItems[0] = 0;
 248                 breakRequest[0] = 0;
 249         }
 250         int localCount=0;
 251         int block=0;
 252         localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
 253         localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
 254
 255         barrier(CLK_LOCAL_MEM_FENCE);
 256         atomic_inc(numActiveWgItems);
 257         barrier(CLK_LOCAL_MEM_FENCE);
 258         int localBreak = 0;
 259
 260         int j=i+1;
 261         do
 262         {
 263                 barrier(CLK_LOCAL_MEM_FENCE);
 264
 265                 if (j<numObjects)
 266                 {
 267                 if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
 268                         {
 269                                 if (!localBreak)
 270                                 {
 271                                         atomic_inc(breakRequest);
 272                                         localBreak = 1;
 273                                 }
 274                         }
 275                 }
 276
 277                 barrier(CLK_LOCAL_MEM_FENCE);
 278
 279                 if (j>=numObjects && !localBreak)
 280                 {
 281                         atomic_inc(breakRequest);
 282                         localBreak = 1;
 283                 }
 284                 barrier(CLK_LOCAL_MEM_FENCE);
 285
 286                 if (!localBreak)
 287                 {
 288                         if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
 289                         {
 290                                 int4 myPair;
 291                                 myPair.x = myAabb.m_minIndices[3];
 292                                 myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
 293                                 myPair.z = NEW_PAIR_MARKER;
 294                                 myPair.w = NEW_PAIR_MARKER;
 295
 296                                 int curPair = atomic_inc (pairCount);
 297                                 if (curPair<maxPairs)
 298                                 {
 299                                                 pairsOut[curPair] = myPair; //flush to main memory
 300                                 }
 301                         }
 302                 }
 303
 304                 barrier(CLK_LOCAL_MEM_FENCE);
 305
 306                 localCount++;
 307                 if (localCount==64)
 308                 {
 309                         localCount = 0;
 310                         block+=64;
 311                         localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
 312                         localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
 313                 }
 314                 j++;
 315
 316         } while (breakRequest[0]<numActiveWgItems[0]);
 317
 318 }
 319
 320
 321
 322
 323 //http://stereopsis.com/radix.html
 324 unsigned int FloatFlip(float fl);
 325 unsigned int FloatFlip(float fl)
 326 {
 327         unsigned int f = *(unsigned int*)&fl;
 328         unsigned int mask = -(int)(f >> 31) | 0x80000000;
 329         return f ^ mask;
 330 }
 331 float IFloatFlip(unsigned int f);
 332 float IFloatFlip(unsigned int f)
 333 {
 334         unsigned int mask = ((f >> 31) - 1) | 0x80000000;
 335         unsigned int fl = f ^ mask;
 336         return *(float*)&fl;
 337 }
 338
 339
 340
 341
 342 __kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
 343 {
 344         int i = get_global_id(0);
 345         if (i>=numObjects)
 346                 return;
 347         int src = destAabbs[i].m_maxIndices[3];
 348         destAabbs[i] = allAabbs[src];
 349         destAabbs[i].m_maxIndices[3] = src;
 350 }
 351
 352
 353 __kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)
 354 {
 355         int i = get_global_id(0);
 356         if (i>=numObjects)
 357                 return;
 358
 359
 360         sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);
 361         sortData[i].y = i;
 362
 363 }
 364
 365
 366 __kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
 367 {
 368         int i = get_global_id(0);
 369         if (i>=numObjects)
 370                 return;
 371
 372         sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];
 373 }
 374
 375
 376
 377 __kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)
 378 {
 379         int i = get_global_id(0);
 380         if (i>=numAabbs)
 381                 return;
 382
 383         btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];
 384
 385         float4 s;
 386         s = (smallAabb.m_max+smallAabb.m_min)*0.5f;
 387         sum[i]=s;
 388         sum2[i]=s*s;
 389 }