Demos/ParticlesOpenCL/ParticlesOCL.cl

   1 MSTRINGIFY(\r
   2 \r
   3 /*\r
   4 Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org\r
   5 Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. \r
   6 \r
   7 This software is provided 'as-is', without any express or implied warranty.\r
   8 In no event will the authors be held liable for any damages arising from the use of this software.\r
   9 Permission is granted to anyone to use this software for any purpose, \r
  10 including commercial applications, and to alter it and redistribute it freely, \r
  11 subject to the following restrictions:\r
  12 \r
  13 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\r
  14 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\r
  15 3. This notice may not be removed or altered from any source distribution.\r
  16 */\r
  17 \r
  18 \r
  19 \r
  20 int4 getGridPos(float4 worldPos, __global float4* pParams)\r
  21 {\r
  22     int4 gridPos;\r
  23     gridPos.x = (int)floor((worldPos.x - pParams[1].x) / pParams[3].x);\r
  24     gridPos.y = (int)floor((worldPos.y - pParams[1].y) / pParams[3].y);\r
  25     gridPos.z = (int)floor((worldPos.z - pParams[1].z) / pParams[3].z);\r
  26     return gridPos;\r
  27 }\r
  28 \r
  29 unsigned int getPosHash(int4 gridPos, __global float4* pParams)\r
  30 {\r
  31         int4 gridDim = *((__global int4*)(pParams + 4));\r
  32         if(gridPos.x < 0) gridPos.x = 0;\r
  33         if(gridPos.x >= gridDim.x) gridPos.x = gridDim.x - 1;\r
  34         if(gridPos.y < 0) gridPos.y = 0;\r
  35         if(gridPos.y >= gridDim.y) gridPos.y = gridDim.y - 1;\r
  36         if(gridPos.z < 0) gridPos.z = 0;\r
  37         if(gridPos.z >= gridDim.z) gridPos.z = gridDim.z - 1;\r
  38         unsigned int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\r
  39         return hash;\r
  40 } \r
  41 \r
  42 \r
  43 __kernel void kComputeCellId(   int numParticles, \r
  44                                                                 __global float4* pPos, \r
  45                                                                 __global int2* pPosHash,\r
  46                                                                 __global float4* pParams GUID_ARG)\r
  47 {\r
  48     int index = get_global_id(0);\r
  49     if(index >= numParticles)\r
  50     {\r
  51                 return;\r
  52     }\r
  53         float4 pos = pPos[index];\r
  54         int4 gridPos = getGridPos(pos, pParams);\r
  55         unsigned int hash = getPosHash(gridPos, pParams);\r
  56         pPosHash[index].x = hash;\r
  57         pPosHash[index].y = index;\r
  58 }\r
  59 \r
  60 __kernel void kClearCellStart(  int numCells, \r
  61                                                                 __global int* pCellStart GUID_ARG)\r
  62 {\r
  63     int index = get_global_id(0);\r
  64     if(index >= numCells)\r
  65         {\r
  66                 return;\r
  67         }\r
  68         pCellStart[index] = -1;\r
  69 }\r
  70 \r
  71 __kernel void kFindCellStart(   int numParticles, \r
  72                                                                 __global int2* pHash, \r
  73                                                                 __global int* cellStart,\r
  74                                                                 __global float4* pPos,\r
  75                                                                 __global float4* pVel,\r
  76                                                                 __global float4* pSortedPos,\r
  77                                                                 __global float4* pSortedVel GUID_ARG)\r
  78 {\r
  79     int index = get_global_id(0);\r
  80         __local int sharedHash[1025];//maximum workgroup size 1024\r
  81         int2 sortedData;\r
  82         \r
  83     if(index < numParticles)\r
  84         {\r
  85 \r
  86                 sortedData = pHash[index];\r
  87                 // Load hash data into shared memory so that we can look \r
  88                 // at neighboring body's hash value without loading\r
  89                 // two hash values per thread\r
  90                 sharedHash[get_local_id(0) + 1] = sortedData.x;\r
  91                 if((index > 0) && (get_local_id(0) == 0))\r
  92                 {\r
  93                         // first thread in block must load neighbor body hash\r
  94                         sharedHash[0] = pHash[index-1].x;\r
  95                 }\r
  96                 \r
  97         }\r
  98     barrier(CLK_LOCAL_MEM_FENCE);\r
  99         \r
 100         if(index < numParticles)\r
 101         {\r
 102                 if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\r
 103                 {\r
 104                         cellStart[sortedData.x] = index;\r
 105                 }\r
 106                 int unsortedIndex = sortedData.y;\r
 107                 float4 pos = pPos[unsortedIndex];\r
 108                 float4 vel = pVel[unsortedIndex];\r
 109                 pSortedPos[index] = pos;\r
 110                 pSortedVel[index] = vel;\r
 111         }\r
 112 }\r
 113 \r
 114 __kernel void kIntegrateMotion( int numParticles,\r
 115                                                                 __global float4* pPos, \r
 116                                                                 __global float4* pVel, \r
 117                                                                 __global float4* pParams, \r
 118                                                                 float timeStep GUID_ARG)\r
 119 {\r
 120     int index = get_global_id(0);\r
 121     if(index >= numParticles)\r
 122     {\r
 123                 return;\r
 124         }\r
 125     float4 pos = pPos[index];\r
 126     float4 vel = pVel[index];\r
 127     pos.w = 1.0f;\r
 128     vel.w = 0.0f;\r
 129     // apply gravity\r
 130     float4 gravity = *((__global float4*)(pParams + 0));\r
 131     float particleRad = pParams[5].x;\r
 132     float globalDamping = pParams[5].y;\r
 133     float boundaryDamping = pParams[5].z;\r
 134     vel += gravity * timeStep;\r
 135     vel *= globalDamping;\r
 136     // integrate position\r
 137     pos += vel * timeStep;\r
 138     // collide with world boundaries\r
 139     float4 worldMin = *((__global float4*)(pParams + 1));\r
 140     float4 worldMax = *((__global float4*)(pParams + 2));\r
 141     \r
 142     \r
 143     if(pos.x < (worldMin.x + 2*particleRad))\r
 144     {\r
 145         pos.x = worldMin.x + 2*particleRad;\r
 146         vel.x *= boundaryDamping;\r
 147     }\r
 148     if(pos.x > (worldMax.x - 2*particleRad))\r
 149     {\r
 150         pos.x = worldMax.x - 2*particleRad;\r
 151         vel.x *= boundaryDamping;\r
 152     }\r
 153     if(pos.y < (worldMin.y + 2*particleRad))\r
 154     {\r
 155         pos.y = worldMin.y + 2*particleRad;\r
 156         vel.y *= boundaryDamping;\r
 157     }\r
 158     if(pos.y > (worldMax.y - 2*particleRad))\r
 159     {\r
 160         pos.y = worldMax.y - 2*particleRad;\r
 161         vel.y *= boundaryDamping;\r
 162     }\r
 163     if(pos.z < (worldMin.z + 2*particleRad))\r
 164     {\r
 165         pos.z = worldMin.z + 2*particleRad;\r
 166         vel.z *= boundaryDamping;\r
 167     }\r
 168     if(pos.z > (worldMax.z - 2*particleRad))\r
 169     {\r
 170         pos.z = worldMax.z - 2*particleRad;\r
 171         vel.z *= boundaryDamping;\r
 172     }\r
 173     // write back position and velocity\r
 174     pPos[index] = pos;\r
 175     pVel[index] = vel;\r
 176 }\r
 177 \r
 178 \r
 179 float4 collideTwoParticles(\r
 180     float4 posA,\r
 181     float4 posB,\r
 182     float4 velA,\r
 183     float4 velB,\r
 184     float radiusA,\r
 185     float radiusB,\r
 186     float spring,\r
 187     float damping,\r
 188     float shear,\r
 189     float attraction\r
 190 )\r
 191 {\r
 192     //Calculate relative position\r
 193     float4     relPos = posB - posA; relPos.w = 0.f;\r
 194     float        dist = sqrt(relPos.x * relPos.x + relPos.y * relPos.y + relPos.z * relPos.z);\r
 195     float collideDist = radiusA + radiusB;\r
 196 \r
 197     float4 force = (float4)0.f;\r
 198     if(dist < collideDist){\r
 199         float4 norm = relPos * (1.f / dist); norm.w = 0.f;\r
 200 \r
 201         //Relative velocity\r
 202         float4 relVel = velB - velA; relVel.w = 0.f;\r
 203 \r
 204         //Relative tangential velocity\r
 205         float relVelDotNorm = relVel.x * norm.x + relVel.y * norm.y + relVel.z * norm.z;\r
 206         float4 tanVel = relVel - norm * relVelDotNorm;  tanVel.w = 0.f;\r
 207 \r
 208         //Spring force (potential)\r
 209         float springFactor = -spring * (collideDist - dist);\r
 210         force = springFactor * norm + damping * relVel + shear * tanVel + attraction * relPos;\r
 211         force.w = 0.f;\r
 212     }\r
 213     return force;\r
 214 }\r
 215 \r
 216 \r
 217 __kernel void kCollideParticles(int numParticles,\r
 218                                                                 __global float4*                pVel,          //output: new velocity\r
 219                                                                 __global const float4* pSortedPos, //input: reordered positions\r
 220                                                                 __global const float4* pSortedVel, //input: reordered velocities\r
 221                                                                 __global const int2   *pPosHash,        //input: reordered particle indices\r
 222                                                                 __global const int   *pCellStart,    //input: cell boundaries\r
 223                                                                 __global float4* pParams GUID_ARG)\r
 224 {\r
 225     int index = get_global_id(0);\r
 226     if(index >= numParticles)\r
 227         {\r
 228         return;\r
 229         }\r
 230 \r
 231     float4   posA = pSortedPos[index];\r
 232     float4   velA = pSortedVel[index];\r
 233     float4 force = (float4)0.f;\r
 234     float particleRad = pParams[5].x;\r
 235     float collisionDamping = pParams[5].w;\r
 236     float spring = pParams[6].x;\r
 237     float shear = pParams[6].y;\r
 238     float attraction = pParams[6].z;\r
 239     int unsortedIndex = pPosHash[index].y;\r
 240 \r
 241     //Get address in grid\r
 242     int4 gridPosA = getGridPos(posA, pParams);\r
 243 \r
 244     //Accumulate surrounding cells\r
 245     int4 gridPosB; \r
 246     for(int z = -1; z <= 1; z++)\r
 247         {\r
 248                 gridPosB.z = gridPosA.z + z;\r
 249         for(int y = -1; y <= 1; y++)\r
 250                 {\r
 251                         gridPosB.y = gridPosA.y + y;\r
 252             for(int x = -1; x <= 1; x++)\r
 253             {\r
 254                                 gridPosB.x = gridPosA.x + x;\r
 255                 //Get start particle index for this cell\r
 256                 uint hashB = getPosHash(gridPosB, pParams);\r
 257                 int startI = pCellStart[hashB];\r
 258                 //Skip empty cell\r
 259                 if(startI < 0)\r
 260                 {\r
 261                     continue;\r
 262                 }\r
 263                //Iterate over particles in this cell\r
 264                 int endI = startI + 32;\r
 265                 if(endI >= numParticles) \r
 266                                         endI = numParticles ;\r
 267                                         \r
 268                 for(int j = startI; j < endI; j++)\r
 269                 {\r
 270                                         uint hashC = pPosHash[j].x;\r
 271                                         if(hashC != hashB)\r
 272                                         {\r
 273                                                 break;\r
 274                                         }\r
 275                                         if(j == index)\r
 276                                         {\r
 277                                                 continue;\r
 278                                         }\r
 279                     float4 posB = pSortedPos[j];\r
 280                     float4 velB = pSortedVel[j];\r
 281                     //Collide two spheres\r
 282                     force += collideTwoParticles(       posA, posB, velA, velB, particleRad, particleRad, \r
 283                                                                                                         spring, collisionDamping, shear, attraction);\r
 284                 }\r
 285             }\r
 286                 }\r
 287         }     \r
 288     //Write new velocity back to original unsorted location\r
 289     pVel[unsortedIndex] = velA + force;\r
 290 }\r
 291 \r
 292 \r
 293 \r
 294 \r
 295 \r
 296 \r
 297 /*\r
 298  * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.\r
 299  *\r
 300  * NVIDIA Corporation and its licensors retain all intellectual property and \r
 301  * proprietary rights in and to this software and related documentation. \r
 302  * Any use, reproduction, disclosure, or distribution of this software \r
 303  * and related documentation without an express license agreement from\r
 304  * NVIDIA Corporation is strictly prohibited.\r
 305  *\r
 306  * Please refer to the applicable NVIDIA end user license agreement (EULA) \r
 307  * associated with this source code for terms and conditions that govern \r
 308  * your use of this NVIDIA software.\r
 309  * \r
 310  */\r
 311 \r
 312 \r
 313 \r
 314 inline void ComparatorPrivate(int2* keyA, int2* keyB, uint dir)\r
 315 {\r
 316     if((keyA[0].x > keyB[0].x) == dir)\r
 317     {\r
 318                 int2 tmp = *keyA;\r
 319                 *keyA = *keyB;\r
 320                 *keyB = tmp;\r
 321     }\r
 322 }\r
 323 \r
 324 inline void ComparatorLocal(__local int2* keyA, __local int2* keyB, uint dir)\r
 325 {\r
 326     if((keyA[0].x > keyB[0].x) == dir)\r
 327     {\r
 328                 int2 tmp = *keyA;\r
 329                 *keyA = *keyB;\r
 330                 *keyB = tmp;\r
 331     }\r
 332 }\r
 333 \r
 334 ////////////////////////////////////////////////////////////////////////////////\r
 335 // Monolithic bitonic sort kernel for short arrays fitting into local memory\r
 336 ////////////////////////////////////////////////////////////////////////////////\r
 337 __kernel void kBitonicSortCellIdLocal(__global int2* pKey, uint arrayLength, uint dir GUID_ARG)\r
 338 {\r
 339     __local int2 l_key[LOCAL_SIZE_MAX];\r
 340     int localSizeLimit = get_local_size(0) * 2;\r
 341 \r
 342     //Offset to the beginning of subbatch and load data\r
 343     pKey += get_group_id(0) * localSizeLimit + get_local_id(0);\r
 344     l_key[get_local_id(0) +                    0] = pKey[                   0];\r
 345     l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];\r
 346 \r
 347     for(uint size = 2; size < arrayLength; size <<= 1)\r
 348     {\r
 349         //Bitonic merge\r
 350         uint ddd = dir ^ ( (get_local_id(0) & (size / 2)) != 0 );\r
 351         for(uint stride = size / 2; stride > 0; stride >>= 1)\r
 352         {\r
 353             barrier(CLK_LOCAL_MEM_FENCE);\r
 354             uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));\r
 355             ComparatorLocal(&l_key[pos +      0], &l_key[pos + stride], ddd);\r
 356         }\r
 357     }\r
 358 \r
 359     //ddd == dir for the last bitonic merge step\r
 360     {\r
 361         for(uint stride = arrayLength / 2; stride > 0; stride >>= 1)\r
 362         {\r
 363             barrier(CLK_LOCAL_MEM_FENCE);\r
 364             uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));\r
 365             ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], dir);\r
 366         }\r
 367     }\r
 368 \r
 369     barrier(CLK_LOCAL_MEM_FENCE);\r
 370     pKey[                   0] = l_key[get_local_id(0) +                    0];\r
 371     pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];\r
 372 }\r
 373 \r
 374 ////////////////////////////////////////////////////////////////////////////////\r
 375 // Bitonic sort kernel for large arrays (not fitting into local memory)\r
 376 ////////////////////////////////////////////////////////////////////////////////\r
 377 //Bottom-level bitonic sort\r
 378 //Almost the same as bitonicSortLocal with the only exception\r
 379 //of even / odd subarrays (of LOCAL_SIZE_LIMIT points) being\r
 380 //sorted in opposite directions\r
 381 __kernel void kBitonicSortCellIdLocal1(__global int2* pKey GUID_ARG)\r
 382 {\r
 383     __local int2 l_key[LOCAL_SIZE_MAX];\r
 384     uint localSizeLimit = get_local_size(0) * 2;\r
 385 \r
 386     //Offset to the beginning of subarray and load data\r
 387     pKey += get_group_id(0) * localSizeLimit + get_local_id(0);\r
 388     l_key[get_local_id(0) +                    0] = pKey[                   0];\r
 389     l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];\r
 390 \r
 391     uint comparatorI = get_global_id(0) & ((localSizeLimit / 2) - 1);\r
 392 \r
 393     for(uint size = 2; size < localSizeLimit; size <<= 1)\r
 394     {\r
 395         //Bitonic merge\r
 396         uint ddd = (comparatorI & (size / 2)) != 0;\r
 397         for(uint stride = size / 2; stride > 0; stride >>= 1)\r
 398         {\r
 399             barrier(CLK_LOCAL_MEM_FENCE);\r
 400             uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));\r
 401             ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);\r
 402         }\r
 403     }\r
 404 \r
 405     //Odd / even arrays of localSizeLimit elements\r
 406     //sorted in opposite directions\r
 407     {\r
 408         uint ddd = (get_group_id(0) & 1);\r
 409         for(uint stride = localSizeLimit / 2; stride > 0; stride >>= 1)\r
 410         {\r
 411             barrier(CLK_LOCAL_MEM_FENCE);\r
 412             uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));\r
 413             ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);\r
 414         }\r
 415     }\r
 416 \r
 417     barrier(CLK_LOCAL_MEM_FENCE);\r
 418     pKey[                   0] = l_key[get_local_id(0) +                    0];\r
 419     pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];\r
 420 }\r
 421 \r
 422 //Bitonic merge iteration for 'stride' >= LOCAL_SIZE_LIMIT\r
 423 __kernel void kBitonicSortCellIdMergeGlobal(__global int2* pKey, uint arrayLength, uint size, uint stride, uint dir GUID_ARG)\r
 424 {\r
 425     uint global_comparatorI = get_global_id(0);\r
 426     uint        comparatorI = global_comparatorI & (arrayLength / 2 - 1);\r
 427 \r
 428     //Bitonic merge\r
 429     uint ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );\r
 430     uint pos = 2 * global_comparatorI - (global_comparatorI & (stride - 1));\r
 431 \r
 432     int2 keyA = pKey[pos +      0];\r
 433     int2 keyB = pKey[pos + stride];\r
 434 \r
 435     ComparatorPrivate(&keyA, &keyB, ddd);\r
 436 \r
 437     pKey[pos +      0] = keyA;\r
 438     pKey[pos + stride] = keyB;\r
 439 }\r
 440 \r
 441 //Combined bitonic merge steps for\r
 442 //'size' > LOCAL_SIZE_LIMIT and 'stride' = [1 .. LOCAL_SIZE_LIMIT / 2]\r
 443 __kernel void kBitonicSortCellIdMergeLocal(__global int2* pKey, uint arrayLength, uint stride, uint size, uint dir GUID_ARG)\r
 444 {\r
 445     __local int2 l_key[LOCAL_SIZE_MAX];\r
 446     int localSizeLimit = get_local_size(0) * 2;\r
 447 \r
 448     pKey += get_group_id(0) * localSizeLimit + get_local_id(0);\r
 449     l_key[get_local_id(0) +                    0] = pKey[                   0];\r
 450     l_key[get_local_id(0) + (localSizeLimit / 2)] = pKey[(localSizeLimit / 2)];\r
 451 \r
 452     //Bitonic merge\r
 453     uint comparatorI = get_global_id(0) & ((arrayLength / 2) - 1);\r
 454     uint         ddd = dir ^ ( (comparatorI & (size / 2)) != 0 );\r
 455     for(; stride > 0; stride >>= 1)\r
 456     {\r
 457         barrier(CLK_LOCAL_MEM_FENCE);\r
 458         uint pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));\r
 459         ComparatorLocal(&l_key[pos + 0], &l_key[pos + stride], ddd);\r
 460     }\r
 461 \r
 462     barrier(CLK_LOCAL_MEM_FENCE);\r
 463     pKey[                   0] = l_key[get_local_id(0) +                    0];\r
 464     pKey[(localSizeLimit / 2)] = l_key[get_local_id(0) + (localSizeLimit / 2)];\r
 465 }\r
 466 \r
 467 );\r