dali-physics/third-party/bullet3/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl

   1 /*
   2 Copyright (c) 2012 Advanced Micro Devices, Inc.
   3
   4 This software is provided 'as-is', without any express or implied warranty.
   5 In no event will the authors be held liable for any damages arising from the use of this software.
   6 Permission is granted to anyone to use this software for any purpose,
   7 including commercial applications, and to alter it and redistribute it freely,
   8 subject to the following restrictions:
   9
  10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  12 3. This notice may not be removed or altered from any source distribution.
  13 */
  14 //Originally written by Takahiro Harada
  15
  16 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
  17
  18 #pragma OPENCL EXTENSION cl_amd_printf : enable
  19 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
  20 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
  21 #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
  22 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
  23
  24 #ifdef cl_ext_atomic_counters_32
  25 #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
  26 #else
  27 #define counter32_t volatile __global int*
  28 #endif
  29
  30
  31 typedef unsigned int u32;
  32 typedef unsigned short u16;
  33 typedef unsigned char u8;
  34
  35 #define GET_GROUP_IDX get_group_id(0)
  36 #define GET_LOCAL_IDX get_local_id(0)
  37 #define GET_GLOBAL_IDX get_global_id(0)
  38 #define GET_GROUP_SIZE get_local_size(0)
  39 #define GET_NUM_GROUPS get_num_groups(0)
  40 #define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
  41 #define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
  42 #define AtomInc(x) atom_inc(&(x))
  43 #define AtomInc1(x, out) out = atom_inc(&(x))
  44 #define AppendInc(x, out) out = atomic_inc(x)
  45 #define AtomAdd(x, value) atom_add(&(x), value)
  46 #define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
  47 #define AtomXhg(x, value) atom_xchg ( &(x), value )
  48
  49
  50 #define SELECT_UINT4( b, a, condition ) select( b,a,condition )
  51
  52 #define make_float4 (float4)
  53 #define make_float2 (float2)
  54 #define make_uint4 (uint4)
  55 #define make_int4 (int4)
  56 #define make_uint2 (uint2)
  57 #define make_int2 (int2)
  58
  59
  60 #define max2 max
  61 #define min2 min
  62
  63
  64 #define WG_SIZE 64
  65
  66
  67
  68
  69
  70 typedef struct
  71 {
  72         int m_n;
  73         int m_start;
  74         int m_staticIdx;
  75         int m_paddings[1];
  76 } ConstBuffer;
  77
  78 typedef struct
  79 {
  80         int m_a;
  81         int m_b;
  82         u32 m_idx;
  83 }Elem;
  84
  85 #define STACK_SIZE (WG_SIZE*10)
  86 //#define STACK_SIZE (WG_SIZE)
  87 #define RING_SIZE 1024
  88 #define RING_SIZE_MASK (RING_SIZE-1)
  89 #define CHECK_SIZE (WG_SIZE)
  90
  91
  92 #define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)
  93 #define RING_END ldsTmp
  94
  95 u32 readBuf(__local u32* buff, int idx)
  96 {
  97         idx = idx % (32*CHECK_SIZE);
  98         int bitIdx = idx%32;
  99         int bufIdx = idx/32;
 100         return buff[bufIdx] & (1<<bitIdx);
 101 }
 102
 103 void writeBuf(__local u32* buff, int idx)
 104 {
 105         idx = idx % (32*CHECK_SIZE);
 106         int bitIdx = idx%32;
 107         int bufIdx = idx/32;
 108 //      buff[bufIdx] |= (1<<bitIdx);
 109         atom_or( &buff[bufIdx], (1<<bitIdx) );
 110 }
 111
 112 u32 tryWrite(__local u32* buff, int idx)
 113 {
 114         idx = idx % (32*CHECK_SIZE);
 115         int bitIdx = idx%32;
 116         int bufIdx = idx/32;
 117         u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
 118         return ((ans >> bitIdx)&1) == 0;
 119 }
 120
 121 //      batching on the GPU
 122 __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
 123                 __global const u32* gN, __global const u32* gStart, __global int* batchSizes,
 124                 int m_staticIdx )
 125 {
 126         __local u32 ldsStackIdx[STACK_SIZE];
 127         __local u32 ldsStackEnd;
 128         __local Elem ldsRingElem[RING_SIZE];
 129         __local u32 ldsRingEnd;
 130         __local u32 ldsTmp;
 131         __local u32 ldsCheckBuffer[CHECK_SIZE];
 132         __local u32 ldsFixedBuffer[CHECK_SIZE];
 133         __local u32 ldsGEnd;
 134         __local u32 ldsDstEnd;
 135
 136         int wgIdx = GET_GROUP_IDX;
 137         int lIdx = GET_LOCAL_IDX;
 138
 139         const int m_n = gN[wgIdx];
 140         const int m_start = gStart[wgIdx];
 141
 142         if( lIdx == 0 )
 143         {
 144                 ldsRingEnd = 0;
 145                 ldsGEnd = 0;
 146                 ldsStackEnd = 0;
 147                 ldsDstEnd = m_start;
 148         }
 149
 150
 151
 152 //      while(1)
 153 //was 250
 154         int ie=0;
 155         int maxBatch = 0;
 156         for(ie=0; ie<50; ie++)
 157         {
 158                 ldsFixedBuffer[lIdx] = 0;
 159
 160                 for(int giter=0; giter<4; giter++)
 161                 {
 162                         int ringCap = GET_RING_CAPACITY;
 163
 164                         //      1. fill ring
 165                         if( ldsGEnd < m_n )
 166                         {
 167                                 while( ringCap > WG_SIZE )
 168                                 {
 169                                         if( ldsGEnd >= m_n ) break;
 170                                         if( lIdx < ringCap - WG_SIZE )
 171                                         {
 172                                                 int srcIdx;
 173                                                 AtomInc1( ldsGEnd, srcIdx );
 174                                                 if( srcIdx < m_n )
 175                                                 {
 176                                                         int dstIdx;
 177                                                         AtomInc1( ldsRingEnd, dstIdx );
 178
 179                                                         int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;
 180                                                         int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;
 181                                                         ldsRingElem[dstIdx].m_a = (a>b)? b:a;
 182                                                         ldsRingElem[dstIdx].m_b = (a>b)? a:b;
 183                                                         ldsRingElem[dstIdx].m_idx = srcIdx;
 184                                                 }
 185                                         }
 186                                         ringCap = GET_RING_CAPACITY;
 187                                 }
 188                         }
 189
 190                         GROUP_LDS_BARRIER;
 191
 192                         //      2. fill stack
 193                         __local Elem* dst = ldsRingElem;
 194                         if( lIdx == 0 ) RING_END = 0;
 195
 196                         int srcIdx=lIdx;
 197                         int end = ldsRingEnd;
 198
 199                         {
 200                                 for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)
 201                                 {
 202                                         Elem e;
 203                                         if(srcIdx<end) e = ldsRingElem[srcIdx];
 204                                         bool done = (srcIdx<end)?false:true;
 205
 206                                         for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;
 207
 208                                         if( !done )
 209                                         {
 210                                                 int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));
 211                                                 int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));
 212
 213                                                 if( aUsed==0 && bUsed==0 )
 214                                                 {
 215                                                         int aAvailable=1;
 216                                                         int bAvailable=1;
 217                                                         int ea = abs(e.m_a);
 218                                                         int eb = abs(e.m_b);
 219
 220                                                         bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);
 221                                                         bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);
 222
 223                                                         if (!aStatic)
 224                                                                 aAvailable = tryWrite( ldsCheckBuffer, ea );
 225                                                         if (!bStatic)
 226                                                                 bAvailable = tryWrite( ldsCheckBuffer, eb );
 227
 228                                                         //aAvailable = aStatic? 1: aAvailable;
 229                                                         //bAvailable = bStatic? 1: bAvailable;
 230
 231                                                         bool success = (aAvailable && bAvailable);
 232                                                         if(success)
 233                                                         {
 234
 235                                                                 if (!aStatic)
 236                                                                         writeBuf( ldsFixedBuffer, ea );
 237                                                                 if (!bStatic)
 238                                                                         writeBuf( ldsFixedBuffer, eb );
 239                                                         }
 240                                                         done = success;
 241                                                 }
 242                                         }
 243
 244                                         //      put it aside
 245                                         if(srcIdx<end)
 246                                         {
 247                                                 if( done )
 248                                                 {
 249                                                         int dstIdx; AtomInc1( ldsStackEnd, dstIdx );
 250                                                         if( dstIdx < STACK_SIZE )
 251                                                                 ldsStackIdx[dstIdx] = e.m_idx;
 252                                                         else{
 253                                                                 done = false;
 254                                                                 AtomAdd( ldsStackEnd, -1 );
 255                                                         }
 256                                                 }
 257                                                 if( !done )
 258                                                 {
 259                                                         int dstIdx; AtomInc1( RING_END, dstIdx );
 260                                                         dst[dstIdx] = e;
 261                                                 }
 262                                         }
 263
 264                                         //      if filled, flush
 265                                         if( ldsStackEnd == STACK_SIZE )
 266                                         {
 267                                                 for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)
 268                                                 {
 269                                                         int idx = m_start + ldsStackIdx[i];
 270                                                         int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
 271                                                         gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
 272                                                         gConstraintsOut[ dstIdx ].m_batchIdx = ie;
 273                                                 }
 274                                                 if( lIdx == 0 ) ldsStackEnd = 0;
 275
 276                                                 //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE)
 277                                                 ldsFixedBuffer[lIdx] = 0;
 278                                         }
 279                                 }
 280                         }
 281
 282                         if( lIdx == 0 ) ldsRingEnd = RING_END;
 283                 }
 284
 285                 GROUP_LDS_BARRIER;
 286
 287                 for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)
 288                 {
 289                         int idx = m_start + ldsStackIdx[i];
 290                         int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
 291                         gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
 292                         gConstraintsOut[ dstIdx ].m_batchIdx = ie;
 293                 }
 294
 295                 //      in case it couldn't consume any pair. Flush them
 296                 //      todo. Serial batch worth while?
 297                 if( ldsStackEnd == 0 )
 298                 {
 299                         for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)
 300                         {
 301                                 int idx = m_start + ldsRingElem[i].m_idx;
 302                                 int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
 303                                 gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
 304                                 int curBatch = 100+i;
 305                                 if (maxBatch < curBatch)
 306                                         maxBatch = curBatch;
 307
 308                                 gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;
 309
 310                         }
 311                         GROUP_LDS_BARRIER;
 312                         if( lIdx == 0 ) ldsRingEnd = 0;
 313                 }
 314
 315                 if( lIdx == 0 ) ldsStackEnd = 0;
 316
 317                 GROUP_LDS_BARRIER;
 318
 319                 //      termination
 320                 if( ldsGEnd == m_n && ldsRingEnd == 0 )
 321                         break;
 322         }
 323
 324         if( lIdx == 0 )
 325         {
 326                 if (maxBatch < ie)
 327                         maxBatch=ie;
 328                 batchSizes[wgIdx]=maxBatch;
 329         }
 330
 331 }
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353