[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / Bullet3OpenCL / RigidBody / kernels / batchingKernels.cl
1 /*
2 Copyright (c) 2012 Advanced Micro Devices, Inc.  
3
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose, 
7 including commercial applications, and to alter it and redistribute it freely, 
8 subject to the following restrictions:
9
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14 //Originally written by Takahiro Harada
15
16 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
17
18 #pragma OPENCL EXTENSION cl_amd_printf : enable
19 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
20 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
21 #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
22 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
23
24 #ifdef cl_ext_atomic_counters_32
25 #pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
26 #else
27 #define counter32_t volatile __global int*
28 #endif
29
30
31 typedef unsigned int u32;
32 typedef unsigned short u16;
33 typedef unsigned char u8;
34
35 #define GET_GROUP_IDX get_group_id(0)
36 #define GET_LOCAL_IDX get_local_id(0)
37 #define GET_GLOBAL_IDX get_global_id(0)
38 #define GET_GROUP_SIZE get_local_size(0)
39 #define GET_NUM_GROUPS get_num_groups(0)
40 #define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
41 #define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
42 #define AtomInc(x) atom_inc(&(x))
43 #define AtomInc1(x, out) out = atom_inc(&(x))
44 #define AppendInc(x, out) out = atomic_inc(x)
45 #define AtomAdd(x, value) atom_add(&(x), value)
46 #define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
47 #define AtomXhg(x, value) atom_xchg ( &(x), value )
48
49
50 #define SELECT_UINT4( b, a, condition ) select( b,a,condition )
51
52 #define make_float4 (float4)
53 #define make_float2 (float2)
54 #define make_uint4 (uint4)
55 #define make_int4 (int4)
56 #define make_uint2 (uint2)
57 #define make_int2 (int2)
58
59
60 #define max2 max
61 #define min2 min
62
63
64 #define WG_SIZE 64
65
66
67
68
69
70 typedef struct 
71 {
72         int m_n;
73         int m_start;
74         int m_staticIdx;
75         int m_paddings[1];
76 } ConstBuffer;
77
78 typedef struct 
79 {
80         int m_a;
81         int m_b;
82         u32 m_idx;
83 }Elem;
84
85 #define STACK_SIZE (WG_SIZE*10)
86 //#define STACK_SIZE (WG_SIZE)
87 #define RING_SIZE 1024
88 #define RING_SIZE_MASK (RING_SIZE-1)
89 #define CHECK_SIZE (WG_SIZE)
90
91
92 #define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)
93 #define RING_END ldsTmp
94
95 u32 readBuf(__local u32* buff, int idx)
96 {
97         idx = idx % (32*CHECK_SIZE);
98         int bitIdx = idx%32;
99         int bufIdx = idx/32;
100         return buff[bufIdx] & (1<<bitIdx);
101 }
102
103 void writeBuf(__local u32* buff, int idx)
104 {
105         idx = idx % (32*CHECK_SIZE);
106         int bitIdx = idx%32;
107         int bufIdx = idx/32;
108 //      buff[bufIdx] |= (1<<bitIdx);
109         atom_or( &buff[bufIdx], (1<<bitIdx) );
110 }
111
112 u32 tryWrite(__local u32* buff, int idx)
113 {
114         idx = idx % (32*CHECK_SIZE);
115         int bitIdx = idx%32;
116         int bufIdx = idx/32;
117         u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
118         return ((ans >> bitIdx)&1) == 0;
119 }
120
121 //      batching on the GPU
122 __kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
123                 __global const u32* gN, __global const u32* gStart, __global int* batchSizes, 
124                 int m_staticIdx )
125 {
126         __local u32 ldsStackIdx[STACK_SIZE];
127         __local u32 ldsStackEnd;
128         __local Elem ldsRingElem[RING_SIZE];
129         __local u32 ldsRingEnd;
130         __local u32 ldsTmp;
131         __local u32 ldsCheckBuffer[CHECK_SIZE];
132         __local u32 ldsFixedBuffer[CHECK_SIZE];
133         __local u32 ldsGEnd;
134         __local u32 ldsDstEnd;
135
136         int wgIdx = GET_GROUP_IDX;
137         int lIdx = GET_LOCAL_IDX;
138         
139         const int m_n = gN[wgIdx];
140         const int m_start = gStart[wgIdx];
141                 
142         if( lIdx == 0 )
143         {
144                 ldsRingEnd = 0;
145                 ldsGEnd = 0;
146                 ldsStackEnd = 0;
147                 ldsDstEnd = m_start;
148         }
149         
150         
151         
152 //      while(1)
153 //was 250
154         int ie=0;
155         int maxBatch = 0;
156         for(ie=0; ie<50; ie++)
157         {
158                 ldsFixedBuffer[lIdx] = 0;
159
160                 for(int giter=0; giter<4; giter++)
161                 {
162                         int ringCap = GET_RING_CAPACITY;
163                 
164                         //      1. fill ring
165                         if( ldsGEnd < m_n )
166                         {
167                                 while( ringCap > WG_SIZE )
168                                 {
169                                         if( ldsGEnd >= m_n ) break;
170                                         if( lIdx < ringCap - WG_SIZE )
171                                         {
172                                                 int srcIdx;
173                                                 AtomInc1( ldsGEnd, srcIdx );
174                                                 if( srcIdx < m_n )
175                                                 {
176                                                         int dstIdx;
177                                                         AtomInc1( ldsRingEnd, dstIdx );
178                                                         
179                                                         int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;
180                                                         int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;
181                                                         ldsRingElem[dstIdx].m_a = (a>b)? b:a;
182                                                         ldsRingElem[dstIdx].m_b = (a>b)? a:b;
183                                                         ldsRingElem[dstIdx].m_idx = srcIdx;
184                                                 }
185                                         }
186                                         ringCap = GET_RING_CAPACITY;
187                                 }
188                         }
189
190                         GROUP_LDS_BARRIER;
191         
192                         //      2. fill stack
193                         __local Elem* dst = ldsRingElem;
194                         if( lIdx == 0 ) RING_END = 0;
195
196                         int srcIdx=lIdx;
197                         int end = ldsRingEnd;
198
199                         {
200                                 for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)
201                                 {
202                                         Elem e;
203                                         if(srcIdx<end) e = ldsRingElem[srcIdx];
204                                         bool done = (srcIdx<end)?false:true;
205
206                                         for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;
207                                         
208                                         if( !done )
209                                         {
210                                                 int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));
211                                                 int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));
212
213                                                 if( aUsed==0 && bUsed==0 )
214                                                 {
215                                                         int aAvailable=1;
216                                                         int bAvailable=1;
217                                                         int ea = abs(e.m_a);
218                                                         int eb = abs(e.m_b);
219
220                                                         bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);
221                                                         bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);
222                                                         
223                                                         if (!aStatic)
224                                                                 aAvailable = tryWrite( ldsCheckBuffer, ea );
225                                                         if (!bStatic)
226                                                                 bAvailable = tryWrite( ldsCheckBuffer, eb );
227                                                         
228                                                         //aAvailable = aStatic? 1: aAvailable;
229                                                         //bAvailable = bStatic? 1: bAvailable;
230
231                                                         bool success = (aAvailable && bAvailable);
232                                                         if(success)
233                                                         {
234                                                         
235                                                                 if (!aStatic)
236                                                                         writeBuf( ldsFixedBuffer, ea );
237                                                                 if (!bStatic)
238                                                                         writeBuf( ldsFixedBuffer, eb );
239                                                         }
240                                                         done = success;
241                                                 }
242                                         }
243
244                                         //      put it aside
245                                         if(srcIdx<end)
246                                         {
247                                                 if( done )
248                                                 {
249                                                         int dstIdx; AtomInc1( ldsStackEnd, dstIdx );
250                                                         if( dstIdx < STACK_SIZE )
251                                                                 ldsStackIdx[dstIdx] = e.m_idx;
252                                                         else{
253                                                                 done = false;
254                                                                 AtomAdd( ldsStackEnd, -1 );
255                                                         }
256                                                 }
257                                                 if( !done )
258                                                 {
259                                                         int dstIdx; AtomInc1( RING_END, dstIdx );
260                                                         dst[dstIdx] = e;
261                                                 }
262                                         }
263
264                                         //      if filled, flush
265                                         if( ldsStackEnd == STACK_SIZE )
266                                         {
267                                                 for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)
268                                                 {
269                                                         int idx = m_start + ldsStackIdx[i];
270                                                         int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
271                                                         gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
272                                                         gConstraintsOut[ dstIdx ].m_batchIdx = ie;
273                                                 }
274                                                 if( lIdx == 0 ) ldsStackEnd = 0;
275
276                                                 //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) 
277                                                 ldsFixedBuffer[lIdx] = 0;
278                                         }
279                                 }
280                         }
281
282                         if( lIdx == 0 ) ldsRingEnd = RING_END;
283                 }
284
285                 GROUP_LDS_BARRIER;
286
287                 for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)
288                 {
289                         int idx = m_start + ldsStackIdx[i];
290                         int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
291                         gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
292                         gConstraintsOut[ dstIdx ].m_batchIdx = ie;
293                 }
294
295                 //      in case it couldn't consume any pair. Flush them
296                 //      todo. Serial batch worth while?
297                 if( ldsStackEnd == 0 )
298                 {
299                         for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)
300                         {
301                                 int idx = m_start + ldsRingElem[i].m_idx;
302                                 int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
303                                 gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
304                                 int curBatch = 100+i;
305                                 if (maxBatch < curBatch)
306                                         maxBatch = curBatch;
307                                 
308                                 gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;
309                                 
310                         }
311                         GROUP_LDS_BARRIER;
312                         if( lIdx == 0 ) ldsRingEnd = 0;
313                 }
314
315                 if( lIdx == 0 ) ldsStackEnd = 0;
316
317                 GROUP_LDS_BARRIER;
318
319                 //      termination
320                 if( ldsGEnd == m_n && ldsRingEnd == 0 )
321                         break;
322         }
323
324         if( lIdx == 0 )
325         {
326                 if (maxBatch < ie)
327                         maxBatch=ie;
328                 batchSizes[wgIdx]=maxBatch;
329         }
330
331 }
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353