[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / Bullet3OpenCL / ParallelPrimitives / kernels / RadixSort32Kernels.cl
1 /*
2 Bullet Continuous Collision Detection and Physics Library
3 Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
4
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose, 
8 including commercial applications, and to alter it and redistribute it freely, 
9 subject to the following restrictions:
10
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14 */
15 //Author Takahiro Harada
16
17
18 //#pragma OPENCL EXTENSION cl_amd_printf : enable
19 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
20 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
21
22 typedef unsigned int u32;
23 #define GET_GROUP_IDX get_group_id(0)
24 #define GET_LOCAL_IDX get_local_id(0)
25 #define GET_GLOBAL_IDX get_global_id(0)
26 #define GET_GROUP_SIZE get_local_size(0)
27 #define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
28 #define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
29 #define AtomInc(x) atom_inc(&(x))
30 #define AtomInc1(x, out) out = atom_inc(&(x))
31 #define AtomAdd(x, value) atom_add(&(x), value)
32
33 #define SELECT_UINT4( b, a, condition ) select( b,a,condition )
34
35
36 #define make_uint4 (uint4)
37 #define make_uint2 (uint2)
38 #define make_int2 (int2)
39
40 #define WG_SIZE 64
41 #define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)
42 #define BITS_PER_PASS 4
43 #define NUM_BUCKET (1<<BITS_PER_PASS)
44 typedef uchar u8;
45
46 //      this isn't optimization for VLIW. But just reducing writes. 
47 #define USE_2LEVEL_REDUCE 1
48
49 //#define CHECK_BOUNDARY 1
50
51 //#define NV_GPU 1
52
53
54 //      Cypress
55 #define nPerWI 16
56 //      Cayman
57 //#define nPerWI 20
58
59 #define m_n x
60 #define m_nWGs y
61 #define m_startBit z
62 #define m_nBlocksPerWG w
63
64 /*
65 typedef struct
66 {
67         int m_n;
68         int m_nWGs;
69         int m_startBit;
70         int m_nBlocksPerWG;
71 } ConstBuffer;
72 */
73
74 typedef struct
75 {
76         unsigned int m_key;
77         unsigned int m_value;
78 } SortDataCL;
79
80
81 uint prefixScanVectorEx( uint4* data )
82 {
83         u32 sum = 0;
84         u32 tmp = data[0].x;
85         data[0].x = sum;
86         sum += tmp;
87         tmp = data[0].y;
88         data[0].y = sum;
89         sum += tmp;
90         tmp = data[0].z;
91         data[0].z = sum;
92         sum += tmp;
93         tmp = data[0].w;
94         data[0].w = sum;
95         sum += tmp;
96         return sum;
97 }
98
99 u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )
100 {
101         {       //      Set data
102                 sorterSharedMemory[lIdx] = 0;
103                 sorterSharedMemory[lIdx+wgSize] = pData;
104         }
105
106         GROUP_LDS_BARRIER;
107
108         {       //      Prefix sum
109                 int idx = 2*lIdx + (wgSize+1);
110 #if defined(USE_2LEVEL_REDUCE)
111                 if( lIdx < 64 )
112                 {
113                         u32 u0, u1, u2;
114                         u0 = sorterSharedMemory[idx-3];
115                         u1 = sorterSharedMemory[idx-2];
116                         u2 = sorterSharedMemory[idx-1];
117                         AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );                   
118                         GROUP_MEM_FENCE;
119
120                         u0 = sorterSharedMemory[idx-12];
121                         u1 = sorterSharedMemory[idx-8];
122                         u2 = sorterSharedMemory[idx-4];
123                         AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );                   
124                         GROUP_MEM_FENCE;
125
126                         u0 = sorterSharedMemory[idx-48];
127                         u1 = sorterSharedMemory[idx-32];
128                         u2 = sorterSharedMemory[idx-16];
129                         AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );                   
130                         GROUP_MEM_FENCE;
131                         if( wgSize > 64 )
132                         {
133                                 sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
134                                 GROUP_MEM_FENCE;
135                         }
136
137                         sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
138                         GROUP_MEM_FENCE;
139                 }
140 #else
141                 if( lIdx < 64 )
142                 {
143                         sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
144                         GROUP_MEM_FENCE;
145                         sorterSharedMemory[idx] += sorterSharedMemory[idx-2];                   
146                         GROUP_MEM_FENCE;
147                         sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
148                         GROUP_MEM_FENCE;
149                         sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
150                         GROUP_MEM_FENCE;
151                         sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
152                         GROUP_MEM_FENCE;
153                         sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
154                         GROUP_MEM_FENCE;
155                         if( wgSize > 64 )
156                         {
157                                 sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
158                                 GROUP_MEM_FENCE;
159                         }
160
161                         sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
162                         GROUP_MEM_FENCE;
163                 }
164 #endif
165         }
166
167         GROUP_LDS_BARRIER;
168
169         *totalSum = sorterSharedMemory[wgSize*2-1];
170         u32 addValue = sorterSharedMemory[lIdx+wgSize-1];
171         return addValue;
172 }
173
174 //__attribute__((reqd_work_group_size(128,1,1)))
175 uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
176 {
177         u32 s4 = prefixScanVectorEx( &pData );
178         u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );
179         return pData + make_uint4( rank, rank, rank, rank );
180 }
181
182
183 //__attribute__((reqd_work_group_size(64,1,1)))
184 uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
185 {
186         u32 s4 = prefixScanVectorEx( &pData );
187         u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );
188         return pData + make_uint4( rank, rank, rank, rank );
189 }
190
191 u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}
192
193 u32 bit8Scan(u32 v)
194 {
195         return (v<<8) + (v<<16) + (v<<24);
196 }
197
198 //===
199
200
201
202
203 #define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]
204
205
206 __kernel
207 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
208 void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )
209 {
210         __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
211
212         u32 gIdx = GET_GLOBAL_IDX;
213         u32 lIdx = GET_LOCAL_IDX;
214         u32 wgIdx = GET_GROUP_IDX;
215         u32 wgSize = GET_GROUP_SIZE;
216         const int startBit = cb.m_startBit;
217         const int n = cb.m_n;
218         const int nWGs = cb.m_nWGs;
219         const int nBlocksPerWG = cb.m_nBlocksPerWG;
220
221         for(int i=0; i<NUM_BUCKET; i++)
222         {
223                 MY_HISTOGRAM(i) = 0;
224         }
225
226         GROUP_LDS_BARRIER;
227
228         const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
229         u32 localKey;
230
231         int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
232
233         int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
234
235         for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
236         {
237                 //      MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
238                 //      Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
239                 //      AMD: AtomInc performs better while NV prefers ++
240                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
241                 {
242 #if defined(CHECK_BOUNDARY)
243                         if( addr+i < n )
244 #endif
245                         {
246                                 localKey = (gSrc[addr+i]>>startBit) & 0xf;
247 #if defined(NV_GPU)
248                                 MY_HISTOGRAM( localKey )++;
249 #else
250                                 AtomInc( MY_HISTOGRAM( localKey ) );
251 #endif
252                         }
253                 }
254         }
255
256         GROUP_LDS_BARRIER;
257         
258         if( lIdx < NUM_BUCKET )
259         {
260                 u32 sum = 0;
261                 for(int i=0; i<GET_GROUP_SIZE; i++)
262                 {
263                         sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
264                 }
265                 histogramOut[lIdx*nWGs+wgIdx] = sum;
266         }
267 }
268
269 __kernel
270 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
271 void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )
272 {
273         __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
274
275         u32 gIdx = GET_GLOBAL_IDX;
276         u32 lIdx = GET_LOCAL_IDX;
277         u32 wgIdx = GET_GROUP_IDX;
278         u32 wgSize = GET_GROUP_SIZE;
279         const int startBit = cb.m_startBit;
280         const int n = cb.m_n;
281         const int nWGs = cb.m_nWGs;
282         const int nBlocksPerWG = cb.m_nBlocksPerWG;
283
284         for(int i=0; i<NUM_BUCKET; i++)
285         {
286                 MY_HISTOGRAM(i) = 0;
287         }
288
289         GROUP_LDS_BARRIER;
290
291         const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
292         u32 localKey;
293
294         int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
295
296         int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
297
298         for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
299         {
300                 //      MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
301                 //      Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
302                 //      AMD: AtomInc performs better while NV prefers ++
303                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
304                 {
305 #if defined(CHECK_BOUNDARY)
306                         if( addr+i < n )
307 #endif
308                         {
309                                 localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;
310 #if defined(NV_GPU)
311                                 MY_HISTOGRAM( localKey )++;
312 #else
313                                 AtomInc( MY_HISTOGRAM( localKey ) );
314 #endif
315                         }
316                 }
317         }
318
319         GROUP_LDS_BARRIER;
320         
321         if( lIdx < NUM_BUCKET )
322         {
323                 u32 sum = 0;
324                 for(int i=0; i<GET_GROUP_SIZE; i++)
325                 {
326                         sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
327                 }
328                 histogramOut[lIdx*nWGs+wgIdx] = sum;
329         }
330 }
331
332 #define nPerLane (nPerWI/4)
333
334 //      NUM_BUCKET*nWGs < 128*nPerWI
335 __kernel
336 __attribute__((reqd_work_group_size(128,1,1)))
337 void PrefixScanKernel( __global u32* wHistogram1, int4  cb )
338 {
339         __local u32 ldsTopScanData[128*2];
340
341         u32 lIdx = GET_LOCAL_IDX;
342         u32 wgIdx = GET_GROUP_IDX;
343         const int nWGs = cb.m_nWGs;
344
345         u32 data[nPerWI];
346         for(int i=0; i<nPerWI; i++)
347         {
348                 data[i] = 0;
349                 if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )
350                         data[i] = wHistogram1[nPerWI*lIdx+i];
351         }
352
353         uint4 myData = make_uint4(0,0,0,0);
354
355         for(int i=0; i<nPerLane; i++)
356         {
357                 myData.x += data[nPerLane*0+i];
358                 myData.y += data[nPerLane*1+i];
359                 myData.z += data[nPerLane*2+i];
360                 myData.w += data[nPerLane*3+i];
361         }
362
363         uint totalSum;
364         uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );
365
366 //      for(int j=0; j<4; j++) //       somehow it introduces a lot of branches
367         {       int j = 0;
368                 u32 sum = 0;
369                 for(int i=0; i<nPerLane; i++)
370                 {
371                         u32 tmp = data[nPerLane*j+i];
372                         data[nPerLane*j+i] = sum;
373                         sum += tmp;
374                 }
375         }
376         {       int j = 1;
377                 u32 sum = 0;
378                 for(int i=0; i<nPerLane; i++)
379                 {
380                         u32 tmp = data[nPerLane*j+i];
381                         data[nPerLane*j+i] = sum;
382                         sum += tmp;
383                 }
384         }
385         {       int j = 2;
386                 u32 sum = 0;
387                 for(int i=0; i<nPerLane; i++)
388                 {
389                         u32 tmp = data[nPerLane*j+i];
390                         data[nPerLane*j+i] = sum;
391                         sum += tmp;
392                 }
393         }
394         {       int j = 3;
395                 u32 sum = 0;
396                 for(int i=0; i<nPerLane; i++)
397                 {
398                         u32 tmp = data[nPerLane*j+i];
399                         data[nPerLane*j+i] = sum;
400                         sum += tmp;
401                 }
402         }
403
404         for(int i=0; i<nPerLane; i++)
405         {
406                 data[nPerLane*0+i] += scanned.x;
407                 data[nPerLane*1+i] += scanned.y;
408                 data[nPerLane*2+i] += scanned.z;
409                 data[nPerLane*3+i] += scanned.w;
410         }
411
412         for(int i=0; i<nPerWI; i++)
413         {
414                 int index = nPerWI*lIdx+i;
415                 if (index < NUM_BUCKET*nWGs)
416                         wHistogram1[nPerWI*lIdx+i] = data[i];
417         }
418 }
419
420 //      4 scan, 4 exchange
421 void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
422 {
423         for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)
424         {
425                 u32 mask = (1<<bitIdx);
426                 uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );
427                 uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
428                 u32 total;
429                 prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );
430                 {
431                         uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
432                         uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
433                         dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
434
435                         GROUP_LDS_BARRIER;
436
437                         ldsSortData[dstAddr.x] = sortData[0];
438                         ldsSortData[dstAddr.y] = sortData[1];
439                         ldsSortData[dstAddr.z] = sortData[2];
440                         ldsSortData[dstAddr.w] = sortData[3];
441
442                         GROUP_LDS_BARRIER;
443
444                         sortData[0] = ldsSortData[localAddr.x];
445                         sortData[1] = ldsSortData[localAddr.y];
446                         sortData[2] = ldsSortData[localAddr.z];
447                         sortData[3] = ldsSortData[localAddr.w];
448
449                         GROUP_LDS_BARRIER;
450                 }
451         }
452 }
453
454 //      2 scan, 2 exchange
455 void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
456 {
457         for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
458         {
459                 uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, 
460                         (sortData[1]>>(startBit+ibit)) & 0x3, 
461                         (sortData[2]>>(startBit+ibit)) & 0x3, 
462                         (sortData[3]>>(startBit+ibit)) & 0x3);
463
464                 u32 key4;
465                 u32 sKeyPacked[4] = { 0, 0, 0, 0 };
466                 {
467                         sKeyPacked[0] |= 1<<(8*b.x);
468                         sKeyPacked[1] |= 1<<(8*b.y);
469                         sKeyPacked[2] |= 1<<(8*b.z);
470                         sKeyPacked[3] |= 1<<(8*b.w);
471
472                         key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
473                 }
474
475                 u32 rankPacked;
476                 u32 sumPacked;
477                 {
478                         rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
479                 }
480
481                 GROUP_LDS_BARRIER;
482
483                 u32 newOffset[4] = { 0,0,0,0 };
484                 {
485                         u32 sumScanned = bit8Scan( sumPacked );
486
487                         u32 scannedKeys[4];
488                         scannedKeys[0] = 1<<(8*b.x);
489                         scannedKeys[1] = 1<<(8*b.y);
490                         scannedKeys[2] = 1<<(8*b.z);
491                         scannedKeys[3] = 1<<(8*b.w);
492                         {       //      4 scans at once
493                                 u32 sum4 = 0;
494                                 for(int ie=0; ie<4; ie++)
495                                 {
496                                         u32 tmp = scannedKeys[ie];
497                                         scannedKeys[ie] = sum4;
498                                         sum4 += tmp;
499                                 }
500                         }
501
502                         {
503                                 u32 sumPlusRank = sumScanned + rankPacked;
504                                 {       u32 ie = b.x;
505                                         scannedKeys[0] += sumPlusRank;
506                                         newOffset[0] = unpack4Key( scannedKeys[0], ie );
507                                 }
508                                 {       u32 ie = b.y;
509                                         scannedKeys[1] += sumPlusRank;
510                                         newOffset[1] = unpack4Key( scannedKeys[1], ie );
511                                 }
512                                 {       u32 ie = b.z;
513                                         scannedKeys[2] += sumPlusRank;
514                                         newOffset[2] = unpack4Key( scannedKeys[2], ie );
515                                 }
516                                 {       u32 ie = b.w;
517                                         scannedKeys[3] += sumPlusRank;
518                                         newOffset[3] = unpack4Key( scannedKeys[3], ie );
519                                 }
520                         }
521                 }
522
523
524                 GROUP_LDS_BARRIER;
525
526                 {
527                         ldsSortData[newOffset[0]] = sortData[0];
528                         ldsSortData[newOffset[1]] = sortData[1];
529                         ldsSortData[newOffset[2]] = sortData[2];
530                         ldsSortData[newOffset[3]] = sortData[3];
531
532                         GROUP_LDS_BARRIER;
533
534                         u32 dstAddr = 4*lIdx;
535                         sortData[0] = ldsSortData[dstAddr+0];
536                         sortData[1] = ldsSortData[dstAddr+1];
537                         sortData[2] = ldsSortData[dstAddr+2];
538                         sortData[3] = ldsSortData[dstAddr+3];
539
540                         GROUP_LDS_BARRIER;
541                 }
542         }
543 }
544
545 #define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]
546
547 __kernel
548 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
549 void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )
550 {
551         __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
552         __local u32 localHistogramToCarry[NUM_BUCKET];
553         __local u32 localHistogram[NUM_BUCKET*2];
554
555         u32 gIdx = GET_GLOBAL_IDX;
556         u32 lIdx = GET_LOCAL_IDX;
557         u32 wgIdx = GET_GROUP_IDX;
558         u32 wgSize = GET_GROUP_SIZE;
559
560         const int n = cb.m_n;
561         const int nWGs = cb.m_nWGs;
562         const int startBit = cb.m_startBit;
563         const int nBlocksPerWG = cb.m_nBlocksPerWG;
564
565         if( lIdx < (NUM_BUCKET) )
566         {
567                 localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
568         }
569
570         GROUP_LDS_BARRIER;
571
572         const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
573
574         int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
575
576         int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
577
578         for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
579         {
580                 u32 myHistogram = 0;
581
582                 u32 sortData[ELEMENTS_PER_WORK_ITEM];
583                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
584 #if defined(CHECK_BOUNDARY)
585                         sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;
586 #else
587                         sortData[i] = gSrc[ addr+i ];
588 #endif
589
590                 sort4Bits(sortData, startBit, lIdx, ldsSortData);
591
592                 u32 keys[ELEMENTS_PER_WORK_ITEM];
593                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
594                         keys[i] = (sortData[i]>>startBit) & 0xf;
595
596                 {       //      create histogram
597                         u32 setIdx = lIdx/16;
598                         if( lIdx < NUM_BUCKET )
599                         {
600                                 localHistogram[lIdx] = 0;
601                         }
602                         ldsSortData[lIdx] = 0;
603                         GROUP_LDS_BARRIER;
604
605                         for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
606 #if defined(CHECK_BOUNDARY)
607                                 if( addr+i < n )
608 #endif
609
610 #if defined(NV_GPU)
611                                 SET_HISTOGRAM( setIdx, keys[i] )++;
612 #else
613                                 AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
614 #endif
615                         
616                         GROUP_LDS_BARRIER;
617                         
618                         uint hIdx = NUM_BUCKET+lIdx;
619                         if( lIdx < NUM_BUCKET )
620                         {
621                                 u32 sum = 0;
622                                 for(int i=0; i<WG_SIZE/16; i++)
623                                 {
624                                         sum += SET_HISTOGRAM( i, lIdx );
625                                 }
626                                 myHistogram = sum;
627                                 localHistogram[hIdx] = sum;
628                         }
629                         GROUP_LDS_BARRIER;
630
631 #if defined(USE_2LEVEL_REDUCE)
632                         if( lIdx < NUM_BUCKET )
633                         {
634                                 localHistogram[hIdx] = localHistogram[hIdx-1];
635                                 GROUP_MEM_FENCE;
636
637                                 u32 u0, u1, u2;
638                                 u0 = localHistogram[hIdx-3];
639                                 u1 = localHistogram[hIdx-2];
640                                 u2 = localHistogram[hIdx-1];
641                                 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
642                                 GROUP_MEM_FENCE;
643                                 u0 = localHistogram[hIdx-12];
644                                 u1 = localHistogram[hIdx-8];
645                                 u2 = localHistogram[hIdx-4];
646                                 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
647                                 GROUP_MEM_FENCE;
648                         }
649 #else
650                         if( lIdx < NUM_BUCKET )
651                         {
652                                 localHistogram[hIdx] = localHistogram[hIdx-1];
653                                 GROUP_MEM_FENCE;
654                                 localHistogram[hIdx] += localHistogram[hIdx-1];
655                                 GROUP_MEM_FENCE;
656                                 localHistogram[hIdx] += localHistogram[hIdx-2];
657                                 GROUP_MEM_FENCE;
658                                 localHistogram[hIdx] += localHistogram[hIdx-4];
659                                 GROUP_MEM_FENCE;
660                                 localHistogram[hIdx] += localHistogram[hIdx-8];
661                                 GROUP_MEM_FENCE;
662                         }
663 #endif
664                         GROUP_LDS_BARRIER;
665                 }
666
667                 {
668                         for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
669                         {
670                                 int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
671                                 int binIdx = keys[ie];
672                                 int groupOffset = localHistogramToCarry[binIdx];
673                                 int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
674 #if defined(CHECK_BOUNDARY)
675                                 if( addr+ie < n )
676 #endif
677                                 gDst[ groupOffset + myIdx ] = sortData[ie];
678                         }
679                 }
680
681                 GROUP_LDS_BARRIER;
682
683                 if( lIdx < NUM_BUCKET )
684                 {
685                         localHistogramToCarry[lIdx] += myHistogram;
686                 }
687                 GROUP_LDS_BARRIER;
688         }
689 }
690
691 //      2 scan, 2 exchange
692 void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)
693 {
694         for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
695         {
696                 uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3, 
697                         (sortData[1]>>(startBit+ibit)) & 0x3, 
698                         (sortData[2]>>(startBit+ibit)) & 0x3, 
699                         (sortData[3]>>(startBit+ibit)) & 0x3);
700
701                 u32 key4;
702                 u32 sKeyPacked[4] = { 0, 0, 0, 0 };
703                 {
704                         sKeyPacked[0] |= 1<<(8*b.x);
705                         sKeyPacked[1] |= 1<<(8*b.y);
706                         sKeyPacked[2] |= 1<<(8*b.z);
707                         sKeyPacked[3] |= 1<<(8*b.w);
708
709                         key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
710                 }
711
712                 u32 rankPacked;
713                 u32 sumPacked;
714                 {
715                         rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
716                 }
717
718                 GROUP_LDS_BARRIER;
719
720                 u32 newOffset[4] = { 0,0,0,0 };
721                 {
722                         u32 sumScanned = bit8Scan( sumPacked );
723
724                         u32 scannedKeys[4];
725                         scannedKeys[0] = 1<<(8*b.x);
726                         scannedKeys[1] = 1<<(8*b.y);
727                         scannedKeys[2] = 1<<(8*b.z);
728                         scannedKeys[3] = 1<<(8*b.w);
729                         {       //      4 scans at once
730                                 u32 sum4 = 0;
731                                 for(int ie=0; ie<4; ie++)
732                                 {
733                                         u32 tmp = scannedKeys[ie];
734                                         scannedKeys[ie] = sum4;
735                                         sum4 += tmp;
736                                 }
737                         }
738
739                         {
740                                 u32 sumPlusRank = sumScanned + rankPacked;
741                                 {       u32 ie = b.x;
742                                         scannedKeys[0] += sumPlusRank;
743                                         newOffset[0] = unpack4Key( scannedKeys[0], ie );
744                                 }
745                                 {       u32 ie = b.y;
746                                         scannedKeys[1] += sumPlusRank;
747                                         newOffset[1] = unpack4Key( scannedKeys[1], ie );
748                                 }
749                                 {       u32 ie = b.z;
750                                         scannedKeys[2] += sumPlusRank;
751                                         newOffset[2] = unpack4Key( scannedKeys[2], ie );
752                                 }
753                                 {       u32 ie = b.w;
754                                         scannedKeys[3] += sumPlusRank;
755                                         newOffset[3] = unpack4Key( scannedKeys[3], ie );
756                                 }
757                         }
758                 }
759
760
761                 GROUP_LDS_BARRIER;
762
763                 {
764                         ldsSortData[newOffset[0]] = sortData[0];
765                         ldsSortData[newOffset[1]] = sortData[1];
766                         ldsSortData[newOffset[2]] = sortData[2];
767                         ldsSortData[newOffset[3]] = sortData[3];
768
769                         ldsSortVal[newOffset[0]] = sortVal[0];
770                         ldsSortVal[newOffset[1]] = sortVal[1];
771                         ldsSortVal[newOffset[2]] = sortVal[2];
772                         ldsSortVal[newOffset[3]] = sortVal[3];
773
774                         GROUP_LDS_BARRIER;
775
776                         u32 dstAddr = 4*lIdx;
777                         sortData[0] = ldsSortData[dstAddr+0];
778                         sortData[1] = ldsSortData[dstAddr+1];
779                         sortData[2] = ldsSortData[dstAddr+2];
780                         sortData[3] = ldsSortData[dstAddr+3];
781
782                         sortVal[0] = ldsSortVal[dstAddr+0];
783                         sortVal[1] = ldsSortVal[dstAddr+1];
784                         sortVal[2] = ldsSortVal[dstAddr+2];
785                         sortVal[3] = ldsSortVal[dstAddr+3];
786
787                         GROUP_LDS_BARRIER;
788                 }
789         }
790 }
791
792
793
794
795 __kernel
796 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
797 void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
798 {
799         __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
800         __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
801         __local u32 localHistogramToCarry[NUM_BUCKET];
802         __local u32 localHistogram[NUM_BUCKET*2];
803
804         u32 gIdx = GET_GLOBAL_IDX;
805         u32 lIdx = GET_LOCAL_IDX;
806         u32 wgIdx = GET_GROUP_IDX;
807         u32 wgSize = GET_GROUP_SIZE;
808
809         const int n = cb.m_n;
810         const int nWGs = cb.m_nWGs;
811         const int startBit = cb.m_startBit;
812         const int nBlocksPerWG = cb.m_nBlocksPerWG;
813
814         if( lIdx < (NUM_BUCKET) )
815         {
816                 localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
817         }
818
819         GROUP_LDS_BARRIER;
820     
821
822         const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
823
824         int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
825
826         int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
827
828         for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
829         {
830
831                 u32 myHistogram = 0;
832
833                 int sortData[ELEMENTS_PER_WORK_ITEM];
834                 int sortVal[ELEMENTS_PER_WORK_ITEM];
835
836                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
837 #if defined(CHECK_BOUNDARY)
838                 {
839                         sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;
840                         sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;
841                 }
842 #else
843                 {
844                         sortData[i] = gSrc[ addr+i ].m_key;
845                         sortVal[i] = gSrc[ addr+i ].m_value;
846                 }
847 #endif
848
849                 sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);
850
851                 u32 keys[ELEMENTS_PER_WORK_ITEM];
852                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
853                         keys[i] = (sortData[i]>>startBit) & 0xf;
854
855                 {       //      create histogram
856                         u32 setIdx = lIdx/16;
857                         if( lIdx < NUM_BUCKET )
858                         {
859                                 localHistogram[lIdx] = 0;
860                         }
861                         ldsSortData[lIdx] = 0;
862                         GROUP_LDS_BARRIER;
863
864                         for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
865 #if defined(CHECK_BOUNDARY)
866                                 if( addr+i < n )
867 #endif
868
869 #if defined(NV_GPU)
870                                 SET_HISTOGRAM( setIdx, keys[i] )++;
871 #else
872                                 AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
873 #endif
874                         
875                         GROUP_LDS_BARRIER;
876                         
877                         uint hIdx = NUM_BUCKET+lIdx;
878                         if( lIdx < NUM_BUCKET )
879                         {
880                                 u32 sum = 0;
881                                 for(int i=0; i<WG_SIZE/16; i++)
882                                 {
883                                         sum += SET_HISTOGRAM( i, lIdx );
884                                 }
885                                 myHistogram = sum;
886                                 localHistogram[hIdx] = sum;
887                         }
888                         GROUP_LDS_BARRIER;
889
890 #if defined(USE_2LEVEL_REDUCE)
891                         if( lIdx < NUM_BUCKET )
892                         {
893                                 localHistogram[hIdx] = localHistogram[hIdx-1];
894                                 GROUP_MEM_FENCE;
895
896                                 u32 u0, u1, u2;
897                                 u0 = localHistogram[hIdx-3];
898                                 u1 = localHistogram[hIdx-2];
899                                 u2 = localHistogram[hIdx-1];
900                                 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
901                                 GROUP_MEM_FENCE;
902                                 u0 = localHistogram[hIdx-12];
903                                 u1 = localHistogram[hIdx-8];
904                                 u2 = localHistogram[hIdx-4];
905                                 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
906                                 GROUP_MEM_FENCE;
907                         }
908 #else
909                         if( lIdx < NUM_BUCKET )
910                         {
911                                 localHistogram[hIdx] = localHistogram[hIdx-1];
912                                 GROUP_MEM_FENCE;
913                                 localHistogram[hIdx] += localHistogram[hIdx-1];
914                                 GROUP_MEM_FENCE;
915                                 localHistogram[hIdx] += localHistogram[hIdx-2];
916                                 GROUP_MEM_FENCE;
917                                 localHistogram[hIdx] += localHistogram[hIdx-4];
918                                 GROUP_MEM_FENCE;
919                                 localHistogram[hIdx] += localHistogram[hIdx-8];
920                                 GROUP_MEM_FENCE;
921                         }
922 #endif
923                         GROUP_LDS_BARRIER;
924                 }
925
926         {
927                         for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
928                         {
929                                 int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
930                                 int binIdx = keys[ie];
931                                 int groupOffset = localHistogramToCarry[binIdx];
932                                 int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
933 #if defined(CHECK_BOUNDARY)
934                                 if( addr+ie < n )
935                                 {
936                     if ((groupOffset + myIdx)<n)
937                     {
938                         if (sortData[ie]==sortVal[ie])
939                         {
940                             
941                             SortDataCL tmp;
942                             tmp.m_key = sortData[ie];
943                             tmp.m_value = sortVal[ie];
944                             if (tmp.m_key == tmp.m_value)
945                                 gDst[groupOffset + myIdx ] = tmp;
946                         }
947                         
948                     }
949                                 }
950 #else
951                 if ((groupOffset + myIdx)<n)
952                 {
953                     gDst[ groupOffset + myIdx ].m_key = sortData[ie];
954                     gDst[ groupOffset + myIdx ].m_value = sortVal[ie];
955                 }
956 #endif
957                         }
958                 }
959
960                 GROUP_LDS_BARRIER;
961
962                 if( lIdx < NUM_BUCKET )
963                 {
964                         localHistogramToCarry[lIdx] += myHistogram;
965                 }
966                 GROUP_LDS_BARRIER;
967         }
968 }
969
970
971
972
973
974
975
976 __kernel
977 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
978 void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
979 {
980     
981         u32 gIdx = GET_GLOBAL_IDX;
982         u32 realLocalIdx = GET_LOCAL_IDX;
983         u32 wgIdx = GET_GROUP_IDX;
984         u32 wgSize = GET_GROUP_SIZE;
985         const int startBit = cb.m_startBit;
986         const int n = cb.m_n;
987         const int nWGs = cb.m_nWGs;
988         const int nBlocksPerWG = cb.m_nBlocksPerWG;
989
990     int counter[NUM_BUCKET];
991     
992     if (realLocalIdx>0)
993         return;
994     
995     for (int c=0;c<NUM_BUCKET;c++)
996         counter[c]=0;
997
998     const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
999         
1000         int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
1001
1002    for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
1003   {
1004      for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
1005         {
1006         int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
1007         
1008                 for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
1009                 {
1010             int i = addr2+j;
1011                         if( i < n )
1012                         {
1013                 int tableIdx;
1014                                 tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1
1015                 gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
1016                 counter[tableIdx] ++;
1017                         }
1018                 }
1019         }
1020   }
1021     
1022 }
1023
1024
1025 __kernel
1026 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
1027 void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )
1028 {
1029     
1030         u32 gIdx = GET_GLOBAL_IDX;
1031         u32 realLocalIdx = GET_LOCAL_IDX;
1032         u32 wgIdx = GET_GROUP_IDX;
1033         u32 wgSize = GET_GROUP_SIZE;
1034         const int startBit = cb.m_startBit;
1035         const int n = cb.m_n;
1036         const int nWGs = cb.m_nWGs;
1037         const int nBlocksPerWG = cb.m_nBlocksPerWG;
1038
1039     int counter[NUM_BUCKET];
1040     
1041     if (realLocalIdx>0)
1042         return;
1043     
1044     for (int c=0;c<NUM_BUCKET;c++)
1045         counter[c]=0;
1046
1047     const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
1048         
1049         int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
1050
1051    for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
1052   {
1053      for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
1054         {
1055         int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
1056         
1057                 for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
1058                 {
1059             int i = addr2+j;
1060                         if( i < n )
1061                         {
1062                 int tableIdx;
1063                                 tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1
1064                 gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
1065                 counter[tableIdx] ++;
1066                         }
1067                 }
1068         }
1069   }
1070     
1071 }