2 Bullet Continuous Collision Detection and Physics Library
3 Copyright (c) 2011 Advanced Micro Devices, Inc. http://bulletphysics.org
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
15 //Author Takahiro Harada
18 //#pragma OPENCL EXTENSION cl_amd_printf : enable
19 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
20 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
22 typedef unsigned int u32;
23 #define GET_GROUP_IDX get_group_id(0)
24 #define GET_LOCAL_IDX get_local_id(0)
25 #define GET_GLOBAL_IDX get_global_id(0)
26 #define GET_GROUP_SIZE get_local_size(0)
27 #define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
28 #define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
29 #define AtomInc(x) atom_inc(&(x))
30 #define AtomInc1(x, out) out = atom_inc(&(x))
31 #define AtomAdd(x, value) atom_add(&(x), value)
33 #define SELECT_UINT4( b, a, condition ) select( b,a,condition )
36 #define make_uint4 (uint4)
37 #define make_uint2 (uint2)
38 #define make_int2 (int2)
41 #define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)
42 #define BITS_PER_PASS 4
43 #define NUM_BUCKET (1<<BITS_PER_PASS)
46 // this isn't optimization for VLIW. But just reducing writes.
47 #define USE_2LEVEL_REDUCE 1
49 //#define CHECK_BOUNDARY 1
62 #define m_nBlocksPerWG w
81 uint prefixScanVectorEx( uint4* data )
99 u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )
102 sorterSharedMemory[lIdx] = 0;
103 sorterSharedMemory[lIdx+wgSize] = pData;
109 int idx = 2*lIdx + (wgSize+1);
110 #if defined(USE_2LEVEL_REDUCE)
114 u0 = sorterSharedMemory[idx-3];
115 u1 = sorterSharedMemory[idx-2];
116 u2 = sorterSharedMemory[idx-1];
117 AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );
120 u0 = sorterSharedMemory[idx-12];
121 u1 = sorterSharedMemory[idx-8];
122 u2 = sorterSharedMemory[idx-4];
123 AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );
126 u0 = sorterSharedMemory[idx-48];
127 u1 = sorterSharedMemory[idx-32];
128 u2 = sorterSharedMemory[idx-16];
129 AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );
133 sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
137 sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
143 sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
145 sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
147 sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
149 sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
151 sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
153 sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
157 sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
161 sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
169 *totalSum = sorterSharedMemory[wgSize*2-1];
170 u32 addValue = sorterSharedMemory[lIdx+wgSize-1];
174 //__attribute__((reqd_work_group_size(128,1,1)))
175 uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
177 u32 s4 = prefixScanVectorEx( &pData );
178 u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );
179 return pData + make_uint4( rank, rank, rank, rank );
183 //__attribute__((reqd_work_group_size(64,1,1)))
184 uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
186 u32 s4 = prefixScanVectorEx( &pData );
187 u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );
188 return pData + make_uint4( rank, rank, rank, rank );
191 u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}
195 return (v<<8) + (v<<16) + (v<<24);
203 #define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]
207 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
208 void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )
210 __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
212 u32 gIdx = GET_GLOBAL_IDX;
213 u32 lIdx = GET_LOCAL_IDX;
214 u32 wgIdx = GET_GROUP_IDX;
215 u32 wgSize = GET_GROUP_SIZE;
216 const int startBit = cb.m_startBit;
217 const int n = cb.m_n;
218 const int nWGs = cb.m_nWGs;
219 const int nBlocksPerWG = cb.m_nBlocksPerWG;
221 for(int i=0; i<NUM_BUCKET; i++)
228 const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
231 int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
233 int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
235 for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
237 // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
238 // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
239 // AMD: AtomInc performs better while NV prefers ++
240 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
242 #if defined(CHECK_BOUNDARY)
246 localKey = (gSrc[addr+i]>>startBit) & 0xf;
248 MY_HISTOGRAM( localKey )++;
250 AtomInc( MY_HISTOGRAM( localKey ) );
258 if( lIdx < NUM_BUCKET )
261 for(int i=0; i<GET_GROUP_SIZE; i++)
263 sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
265 histogramOut[lIdx*nWGs+wgIdx] = sum;
270 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
271 void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4 cb )
273 __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
275 u32 gIdx = GET_GLOBAL_IDX;
276 u32 lIdx = GET_LOCAL_IDX;
277 u32 wgIdx = GET_GROUP_IDX;
278 u32 wgSize = GET_GROUP_SIZE;
279 const int startBit = cb.m_startBit;
280 const int n = cb.m_n;
281 const int nWGs = cb.m_nWGs;
282 const int nBlocksPerWG = cb.m_nBlocksPerWG;
284 for(int i=0; i<NUM_BUCKET; i++)
291 const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
294 int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
296 int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
298 for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
300 // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
301 // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
302 // AMD: AtomInc performs better while NV prefers ++
303 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
305 #if defined(CHECK_BOUNDARY)
309 localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;
311 MY_HISTOGRAM( localKey )++;
313 AtomInc( MY_HISTOGRAM( localKey ) );
321 if( lIdx < NUM_BUCKET )
324 for(int i=0; i<GET_GROUP_SIZE; i++)
326 sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
328 histogramOut[lIdx*nWGs+wgIdx] = sum;
332 #define nPerLane (nPerWI/4)
334 // NUM_BUCKET*nWGs < 128*nPerWI
336 __attribute__((reqd_work_group_size(128,1,1)))
337 void PrefixScanKernel( __global u32* wHistogram1, int4 cb )
339 __local u32 ldsTopScanData[128*2];
341 u32 lIdx = GET_LOCAL_IDX;
342 u32 wgIdx = GET_GROUP_IDX;
343 const int nWGs = cb.m_nWGs;
346 for(int i=0; i<nPerWI; i++)
349 if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )
350 data[i] = wHistogram1[nPerWI*lIdx+i];
353 uint4 myData = make_uint4(0,0,0,0);
355 for(int i=0; i<nPerLane; i++)
357 myData.x += data[nPerLane*0+i];
358 myData.y += data[nPerLane*1+i];
359 myData.z += data[nPerLane*2+i];
360 myData.w += data[nPerLane*3+i];
364 uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );
366 // for(int j=0; j<4; j++) // somehow it introduces a lot of branches
369 for(int i=0; i<nPerLane; i++)
371 u32 tmp = data[nPerLane*j+i];
372 data[nPerLane*j+i] = sum;
378 for(int i=0; i<nPerLane; i++)
380 u32 tmp = data[nPerLane*j+i];
381 data[nPerLane*j+i] = sum;
387 for(int i=0; i<nPerLane; i++)
389 u32 tmp = data[nPerLane*j+i];
390 data[nPerLane*j+i] = sum;
396 for(int i=0; i<nPerLane; i++)
398 u32 tmp = data[nPerLane*j+i];
399 data[nPerLane*j+i] = sum;
404 for(int i=0; i<nPerLane; i++)
406 data[nPerLane*0+i] += scanned.x;
407 data[nPerLane*1+i] += scanned.y;
408 data[nPerLane*2+i] += scanned.z;
409 data[nPerLane*3+i] += scanned.w;
412 for(int i=0; i<nPerWI; i++)
414 int index = nPerWI*lIdx+i;
415 if (index < NUM_BUCKET*nWGs)
416 wHistogram1[nPerWI*lIdx+i] = data[i];
420 // 4 scan, 4 exchange
421 void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
423 for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)
425 u32 mask = (1<<bitIdx);
426 uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );
427 uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
429 prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );
431 uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
432 uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
433 dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
437 ldsSortData[dstAddr.x] = sortData[0];
438 ldsSortData[dstAddr.y] = sortData[1];
439 ldsSortData[dstAddr.z] = sortData[2];
440 ldsSortData[dstAddr.w] = sortData[3];
444 sortData[0] = ldsSortData[localAddr.x];
445 sortData[1] = ldsSortData[localAddr.y];
446 sortData[2] = ldsSortData[localAddr.z];
447 sortData[3] = ldsSortData[localAddr.w];
454 // 2 scan, 2 exchange
455 void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
457 for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
459 uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3,
460 (sortData[1]>>(startBit+ibit)) & 0x3,
461 (sortData[2]>>(startBit+ibit)) & 0x3,
462 (sortData[3]>>(startBit+ibit)) & 0x3);
465 u32 sKeyPacked[4] = { 0, 0, 0, 0 };
467 sKeyPacked[0] |= 1<<(8*b.x);
468 sKeyPacked[1] |= 1<<(8*b.y);
469 sKeyPacked[2] |= 1<<(8*b.z);
470 sKeyPacked[3] |= 1<<(8*b.w);
472 key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
478 rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
483 u32 newOffset[4] = { 0,0,0,0 };
485 u32 sumScanned = bit8Scan( sumPacked );
488 scannedKeys[0] = 1<<(8*b.x);
489 scannedKeys[1] = 1<<(8*b.y);
490 scannedKeys[2] = 1<<(8*b.z);
491 scannedKeys[3] = 1<<(8*b.w);
494 for(int ie=0; ie<4; ie++)
496 u32 tmp = scannedKeys[ie];
497 scannedKeys[ie] = sum4;
503 u32 sumPlusRank = sumScanned + rankPacked;
505 scannedKeys[0] += sumPlusRank;
506 newOffset[0] = unpack4Key( scannedKeys[0], ie );
509 scannedKeys[1] += sumPlusRank;
510 newOffset[1] = unpack4Key( scannedKeys[1], ie );
513 scannedKeys[2] += sumPlusRank;
514 newOffset[2] = unpack4Key( scannedKeys[2], ie );
517 scannedKeys[3] += sumPlusRank;
518 newOffset[3] = unpack4Key( scannedKeys[3], ie );
527 ldsSortData[newOffset[0]] = sortData[0];
528 ldsSortData[newOffset[1]] = sortData[1];
529 ldsSortData[newOffset[2]] = sortData[2];
530 ldsSortData[newOffset[3]] = sortData[3];
534 u32 dstAddr = 4*lIdx;
535 sortData[0] = ldsSortData[dstAddr+0];
536 sortData[1] = ldsSortData[dstAddr+1];
537 sortData[2] = ldsSortData[dstAddr+2];
538 sortData[3] = ldsSortData[dstAddr+3];
545 #define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]
548 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
549 void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )
551 __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
552 __local u32 localHistogramToCarry[NUM_BUCKET];
553 __local u32 localHistogram[NUM_BUCKET*2];
555 u32 gIdx = GET_GLOBAL_IDX;
556 u32 lIdx = GET_LOCAL_IDX;
557 u32 wgIdx = GET_GROUP_IDX;
558 u32 wgSize = GET_GROUP_SIZE;
560 const int n = cb.m_n;
561 const int nWGs = cb.m_nWGs;
562 const int startBit = cb.m_startBit;
563 const int nBlocksPerWG = cb.m_nBlocksPerWG;
565 if( lIdx < (NUM_BUCKET) )
567 localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
572 const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
574 int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
576 int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
578 for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
582 u32 sortData[ELEMENTS_PER_WORK_ITEM];
583 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
584 #if defined(CHECK_BOUNDARY)
585 sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;
587 sortData[i] = gSrc[ addr+i ];
590 sort4Bits(sortData, startBit, lIdx, ldsSortData);
592 u32 keys[ELEMENTS_PER_WORK_ITEM];
593 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
594 keys[i] = (sortData[i]>>startBit) & 0xf;
596 { // create histogram
597 u32 setIdx = lIdx/16;
598 if( lIdx < NUM_BUCKET )
600 localHistogram[lIdx] = 0;
602 ldsSortData[lIdx] = 0;
605 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
606 #if defined(CHECK_BOUNDARY)
611 SET_HISTOGRAM( setIdx, keys[i] )++;
613 AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
618 uint hIdx = NUM_BUCKET+lIdx;
619 if( lIdx < NUM_BUCKET )
622 for(int i=0; i<WG_SIZE/16; i++)
624 sum += SET_HISTOGRAM( i, lIdx );
627 localHistogram[hIdx] = sum;
631 #if defined(USE_2LEVEL_REDUCE)
632 if( lIdx < NUM_BUCKET )
634 localHistogram[hIdx] = localHistogram[hIdx-1];
638 u0 = localHistogram[hIdx-3];
639 u1 = localHistogram[hIdx-2];
640 u2 = localHistogram[hIdx-1];
641 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
643 u0 = localHistogram[hIdx-12];
644 u1 = localHistogram[hIdx-8];
645 u2 = localHistogram[hIdx-4];
646 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
650 if( lIdx < NUM_BUCKET )
652 localHistogram[hIdx] = localHistogram[hIdx-1];
654 localHistogram[hIdx] += localHistogram[hIdx-1];
656 localHistogram[hIdx] += localHistogram[hIdx-2];
658 localHistogram[hIdx] += localHistogram[hIdx-4];
660 localHistogram[hIdx] += localHistogram[hIdx-8];
668 for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
670 int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
671 int binIdx = keys[ie];
672 int groupOffset = localHistogramToCarry[binIdx];
673 int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
674 #if defined(CHECK_BOUNDARY)
677 gDst[ groupOffset + myIdx ] = sortData[ie];
683 if( lIdx < NUM_BUCKET )
685 localHistogramToCarry[lIdx] += myHistogram;
691 // 2 scan, 2 exchange
692 void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)
694 for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
696 uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3,
697 (sortData[1]>>(startBit+ibit)) & 0x3,
698 (sortData[2]>>(startBit+ibit)) & 0x3,
699 (sortData[3]>>(startBit+ibit)) & 0x3);
702 u32 sKeyPacked[4] = { 0, 0, 0, 0 };
704 sKeyPacked[0] |= 1<<(8*b.x);
705 sKeyPacked[1] |= 1<<(8*b.y);
706 sKeyPacked[2] |= 1<<(8*b.z);
707 sKeyPacked[3] |= 1<<(8*b.w);
709 key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
715 rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
720 u32 newOffset[4] = { 0,0,0,0 };
722 u32 sumScanned = bit8Scan( sumPacked );
725 scannedKeys[0] = 1<<(8*b.x);
726 scannedKeys[1] = 1<<(8*b.y);
727 scannedKeys[2] = 1<<(8*b.z);
728 scannedKeys[3] = 1<<(8*b.w);
731 for(int ie=0; ie<4; ie++)
733 u32 tmp = scannedKeys[ie];
734 scannedKeys[ie] = sum4;
740 u32 sumPlusRank = sumScanned + rankPacked;
742 scannedKeys[0] += sumPlusRank;
743 newOffset[0] = unpack4Key( scannedKeys[0], ie );
746 scannedKeys[1] += sumPlusRank;
747 newOffset[1] = unpack4Key( scannedKeys[1], ie );
750 scannedKeys[2] += sumPlusRank;
751 newOffset[2] = unpack4Key( scannedKeys[2], ie );
754 scannedKeys[3] += sumPlusRank;
755 newOffset[3] = unpack4Key( scannedKeys[3], ie );
764 ldsSortData[newOffset[0]] = sortData[0];
765 ldsSortData[newOffset[1]] = sortData[1];
766 ldsSortData[newOffset[2]] = sortData[2];
767 ldsSortData[newOffset[3]] = sortData[3];
769 ldsSortVal[newOffset[0]] = sortVal[0];
770 ldsSortVal[newOffset[1]] = sortVal[1];
771 ldsSortVal[newOffset[2]] = sortVal[2];
772 ldsSortVal[newOffset[3]] = sortVal[3];
776 u32 dstAddr = 4*lIdx;
777 sortData[0] = ldsSortData[dstAddr+0];
778 sortData[1] = ldsSortData[dstAddr+1];
779 sortData[2] = ldsSortData[dstAddr+2];
780 sortData[3] = ldsSortData[dstAddr+3];
782 sortVal[0] = ldsSortVal[dstAddr+0];
783 sortVal[1] = ldsSortVal[dstAddr+1];
784 sortVal[2] = ldsSortVal[dstAddr+2];
785 sortVal[3] = ldsSortVal[dstAddr+3];
796 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
797 void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
799 __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
800 __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
801 __local u32 localHistogramToCarry[NUM_BUCKET];
802 __local u32 localHistogram[NUM_BUCKET*2];
804 u32 gIdx = GET_GLOBAL_IDX;
805 u32 lIdx = GET_LOCAL_IDX;
806 u32 wgIdx = GET_GROUP_IDX;
807 u32 wgSize = GET_GROUP_SIZE;
809 const int n = cb.m_n;
810 const int nWGs = cb.m_nWGs;
811 const int startBit = cb.m_startBit;
812 const int nBlocksPerWG = cb.m_nBlocksPerWG;
814 if( lIdx < (NUM_BUCKET) )
816 localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
822 const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
824 int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
826 int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
828 for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
833 int sortData[ELEMENTS_PER_WORK_ITEM];
834 int sortVal[ELEMENTS_PER_WORK_ITEM];
836 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
837 #if defined(CHECK_BOUNDARY)
839 sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;
840 sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;
844 sortData[i] = gSrc[ addr+i ].m_key;
845 sortVal[i] = gSrc[ addr+i ].m_value;
849 sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);
851 u32 keys[ELEMENTS_PER_WORK_ITEM];
852 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
853 keys[i] = (sortData[i]>>startBit) & 0xf;
855 { // create histogram
856 u32 setIdx = lIdx/16;
857 if( lIdx < NUM_BUCKET )
859 localHistogram[lIdx] = 0;
861 ldsSortData[lIdx] = 0;
864 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
865 #if defined(CHECK_BOUNDARY)
870 SET_HISTOGRAM( setIdx, keys[i] )++;
872 AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
877 uint hIdx = NUM_BUCKET+lIdx;
878 if( lIdx < NUM_BUCKET )
881 for(int i=0; i<WG_SIZE/16; i++)
883 sum += SET_HISTOGRAM( i, lIdx );
886 localHistogram[hIdx] = sum;
890 #if defined(USE_2LEVEL_REDUCE)
891 if( lIdx < NUM_BUCKET )
893 localHistogram[hIdx] = localHistogram[hIdx-1];
897 u0 = localHistogram[hIdx-3];
898 u1 = localHistogram[hIdx-2];
899 u2 = localHistogram[hIdx-1];
900 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
902 u0 = localHistogram[hIdx-12];
903 u1 = localHistogram[hIdx-8];
904 u2 = localHistogram[hIdx-4];
905 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
909 if( lIdx < NUM_BUCKET )
911 localHistogram[hIdx] = localHistogram[hIdx-1];
913 localHistogram[hIdx] += localHistogram[hIdx-1];
915 localHistogram[hIdx] += localHistogram[hIdx-2];
917 localHistogram[hIdx] += localHistogram[hIdx-4];
919 localHistogram[hIdx] += localHistogram[hIdx-8];
927 for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
929 int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
930 int binIdx = keys[ie];
931 int groupOffset = localHistogramToCarry[binIdx];
932 int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
933 #if defined(CHECK_BOUNDARY)
936 if ((groupOffset + myIdx)<n)
938 if (sortData[ie]==sortVal[ie])
942 tmp.m_key = sortData[ie];
943 tmp.m_value = sortVal[ie];
944 if (tmp.m_key == tmp.m_value)
945 gDst[groupOffset + myIdx ] = tmp;
951 if ((groupOffset + myIdx)<n)
953 gDst[ groupOffset + myIdx ].m_key = sortData[ie];
954 gDst[ groupOffset + myIdx ].m_value = sortVal[ie];
962 if( lIdx < NUM_BUCKET )
964 localHistogramToCarry[lIdx] += myHistogram;
977 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
978 void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
981 u32 gIdx = GET_GLOBAL_IDX;
982 u32 realLocalIdx = GET_LOCAL_IDX;
983 u32 wgIdx = GET_GROUP_IDX;
984 u32 wgSize = GET_GROUP_SIZE;
985 const int startBit = cb.m_startBit;
986 const int n = cb.m_n;
987 const int nWGs = cb.m_nWGs;
988 const int nBlocksPerWG = cb.m_nBlocksPerWG;
990 int counter[NUM_BUCKET];
995 for (int c=0;c<NUM_BUCKET;c++)
998 const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
1000 int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
1002 for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
1004 for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
1006 int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
1008 for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
1014 tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1
1015 gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
1016 counter[tableIdx] ++;
1026 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
1027 void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4 cb )
1030 u32 gIdx = GET_GLOBAL_IDX;
1031 u32 realLocalIdx = GET_LOCAL_IDX;
1032 u32 wgIdx = GET_GROUP_IDX;
1033 u32 wgSize = GET_GROUP_SIZE;
1034 const int startBit = cb.m_startBit;
1035 const int n = cb.m_n;
1036 const int nWGs = cb.m_nWGs;
1037 const int nBlocksPerWG = cb.m_nBlocksPerWG;
1039 int counter[NUM_BUCKET];
1044 for (int c=0;c<NUM_BUCKET;c++)
1047 const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
1049 int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
1051 for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
1053 for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
1055 int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
1057 for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
1063 tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1
1064 gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
1065 counter[tableIdx] ++;