dali-physics/third-party/bullet3/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl

   1 /*
   2 Bullet Continuous Collision Detection and Physics Library
   3 Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
   4
   5 This software is provided 'as-is', without any express or implied warranty.
   6 In no event will the authors be held liable for any damages arising from the use of this software.
   7 Permission is granted to anyone to use this software for any purpose,
   8 including commercial applications, and to alter it and redistribute it freely,
   9 subject to the following restrictions:
  10
  11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  13 3. This notice may not be removed or altered from any source distribution.
  14 */
  15 //Author Takahiro Harada
  16
  17
  18 //#pragma OPENCL EXTENSION cl_amd_printf : enable
  19 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
  20 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
  21
  22 typedef unsigned int u32;
  23 #define GET_GROUP_IDX get_group_id(0)
  24 #define GET_LOCAL_IDX get_local_id(0)
  25 #define GET_GLOBAL_IDX get_global_id(0)
  26 #define GET_GROUP_SIZE get_local_size(0)
  27 #define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
  28 #define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
  29 #define AtomInc(x) atom_inc(&(x))
  30 #define AtomInc1(x, out) out = atom_inc(&(x))
  31 #define AtomAdd(x, value) atom_add(&(x), value)
  32
  33 #define SELECT_UINT4( b, a, condition ) select( b,a,condition )
  34
  35
  36 #define make_uint4 (uint4)
  37 #define make_uint2 (uint2)
  38 #define make_int2 (int2)
  39
  40 #define WG_SIZE 64
  41 #define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)
  42 #define BITS_PER_PASS 4
  43 #define NUM_BUCKET (1<<BITS_PER_PASS)
  44 typedef uchar u8;
  45
  46 //      this isn't optimization for VLIW. But just reducing writes.
  47 #define USE_2LEVEL_REDUCE 1
  48
  49 //#define CHECK_BOUNDARY 1
  50
  51 //#define NV_GPU 1
  52
  53
  54 //      Cypress
  55 #define nPerWI 16
  56 //      Cayman
  57 //#define nPerWI 20
  58
  59 #define m_n x
  60 #define m_nWGs y
  61 #define m_startBit z
  62 #define m_nBlocksPerWG w
  63
  64 /*
  65 typedef struct
  66 {
  67         int m_n;
  68         int m_nWGs;
  69         int m_startBit;
  70         int m_nBlocksPerWG;
  71 } ConstBuffer;
  72 */
  73
  74 typedef struct
  75 {
  76         unsigned int m_key;
  77         unsigned int m_value;
  78 } SortDataCL;
  79
  80
  81 uint prefixScanVectorEx( uint4* data )
  82 {
  83         u32 sum = 0;
  84         u32 tmp = data[0].x;
  85         data[0].x = sum;
  86         sum += tmp;
  87         tmp = data[0].y;
  88         data[0].y = sum;
  89         sum += tmp;
  90         tmp = data[0].z;
  91         data[0].z = sum;
  92         sum += tmp;
  93         tmp = data[0].w;
  94         data[0].w = sum;
  95         sum += tmp;
  96         return sum;
  97 }
  98
  99 u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )
 100 {
 101         {       //      Set data
 102                 sorterSharedMemory[lIdx] = 0;
 103                 sorterSharedMemory[lIdx+wgSize] = pData;
 104         }
 105
 106         GROUP_LDS_BARRIER;
 107
 108         {       //      Prefix sum
 109                 int idx = 2*lIdx + (wgSize+1);
 110 #if defined(USE_2LEVEL_REDUCE)
 111                 if( lIdx < 64 )
 112                 {
 113                         u32 u0, u1, u2;
 114                         u0 = sorterSharedMemory[idx-3];
 115                         u1 = sorterSharedMemory[idx-2];
 116                         u2 = sorterSharedMemory[idx-1];
 117                         AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );
 118                         GROUP_MEM_FENCE;
 119
 120                         u0 = sorterSharedMemory[idx-12];
 121                         u1 = sorterSharedMemory[idx-8];
 122                         u2 = sorterSharedMemory[idx-4];
 123                         AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );
 124                         GROUP_MEM_FENCE;
 125
 126                         u0 = sorterSharedMemory[idx-48];
 127                         u1 = sorterSharedMemory[idx-32];
 128                         u2 = sorterSharedMemory[idx-16];
 129                         AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );
 130                         GROUP_MEM_FENCE;
 131                         if( wgSize > 64 )
 132                         {
 133                                 sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
 134                                 GROUP_MEM_FENCE;
 135                         }
 136
 137                         sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
 138                         GROUP_MEM_FENCE;
 139                 }
 140 #else
 141                 if( lIdx < 64 )
 142                 {
 143                         sorterSharedMemory[idx] += sorterSharedMemory[idx-1];
 144                         GROUP_MEM_FENCE;
 145                         sorterSharedMemory[idx] += sorterSharedMemory[idx-2];
 146                         GROUP_MEM_FENCE;
 147                         sorterSharedMemory[idx] += sorterSharedMemory[idx-4];
 148                         GROUP_MEM_FENCE;
 149                         sorterSharedMemory[idx] += sorterSharedMemory[idx-8];
 150                         GROUP_MEM_FENCE;
 151                         sorterSharedMemory[idx] += sorterSharedMemory[idx-16];
 152                         GROUP_MEM_FENCE;
 153                         sorterSharedMemory[idx] += sorterSharedMemory[idx-32];
 154                         GROUP_MEM_FENCE;
 155                         if( wgSize > 64 )
 156                         {
 157                                 sorterSharedMemory[idx] += sorterSharedMemory[idx-64];
 158                                 GROUP_MEM_FENCE;
 159                         }
 160
 161                         sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];
 162                         GROUP_MEM_FENCE;
 163                 }
 164 #endif
 165         }
 166
 167         GROUP_LDS_BARRIER;
 168
 169         *totalSum = sorterSharedMemory[wgSize*2-1];
 170         u32 addValue = sorterSharedMemory[lIdx+wgSize-1];
 171         return addValue;
 172 }
 173
 174 //__attribute__((reqd_work_group_size(128,1,1)))
 175 uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
 176 {
 177         u32 s4 = prefixScanVectorEx( &pData );
 178         u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );
 179         return pData + make_uint4( rank, rank, rank, rank );
 180 }
 181
 182
 183 //__attribute__((reqd_work_group_size(64,1,1)))
 184 uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )
 185 {
 186         u32 s4 = prefixScanVectorEx( &pData );
 187         u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );
 188         return pData + make_uint4( rank, rank, rank, rank );
 189 }
 190
 191 u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}
 192
 193 u32 bit8Scan(u32 v)
 194 {
 195         return (v<<8) + (v<<16) + (v<<24);
 196 }
 197
 198 //===
 199
 200
 201
 202
 203 #define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]
 204
 205
 206 __kernel
 207 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 208 void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )
 209 {
 210         __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
 211
 212         u32 gIdx = GET_GLOBAL_IDX;
 213         u32 lIdx = GET_LOCAL_IDX;
 214         u32 wgIdx = GET_GROUP_IDX;
 215         u32 wgSize = GET_GROUP_SIZE;
 216         const int startBit = cb.m_startBit;
 217         const int n = cb.m_n;
 218         const int nWGs = cb.m_nWGs;
 219         const int nBlocksPerWG = cb.m_nBlocksPerWG;
 220
 221         for(int i=0; i<NUM_BUCKET; i++)
 222         {
 223                 MY_HISTOGRAM(i) = 0;
 224         }
 225
 226         GROUP_LDS_BARRIER;
 227
 228         const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
 229         u32 localKey;
 230
 231         int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
 232
 233         int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
 234
 235         for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
 236         {
 237                 //      MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
 238                 //      Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
 239                 //      AMD: AtomInc performs better while NV prefers ++
 240                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
 241                 {
 242 #if defined(CHECK_BOUNDARY)
 243                         if( addr+i < n )
 244 #endif
 245                         {
 246                                 localKey = (gSrc[addr+i]>>startBit) & 0xf;
 247 #if defined(NV_GPU)
 248                                 MY_HISTOGRAM( localKey )++;
 249 #else
 250                                 AtomInc( MY_HISTOGRAM( localKey ) );
 251 #endif
 252                         }
 253                 }
 254         }
 255
 256         GROUP_LDS_BARRIER;
 257
 258         if( lIdx < NUM_BUCKET )
 259         {
 260                 u32 sum = 0;
 261                 for(int i=0; i<GET_GROUP_SIZE; i++)
 262                 {
 263                         sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
 264                 }
 265                 histogramOut[lIdx*nWGs+wgIdx] = sum;
 266         }
 267 }
 268
 269 __kernel
 270 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 271 void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )
 272 {
 273         __local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];
 274
 275         u32 gIdx = GET_GLOBAL_IDX;
 276         u32 lIdx = GET_LOCAL_IDX;
 277         u32 wgIdx = GET_GROUP_IDX;
 278         u32 wgSize = GET_GROUP_SIZE;
 279         const int startBit = cb.m_startBit;
 280         const int n = cb.m_n;
 281         const int nWGs = cb.m_nWGs;
 282         const int nBlocksPerWG = cb.m_nBlocksPerWG;
 283
 284         for(int i=0; i<NUM_BUCKET; i++)
 285         {
 286                 MY_HISTOGRAM(i) = 0;
 287         }
 288
 289         GROUP_LDS_BARRIER;
 290
 291         const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
 292         u32 localKey;
 293
 294         int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
 295
 296         int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
 297
 298         for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
 299         {
 300                 //      MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
 301                 //      Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
 302                 //      AMD: AtomInc performs better while NV prefers ++
 303                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
 304                 {
 305 #if defined(CHECK_BOUNDARY)
 306                         if( addr+i < n )
 307 #endif
 308                         {
 309                                 localKey = (gSrc[addr+i].m_key>>startBit) & 0xf;
 310 #if defined(NV_GPU)
 311                                 MY_HISTOGRAM( localKey )++;
 312 #else
 313                                 AtomInc( MY_HISTOGRAM( localKey ) );
 314 #endif
 315                         }
 316                 }
 317         }
 318
 319         GROUP_LDS_BARRIER;
 320
 321         if( lIdx < NUM_BUCKET )
 322         {
 323                 u32 sum = 0;
 324                 for(int i=0; i<GET_GROUP_SIZE; i++)
 325                 {
 326                         sum += localHistogramMat[lIdx*WG_SIZE+(i+lIdx)%GET_GROUP_SIZE];
 327                 }
 328                 histogramOut[lIdx*nWGs+wgIdx] = sum;
 329         }
 330 }
 331
 332 #define nPerLane (nPerWI/4)
 333
 334 //      NUM_BUCKET*nWGs < 128*nPerWI
 335 __kernel
 336 __attribute__((reqd_work_group_size(128,1,1)))
 337 void PrefixScanKernel( __global u32* wHistogram1, int4  cb )
 338 {
 339         __local u32 ldsTopScanData[128*2];
 340
 341         u32 lIdx = GET_LOCAL_IDX;
 342         u32 wgIdx = GET_GROUP_IDX;
 343         const int nWGs = cb.m_nWGs;
 344
 345         u32 data[nPerWI];
 346         for(int i=0; i<nPerWI; i++)
 347         {
 348                 data[i] = 0;
 349                 if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )
 350                         data[i] = wHistogram1[nPerWI*lIdx+i];
 351         }
 352
 353         uint4 myData = make_uint4(0,0,0,0);
 354
 355         for(int i=0; i<nPerLane; i++)
 356         {
 357                 myData.x += data[nPerLane*0+i];
 358                 myData.y += data[nPerLane*1+i];
 359                 myData.z += data[nPerLane*2+i];
 360                 myData.w += data[nPerLane*3+i];
 361         }
 362
 363         uint totalSum;
 364         uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );
 365
 366 //      for(int j=0; j<4; j++) //       somehow it introduces a lot of branches
 367         {       int j = 0;
 368                 u32 sum = 0;
 369                 for(int i=0; i<nPerLane; i++)
 370                 {
 371                         u32 tmp = data[nPerLane*j+i];
 372                         data[nPerLane*j+i] = sum;
 373                         sum += tmp;
 374                 }
 375         }
 376         {       int j = 1;
 377                 u32 sum = 0;
 378                 for(int i=0; i<nPerLane; i++)
 379                 {
 380                         u32 tmp = data[nPerLane*j+i];
 381                         data[nPerLane*j+i] = sum;
 382                         sum += tmp;
 383                 }
 384         }
 385         {       int j = 2;
 386                 u32 sum = 0;
 387                 for(int i=0; i<nPerLane; i++)
 388                 {
 389                         u32 tmp = data[nPerLane*j+i];
 390                         data[nPerLane*j+i] = sum;
 391                         sum += tmp;
 392                 }
 393         }
 394         {       int j = 3;
 395                 u32 sum = 0;
 396                 for(int i=0; i<nPerLane; i++)
 397                 {
 398                         u32 tmp = data[nPerLane*j+i];
 399                         data[nPerLane*j+i] = sum;
 400                         sum += tmp;
 401                 }
 402         }
 403
 404         for(int i=0; i<nPerLane; i++)
 405         {
 406                 data[nPerLane*0+i] += scanned.x;
 407                 data[nPerLane*1+i] += scanned.y;
 408                 data[nPerLane*2+i] += scanned.z;
 409                 data[nPerLane*3+i] += scanned.w;
 410         }
 411
 412         for(int i=0; i<nPerWI; i++)
 413         {
 414                 int index = nPerWI*lIdx+i;
 415                 if (index < NUM_BUCKET*nWGs)
 416                         wHistogram1[nPerWI*lIdx+i] = data[i];
 417         }
 418 }
 419
 420 //      4 scan, 4 exchange
 421 void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
 422 {
 423         for(int bitIdx=0; bitIdx<BITS_PER_PASS; bitIdx++)
 424         {
 425                 u32 mask = (1<<bitIdx);
 426                 uint4 cmpResult = make_uint4( (sortData[0]>>startBit) & mask, (sortData[1]>>startBit) & mask, (sortData[2]>>startBit) & mask, (sortData[3]>>startBit) & mask );
 427                 uint4 prefixSum = SELECT_UINT4( make_uint4(1,1,1,1), make_uint4(0,0,0,0), cmpResult != make_uint4(0,0,0,0) );
 428                 u32 total;
 429                 prefixSum = localPrefixSum64V( prefixSum, lIdx, &total, ldsSortData );
 430                 {
 431                         uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);
 432                         uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );
 433                         dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );
 434
 435                         GROUP_LDS_BARRIER;
 436
 437                         ldsSortData[dstAddr.x] = sortData[0];
 438                         ldsSortData[dstAddr.y] = sortData[1];
 439                         ldsSortData[dstAddr.z] = sortData[2];
 440                         ldsSortData[dstAddr.w] = sortData[3];
 441
 442                         GROUP_LDS_BARRIER;
 443
 444                         sortData[0] = ldsSortData[localAddr.x];
 445                         sortData[1] = ldsSortData[localAddr.y];
 446                         sortData[2] = ldsSortData[localAddr.z];
 447                         sortData[3] = ldsSortData[localAddr.w];
 448
 449                         GROUP_LDS_BARRIER;
 450                 }
 451         }
 452 }
 453
 454 //      2 scan, 2 exchange
 455 void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)
 456 {
 457         for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
 458         {
 459                 uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3,
 460                         (sortData[1]>>(startBit+ibit)) & 0x3,
 461                         (sortData[2]>>(startBit+ibit)) & 0x3,
 462                         (sortData[3]>>(startBit+ibit)) & 0x3);
 463
 464                 u32 key4;
 465                 u32 sKeyPacked[4] = { 0, 0, 0, 0 };
 466                 {
 467                         sKeyPacked[0] |= 1<<(8*b.x);
 468                         sKeyPacked[1] |= 1<<(8*b.y);
 469                         sKeyPacked[2] |= 1<<(8*b.z);
 470                         sKeyPacked[3] |= 1<<(8*b.w);
 471
 472                         key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
 473                 }
 474
 475                 u32 rankPacked;
 476                 u32 sumPacked;
 477                 {
 478                         rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
 479                 }
 480
 481                 GROUP_LDS_BARRIER;
 482
 483                 u32 newOffset[4] = { 0,0,0,0 };
 484                 {
 485                         u32 sumScanned = bit8Scan( sumPacked );
 486
 487                         u32 scannedKeys[4];
 488                         scannedKeys[0] = 1<<(8*b.x);
 489                         scannedKeys[1] = 1<<(8*b.y);
 490                         scannedKeys[2] = 1<<(8*b.z);
 491                         scannedKeys[3] = 1<<(8*b.w);
 492                         {       //      4 scans at once
 493                                 u32 sum4 = 0;
 494                                 for(int ie=0; ie<4; ie++)
 495                                 {
 496                                         u32 tmp = scannedKeys[ie];
 497                                         scannedKeys[ie] = sum4;
 498                                         sum4 += tmp;
 499                                 }
 500                         }
 501
 502                         {
 503                                 u32 sumPlusRank = sumScanned + rankPacked;
 504                                 {       u32 ie = b.x;
 505                                         scannedKeys[0] += sumPlusRank;
 506                                         newOffset[0] = unpack4Key( scannedKeys[0], ie );
 507                                 }
 508                                 {       u32 ie = b.y;
 509                                         scannedKeys[1] += sumPlusRank;
 510                                         newOffset[1] = unpack4Key( scannedKeys[1], ie );
 511                                 }
 512                                 {       u32 ie = b.z;
 513                                         scannedKeys[2] += sumPlusRank;
 514                                         newOffset[2] = unpack4Key( scannedKeys[2], ie );
 515                                 }
 516                                 {       u32 ie = b.w;
 517                                         scannedKeys[3] += sumPlusRank;
 518                                         newOffset[3] = unpack4Key( scannedKeys[3], ie );
 519                                 }
 520                         }
 521                 }
 522
 523
 524                 GROUP_LDS_BARRIER;
 525
 526                 {
 527                         ldsSortData[newOffset[0]] = sortData[0];
 528                         ldsSortData[newOffset[1]] = sortData[1];
 529                         ldsSortData[newOffset[2]] = sortData[2];
 530                         ldsSortData[newOffset[3]] = sortData[3];
 531
 532                         GROUP_LDS_BARRIER;
 533
 534                         u32 dstAddr = 4*lIdx;
 535                         sortData[0] = ldsSortData[dstAddr+0];
 536                         sortData[1] = ldsSortData[dstAddr+1];
 537                         sortData[2] = ldsSortData[dstAddr+2];
 538                         sortData[3] = ldsSortData[dstAddr+3];
 539
 540                         GROUP_LDS_BARRIER;
 541                 }
 542         }
 543 }
 544
 545 #define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]
 546
 547 __kernel
 548 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 549 void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )
 550 {
 551         __local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
 552         __local u32 localHistogramToCarry[NUM_BUCKET];
 553         __local u32 localHistogram[NUM_BUCKET*2];
 554
 555         u32 gIdx = GET_GLOBAL_IDX;
 556         u32 lIdx = GET_LOCAL_IDX;
 557         u32 wgIdx = GET_GROUP_IDX;
 558         u32 wgSize = GET_GROUP_SIZE;
 559
 560         const int n = cb.m_n;
 561         const int nWGs = cb.m_nWGs;
 562         const int startBit = cb.m_startBit;
 563         const int nBlocksPerWG = cb.m_nBlocksPerWG;
 564
 565         if( lIdx < (NUM_BUCKET) )
 566         {
 567                 localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
 568         }
 569
 570         GROUP_LDS_BARRIER;
 571
 572         const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
 573
 574         int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
 575
 576         int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
 577
 578         for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
 579         {
 580                 u32 myHistogram = 0;
 581
 582                 u32 sortData[ELEMENTS_PER_WORK_ITEM];
 583                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
 584 #if defined(CHECK_BOUNDARY)
 585                         sortData[i] = ( addr+i < n )? gSrc[ addr+i ] : 0xffffffff;
 586 #else
 587                         sortData[i] = gSrc[ addr+i ];
 588 #endif
 589
 590                 sort4Bits(sortData, startBit, lIdx, ldsSortData);
 591
 592                 u32 keys[ELEMENTS_PER_WORK_ITEM];
 593                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
 594                         keys[i] = (sortData[i]>>startBit) & 0xf;
 595
 596                 {       //      create histogram
 597                         u32 setIdx = lIdx/16;
 598                         if( lIdx < NUM_BUCKET )
 599                         {
 600                                 localHistogram[lIdx] = 0;
 601                         }
 602                         ldsSortData[lIdx] = 0;
 603                         GROUP_LDS_BARRIER;
 604
 605                         for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
 606 #if defined(CHECK_BOUNDARY)
 607                                 if( addr+i < n )
 608 #endif
 609
 610 #if defined(NV_GPU)
 611                                 SET_HISTOGRAM( setIdx, keys[i] )++;
 612 #else
 613                                 AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
 614 #endif
 615
 616                         GROUP_LDS_BARRIER;
 617
 618                         uint hIdx = NUM_BUCKET+lIdx;
 619                         if( lIdx < NUM_BUCKET )
 620                         {
 621                                 u32 sum = 0;
 622                                 for(int i=0; i<WG_SIZE/16; i++)
 623                                 {
 624                                         sum += SET_HISTOGRAM( i, lIdx );
 625                                 }
 626                                 myHistogram = sum;
 627                                 localHistogram[hIdx] = sum;
 628                         }
 629                         GROUP_LDS_BARRIER;
 630
 631 #if defined(USE_2LEVEL_REDUCE)
 632                         if( lIdx < NUM_BUCKET )
 633                         {
 634                                 localHistogram[hIdx] = localHistogram[hIdx-1];
 635                                 GROUP_MEM_FENCE;
 636
 637                                 u32 u0, u1, u2;
 638                                 u0 = localHistogram[hIdx-3];
 639                                 u1 = localHistogram[hIdx-2];
 640                                 u2 = localHistogram[hIdx-1];
 641                                 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
 642                                 GROUP_MEM_FENCE;
 643                                 u0 = localHistogram[hIdx-12];
 644                                 u1 = localHistogram[hIdx-8];
 645                                 u2 = localHistogram[hIdx-4];
 646                                 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
 647                                 GROUP_MEM_FENCE;
 648                         }
 649 #else
 650                         if( lIdx < NUM_BUCKET )
 651                         {
 652                                 localHistogram[hIdx] = localHistogram[hIdx-1];
 653                                 GROUP_MEM_FENCE;
 654                                 localHistogram[hIdx] += localHistogram[hIdx-1];
 655                                 GROUP_MEM_FENCE;
 656                                 localHistogram[hIdx] += localHistogram[hIdx-2];
 657                                 GROUP_MEM_FENCE;
 658                                 localHistogram[hIdx] += localHistogram[hIdx-4];
 659                                 GROUP_MEM_FENCE;
 660                                 localHistogram[hIdx] += localHistogram[hIdx-8];
 661                                 GROUP_MEM_FENCE;
 662                         }
 663 #endif
 664                         GROUP_LDS_BARRIER;
 665                 }
 666
 667                 {
 668                         for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
 669                         {
 670                                 int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
 671                                 int binIdx = keys[ie];
 672                                 int groupOffset = localHistogramToCarry[binIdx];
 673                                 int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
 674 #if defined(CHECK_BOUNDARY)
 675                                 if( addr+ie < n )
 676 #endif
 677                                 gDst[ groupOffset + myIdx ] = sortData[ie];
 678                         }
 679                 }
 680
 681                 GROUP_LDS_BARRIER;
 682
 683                 if( lIdx < NUM_BUCKET )
 684                 {
 685                         localHistogramToCarry[lIdx] += myHistogram;
 686                 }
 687                 GROUP_LDS_BARRIER;
 688         }
 689 }
 690
 691 //      2 scan, 2 exchange
 692 void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)
 693 {
 694         for(uint ibit=0; ibit<BITS_PER_PASS; ibit+=2)
 695         {
 696                 uint4 b = make_uint4((sortData[0]>>(startBit+ibit)) & 0x3,
 697                         (sortData[1]>>(startBit+ibit)) & 0x3,
 698                         (sortData[2]>>(startBit+ibit)) & 0x3,
 699                         (sortData[3]>>(startBit+ibit)) & 0x3);
 700
 701                 u32 key4;
 702                 u32 sKeyPacked[4] = { 0, 0, 0, 0 };
 703                 {
 704                         sKeyPacked[0] |= 1<<(8*b.x);
 705                         sKeyPacked[1] |= 1<<(8*b.y);
 706                         sKeyPacked[2] |= 1<<(8*b.z);
 707                         sKeyPacked[3] |= 1<<(8*b.w);
 708
 709                         key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];
 710                 }
 711
 712                 u32 rankPacked;
 713                 u32 sumPacked;
 714                 {
 715                         rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );
 716                 }
 717
 718                 GROUP_LDS_BARRIER;
 719
 720                 u32 newOffset[4] = { 0,0,0,0 };
 721                 {
 722                         u32 sumScanned = bit8Scan( sumPacked );
 723
 724                         u32 scannedKeys[4];
 725                         scannedKeys[0] = 1<<(8*b.x);
 726                         scannedKeys[1] = 1<<(8*b.y);
 727                         scannedKeys[2] = 1<<(8*b.z);
 728                         scannedKeys[3] = 1<<(8*b.w);
 729                         {       //      4 scans at once
 730                                 u32 sum4 = 0;
 731                                 for(int ie=0; ie<4; ie++)
 732                                 {
 733                                         u32 tmp = scannedKeys[ie];
 734                                         scannedKeys[ie] = sum4;
 735                                         sum4 += tmp;
 736                                 }
 737                         }
 738
 739                         {
 740                                 u32 sumPlusRank = sumScanned + rankPacked;
 741                                 {       u32 ie = b.x;
 742                                         scannedKeys[0] += sumPlusRank;
 743                                         newOffset[0] = unpack4Key( scannedKeys[0], ie );
 744                                 }
 745                                 {       u32 ie = b.y;
 746                                         scannedKeys[1] += sumPlusRank;
 747                                         newOffset[1] = unpack4Key( scannedKeys[1], ie );
 748                                 }
 749                                 {       u32 ie = b.z;
 750                                         scannedKeys[2] += sumPlusRank;
 751                                         newOffset[2] = unpack4Key( scannedKeys[2], ie );
 752                                 }
 753                                 {       u32 ie = b.w;
 754                                         scannedKeys[3] += sumPlusRank;
 755                                         newOffset[3] = unpack4Key( scannedKeys[3], ie );
 756                                 }
 757                         }
 758                 }
 759
 760
 761                 GROUP_LDS_BARRIER;
 762
 763                 {
 764                         ldsSortData[newOffset[0]] = sortData[0];
 765                         ldsSortData[newOffset[1]] = sortData[1];
 766                         ldsSortData[newOffset[2]] = sortData[2];
 767                         ldsSortData[newOffset[3]] = sortData[3];
 768
 769                         ldsSortVal[newOffset[0]] = sortVal[0];
 770                         ldsSortVal[newOffset[1]] = sortVal[1];
 771                         ldsSortVal[newOffset[2]] = sortVal[2];
 772                         ldsSortVal[newOffset[3]] = sortVal[3];
 773
 774                         GROUP_LDS_BARRIER;
 775
 776                         u32 dstAddr = 4*lIdx;
 777                         sortData[0] = ldsSortData[dstAddr+0];
 778                         sortData[1] = ldsSortData[dstAddr+1];
 779                         sortData[2] = ldsSortData[dstAddr+2];
 780                         sortData[3] = ldsSortData[dstAddr+3];
 781
 782                         sortVal[0] = ldsSortVal[dstAddr+0];
 783                         sortVal[1] = ldsSortVal[dstAddr+1];
 784                         sortVal[2] = ldsSortVal[dstAddr+2];
 785                         sortVal[3] = ldsSortVal[dstAddr+3];
 786
 787                         GROUP_LDS_BARRIER;
 788                 }
 789         }
 790 }
 791
 792
 793
 794
 795 __kernel
 796 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 797 void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
 798 {
 799         __local int ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
 800         __local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];
 801         __local u32 localHistogramToCarry[NUM_BUCKET];
 802         __local u32 localHistogram[NUM_BUCKET*2];
 803
 804         u32 gIdx = GET_GLOBAL_IDX;
 805         u32 lIdx = GET_LOCAL_IDX;
 806         u32 wgIdx = GET_GROUP_IDX;
 807         u32 wgSize = GET_GROUP_SIZE;
 808
 809         const int n = cb.m_n;
 810         const int nWGs = cb.m_nWGs;
 811         const int startBit = cb.m_startBit;
 812         const int nBlocksPerWG = cb.m_nBlocksPerWG;
 813
 814         if( lIdx < (NUM_BUCKET) )
 815         {
 816                 localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];
 817         }
 818
 819         GROUP_LDS_BARRIER;
 820
 821
 822         const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
 823
 824         int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;
 825
 826         int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
 827
 828         for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)
 829         {
 830
 831                 u32 myHistogram = 0;
 832
 833                 int sortData[ELEMENTS_PER_WORK_ITEM];
 834                 int sortVal[ELEMENTS_PER_WORK_ITEM];
 835
 836                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
 837 #if defined(CHECK_BOUNDARY)
 838                 {
 839                         sortData[i] = ( addr+i < n )? gSrc[ addr+i ].m_key : 0xffffffff;
 840                         sortVal[i] = ( addr+i < n )? gSrc[ addr+i ].m_value : 0xffffffff;
 841                 }
 842 #else
 843                 {
 844                         sortData[i] = gSrc[ addr+i ].m_key;
 845                         sortVal[i] = gSrc[ addr+i ].m_value;
 846                 }
 847 #endif
 848
 849                 sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);
 850
 851                 u32 keys[ELEMENTS_PER_WORK_ITEM];
 852                 for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
 853                         keys[i] = (sortData[i]>>startBit) & 0xf;
 854
 855                 {       //      create histogram
 856                         u32 setIdx = lIdx/16;
 857                         if( lIdx < NUM_BUCKET )
 858                         {
 859                                 localHistogram[lIdx] = 0;
 860                         }
 861                         ldsSortData[lIdx] = 0;
 862                         GROUP_LDS_BARRIER;
 863
 864                         for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)
 865 #if defined(CHECK_BOUNDARY)
 866                                 if( addr+i < n )
 867 #endif
 868
 869 #if defined(NV_GPU)
 870                                 SET_HISTOGRAM( setIdx, keys[i] )++;
 871 #else
 872                                 AtomInc( SET_HISTOGRAM( setIdx, keys[i] ) );
 873 #endif
 874
 875                         GROUP_LDS_BARRIER;
 876
 877                         uint hIdx = NUM_BUCKET+lIdx;
 878                         if( lIdx < NUM_BUCKET )
 879                         {
 880                                 u32 sum = 0;
 881                                 for(int i=0; i<WG_SIZE/16; i++)
 882                                 {
 883                                         sum += SET_HISTOGRAM( i, lIdx );
 884                                 }
 885                                 myHistogram = sum;
 886                                 localHistogram[hIdx] = sum;
 887                         }
 888                         GROUP_LDS_BARRIER;
 889
 890 #if defined(USE_2LEVEL_REDUCE)
 891                         if( lIdx < NUM_BUCKET )
 892                         {
 893                                 localHistogram[hIdx] = localHistogram[hIdx-1];
 894                                 GROUP_MEM_FENCE;
 895
 896                                 u32 u0, u1, u2;
 897                                 u0 = localHistogram[hIdx-3];
 898                                 u1 = localHistogram[hIdx-2];
 899                                 u2 = localHistogram[hIdx-1];
 900                                 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
 901                                 GROUP_MEM_FENCE;
 902                                 u0 = localHistogram[hIdx-12];
 903                                 u1 = localHistogram[hIdx-8];
 904                                 u2 = localHistogram[hIdx-4];
 905                                 AtomAdd( localHistogram[hIdx], u0 + u1 + u2 );
 906                                 GROUP_MEM_FENCE;
 907                         }
 908 #else
 909                         if( lIdx < NUM_BUCKET )
 910                         {
 911                                 localHistogram[hIdx] = localHistogram[hIdx-1];
 912                                 GROUP_MEM_FENCE;
 913                                 localHistogram[hIdx] += localHistogram[hIdx-1];
 914                                 GROUP_MEM_FENCE;
 915                                 localHistogram[hIdx] += localHistogram[hIdx-2];
 916                                 GROUP_MEM_FENCE;
 917                                 localHistogram[hIdx] += localHistogram[hIdx-4];
 918                                 GROUP_MEM_FENCE;
 919                                 localHistogram[hIdx] += localHistogram[hIdx-8];
 920                                 GROUP_MEM_FENCE;
 921                         }
 922 #endif
 923                         GROUP_LDS_BARRIER;
 924                 }
 925
 926         {
 927                         for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)
 928                         {
 929                                 int dataIdx = ELEMENTS_PER_WORK_ITEM*lIdx+ie;
 930                                 int binIdx = keys[ie];
 931                                 int groupOffset = localHistogramToCarry[binIdx];
 932                                 int myIdx = dataIdx - localHistogram[NUM_BUCKET+binIdx];
 933 #if defined(CHECK_BOUNDARY)
 934                                 if( addr+ie < n )
 935                                 {
 936                     if ((groupOffset + myIdx)<n)
 937                     {
 938                         if (sortData[ie]==sortVal[ie])
 939                         {
 940
 941                             SortDataCL tmp;
 942                             tmp.m_key = sortData[ie];
 943                             tmp.m_value = sortVal[ie];
 944                             if (tmp.m_key == tmp.m_value)
 945                                 gDst[groupOffset + myIdx ] = tmp;
 946                         }
 947
 948                     }
 949                                 }
 950 #else
 951                 if ((groupOffset + myIdx)<n)
 952                 {
 953                     gDst[ groupOffset + myIdx ].m_key = sortData[ie];
 954                     gDst[ groupOffset + myIdx ].m_value = sortVal[ie];
 955                 }
 956 #endif
 957                         }
 958                 }
 959
 960                 GROUP_LDS_BARRIER;
 961
 962                 if( lIdx < NUM_BUCKET )
 963                 {
 964                         localHistogramToCarry[lIdx] += myHistogram;
 965                 }
 966                 GROUP_LDS_BARRIER;
 967         }
 968 }
 969
 970
 971
 972
 973
 974
 975
 976 __kernel
 977 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 978 void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)
 979 {
 980
 981         u32 gIdx = GET_GLOBAL_IDX;
 982         u32 realLocalIdx = GET_LOCAL_IDX;
 983         u32 wgIdx = GET_GROUP_IDX;
 984         u32 wgSize = GET_GROUP_SIZE;
 985         const int startBit = cb.m_startBit;
 986         const int n = cb.m_n;
 987         const int nWGs = cb.m_nWGs;
 988         const int nBlocksPerWG = cb.m_nBlocksPerWG;
 989
 990     int counter[NUM_BUCKET];
 991
 992     if (realLocalIdx>0)
 993         return;
 994
 995     for (int c=0;c<NUM_BUCKET;c++)
 996         counter[c]=0;
 997
 998     const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
 999
1000         int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
1001
1002    for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
1003   {
1004      for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
1005         {
1006         int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
1007
1008                 for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
1009                 {
1010             int i = addr2+j;
1011                         if( i < n )
1012                         {
1013                 int tableIdx;
1014                                 tableIdx = (gSrc[i].m_key>>startBit) & 0xf;//0xf = NUM_TABLES-1
1015                 gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
1016                 counter[tableIdx] ++;
1017                         }
1018                 }
1019         }
1020   }
1021
1022 }
1023
1024
1025 __kernel
1026 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
1027 void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )
1028 {
1029
1030         u32 gIdx = GET_GLOBAL_IDX;
1031         u32 realLocalIdx = GET_LOCAL_IDX;
1032         u32 wgIdx = GET_GROUP_IDX;
1033         u32 wgSize = GET_GROUP_SIZE;
1034         const int startBit = cb.m_startBit;
1035         const int n = cb.m_n;
1036         const int nWGs = cb.m_nWGs;
1037         const int nBlocksPerWG = cb.m_nBlocksPerWG;
1038
1039     int counter[NUM_BUCKET];
1040
1041     if (realLocalIdx>0)
1042         return;
1043
1044     for (int c=0;c<NUM_BUCKET;c++)
1045         counter[c]=0;
1046
1047     const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;
1048
1049         int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;
1050
1051    for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)
1052   {
1053      for (int lIdx=0;lIdx<WG_SIZE;lIdx++)
1054         {
1055         int addr2 = iblock*blockSize + blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;
1056
1057                 for(int j=0; j<ELEMENTS_PER_WORK_ITEM; j++)
1058                 {
1059             int i = addr2+j;
1060                         if( i < n )
1061                         {
1062                 int tableIdx;
1063                                 tableIdx = (gSrc[i]>>startBit) & 0xf;//0xf = NUM_TABLES-1
1064                 gDst[rHistogram[tableIdx*nWGs+wgIdx] + counter[tableIdx]] = gSrc[i];
1065                 counter[tableIdx] ++;
1066                         }
1067                 }
1068         }
1069   }
1070
1071 }