[dali_2.3.21] Merge branch 'devel/master'
[platform/core/uifw/dali-toolkit.git] / dali-physics / third-party / bullet3 / src / Bullet3OpenCL / ParallelPrimitives / b3RadixSort32CL.cpp
1
2 #include "b3RadixSort32CL.h"
3 #include "b3LauncherCL.h"
4 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
5 #include "b3PrefixScanCL.h"
6 #include "b3FillCL.h"
7
8 #define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
9
10 #include "kernels/RadixSort32KernelsCL.h"
11
12 b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
13         : m_commandQueue(queue)
14 {
15         b3OpenCLDeviceInfo info;
16         b3OpenCLUtils::getDeviceInfo(device, &info);
17         m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
18
19         m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
20         m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
21         m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
22         m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
23         m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
24         m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
25
26         if (initialCapacity > 0)
27         {
28                 m_workBuffer1->resize(initialCapacity);
29                 m_workBuffer3->resize(initialCapacity);
30                 m_workBuffer3a->resize(initialCapacity);
31                 m_workBuffer4->resize(initialCapacity);
32                 m_workBuffer4a->resize(initialCapacity);
33         }
34
35         m_scan = new b3PrefixScanCL(ctx, device, queue);
36         m_fill = new b3FillCL(ctx, device, queue);
37
38         const char* additionalMacros = "";
39
40         cl_int pErrNum;
41         const char* kernelSource = radixSort32KernelsCL;
42
43         cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
44         b3Assert(sortProg);
45
46         m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
47         b3Assert(m_streamCountSortDataKernel);
48
49         m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
50         b3Assert(m_streamCountKernel);
51
52         if (m_deviceCPU)
53         {
54                 m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
55                 b3Assert(m_sortAndScatterSortDataKernel);
56                 m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
57                 b3Assert(m_sortAndScatterKernel);
58         }
59         else
60         {
61                 m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
62                 b3Assert(m_sortAndScatterSortDataKernel);
63                 m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
64                 b3Assert(m_sortAndScatterKernel);
65         }
66
67         m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
68         b3Assert(m_prefixScanKernel);
69 }
70
71 b3RadixSort32CL::~b3RadixSort32CL()
72 {
73         delete m_scan;
74         delete m_fill;
75         delete m_workBuffer1;
76         delete m_workBuffer2;
77         delete m_workBuffer3;
78         delete m_workBuffer3a;
79         delete m_workBuffer4;
80         delete m_workBuffer4a;
81
82         clReleaseKernel(m_streamCountSortDataKernel);
83         clReleaseKernel(m_streamCountKernel);
84         clReleaseKernel(m_sortAndScatterSortDataKernel);
85         clReleaseKernel(m_sortAndScatterKernel);
86         clReleaseKernel(m_prefixScanKernel);
87 }
88
89 void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
90 {
91         int n = inout.size();
92         const int BITS_PER_PASS = 8;
93         const int NUM_TABLES = (1 << BITS_PER_PASS);
94
95         int tables[NUM_TABLES];
96         int counter[NUM_TABLES];
97
98         b3SortData* src = &inout[0];
99         b3AlignedObjectArray<b3SortData> workbuffer;
100         workbuffer.resize(inout.size());
101         b3SortData* dst = &workbuffer[0];
102
103         int count = 0;
104         for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
105         {
106                 for (int i = 0; i < NUM_TABLES; i++)
107                 {
108                         tables[i] = 0;
109                 }
110
111                 for (int i = 0; i < n; i++)
112                 {
113                         int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
114                         tables[tableIdx]++;
115                 }
116 //#define TEST
117 #ifdef TEST
118                 printf("histogram size=%d\n", NUM_TABLES);
119                 for (int i = 0; i < NUM_TABLES; i++)
120                 {
121                         if (tables[i] != 0)
122                         {
123                                 printf("tables[%d]=%d]\n", i, tables[i]);
124                         }
125                 }
126 #endif  //TEST \
127         //      prefix scan
128                 int sum = 0;
129                 for (int i = 0; i < NUM_TABLES; i++)
130                 {
131                         int iData = tables[i];
132                         tables[i] = sum;
133                         sum += iData;
134                         counter[i] = 0;
135                 }
136
137                 //      distribute
138                 for (int i = 0; i < n; i++)
139                 {
140                         int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
141
142                         dst[tables[tableIdx] + counter[tableIdx]] = src[i];
143                         counter[tableIdx]++;
144                 }
145
146                 b3Swap(src, dst);
147                 count++;
148         }
149
150         if (count & 1)
151         {
152                 b3Assert(0);  //need to copy
153         }
154 }
155
156 void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
157 {
158         b3AlignedObjectArray<b3SortData> inout;
159         keyValuesInOut.copyToHost(inout);
160
161         executeHost(inout, sortBits);
162
163         keyValuesInOut.copyFromHost(inout);
164 }
165
166 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
167                                                           b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
168 {
169 }
170
171 //#define DEBUG_RADIXSORT
172 //#define DEBUG_RADIXSORT2
173
174 void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
175 {
176         int originalSize = keyValuesInOut.size();
177         int workingSize = originalSize;
178
179         int dataAlignment = DATA_ALIGNMENT;
180
181 #ifdef DEBUG_RADIXSORT2
182         b3AlignedObjectArray<b3SortData> test2;
183         keyValuesInOut.copyToHost(test2);
184         printf("numElem = %d\n", test2.size());
185         for (int i = 0; i < test2.size(); i++)
186         {
187                 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
188                 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
189         }
190 #endif  //DEBUG_RADIXSORT2
191
192         b3OpenCLArray<b3SortData>* src = 0;
193
194         if (workingSize % dataAlignment)
195         {
196                 workingSize += dataAlignment - (workingSize % dataAlignment);
197                 m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
198                 m_workBuffer4->resize(workingSize);
199                 b3SortData fillValue;
200                 fillValue.m_key = 0xffffffff;
201                 fillValue.m_value = 0xffffffff;
202
203 #define USE_BTFILL
204 #ifdef USE_BTFILL
205                 m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
206 #else
207                 //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
208
209                 for (int i = originalSize; i < workingSize; i++)
210                 {
211                         m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
212                 }
213 #endif  //USE_BTFILL
214
215                 src = m_workBuffer4;
216         }
217         else
218         {
219                 src = &keyValuesInOut;
220                 m_workBuffer4->resize(0);
221         }
222
223         b3Assert(workingSize % DATA_ALIGNMENT == 0);
224         int minCap = NUM_BUCKET * NUM_WGS;
225
226         int n = workingSize;
227
228         m_workBuffer1->resize(minCap);
229         m_workBuffer3->resize(workingSize);
230
231         //      ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
232         b3Assert(BITS_PER_PASS == 4);
233         b3Assert(WG_SIZE == 64);
234         b3Assert((sortBits & 0x3) == 0);
235
236         b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
237
238         b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
239         b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
240
241         int nWGs = NUM_WGS;
242         b3ConstData cdata;
243
244         {
245                 int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
246                 int nBlocks = (n + blockSize - 1) / (blockSize);
247                 cdata.m_n = n;
248                 cdata.m_nWGs = NUM_WGS;
249                 cdata.m_startBit = 0;
250                 cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
251                 if (nBlocks < NUM_WGS)
252                 {
253                         cdata.m_nBlocksPerWG = 1;
254                         nWGs = nBlocks;
255                 }
256         }
257
258         int count = 0;
259         for (int ib = 0; ib < sortBits; ib += 4)
260         {
261 #ifdef DEBUG_RADIXSORT2
262                 keyValuesInOut.copyToHost(test2);
263                 printf("numElem = %d\n", test2.size());
264                 for (int i = 0; i < test2.size(); i++)
265                 {
266                         if (test2[i].m_key != test2[i].m_value)
267                         {
268                                 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
269                                 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
270                         }
271                 }
272 #endif  //DEBUG_RADIXSORT2
273
274                 cdata.m_startBit = ib;
275
276                 if (src->size())
277                 {
278                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
279                         b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
280
281                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
282                         launcher.setConst(cdata);
283
284                         int num = NUM_WGS * WG_SIZE;
285                         launcher.launch1D(num, WG_SIZE);
286                 }
287
288 #ifdef DEBUG_RADIXSORT
289                 b3AlignedObjectArray<unsigned int> testHist;
290                 srcHisto->copyToHost(testHist);
291                 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
292                 for (int i = 0; i < testHist.size(); i++)
293                 {
294                         if (testHist[i] != 0)
295                                 printf("testHist[%d]=%d\n", i, testHist[i]);
296                 }
297 #endif  //DEBUG_RADIXSORT
298
299 //fast prefix scan is not working properly on Mac OSX yet
300 #ifdef __APPLE__
301                 bool fastScan = false;
302 #else
303                 bool fastScan = !m_deviceCPU;  //only use fast scan on GPU
304 #endif
305
306                 if (fastScan)
307                 {  //   prefix scan group histogram
308                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
309                         b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
310                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
311                         launcher.setConst(cdata);
312                         launcher.launch1D(128, 128);
313                         destHisto = srcHisto;
314                 }
315                 else
316                 {
317                         //unsigned int sum; //for debugging
318                         m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
319                 }
320
321 #ifdef DEBUG_RADIXSORT
322                 destHisto->copyToHost(testHist);
323                 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
324                 for (int i = 0; i < testHist.size(); i++)
325                 {
326                         if (testHist[i] != 0)
327                                 printf("testHist[%d]=%d\n", i, testHist[i]);
328                 }
329
330                 for (int i = 0; i < testHist.size(); i += NUM_WGS)
331                 {
332                         printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
333                 }
334
335 #endif  //DEBUG_RADIXSORT
336
337 #define USE_GPU
338 #ifdef USE_GPU
339
340                 if (src->size())
341                 {  //   local sort and distribute
342                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
343                         b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
344                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
345                         launcher.setConst(cdata);
346                         launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
347                 }
348 #else
349                 {
350 #define NUM_TABLES 16
351 //#define SEQUENTIAL
352 #ifdef SEQUENTIAL
353                         int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
354                         int tables[NUM_TABLES];
355                         int startBit = ib;
356
357                         destHisto->copyToHost(testHist);
358                         b3AlignedObjectArray<b3SortData> srcHost;
359                         b3AlignedObjectArray<b3SortData> dstHost;
360                         dstHost.resize(src->size());
361
362                         src->copyToHost(srcHost);
363
364                         for (int i = 0; i < NUM_TABLES; i++)
365                         {
366                                 tables[i] = testHist[i * NUM_WGS];
367                         }
368
369                         //      distribute
370                         for (int i = 0; i < n; i++)
371                         {
372                                 int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
373
374                                 dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
375                                 counter2[tableIdx]++;
376                         }
377
378 #else
379
380                         int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
381
382                         int tables[NUM_TABLES];
383                         b3AlignedObjectArray<b3SortData> dstHostOK;
384                         dstHostOK.resize(src->size());
385
386                         destHisto->copyToHost(testHist);
387                         b3AlignedObjectArray<b3SortData> srcHost;
388                         src->copyToHost(srcHost);
389
390                         int blockSize = 256;
391                         int nBlocksPerWG = cdata.m_nBlocksPerWG;
392                         int startBit = ib;
393
394                         {
395                                 for (int i = 0; i < NUM_TABLES; i++)
396                                 {
397                                         tables[i] = testHist[i * NUM_WGS];
398                                 }
399
400                                 //      distribute
401                                 for (int i = 0; i < n; i++)
402                                 {
403                                         int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
404
405                                         dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
406                                         counter2[tableIdx]++;
407                                 }
408                         }
409
410                         b3AlignedObjectArray<b3SortData> dstHost;
411                         dstHost.resize(src->size());
412
413                         int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
414
415                         for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
416                         {
417                                 int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
418
419                                 int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
420
421                                 for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
422                                 {
423                                         for (int lIdx = 0; lIdx < 64; lIdx++)
424                                         {
425                                                 int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
426
427                                                 //      MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
428                                                 //      Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
429                                                 //      AMD: AtomInc performs better while NV prefers ++
430                                                 for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
431                                                 {
432                                                         if (addr + j < n)
433                                                         {
434                                                                 //  printf ("addr+j=%d\n", addr+j);
435
436                                                                 int i = addr + j;
437
438                                                                 int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
439
440                                                                 int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
441
442                                                                 b3SortData ok = dstHostOK[destIndex];
443
444                                                                 if (ok.m_key != srcHost[i].m_key)
445                                                                 {
446                                                                         printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
447                                                                         printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
448                                                                 }
449                                                                 if (ok.m_value != srcHost[i].m_value)
450                                                                 {
451                                                                         printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
452                                                                         printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
453                                                                 }
454
455                                                                 dstHost[destIndex] = srcHost[i];
456                                                                 counter[tableIdx]++;
457                                                         }
458                                                 }
459                                         }
460                                 }
461                         }
462
463 #endif  //SEQUENTIAL
464
465                         dst->copyFromHost(dstHost);
466                 }
467 #endif  //USE_GPU
468
469 #ifdef DEBUG_RADIXSORT
470                 destHisto->copyToHost(testHist);
471                 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
472                 for (int i = 0; i < testHist.size(); i++)
473                 {
474                         if (testHist[i] != 0)
475                                 printf("testHist[%d]=%d\n", i, testHist[i]);
476                 }
477 #endif  //DEBUG_RADIXSORT
478                 b3Swap(src, dst);
479                 b3Swap(srcHisto, destHisto);
480
481 #ifdef DEBUG_RADIXSORT2
482                 keyValuesInOut.copyToHost(test2);
483                 printf("numElem = %d\n", test2.size());
484                 for (int i = 0; i < test2.size(); i++)
485                 {
486                         if (test2[i].m_key != test2[i].m_value)
487                         {
488                                 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
489                                 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
490                         }
491                 }
492 #endif  //DEBUG_RADIXSORT2
493
494                 count++;
495         }
496
497         if (count & 1)
498         {
499                 b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
500         }
501
502         if (m_workBuffer4->size())
503         {
504                 m_workBuffer4->resize(originalSize);
505                 keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
506         }
507
508 #ifdef DEBUG_RADIXSORT
509         keyValuesInOut.copyToHost(test2);
510
511         printf("numElem = %d\n", test2.size());
512         for (int i = 0; i < test2.size(); i++)
513         {
514                 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
515                 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
516         }
517 #endif
518 }
519
520 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
521 {
522         int originalSize = keysInOut.size();
523         int workingSize = originalSize;
524
525         int dataAlignment = DATA_ALIGNMENT;
526
527         b3OpenCLArray<unsigned int>* src = 0;
528
529         if (workingSize % dataAlignment)
530         {
531                 workingSize += dataAlignment - (workingSize % dataAlignment);
532                 m_workBuffer4a->copyFromOpenCLArray(keysInOut);
533                 m_workBuffer4a->resize(workingSize);
534                 unsigned int fillValue = 0xffffffff;
535
536                 m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
537
538                 src = m_workBuffer4a;
539         }
540         else
541         {
542                 src = &keysInOut;
543                 m_workBuffer4a->resize(0);
544         }
545
546         b3Assert(workingSize % DATA_ALIGNMENT == 0);
547         int minCap = NUM_BUCKET * NUM_WGS;
548
549         int n = workingSize;
550
551         m_workBuffer1->resize(minCap);
552         m_workBuffer3->resize(workingSize);
553         m_workBuffer3a->resize(workingSize);
554
555         //      ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
556         b3Assert(BITS_PER_PASS == 4);
557         b3Assert(WG_SIZE == 64);
558         b3Assert((sortBits & 0x3) == 0);
559
560         b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
561
562         b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
563         b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
564
565         int nWGs = NUM_WGS;
566         b3ConstData cdata;
567
568         {
569                 int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE;  //set at 256
570                 int nBlocks = (n + blockSize - 1) / (blockSize);
571                 cdata.m_n = n;
572                 cdata.m_nWGs = NUM_WGS;
573                 cdata.m_startBit = 0;
574                 cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
575                 if (nBlocks < NUM_WGS)
576                 {
577                         cdata.m_nBlocksPerWG = 1;
578                         nWGs = nBlocks;
579                 }
580         }
581
582         int count = 0;
583         for (int ib = 0; ib < sortBits; ib += 4)
584         {
585                 cdata.m_startBit = ib;
586
587                 if (src->size())
588                 {
589                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
590                         b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
591
592                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
593                         launcher.setConst(cdata);
594
595                         int num = NUM_WGS * WG_SIZE;
596                         launcher.launch1D(num, WG_SIZE);
597                 }
598
599 //fast prefix scan is not working properly on Mac OSX yet
600 #ifdef __APPLE__
601                 bool fastScan = false;
602 #else
603                 bool fastScan = !m_deviceCPU;
604 #endif
605
606                 if (fastScan)
607                 {  //   prefix scan group histogram
608                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
609                         b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
610                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
611                         launcher.setConst(cdata);
612                         launcher.launch1D(128, 128);
613                         destHisto = srcHisto;
614                 }
615                 else
616                 {
617                         //unsigned int sum; //for debugging
618                         m_scan->execute(*srcHisto, *destHisto, 1920, 0);  //,&sum);
619                 }
620
621                 if (src->size())
622                 {  //   local sort and distribute
623                         b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
624                         b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
625                         launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
626                         launcher.setConst(cdata);
627                         launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
628                 }
629
630                 b3Swap(src, dst);
631                 b3Swap(srcHisto, destHisto);
632
633                 count++;
634         }
635
636         if (count & 1)
637         {
638                 b3Assert(0);  //need to copy from workbuffer to keyValuesInOut
639         }
640
641         if (m_workBuffer4a->size())
642         {
643                 m_workBuffer4a->resize(originalSize);
644                 keysInOut.copyFromOpenCLArray(*m_workBuffer4a);
645         }
646 }