2 #include "b3RadixSort32CL.h"
3 #include "b3LauncherCL.h"
4 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
5 #include "b3PrefixScanCL.h"
8 #define RADIXSORT32_PATH "src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32Kernels.cl"
10 #include "kernels/RadixSort32KernelsCL.h"
12 b3RadixSort32CL::b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity)
13 : m_commandQueue(queue)
15 b3OpenCLDeviceInfo info;
16 b3OpenCLUtils::getDeviceInfo(device, &info);
17 m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU) != 0;
19 m_workBuffer1 = new b3OpenCLArray<unsigned int>(ctx, queue);
20 m_workBuffer2 = new b3OpenCLArray<unsigned int>(ctx, queue);
21 m_workBuffer3 = new b3OpenCLArray<b3SortData>(ctx, queue);
22 m_workBuffer3a = new b3OpenCLArray<unsigned int>(ctx, queue);
23 m_workBuffer4 = new b3OpenCLArray<b3SortData>(ctx, queue);
24 m_workBuffer4a = new b3OpenCLArray<unsigned int>(ctx, queue);
26 if (initialCapacity > 0)
28 m_workBuffer1->resize(initialCapacity);
29 m_workBuffer3->resize(initialCapacity);
30 m_workBuffer3a->resize(initialCapacity);
31 m_workBuffer4->resize(initialCapacity);
32 m_workBuffer4a->resize(initialCapacity);
35 m_scan = new b3PrefixScanCL(ctx, device, queue);
36 m_fill = new b3FillCL(ctx, device, queue);
38 const char* additionalMacros = "";
41 const char* kernelSource = radixSort32KernelsCL;
43 cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, RADIXSORT32_PATH);
46 m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg, additionalMacros);
47 b3Assert(m_streamCountSortDataKernel);
49 m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg, additionalMacros);
50 b3Assert(m_streamCountKernel);
54 m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg, additionalMacros);
55 b3Assert(m_sortAndScatterSortDataKernel);
56 m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg, additionalMacros);
57 b3Assert(m_sortAndScatterKernel);
61 m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg, additionalMacros);
62 b3Assert(m_sortAndScatterSortDataKernel);
63 m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg, additionalMacros);
64 b3Assert(m_sortAndScatterKernel);
67 m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg, additionalMacros);
68 b3Assert(m_prefixScanKernel);
71 b3RadixSort32CL::~b3RadixSort32CL()
78 delete m_workBuffer3a;
80 delete m_workBuffer4a;
82 clReleaseKernel(m_streamCountSortDataKernel);
83 clReleaseKernel(m_streamCountKernel);
84 clReleaseKernel(m_sortAndScatterSortDataKernel);
85 clReleaseKernel(m_sortAndScatterKernel);
86 clReleaseKernel(m_prefixScanKernel);
89 void b3RadixSort32CL::executeHost(b3AlignedObjectArray<b3SortData>& inout, int sortBits /* = 32 */)
92 const int BITS_PER_PASS = 8;
93 const int NUM_TABLES = (1 << BITS_PER_PASS);
95 int tables[NUM_TABLES];
96 int counter[NUM_TABLES];
98 b3SortData* src = &inout[0];
99 b3AlignedObjectArray<b3SortData> workbuffer;
100 workbuffer.resize(inout.size());
101 b3SortData* dst = &workbuffer[0];
104 for (int startBit = 0; startBit < sortBits; startBit += BITS_PER_PASS)
106 for (int i = 0; i < NUM_TABLES; i++)
111 for (int i = 0; i < n; i++)
113 int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
118 printf("histogram size=%d\n", NUM_TABLES);
119 for (int i = 0; i < NUM_TABLES; i++)
123 printf("tables[%d]=%d]\n", i, tables[i]);
129 for (int i = 0; i < NUM_TABLES; i++)
131 int iData = tables[i];
138 for (int i = 0; i < n; i++)
140 int tableIdx = (src[i].m_key >> startBit) & (NUM_TABLES - 1);
142 dst[tables[tableIdx] + counter[tableIdx]] = src[i];
152 b3Assert(0); //need to copy
156 void b3RadixSort32CL::executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
158 b3AlignedObjectArray<b3SortData> inout;
159 keyValuesInOut.copyToHost(inout);
161 executeHost(inout, sortBits);
163 keyValuesInOut.copyFromHost(inout);
166 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
167 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits)
171 //#define DEBUG_RADIXSORT
172 //#define DEBUG_RADIXSORT2
174 void b3RadixSort32CL::execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits /* = 32 */)
176 int originalSize = keyValuesInOut.size();
177 int workingSize = originalSize;
179 int dataAlignment = DATA_ALIGNMENT;
181 #ifdef DEBUG_RADIXSORT2
182 b3AlignedObjectArray<b3SortData> test2;
183 keyValuesInOut.copyToHost(test2);
184 printf("numElem = %d\n", test2.size());
185 for (int i = 0; i < test2.size(); i++)
187 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
188 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
190 #endif //DEBUG_RADIXSORT2
192 b3OpenCLArray<b3SortData>* src = 0;
194 if (workingSize % dataAlignment)
196 workingSize += dataAlignment - (workingSize % dataAlignment);
197 m_workBuffer4->copyFromOpenCLArray(keyValuesInOut);
198 m_workBuffer4->resize(workingSize);
199 b3SortData fillValue;
200 fillValue.m_key = 0xffffffff;
201 fillValue.m_value = 0xffffffff;
205 m_fill->execute((b3OpenCLArray<b3Int2>&)*m_workBuffer4, (b3Int2&)fillValue, workingSize - originalSize, originalSize);
207 //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side)
209 for (int i = originalSize; i < workingSize; i++)
211 m_workBuffer4->copyFromHostPointer(&fillValue, 1, i);
219 src = &keyValuesInOut;
220 m_workBuffer4->resize(0);
223 b3Assert(workingSize % DATA_ALIGNMENT == 0);
224 int minCap = NUM_BUCKET * NUM_WGS;
228 m_workBuffer1->resize(minCap);
229 m_workBuffer3->resize(workingSize);
231 // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
232 b3Assert(BITS_PER_PASS == 4);
233 b3Assert(WG_SIZE == 64);
234 b3Assert((sortBits & 0x3) == 0);
236 b3OpenCLArray<b3SortData>* dst = m_workBuffer3;
238 b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
239 b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
245 int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
246 int nBlocks = (n + blockSize - 1) / (blockSize);
248 cdata.m_nWGs = NUM_WGS;
249 cdata.m_startBit = 0;
250 cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
251 if (nBlocks < NUM_WGS)
253 cdata.m_nBlocksPerWG = 1;
259 for (int ib = 0; ib < sortBits; ib += 4)
261 #ifdef DEBUG_RADIXSORT2
262 keyValuesInOut.copyToHost(test2);
263 printf("numElem = %d\n", test2.size());
264 for (int i = 0; i < test2.size(); i++)
266 if (test2[i].m_key != test2[i].m_value)
268 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
269 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
272 #endif //DEBUG_RADIXSORT2
274 cdata.m_startBit = ib;
278 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
279 b3LauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel, "m_streamCountSortDataKernel");
281 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
282 launcher.setConst(cdata);
284 int num = NUM_WGS * WG_SIZE;
285 launcher.launch1D(num, WG_SIZE);
288 #ifdef DEBUG_RADIXSORT
289 b3AlignedObjectArray<unsigned int> testHist;
290 srcHisto->copyToHost(testHist);
291 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
292 for (int i = 0; i < testHist.size(); i++)
294 if (testHist[i] != 0)
295 printf("testHist[%d]=%d\n", i, testHist[i]);
297 #endif //DEBUG_RADIXSORT
299 //fast prefix scan is not working properly on Mac OSX yet
301 bool fastScan = false;
303 bool fastScan = !m_deviceCPU; //only use fast scan on GPU
307 { // prefix scan group histogram
308 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
309 b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
310 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
311 launcher.setConst(cdata);
312 launcher.launch1D(128, 128);
313 destHisto = srcHisto;
317 //unsigned int sum; //for debugging
318 m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
321 #ifdef DEBUG_RADIXSORT
322 destHisto->copyToHost(testHist);
323 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
324 for (int i = 0; i < testHist.size(); i++)
326 if (testHist[i] != 0)
327 printf("testHist[%d]=%d\n", i, testHist[i]);
330 for (int i = 0; i < testHist.size(); i += NUM_WGS)
332 printf("testHist[%d]=%d\n", i / NUM_WGS, testHist[i]);
335 #endif //DEBUG_RADIXSORT
341 { // local sort and distribute
342 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
343 b3LauncherCL launcher(m_commandQueue, m_sortAndScatterSortDataKernel, "m_sortAndScatterSortDataKernel");
344 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
345 launcher.setConst(cdata);
346 launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
350 #define NUM_TABLES 16
353 int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
354 int tables[NUM_TABLES];
357 destHisto->copyToHost(testHist);
358 b3AlignedObjectArray<b3SortData> srcHost;
359 b3AlignedObjectArray<b3SortData> dstHost;
360 dstHost.resize(src->size());
362 src->copyToHost(srcHost);
364 for (int i = 0; i < NUM_TABLES; i++)
366 tables[i] = testHist[i * NUM_WGS];
370 for (int i = 0; i < n; i++)
372 int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
374 dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
375 counter2[tableIdx]++;
380 int counter2[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
382 int tables[NUM_TABLES];
383 b3AlignedObjectArray<b3SortData> dstHostOK;
384 dstHostOK.resize(src->size());
386 destHisto->copyToHost(testHist);
387 b3AlignedObjectArray<b3SortData> srcHost;
388 src->copyToHost(srcHost);
391 int nBlocksPerWG = cdata.m_nBlocksPerWG;
395 for (int i = 0; i < NUM_TABLES; i++)
397 tables[i] = testHist[i * NUM_WGS];
401 for (int i = 0; i < n; i++)
403 int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
405 dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i];
406 counter2[tableIdx]++;
410 b3AlignedObjectArray<b3SortData> dstHost;
411 dstHost.resize(src->size());
413 int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
415 for (int wgIdx = 0; wgIdx < NUM_WGS; wgIdx++)
417 int counter[NUM_TABLES] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
419 int nBlocks = (n) / blockSize - nBlocksPerWG * wgIdx;
421 for (int iblock = 0; iblock < b3Min(cdata.m_nBlocksPerWG, nBlocks); iblock++)
423 for (int lIdx = 0; lIdx < 64; lIdx++)
425 int addr = iblock * blockSize + blockSize * cdata.m_nBlocksPerWG * wgIdx + ELEMENTS_PER_WORK_ITEM * lIdx;
427 // MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD
428 // Using registers didn't perform well. It seems like use localKeys to address requires a lot of alu ops
429 // AMD: AtomInc performs better while NV prefers ++
430 for (int j = 0; j < ELEMENTS_PER_WORK_ITEM; j++)
434 // printf ("addr+j=%d\n", addr+j);
438 int tableIdx = (srcHost[i].m_key >> startBit) & (NUM_TABLES - 1);
440 int destIndex = testHist[tableIdx * NUM_WGS + wgIdx] + counter[tableIdx];
442 b3SortData ok = dstHostOK[destIndex];
444 if (ok.m_key != srcHost[i].m_key)
446 printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key, srcHost[i].m_key);
447 printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value, srcHost[i].m_value);
449 if (ok.m_value != srcHost[i].m_value)
451 printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value, srcHost[i].m_value);
452 printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key, srcHost[i].m_key);
455 dstHost[destIndex] = srcHost[i];
465 dst->copyFromHost(dstHost);
469 #ifdef DEBUG_RADIXSORT
470 destHisto->copyToHost(testHist);
471 printf("ib = %d, testHist size = %d, non zero elements:\n", ib, testHist.size());
472 for (int i = 0; i < testHist.size(); i++)
474 if (testHist[i] != 0)
475 printf("testHist[%d]=%d\n", i, testHist[i]);
477 #endif //DEBUG_RADIXSORT
479 b3Swap(srcHisto, destHisto);
481 #ifdef DEBUG_RADIXSORT2
482 keyValuesInOut.copyToHost(test2);
483 printf("numElem = %d\n", test2.size());
484 for (int i = 0; i < test2.size(); i++)
486 if (test2[i].m_key != test2[i].m_value)
488 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
489 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
492 #endif //DEBUG_RADIXSORT2
499 b3Assert(0); //need to copy from workbuffer to keyValuesInOut
502 if (m_workBuffer4->size())
504 m_workBuffer4->resize(originalSize);
505 keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4);
508 #ifdef DEBUG_RADIXSORT
509 keyValuesInOut.copyToHost(test2);
511 printf("numElem = %d\n", test2.size());
512 for (int i = 0; i < test2.size(); i++)
514 printf("test2[%d].m_key=%d\n", i, test2[i].m_key);
515 printf("test2[%d].m_value=%d\n", i, test2[i].m_value);
520 void b3RadixSort32CL::execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits /* = 32 */)
522 int originalSize = keysInOut.size();
523 int workingSize = originalSize;
525 int dataAlignment = DATA_ALIGNMENT;
527 b3OpenCLArray<unsigned int>* src = 0;
529 if (workingSize % dataAlignment)
531 workingSize += dataAlignment - (workingSize % dataAlignment);
532 m_workBuffer4a->copyFromOpenCLArray(keysInOut);
533 m_workBuffer4a->resize(workingSize);
534 unsigned int fillValue = 0xffffffff;
536 m_fill->execute(*m_workBuffer4a, fillValue, workingSize - originalSize, originalSize);
538 src = m_workBuffer4a;
543 m_workBuffer4a->resize(0);
546 b3Assert(workingSize % DATA_ALIGNMENT == 0);
547 int minCap = NUM_BUCKET * NUM_WGS;
551 m_workBuffer1->resize(minCap);
552 m_workBuffer3->resize(workingSize);
553 m_workBuffer3a->resize(workingSize);
555 // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 );
556 b3Assert(BITS_PER_PASS == 4);
557 b3Assert(WG_SIZE == 64);
558 b3Assert((sortBits & 0x3) == 0);
560 b3OpenCLArray<unsigned int>* dst = m_workBuffer3a;
562 b3OpenCLArray<unsigned int>* srcHisto = m_workBuffer1;
563 b3OpenCLArray<unsigned int>* destHisto = m_workBuffer2;
569 int blockSize = ELEMENTS_PER_WORK_ITEM * WG_SIZE; //set at 256
570 int nBlocks = (n + blockSize - 1) / (blockSize);
572 cdata.m_nWGs = NUM_WGS;
573 cdata.m_startBit = 0;
574 cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1) / cdata.m_nWGs;
575 if (nBlocks < NUM_WGS)
577 cdata.m_nBlocksPerWG = 1;
583 for (int ib = 0; ib < sortBits; ib += 4)
585 cdata.m_startBit = ib;
589 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(srcHisto->getBufferCL())};
590 b3LauncherCL launcher(m_commandQueue, m_streamCountKernel, "m_streamCountKernel");
592 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
593 launcher.setConst(cdata);
595 int num = NUM_WGS * WG_SIZE;
596 launcher.launch1D(num, WG_SIZE);
599 //fast prefix scan is not working properly on Mac OSX yet
601 bool fastScan = false;
603 bool fastScan = !m_deviceCPU;
607 { // prefix scan group histogram
608 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(srcHisto->getBufferCL())};
609 b3LauncherCL launcher(m_commandQueue, m_prefixScanKernel, "m_prefixScanKernel");
610 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
611 launcher.setConst(cdata);
612 launcher.launch1D(128, 128);
613 destHisto = srcHisto;
617 //unsigned int sum; //for debugging
618 m_scan->execute(*srcHisto, *destHisto, 1920, 0); //,&sum);
622 { // local sort and distribute
623 b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src->getBufferCL(), true), b3BufferInfoCL(destHisto->getBufferCL(), true), b3BufferInfoCL(dst->getBufferCL())};
624 b3LauncherCL launcher(m_commandQueue, m_sortAndScatterKernel, "m_sortAndScatterKernel");
625 launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
626 launcher.setConst(cdata);
627 launcher.launch1D(nWGs * WG_SIZE, WG_SIZE);
631 b3Swap(srcHisto, destHisto);
638 b3Assert(0); //need to copy from workbuffer to keyValuesInOut
641 if (m_workBuffer4a->size())
643 m_workBuffer4a->resize(originalSize);
644 keysInOut.copyFromOpenCLArray(*m_workBuffer4a);