2 #ifndef B3_RADIXSORT32_H
3 #define B3_RADIXSORT32_H
5 #include "b3OpenCLArray.h"
19 #include "b3BufferInfoCL.h"
23 b3OpenCLArray<unsigned int>* m_workBuffer1;
24 b3OpenCLArray<unsigned int>* m_workBuffer2;
26 b3OpenCLArray<b3SortData>* m_workBuffer3;
27 b3OpenCLArray<b3SortData>* m_workBuffer4;
29 b3OpenCLArray<unsigned int>* m_workBuffer3a;
30 b3OpenCLArray<unsigned int>* m_workBuffer4a;
32 cl_command_queue m_commandQueue;
34 cl_kernel m_streamCountSortDataKernel;
35 cl_kernel m_streamCountKernel;
37 cl_kernel m_prefixScanKernel;
38 cl_kernel m_sortAndScatterSortDataKernel;
39 cl_kernel m_sortAndScatterKernel;
43 class b3PrefixScanCL* m_scan;
44 class b3FillCL* m_fill;
59 ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
61 NUM_BUCKET = (1 << BITS_PER_PASS),
62 // if you change this, change nPerWI in kernel as well
63 NUM_WGS = 20 * 6, // cypress
64 // NUM_WGS = 24*6, // cayman
65 // NUM_WGS = 32*4, // nv
70 b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);
72 virtual ~b3RadixSort32CL();
74 void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
75 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
78 void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);
80 void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
81 void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
82 void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
84 #endif //B3_RADIXSORT32_H