1 static const char* radixSortSimpleKernelsCL = \
2 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
3 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
5 "typedef unsigned int u32;\n"
6 "#define GET_GROUP_IDX get_group_id(0)\n"
7 "#define GET_LOCAL_IDX get_local_id(0)\n"
8 "#define GET_GLOBAL_IDX get_global_id(0)\n"
9 "#define GET_GROUP_SIZE get_local_size(0)\n"
10 "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
11 "#define AtomInc(x) atom_inc(&(x))\n"
12 "#define AtomInc1(x, out) out = atom_inc(&(x))\n"
15 "#define WG_SIZE 128\n"
16 "#define NUM_PER_WI 4\n"
30 " u32 m_padding[2];\n"
35 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
36 "void LocalCountKernel(__global SortData* sortData,\n"
37 " __global u32* ldsHistogramOut,\n"
40 " __local u32 ldsHistogram[16][256];\n"
42 " int lIdx = GET_LOCAL_IDX;\n"
43 " int gIdx = GET_GLOBAL_IDX;\n"
45 " for(int i=0; i<16; i++)\n"
47 " ldsHistogram[i][lIdx] = 0.f;\n"
48 " ldsHistogram[i][lIdx+128] = 0.f;\n"
51 " GROUP_LDS_BARRIER;\n"
53 " SortData datas[NUM_PER_WI];\n"
54 " datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
55 " datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
56 " datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
57 " datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
59 " datas[0].m_key = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
60 " datas[1].m_key = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
61 " datas[2].m_key = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
62 " datas[3].m_key = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
64 " int tableIdx = lIdx%16;\n"
66 " AtomInc(ldsHistogram[tableIdx][datas[0].m_key]);\n"
67 " AtomInc(ldsHistogram[tableIdx][datas[1].m_key]);\n"
68 " AtomInc(ldsHistogram[tableIdx][datas[2].m_key]);\n"
69 " AtomInc(ldsHistogram[tableIdx][datas[3].m_key]);\n"
71 " GROUP_LDS_BARRIER;\n"
75 " for(int i=0; i<16; i++)\n"
77 " sum0 += ldsHistogram[i][lIdx];\n"
78 " sum1 += ldsHistogram[i][lIdx+128];\n"
81 " ldsHistogramOut[lIdx*cb.m_numGroups+GET_GROUP_IDX] = sum0;\n"
82 " ldsHistogramOut[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX] = sum1;\n"
86 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
87 "void ScatterKernel(__global SortData* sortData,\n"
88 " __global SortData* sortDataOut,\n"
89 " __global u32* scannedHistogram,\n"
92 " __local u32 ldsCurrentLocation[256];\n"
94 " int lIdx = GET_LOCAL_IDX;\n"
95 " int gIdx = GET_GLOBAL_IDX;\n"
98 " ldsCurrentLocation[lIdx] = scannedHistogram[lIdx*cb.m_numGroups+GET_GROUP_IDX];\n"
99 " ldsCurrentLocation[lIdx+128] = scannedHistogram[(lIdx+128)*cb.m_numGroups+GET_GROUP_IDX];\n"
102 " GROUP_LDS_BARRIER;\n"
104 " SortData datas[NUM_PER_WI];\n"
105 " int keys[NUM_PER_WI];\n"
106 " datas[0] = sortData[gIdx*NUM_PER_WI+0];\n"
107 " datas[1] = sortData[gIdx*NUM_PER_WI+1];\n"
108 " datas[2] = sortData[gIdx*NUM_PER_WI+2];\n"
109 " datas[3] = sortData[gIdx*NUM_PER_WI+3];\n"
111 " keys[0] = (datas[0].m_key >> cb.m_startBit) & 0xff;\n"
112 " keys[1] = (datas[1].m_key >> cb.m_startBit) & 0xff;\n"
113 " keys[2] = (datas[2].m_key >> cb.m_startBit) & 0xff;\n"
114 " keys[3] = (datas[3].m_key >> cb.m_startBit) & 0xff;\n"
116 " int dst[NUM_PER_WI];\n"
117 " for(int i=0; i<WG_SIZE; i++)\n"
121 " AtomInc1(ldsCurrentLocation[keys[0]], dst[0]);\n"
122 " AtomInc1(ldsCurrentLocation[keys[1]], dst[1]);\n"
123 " AtomInc1(ldsCurrentLocation[keys[2]], dst[2]);\n"
124 " AtomInc1(ldsCurrentLocation[keys[3]], dst[3]);\n"
126 " GROUP_LDS_BARRIER;\n"
128 " sortDataOut[dst[0]] = datas[0];\n"
129 " sortDataOut[dst[1]] = datas[1];\n"
130 " sortDataOut[dst[2]] = datas[2];\n"
131 " sortDataOut[dst[3]] = datas[3];\n"