1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Peng Xiao, pengxiao@outlook.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other oclMaterials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
59 #define my_comp(x,y) ((x) > (y))
61 #define my_comp(x,y) ((x) < (y))
64 ///////////// parallel merge sort ///////////////
65 // ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
66 uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
68 // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
69 uint firstIndex = left;
70 uint lastIndex = right;
72 // This loops through [firstIndex, lastIndex)
73 // Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
74 // this while loop will be divergent within a wavefront
75 while( firstIndex < lastIndex )
77 K_T dataVal = data[ firstIndex ];
79 // This branch will create divergent wavefronts
80 if( my_comp( dataVal, searchVal ) )
82 firstIndex = firstIndex+1;
93 // This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
94 // by a base pointer and left and right index for a particular candidate value. The comparison operator is
95 // passed as a functor parameter my_comp
96 // This function returns an index that is the first index whos value would be equal to the searched value
97 uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
99 // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
100 uint firstIndex = left;
101 uint lastIndex = right;
103 // This loops through [firstIndex, lastIndex)
104 // Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
105 // this while loop will be divergent within a wavefront
106 while( firstIndex < lastIndex )
108 // midIndex is the average of first and last, rounded down
109 uint midIndex = ( firstIndex + lastIndex ) / 2;
110 K_T midValue = data[ midIndex ];
112 // This branch will create divergent wavefronts
113 if( my_comp( midValue, searchVal ) )
115 firstIndex = midIndex+1;
116 // printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex );
120 lastIndex = midIndex;
121 // printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex );
128 // This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
129 // by a base pointer and left and right index for a particular candidate value. The comparison operator is
130 // passed as a functor parameter my_comp
131 // This function returns an index that is the first index whos value would be greater than the searched value
132 // If the search value is not found in the sequence, upperbound returns the same result as lowerbound
133 uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
135 uint upperBound = lowerBoundBinary( data, left, right, searchVal );
137 // printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound );
138 // If upperBound == right, then searchVal was not found in the sequence. Just return.
139 if( upperBound != right )
141 // While the values are equal i.e. !(x < y) && !(y < x) increment the index
142 K_T upperValue = data[ upperBound ];
143 while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) )
146 upperValue = data[ upperBound ];
153 // This kernel implements merging of blocks of sorted data. The input to this kernel most likely is
154 // the output of blockInsertionSortTemplate. It is expected that the source array contains multiple
155 // blocks, each block is independently sorted. The goal is to write into the output buffer half as
156 // many blocks, of double the size. The even and odd blocks are stably merged together to form
157 // a new sorted block of twice the size. The algorithm is out-of-place.
159 global K_T* iKey_ptr,
160 global V_T* iValue_ptr,
161 global K_T* oKey_ptr,
162 global V_T* oValue_ptr,
163 const uint srcVecSize,
164 const uint srcLogicalBlockSize,
169 size_t globalID = get_global_id( 0 );
170 size_t groupID = get_group_id( 0 );
171 size_t localID = get_local_id( 0 );
172 size_t wgSize = get_local_size( 0 );
174 // Abort threads that are passed the end of the input vector
175 if( globalID >= srcVecSize )
176 return; // on SI this doesn't mess-up barriers
178 // For an element in sequence A, find the lowerbound index for it in sequence B
179 uint srcBlockNum = globalID / srcLogicalBlockSize;
180 uint srcBlockIndex = globalID % srcLogicalBlockSize;
182 // printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex );
184 // Pairs of even-odd blocks will be merged together
185 // An even block should search for an insertion point in the next odd block,
186 // and the odd block should look for an insertion point in the corresponding previous even block
187 uint dstLogicalBlockSize = srcLogicalBlockSize<<1;
188 uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 );
189 leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize;
190 leftBlockIndex = min( leftBlockIndex, srcVecSize );
191 uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize );
193 // if( localID == 0 )
195 // printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex );
198 // For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex
199 // uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex;
200 uint insertionIndex = 0;
201 if( (srcBlockNum & 0x1) == 0 )
203 insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
207 insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
210 // The index of an element in the result sequence is the summation of it's indixes in the two input
212 uint dstBlockIndex = srcBlockIndex + insertionIndex;
213 uint dstBlockNum = srcBlockNum/2;
215 // if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 )
217 // printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex );
218 // printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] );
220 oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ];
221 oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ];
222 // printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex );
225 kernel void blockInsertionSort(
227 global V_T* value_ptr,
233 size_t gloId = get_global_id( 0 );
234 size_t groId = get_group_id( 0 );
235 size_t locId = get_local_id( 0 );
236 size_t wgSize = get_local_size( 0 );
238 bool in_range = gloId < vecSize;
241 // Abort threads that are passed the end of the input vector
244 // Make a copy of the entire input array into fast local memory
245 key = key_ptr[ gloId ];
246 val = value_ptr[ gloId ];
247 key_lds[ locId ] = key;
248 val_lds[ locId ] = val;
250 barrier( CLK_LOCAL_MEM_FENCE );
251 // Sorts a workgroup using a naive insertion sort
252 // The sort uses one thread within a workgroup to sort the entire workgroup
253 if( locId == 0 && in_range )
255 // The last workgroup may have an irregular size, so we calculate a per-block endIndex
256 // endIndex is essentially emulating a mod operator with subtraction and multiply
257 size_t endIndex = vecSize - ( groId * wgSize );
258 endIndex = min( endIndex, wgSize );
260 // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
262 // Indices are signed because the while loop will generate a -1 index inside of the max function
263 for( int currIndex = 1; currIndex < endIndex; ++currIndex )
265 key = key_lds[ currIndex ];
266 val = val_lds[ currIndex ];
267 int scanIndex = currIndex;
268 K_T ldsKey = key_lds[scanIndex - 1];
269 while( scanIndex > 0 && my_comp( key, ldsKey ) )
271 V_T ldsVal = val_lds[scanIndex - 1];
273 // If the keys are being swapped, make sure the values are swapped identicaly
274 key_lds[ scanIndex ] = ldsKey;
275 val_lds[ scanIndex ] = ldsVal;
277 scanIndex = scanIndex - 1;
278 ldsKey = key_lds[ max( 0, scanIndex - 1 ) ]; // scanIndex-1 may be -1
280 key_lds[ scanIndex ] = key;
281 val_lds[ scanIndex ] = val;
284 barrier( CLK_LOCAL_MEM_FENCE );
288 key = key_lds[ locId ];
289 key_ptr[ gloId ] = key;
291 val = val_lds[ locId ];
292 value_ptr[ gloId ] = val;
296 ///////////// Radix sort from b40c library /////////////