modules/ocl/src/opencl/kernel_stablesort_by_key.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@outlook.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46 #ifndef K_T
  47 #define K_T float
  48 #endif
  49
  50 #ifndef V_T
  51 #define V_T float
  52 #endif
  53
  54 #ifndef IS_GT
  55 #define IS_GT false
  56 #endif
  57
  58 #if IS_GT
  59 #define my_comp(x,y) ((x) > (y))
  60 #else
  61 #define my_comp(x,y) ((x) < (y))
  62 #endif
  63
  64 ///////////// parallel merge sort ///////////////
  65 // ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
  66 uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
  67 {
  68     //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
  69     uint firstIndex = left;
  70     uint lastIndex = right;
  71
  72     //  This loops through [firstIndex, lastIndex)
  73     //  Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
  74     //  this while loop will be divergent within a wavefront
  75     while( firstIndex < lastIndex )
  76     {
  77         K_T dataVal = data[ firstIndex ];
  78
  79         //  This branch will create divergent wavefronts
  80         if( my_comp( dataVal, searchVal ) )
  81         {
  82             firstIndex = firstIndex+1;
  83         }
  84         else
  85         {
  86             break;
  87         }
  88     }
  89
  90     return firstIndex;
  91 }
  92
  93 //  This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
  94 //  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
  95 //  passed as a functor parameter my_comp
  96 //  This function returns an index that is the first index whos value would be equal to the searched value
  97 uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
  98 {
  99     //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
 100     uint firstIndex = left;
 101     uint lastIndex = right;
 102
 103     //  This loops through [firstIndex, lastIndex)
 104     //  Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
 105     //  this while loop will be divergent within a wavefront
 106     while( firstIndex < lastIndex )
 107     {
 108         //  midIndex is the average of first and last, rounded down
 109         uint midIndex = ( firstIndex + lastIndex ) / 2;
 110         K_T midValue = data[ midIndex ];
 111
 112         //  This branch will create divergent wavefronts
 113         if( my_comp( midValue, searchVal ) )
 114         {
 115             firstIndex = midIndex+1;
 116             // printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex );
 117         }
 118         else
 119         {
 120             lastIndex = midIndex;
 121             // printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex );
 122         }
 123     }
 124
 125     return firstIndex;
 126 }
 127
 128 //  This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
 129 //  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
 130 //  passed as a functor parameter my_comp
 131 //  This function returns an index that is the first index whos value would be greater than the searched value
 132 //  If the search value is not found in the sequence, upperbound returns the same result as lowerbound
 133 uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
 134 {
 135     uint upperBound = lowerBoundBinary( data, left, right, searchVal );
 136
 137     // printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound );
 138     //  If upperBound == right, then  searchVal was not found in the sequence.  Just return.
 139     if( upperBound != right )
 140     {
 141         //  While the values are equal i.e. !(x < y) && !(y < x) increment the index
 142         K_T upperValue = data[ upperBound ];
 143         while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) )
 144         {
 145             upperBound++;
 146             upperValue = data[ upperBound ];
 147         }
 148     }
 149
 150     return upperBound;
 151 }
 152
 153 //  This kernel implements merging of blocks of sorted data.  The input to this kernel most likely is
 154 //  the output of blockInsertionSortTemplate.  It is expected that the source array contains multiple
 155 //  blocks, each block is independently sorted.  The goal is to write into the output buffer half as
 156 //  many blocks, of double the size.  The even and odd blocks are stably merged together to form
 157 //  a new sorted block of twice the size.  The algorithm is out-of-place.
 158 kernel void merge(
 159     global K_T*   iKey_ptr,
 160     global V_T*   iValue_ptr,
 161     global K_T*   oKey_ptr,
 162     global V_T*   oValue_ptr,
 163     const uint    srcVecSize,
 164     const uint    srcLogicalBlockSize,
 165     local K_T*    key_lds,
 166     local V_T*    val_lds
 167 )
 168 {
 169     size_t globalID     = get_global_id( 0 );
 170     size_t groupID      = get_group_id( 0 );
 171     size_t localID      = get_local_id( 0 );
 172     size_t wgSize       = get_local_size( 0 );
 173
 174     //  Abort threads that are passed the end of the input vector
 175     if( globalID >= srcVecSize )
 176         return; // on SI this doesn't mess-up barriers
 177
 178     //  For an element in sequence A, find the lowerbound index for it in sequence B
 179     uint srcBlockNum   = globalID / srcLogicalBlockSize;
 180     uint srcBlockIndex = globalID % srcLogicalBlockSize;
 181
 182     // printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex );
 183
 184     //  Pairs of even-odd blocks will be merged together
 185     //  An even block should search for an insertion point in the next odd block,
 186     //  and the odd block should look for an insertion point in the corresponding previous even block
 187     uint dstLogicalBlockSize = srcLogicalBlockSize<<1;
 188     uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 );
 189     leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize;
 190     leftBlockIndex = min( leftBlockIndex, srcVecSize );
 191     uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize );
 192
 193     // if( localID == 0 )
 194     // {
 195     // printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex );
 196     // }
 197
 198     //  For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex
 199     // uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex;
 200     uint insertionIndex = 0;
 201     if( (srcBlockNum & 0x1) == 0 )
 202     {
 203         insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
 204     }
 205     else
 206     {
 207         insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
 208     }
 209
 210     //  The index of an element in the result sequence is the summation of it's indixes in the two input
 211     //  sequences
 212     uint dstBlockIndex = srcBlockIndex + insertionIndex;
 213     uint dstBlockNum = srcBlockNum/2;
 214
 215     // if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 )
 216     // {
 217     // printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex );
 218     // printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] );
 219     // }
 220     oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ];
 221     oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ];
 222     // printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex );
 223 }
 224
 225 kernel void blockInsertionSort(
 226     global K_T*   key_ptr,
 227     global V_T*   value_ptr,
 228     const uint    vecSize,
 229     local K_T*    key_lds,
 230     local V_T*    val_lds
 231 )
 232 {
 233     size_t gloId    = get_global_id( 0 );
 234     size_t groId    = get_group_id( 0 );
 235     size_t locId    = get_local_id( 0 );
 236     size_t wgSize   = get_local_size( 0 );
 237
 238     bool in_range = gloId < vecSize;
 239     K_T key;
 240     V_T val;
 241     //  Abort threads that are passed the end of the input vector
 242     if (in_range)
 243     {
 244         //  Make a copy of the entire input array into fast local memory
 245         key = key_ptr[ gloId ];
 246         val = value_ptr[ gloId ];
 247         key_lds[ locId ] = key;
 248         val_lds[ locId ] = val;
 249     }
 250     barrier( CLK_LOCAL_MEM_FENCE );
 251     //  Sorts a workgroup using a naive insertion sort
 252     //  The sort uses one thread within a workgroup to sort the entire workgroup
 253     if( locId == 0 && in_range )
 254     {
 255         //  The last workgroup may have an irregular size, so we calculate a per-block endIndex
 256         //  endIndex is essentially emulating a mod operator with subtraction and multiply
 257         size_t endIndex = vecSize - ( groId * wgSize );
 258         endIndex = min( endIndex, wgSize );
 259
 260         // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
 261
 262         //  Indices are signed because the while loop will generate a -1 index inside of the max function
 263         for( int currIndex = 1; currIndex < endIndex; ++currIndex )
 264         {
 265             key = key_lds[ currIndex ];
 266             val = val_lds[ currIndex ];
 267             int scanIndex = currIndex;
 268             K_T ldsKey = key_lds[scanIndex - 1];
 269             while( scanIndex > 0 && my_comp( key, ldsKey ) )
 270             {
 271                 V_T ldsVal = val_lds[scanIndex - 1];
 272
 273                 //  If the keys are being swapped, make sure the values are swapped identicaly
 274                 key_lds[ scanIndex ] = ldsKey;
 275                 val_lds[ scanIndex ] = ldsVal;
 276
 277                 scanIndex = scanIndex - 1;
 278                 ldsKey = key_lds[ max( 0, scanIndex - 1 ) ];  // scanIndex-1 may be -1
 279             }
 280             key_lds[ scanIndex ] = key;
 281             val_lds[ scanIndex ] = val;
 282         }
 283     }
 284     barrier( CLK_LOCAL_MEM_FENCE );
 285
 286     if(in_range)
 287     {
 288         key = key_lds[ locId ];
 289         key_ptr[ gloId ] = key;
 290
 291         val = val_lds[ locId ];
 292         value_ptr[ gloId ] = val;
 293     }
 294 }
 295
 296 ///////////// Radix sort from b40c library /////////////