modules/ocl/src/opencl/kernel_radix_sort_by_key.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@outlook.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
  47
  48 #ifndef N   // number of radices
  49 #define N 4
  50 #endif
  51
  52 #ifndef K_T
  53 #define K_T float
  54 #endif
  55
  56 #ifndef V_T
  57 #define V_T float
  58 #endif
  59
  60 #ifndef IS_GT
  61 #define IS_GT 0
  62 #endif
  63
  64
  65 // from Thrust::b40c, link:
  66 // https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
  67 __inline uint convertKey(uint converted_key)
  68 {
  69 #ifdef K_FLT
  70     unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000;
  71     converted_key ^= mask;
  72 #elif defined(K_INT)
  73     const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
  74     converted_key ^= SIGN_MASK;
  75 #else
  76
  77 #endif
  78     return converted_key;
  79 }
  80
  81 //FIXME(pengx17):
  82 // exclusive scan, need to be optimized as this is too naive...
  83 kernel
  84     void naiveScanAddition(
  85     __global int * input,
  86     __global int * output,
  87     int size
  88     )
  89 {
  90     if(get_global_id(0) == 0)
  91     {
  92         output[0] = 0;
  93         for(int i = 1; i < size; i ++)
  94         {
  95             output[i] = output[i - 1] + input[i - 1];
  96         }
  97     }
  98 }
  99
 100 // following is ported from
 101 // https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl
 102 kernel
 103     void histogramRadixN (
 104     __global K_T* unsortedKeys,
 105     __global int * buckets,
 106     uint shiftCount
 107     )
 108 {
 109     const int RADIX_T     = N;
 110     const int RADICES_T   = (1 << RADIX_T);
 111     const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T;
 112     const int MASK_T      = (1 << RADIX_T) - 1;
 113     int localBuckets[16] = {0,0,0,0,0,0,0,0,
 114                             0,0,0,0,0,0,0,0};
 115     int globalId    = get_global_id(0);
 116     int numOfGroups = get_num_groups(0);
 117
 118     /* Calculate thread-histograms */
 119     for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
 120     {
 121         uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)]));
 122         value = (value >> shiftCount) & MASK_T;
 123 #if IS_GT
 124         localBuckets[RADICES_T - value - 1]++;
 125 #else
 126         localBuckets[value]++;
 127 #endif
 128     }
 129
 130     for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
 131     {
 132         buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i];
 133     }
 134 }
 135
 136 kernel
 137     void permuteRadixN (
 138     __global K_T*  unsortedKeys,
 139     __global V_T*  unsortedVals,
 140     __global int* scanedBuckets,
 141     uint shiftCount,
 142     __global K_T*  sortedKeys,
 143     __global V_T*  sortedVals
 144     )
 145 {
 146     const int RADIX_T     = N;
 147     const int RADICES_T   = (1 << RADIX_T);
 148     const int MASK_T = (1<<RADIX_T)  -1;
 149
 150     int globalId  = get_global_id(0);
 151     int numOfGroups = get_num_groups(0);
 152     const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N;
 153     int  localIndex[16];
 154
 155     /*Load the index to local memory*/
 156     for(int i = 0; i < RADICES_T; ++i)
 157     {
 158 #if IS_GT
 159         localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
 160 #else
 161         localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
 162 #endif
 163     }
 164     /* Permute elements to appropriate location */
 165     for(int i = 0; i < RADICES_T; ++i)
 166     {
 167         int old_idx = mad24(globalId, RADICES_T, i);
 168         K_T  ovalue = unsortedKeys[old_idx];
 169         uint value = convertKey(as_uint(ovalue));
 170         uint maskedValue = (value >> shiftCount) & MASK_T;
 171         uint index = localIndex[maskedValue];
 172         sortedKeys[index] = ovalue;
 173         sortedVals[index] = unsortedVals[old_idx];
 174         localIndex[maskedValue] = index + 1;
 175     }
 176 }