modules/ocl/src/sort_by_key.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@outlook.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46 #include <iomanip>
  47 #include "precomp.hpp"
  48
  49 namespace cv
  50 {
  51 namespace ocl
  52 {
  53
  54 extern const char * kernel_sort_by_key;
  55 extern const char * kernel_stablesort_by_key;
  56 extern const char * kernel_radix_sort_by_key;
  57
  58 void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan);
  59
  60 //TODO(pengx17): change this value depending on device other than a constant
  61 const static unsigned int GROUP_SIZE = 256;
  62
  63 const char * depth_strings[] =
  64 {
  65     "uchar",   //CV_8U
  66     "char",    //CV_8S
  67     "ushort",  //CV_16U
  68     "short",   //CV_16S
  69     "int",     //CV_32S
  70     "float",   //CV_32F
  71     "double"   //CV_64F
  72 };
  73
  74 void static genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf)
  75 {
  76     sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s",
  77             isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]);
  78     if(vals.oclchannels() > 1)
  79     {
  80         sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels());
  81     }
  82 }
  83 inline bool isSizePowerOf2(size_t size)
  84 {
  85     return ((size - 1) & (size)) == 0;
  86 }
  87
  88 namespace bitonic_sort
  89 {
  90 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
  91 {
  92     CV_Assert(isSizePowerOf2(vecSize));
  93
  94     Context * cxt = Context::getContext();
  95     size_t globalThreads[3] = {vecSize / 2, 1, 1};
  96     size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
  97
  98     // 2^numStages should be equal to vecSize or the output is invalid
  99     int numStages = 0;
 100     for(int i = vecSize; i > 1; i >>= 1)
 101     {
 102         ++numStages;
 103     }
 104     char build_opt_buf [100];
 105     genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
 106     const int argc = 5;
 107     std::vector< std::pair<size_t, const void *> > args(argc);
 108     String kernelname = "bitonicSort";
 109
 110     args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
 111     args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
 112     args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize);
 113
 114     for(int stage = 0; stage < numStages; ++stage)
 115     {
 116         args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
 117         for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
 118         {
 119             args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
 120             openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
 121         }
 122     }
 123 }
 124 }  /* bitonic_sort */
 125
 126 namespace selection_sort
 127 {
 128 // FIXME:
 129 // This function cannot sort arrays with duplicated keys
 130 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
 131 {
 132     CV_Error(-1, "This function is incorrect at the moment.");
 133     Context * cxt = Context::getContext();
 134
 135     size_t globalThreads[3] = {vecSize, 1, 1};
 136     size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
 137
 138     std::vector< std::pair<size_t, const void *> > args;
 139     char build_opt_buf [100];
 140     genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
 141
 142     //local
 143     String kernelname = "selectionSortLocal";
 144     int lds_size = GROUP_SIZE * keys.elemSize();
 145     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
 146     args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
 147     args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
 148     args.push_back(std::make_pair(lds_size,       (void*)NULL));
 149
 150     openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
 151
 152     //final
 153     kernelname = "selectionSortFinal";
 154     args.pop_back();
 155     openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
 156 }
 157
 158 }  /* selection_sort */
 159
 160
 161 namespace radix_sort
 162 {
 163 //FIXME(pengx17):
 164 // exclusive scan, need to be optimized as this is too naive...
 165 //void naive_scan_addition(oclMat& input, oclMat& output)
 166 //{
 167 //    Context * cxt = Context::getContext();
 168 //    size_t vecSize = input.cols;
 169 //    size_t globalThreads[3] = {1, 1, 1};
 170 //    size_t localThreads[3]  = {1, 1, 1};
 171 //
 172 //    String kernelname = "naiveScanAddition";
 173 //
 174 //    std::vector< std::pair<size_t, const void *> > args;
 175 //    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
 176 //    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
 177 //    args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
 178 //    openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
 179 //}
 180
 181 void static naive_scan_addition_cpu(oclMat& input, oclMat& output)
 182 {
 183     Mat m_input = input, m_output(output.size(), output.type());
 184     MatIterator_<int> i_mit = m_input.begin<int>();
 185     MatIterator_<int> o_mit = m_output.begin<int>();
 186     *o_mit = 0;
 187     ++i_mit;
 188     ++o_mit;
 189     for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit)
 190     {
 191         *o_mit = *(o_mit - 1) + *(i_mit - 1);
 192     }
 193     output = m_output;
 194 }
 195
 196
 197 //radix sort ported from Bolt
 198 static void sortByKey(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan)
 199 {
 200     CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
 201
 202     bool isKeyFloat = keys.type() == CV_32F;
 203
 204     const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
 205                          //local array of 16 elements in the histogram kernel.
 206     const int RADICES = (1 << RADIX); //Values handeled by each work-item?
 207
 208     bool  newBuffer = false;
 209     size_t vecSize = origVecSize;
 210
 211     unsigned int groupSize  = RADICES;
 212
 213     size_t mulFactor = groupSize * RADICES;
 214
 215     oclMat buffer_keys, buffer_vals;
 216
 217     if(origVecSize % mulFactor != 0)
 218     {
 219         vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor;
 220         buffer_keys.create(1, vecSize, keys.type());
 221         buffer_vals.create(1, vecSize, vals.type());
 222         Scalar padding_value;
 223         oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1));
 224
 225         if(isGreaterThan)
 226         {
 227             switch(buffer_keys.depth())
 228             {
 229             case CV_32F:
 230                 padding_value = Scalar::all(-FLT_MAX);
 231                 break;
 232             case CV_32S:
 233                 padding_value = Scalar::all(INT_MIN);
 234                 break;
 235             }
 236         }
 237         else
 238         {
 239             switch(buffer_keys.depth())
 240             {
 241             case CV_32F:
 242                 padding_value = Scalar::all(FLT_MAX);
 243                 break;
 244             case CV_32S:
 245                 padding_value = Scalar::all(INT_MAX);
 246                 break;
 247             }
 248         }
 249         ocl::copyMakeBorder(
 250             keys(Rect(0,0,origVecSize,1)), buffer_keys,
 251             0, 0, 0, vecSize - origVecSize,
 252             BORDER_CONSTANT, padding_value);
 253         vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals);
 254         newBuffer = true;
 255     }
 256     else
 257     {
 258         buffer_keys = keys;
 259         buffer_vals = vals;
 260         newBuffer = false;
 261     }
 262     oclMat swap_input_keys(1, vecSize, keys.type());
 263     oclMat swap_input_vals(1, vecSize, vals.type());
 264     oclMat hist_bin_keys(1, vecSize, CV_32SC1);
 265     oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1);
 266
 267     Context * cxt = Context::getContext();
 268
 269     size_t globalThreads[3] = {vecSize / RADICES, 1, 1};
 270     size_t localThreads[3]  = {groupSize, 1, 1};
 271
 272     std::vector< std::pair<size_t, const void *> > args;
 273     char build_opt_buf [100];
 274     genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
 275
 276     //additional build option for radix sort
 277     sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT");
 278
 279     String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")};
 280
 281     int swap = 0;
 282     for(int bits = 0; bits < (static_cast<int>(keys.elemSize()) * 8); bits += RADIX)
 283     {
 284         args.clear();
 285         //Do a histogram pass locally
 286         if(swap == 0)
 287         {
 288             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
 289         }
 290         else
 291         {
 292             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
 293         }
 294         args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data));
 295         args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
 296         openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads,
 297             args, -1, -1, build_opt_buf);
 298
 299         args.clear();
 300         //Perform a global scan
 301         naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys);
 302         // end of scan
 303         if(swap == 0)
 304         {
 305             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
 306             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
 307         }
 308         else
 309         {
 310             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
 311             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
 312         }
 313         args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data));
 314         args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
 315
 316         if(swap == 0)
 317         {
 318             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
 319             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
 320         }
 321         else
 322         {
 323             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
 324             args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
 325         }
 326         openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads,
 327             args, -1, -1, build_opt_buf);
 328         swap = swap ? 0 : 1;
 329     }
 330     if(newBuffer)
 331     {
 332         buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys);
 333         buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals);
 334     }
 335 }
 336
 337 }  /* radix_sort */
 338
 339 namespace merge_sort
 340 {
 341 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
 342 {
 343     Context * cxt = Context::getContext();
 344
 345     size_t globalThreads[3] = {vecSize, 1, 1};
 346     size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
 347
 348     std::vector< std::pair<size_t, const void *> > args;
 349     char build_opt_buf [100];
 350     genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
 351
 352     String kernelname[] = {String("blockInsertionSort"), String("merge")};
 353     int keylds_size = GROUP_SIZE * keys.elemSize();
 354     int vallds_size = GROUP_SIZE * vals.elemSize();
 355     args.push_back(std::make_pair(sizeof(cl_mem),  (void *)&keys.data));
 356     args.push_back(std::make_pair(sizeof(cl_mem),  (void *)&vals.data));
 357     args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize));
 358     args.push_back(std::make_pair(keylds_size,     (void*)NULL));
 359     args.push_back(std::make_pair(vallds_size,     (void*)NULL));
 360
 361     openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf);
 362
 363     //  Early exit for the case of no merge passes, values are already in destination vector
 364     if(vecSize <= GROUP_SIZE)
 365     {
 366         return;
 367     }
 368
 369     //  An odd number of elements requires an extra merge pass to sort
 370     size_t numMerges = 0;
 371     //  Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
 372     //  this is how many merge passes we want
 373     size_t log2BlockSize = vecSize >> 6;
 374     for( ; log2BlockSize > 1; log2BlockSize >>= 1 )
 375     {
 376         ++numMerges;
 377     }
 378     //  Check to see if the input vector size is a power of 2, if not we will need last merge pass
 379     numMerges += isSizePowerOf2(vecSize)? 1: 0;
 380
 381     //  Allocate a flipflop buffer because the merge passes are out of place
 382     oclMat tmpKeyBuffer(keys.size(), keys.type());
 383     oclMat tmpValBuffer(vals.size(), vals.type());
 384     args.resize(8);
 385
 386     args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize);
 387     args[6] = std::make_pair(keylds_size,    (void*)NULL);
 388     args[7] = std::make_pair(vallds_size,    (void*)NULL);
 389
 390     for(size_t pass = 1; pass <= numMerges; ++pass )
 391     {
 392         //  For each pass, flip the input-output buffers
 393         if( pass & 0x1 )
 394         {
 395             args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
 396             args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
 397             args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
 398             args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
 399         }
 400         else
 401         {
 402             args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
 403             args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
 404             args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
 405             args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
 406         }
 407         //  For each pass, the merge window doubles
 408         unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) );
 409         args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize);
 410         openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf);
 411     }
 412     //  If there are an odd number of merges, then the output data is sitting in the temp buffer.  We need to copy
 413     //  the results back into the input array
 414     if( numMerges & 1 )
 415     {
 416         tmpKeyBuffer.copyTo(keys);
 417         tmpValBuffer.copyTo(vals);
 418     }
 419 }
 420 }  /* merge_sort */
 421
 422 }
 423 } /* namespace cv { namespace ocl */
 424
 425
 426 void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan)
 427 {
 428     CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
 429     CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
 430     CV_Assert( vecSize <= static_cast<size_t>(keys.cols) );
 431     switch(method)
 432     {
 433     case SORT_BITONIC:
 434         bitonic_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
 435         break;
 436     case SORT_SELECTION:
 437         selection_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
 438         break;
 439     case SORT_RADIX:
 440         radix_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
 441         break;
 442     case SORT_MERGE:
 443         merge_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
 444         break;
 445     }
 446 }
 447
 448 void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, int method, bool isGreaterThan)
 449 {
 450     CV_Assert( keys.size() == vals.size() );
 451     CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
 452     size_t vecSize = static_cast<size_t>(keys.cols);
 453     sortByKey(keys, vals, vecSize, method, isGreaterThan);
 454 }