1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Peng Xiao, pengxiao@outlook.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other oclMaterials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
47 #include "precomp.hpp"
54 extern const char * kernel_sort_by_key;
55 extern const char * kernel_stablesort_by_key;
56 extern const char * kernel_radix_sort_by_key;
58 void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan);
60 //TODO(pengx17): change this value depending on device other than a constant
61 const static unsigned int GROUP_SIZE = 256;
63 const char * depth_strings[] =
74 void static genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf)
76 sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s",
77 isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]);
78 if(vals.oclchannels() > 1)
80 sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels());
83 inline bool isSizePowerOf2(size_t size)
85 return ((size - 1) & (size)) == 0;
88 namespace bitonic_sort
90 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
92 CV_Assert(isSizePowerOf2(vecSize));
94 Context * cxt = Context::getContext();
95 size_t globalThreads[3] = {vecSize / 2, 1, 1};
96 size_t localThreads[3] = {GROUP_SIZE, 1, 1};
98 // 2^numStages should be equal to vecSize or the output is invalid
100 for(int i = vecSize; i > 1; i >>= 1)
104 char build_opt_buf [100];
105 genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
107 std::vector< std::pair<size_t, const void *> > args(argc);
108 String kernelname = "bitonicSort";
110 args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
111 args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
112 args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize);
114 for(int stage = 0; stage < numStages; ++stage)
116 args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
117 for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
119 args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
120 openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
126 namespace selection_sort
129 // This function cannot sort arrays with duplicated keys
130 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
132 CV_Error(-1, "This function is incorrect at the moment.");
133 Context * cxt = Context::getContext();
135 size_t globalThreads[3] = {vecSize, 1, 1};
136 size_t localThreads[3] = {GROUP_SIZE, 1, 1};
138 std::vector< std::pair<size_t, const void *> > args;
139 char build_opt_buf [100];
140 genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
143 String kernelname = "selectionSortLocal";
144 int lds_size = GROUP_SIZE * keys.elemSize();
145 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
146 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
147 args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
148 args.push_back(std::make_pair(lds_size, (void*)NULL));
150 openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
153 kernelname = "selectionSortFinal";
155 openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
158 } /* selection_sort */
164 // exclusive scan, need to be optimized as this is too naive...
165 //void naive_scan_addition(oclMat& input, oclMat& output)
167 // Context * cxt = Context::getContext();
168 // size_t vecSize = input.cols;
169 // size_t globalThreads[3] = {1, 1, 1};
170 // size_t localThreads[3] = {1, 1, 1};
172 // String kernelname = "naiveScanAddition";
174 // std::vector< std::pair<size_t, const void *> > args;
175 // args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
176 // args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
177 // args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
178 // openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
181 void static naive_scan_addition_cpu(oclMat& input, oclMat& output)
183 Mat m_input = input, m_output(output.size(), output.type());
184 MatIterator_<int> i_mit = m_input.begin<int>();
185 MatIterator_<int> o_mit = m_output.begin<int>();
189 for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit)
191 *o_mit = *(o_mit - 1) + *(i_mit - 1);
197 //radix sort ported from Bolt
198 static void sortByKey(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan)
200 CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
202 bool isKeyFloat = keys.type() == CV_32F;
204 const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
205 //local array of 16 elements in the histogram kernel.
206 const int RADICES = (1 << RADIX); //Values handeled by each work-item?
208 bool newBuffer = false;
209 size_t vecSize = origVecSize;
211 unsigned int groupSize = RADICES;
213 size_t mulFactor = groupSize * RADICES;
215 oclMat buffer_keys, buffer_vals;
217 if(origVecSize % mulFactor != 0)
219 vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor;
220 buffer_keys.create(1, vecSize, keys.type());
221 buffer_vals.create(1, vecSize, vals.type());
222 Scalar padding_value;
223 oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1));
227 switch(buffer_keys.depth())
230 padding_value = Scalar::all(-FLT_MAX);
233 padding_value = Scalar::all(INT_MIN);
239 switch(buffer_keys.depth())
242 padding_value = Scalar::all(FLT_MAX);
245 padding_value = Scalar::all(INT_MAX);
250 keys(Rect(0,0,origVecSize,1)), buffer_keys,
251 0, 0, 0, vecSize - origVecSize,
252 BORDER_CONSTANT, padding_value);
253 vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals);
262 oclMat swap_input_keys(1, vecSize, keys.type());
263 oclMat swap_input_vals(1, vecSize, vals.type());
264 oclMat hist_bin_keys(1, vecSize, CV_32SC1);
265 oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1);
267 Context * cxt = Context::getContext();
269 size_t globalThreads[3] = {vecSize / RADICES, 1, 1};
270 size_t localThreads[3] = {groupSize, 1, 1};
272 std::vector< std::pair<size_t, const void *> > args;
273 char build_opt_buf [100];
274 genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
276 //additional build option for radix sort
277 sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT");
279 String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")};
282 for(int bits = 0; bits < (static_cast<int>(keys.elemSize()) * 8); bits += RADIX)
285 //Do a histogram pass locally
288 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
292 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
294 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data));
295 args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
296 openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads,
297 args, -1, -1, build_opt_buf);
300 //Perform a global scan
301 naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys);
305 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
306 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
310 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
311 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
313 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data));
314 args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
318 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
319 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
323 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
324 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
326 openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads,
327 args, -1, -1, build_opt_buf);
332 buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys);
333 buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals);
341 static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
343 Context * cxt = Context::getContext();
345 size_t globalThreads[3] = {vecSize, 1, 1};
346 size_t localThreads[3] = {GROUP_SIZE, 1, 1};
348 std::vector< std::pair<size_t, const void *> > args;
349 char build_opt_buf [100];
350 genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
352 String kernelname[] = {String("blockInsertionSort"), String("merge")};
353 int keylds_size = GROUP_SIZE * keys.elemSize();
354 int vallds_size = GROUP_SIZE * vals.elemSize();
355 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
356 args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
357 args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize));
358 args.push_back(std::make_pair(keylds_size, (void*)NULL));
359 args.push_back(std::make_pair(vallds_size, (void*)NULL));
361 openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf);
363 // Early exit for the case of no merge passes, values are already in destination vector
364 if(vecSize <= GROUP_SIZE)
369 // An odd number of elements requires an extra merge pass to sort
370 size_t numMerges = 0;
371 // Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
372 // this is how many merge passes we want
373 size_t log2BlockSize = vecSize >> 6;
374 for( ; log2BlockSize > 1; log2BlockSize >>= 1 )
378 // Check to see if the input vector size is a power of 2, if not we will need last merge pass
379 numMerges += isSizePowerOf2(vecSize)? 1: 0;
381 // Allocate a flipflop buffer because the merge passes are out of place
382 oclMat tmpKeyBuffer(keys.size(), keys.type());
383 oclMat tmpValBuffer(vals.size(), vals.type());
386 args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize);
387 args[6] = std::make_pair(keylds_size, (void*)NULL);
388 args[7] = std::make_pair(vallds_size, (void*)NULL);
390 for(size_t pass = 1; pass <= numMerges; ++pass )
392 // For each pass, flip the input-output buffers
395 args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
396 args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
397 args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
398 args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
402 args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
403 args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
404 args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
405 args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
407 // For each pass, the merge window doubles
408 unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) );
409 args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize);
410 openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf);
412 // If there are an odd number of merges, then the output data is sitting in the temp buffer. We need to copy
413 // the results back into the input array
416 tmpKeyBuffer.copyTo(keys);
417 tmpValBuffer.copyTo(vals);
423 } /* namespace cv { namespace ocl */
426 void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan)
428 CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
429 CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
430 CV_Assert( vecSize <= static_cast<size_t>(keys.cols) );
434 bitonic_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
437 selection_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
440 radix_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
443 merge_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
448 void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, int method, bool isGreaterThan)
450 CV_Assert( keys.size() == vals.size() );
451 CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
452 size_t vecSize = static_cast<size_t>(keys.cols);
453 sortByKey(keys, vals, vecSize, method, isGreaterThan);