Most codes are ported from AMD's Bolt library.
Four methods are implemented:
SORT_BITONIC, // only support power-of-2 buffer size
SORT_SELECTION, // cannot sort duplicate keys
SORT_MERGE,
SORT_RADIX // only support signed int/float keys
oclMat diff_buf;
oclMat norm_buf;
};
+ // current supported sorting methods
+ enum
+ {
+ SORT_BITONIC, // only support power-of-2 buffer size
+ SORT_SELECTION, // cannot sort duplicate keys
+ SORT_MERGE,
+ SORT_RADIX // only support signed int/float keys
+ };
+ //! Returns the sorted result of all the elements in input based on equivalent keys.
+ //
+ // The element unit in the values to be sorted is determined from the data type,
+ // i.e., a CV_32FC2 input {a1a2, b1b2} will be considered as two elements, regardless its
+ // matrix dimension.
+ // both keys and values will be sorted inplace
+ // Key needs to be single channel oclMat.
+ // TODO(pengx): add supported types for values
+ //
+ // Example:
+ // input -
+ // keys = {2, 3, 1} (CV_8UC1)
+ // values = {10,5, 4,3, 6,2} (CV_8UC2)
+ // sort_by_key(keys, values, SORT_SELECTION, false);
+ // output -
+ // keys = {1, 2, 3} (CV_8UC1)
+ // values = {6,2, 10,5, 4,3} (CV_8UC2)
+ void CV_EXPORTS sort_by_key(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false);
+ void CV_EXPORTS sort_by_key(oclMat& keys, oclMat& values, size_t vecSize, int method, bool isGreaterThan = false);
}
}
#if defined _MSC_VER && _MSC_VER >= 1200
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other oclMaterials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+
+#ifndef N // number of radices
+#define N 4
+#endif
+
+#ifndef K_T
+#define K_T float
+#endif
+
+#ifndef V_T
+#define V_T float
+#endif
+
+#ifndef IS_GT
+#define IS_GT 0
+#endif
+
+
+// from Thrust::b40c, link:
+// https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
+__inline uint convertKey(uint converted_key)
+{
+#ifdef K_FLT
+ unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000;
+ converted_key ^= mask;
+#elif defined(K_INT)
+ const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
+ converted_key ^= SIGN_MASK;
+#else
+
+#endif
+ return converted_key;
+}
+
+//FIXME(pengx17):
+// exclusive scan, need to be optimized as this is too naive...
+kernel
+ void naiveScanAddition(
+ __global int * input,
+ __global int * output,
+ int size
+ )
+{
+ if(get_global_id(0) == 0)
+ {
+ output[0] = 0;
+ for(int i = 1; i < size; i ++)
+ {
+ output[i] = output[i - 1] + input[i - 1];
+ }
+ }
+}
+
+// following is ported from
+// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl
+kernel
+ void histogramRadixN (
+ __global K_T* unsortedKeys,
+ __global int * buckets,
+ uint shiftCount
+ )
+{
+ const int RADIX_T = N;
+ const int RADICES_T = (1 << RADIX_T);
+ const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T;
+ const int MASK_T = (1 << RADIX_T) - 1;
+ int localBuckets[16] = {0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0};
+ int globalId = get_global_id(0);
+ int numOfGroups = get_num_groups(0);
+
+ /* Calculate thread-histograms */
+ for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
+ {
+ uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)]));
+ value = (value >> shiftCount) & MASK_T;
+#if IS_GT
+ localBuckets[RADICES_T - value - 1]++;
+#else
+ localBuckets[value]++;
+#endif
+ }
+
+ for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
+ {
+ buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i];
+ }
+}
+
+kernel
+ void permuteRadixN (
+ __global K_T* unsortedKeys,
+ __global V_T* unsortedVals,
+ __global int* scanedBuckets,
+ uint shiftCount,
+ __global K_T* sortedKeys,
+ __global V_T* sortedVals
+ )
+{
+ const int RADIX_T = N;
+ const int RADICES_T = (1 << RADIX_T);
+ const int MASK_T = (1<<RADIX_T) -1;
+
+ int globalId = get_global_id(0);
+ int numOfGroups = get_num_groups(0);
+ const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N;
+ int localIndex[16];
+
+ /*Load the index to local memory*/
+ for(int i = 0; i < RADICES_T; ++i)
+ {
+#if IS_GT
+ localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
+#else
+ localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
+#endif
+ }
+ /* Permute elements to appropriate location */
+ for(int i = 0; i < RADICES_T; ++i)
+ {
+ int old_idx = mad24(globalId, RADICES_T, i);
+ K_T ovalue = unsortedKeys[old_idx];
+ uint value = convertKey(as_uint(ovalue));
+ uint maskedValue = (value >> shiftCount) & MASK_T;
+ uint index = localIndex[maskedValue];
+ sortedKeys[index] = ovalue;
+ sortedVals[index] = unsortedVals[old_idx];
+ localIndex[maskedValue] = index + 1;
+ }
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other oclMaterials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef K_T
+#define K_T float
+#endif
+
+#ifndef V_T
+#define V_T float
+#endif
+
+#ifndef IS_GT
+#define IS_GT false
+#endif
+
+#if IS_GT
+#define my_comp(x,y) ((x) > (y))
+#else
+#define my_comp(x,y) ((x) < (y))
+#endif
+
+/////////////////////// Bitonic sort ////////////////////////////
+// ported from
+// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl
+__kernel
+ void bitonicSort
+ (
+ __global K_T * keys,
+ __global V_T * vals,
+ int count,
+ int stage,
+ int passOfStage
+ )
+{
+ const int threadId = get_global_id(0);
+ if(threadId >= count / 2)
+ {
+ return;
+ }
+ const int pairDistance = 1 << (stage - passOfStage);
+ const int blockWidth = 2 * pairDistance;
+
+ int leftId = min( (threadId % pairDistance)
+ + (threadId / pairDistance) * blockWidth, count );
+
+ int rightId = min( leftId + pairDistance, count );
+
+ int temp;
+
+ const V_T lval = vals[leftId];
+ const V_T rval = vals[rightId];
+
+ const K_T lkey = keys[leftId];
+ const K_T rkey = keys[rightId];
+
+ int sameDirectionBlockWidth = 1 << stage;
+
+ if((threadId/sameDirectionBlockWidth) % 2 == 1)
+ {
+ temp = rightId;
+ rightId = leftId;
+ leftId = temp;
+ }
+
+ const bool compareResult = my_comp(lkey, rkey);
+
+ if(compareResult)
+ {
+ keys[rightId] = rkey;
+ keys[leftId] = lkey;
+ vals[rightId] = rval;
+ vals[leftId] = lval;
+ }
+ else
+ {
+ keys[rightId] = lkey;
+ keys[leftId] = rkey;
+ vals[rightId] = lval;
+ vals[leftId] = rval;
+ }
+}
+
+/////////////////////// Selection sort ////////////////////////////
+//kernel is ported from Bolt library:
+//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
+__kernel
+ void selectionSortLocal
+ (
+ __global K_T * keys,
+ __global V_T * vals,
+ const int count,
+ __local K_T * scratch
+ )
+{
+ int i = get_local_id(0); // index in workgroup
+ int numOfGroups = get_num_groups(0); // index in workgroup
+ int groupID = get_group_id(0);
+ int wg = get_local_size(0); // workgroup size = block size
+ int n; // number of elements to be processed for this work group
+
+ int offset = groupID * wg;
+ int same = 0;
+
+ vals += offset;
+ keys += offset;
+ n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
+
+ int clamped_i= min(i, n - 1);
+
+ K_T key1 = keys[clamped_i], key2;
+ V_T val1 = vals[clamped_i];
+ scratch[i] = key1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if(i >= n)
+ {
+ return;
+ }
+
+ int pos = 0;
+ for (int j=0;j<n;++j)
+ {
+ key2 = scratch[j];
+ if(my_comp(key2, key1))
+ pos++;//calculate the rank of this element in this work group
+ else
+ {
+ if(my_comp(key1, key2))
+ continue;
+ else
+ {
+ // key1 and key2 are same
+ same++;
+ }
+ }
+ }
+ for (int j=0; j< same; j++)
+ {
+ vals[pos + j] = val1;
+ keys[pos + j] = key1;
+ }
+}
+__kernel
+ void selectionSortFinal
+ (
+ __global K_T * keys,
+ __global V_T * vals,
+ const int count
+ )
+{
+ const int i = get_local_id(0); // index in workgroup
+ const int numOfGroups = get_num_groups(0); // index in workgroup
+ const int groupID = get_group_id(0);
+ const int wg = get_local_size(0); // workgroup size = block size
+ int pos = 0, same = 0;
+ const int offset = get_group_id(0) * wg;
+ const int remainder = count - wg*(numOfGroups-1);
+
+ if((offset + i ) >= count)
+ return;
+ V_T val1 = vals[offset + i];
+
+ K_T key1 = keys[offset + i];
+ K_T key2;
+
+ for(int j=0; j<numOfGroups-1; j++ )
+ {
+ for(int k=0; k<wg; k++)
+ {
+ key2 = keys[j*wg + k];
+ if(my_comp(key1, key2))
+ break;
+ else
+ {
+ //Increment only if the value is not the same.
+ if(my_comp(key2, key1))
+ pos++;
+ else
+ same++;
+ }
+ }
+ }
+
+ for(int k=0; k<remainder; k++)
+ {
+ key2 = keys[(numOfGroups-1)*wg + k];
+ if(my_comp(key1, key2))
+ break;
+ else
+ {
+ //Don't increment if the value is the same.
+ if(my_comp(key2, key1))
+ pos++;
+ else
+ same++;
+ }
+ }
+ for (int j=0; j< same; j++)
+ {
+ vals[pos + j] = val1;
+ keys[pos + j] = key1;
+ }
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other oclMaterials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef K_T
+#define K_T float
+#endif
+
+#ifndef V_T
+#define V_T float
+#endif
+
+#ifndef IS_GT
+#define IS_GT false
+#endif
+
+#if IS_GT
+#define my_comp(x,y) ((x) > (y))
+#else
+#define my_comp(x,y) ((x) < (y))
+#endif
+
+///////////// parallel merge sort ///////////////
+// ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
+uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
+{
+ // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
+ uint firstIndex = left;
+ uint lastIndex = right;
+
+ // This loops through [firstIndex, lastIndex)
+ // Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
+ // this while loop will be divergent within a wavefront
+ while( firstIndex < lastIndex )
+ {
+ K_T dataVal = data[ firstIndex ];
+
+ // This branch will create divergent wavefronts
+ if( my_comp( dataVal, searchVal ) )
+ {
+ firstIndex = firstIndex+1;
+ }
+ else
+ {
+ break;
+ }
+ }
+
+ return firstIndex;
+}
+
+// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
+// by a base pointer and left and right index for a particular candidate value. The comparison operator is
+// passed as a functor parameter my_comp
+// This function returns an index that is the first index whos value would be equal to the searched value
+uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
+{
+ // The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
+ uint firstIndex = left;
+ uint lastIndex = right;
+
+ // This loops through [firstIndex, lastIndex)
+ // Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
+ // this while loop will be divergent within a wavefront
+ while( firstIndex < lastIndex )
+ {
+ // midIndex is the average of first and last, rounded down
+ uint midIndex = ( firstIndex + lastIndex ) / 2;
+ K_T midValue = data[ midIndex ];
+
+ // This branch will create divergent wavefronts
+ if( my_comp( midValue, searchVal ) )
+ {
+ firstIndex = midIndex+1;
+ // printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex );
+ }
+ else
+ {
+ lastIndex = midIndex;
+ // printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex );
+ }
+ }
+
+ return firstIndex;
+}
+
+// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
+// by a base pointer and left and right index for a particular candidate value. The comparison operator is
+// passed as a functor parameter my_comp
+// This function returns an index that is the first index whos value would be greater than the searched value
+// If the search value is not found in the sequence, upperbound returns the same result as lowerbound
+uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
+{
+ uint upperBound = lowerBoundBinary( data, left, right, searchVal );
+
+ // printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound );
+ // If upperBound == right, then searchVal was not found in the sequence. Just return.
+ if( upperBound != right )
+ {
+ // While the values are equal i.e. !(x < y) && !(y < x) increment the index
+ K_T upperValue = data[ upperBound ];
+ while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) )
+ {
+ upperBound++;
+ upperValue = data[ upperBound ];
+ }
+ }
+
+ return upperBound;
+}
+
+// This kernel implements merging of blocks of sorted data. The input to this kernel most likely is
+// the output of blockInsertionSortTemplate. It is expected that the source array contains multiple
+// blocks, each block is independently sorted. The goal is to write into the output buffer half as
+// many blocks, of double the size. The even and odd blocks are stably merged together to form
+// a new sorted block of twice the size. The algorithm is out-of-place.
+kernel void merge(
+ global K_T* iKey_ptr,
+ global V_T* iValue_ptr,
+ global K_T* oKey_ptr,
+ global V_T* oValue_ptr,
+ const uint srcVecSize,
+ const uint srcLogicalBlockSize,
+ local K_T* key_lds,
+ local V_T* val_lds
+)
+{
+ size_t globalID = get_global_id( 0 );
+ size_t groupID = get_group_id( 0 );
+ size_t localID = get_local_id( 0 );
+ size_t wgSize = get_local_size( 0 );
+
+ // Abort threads that are passed the end of the input vector
+ if( globalID >= srcVecSize )
+ return; // on SI this doesn't mess-up barriers
+
+ // For an element in sequence A, find the lowerbound index for it in sequence B
+ uint srcBlockNum = globalID / srcLogicalBlockSize;
+ uint srcBlockIndex = globalID % srcLogicalBlockSize;
+
+ // printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex );
+
+ // Pairs of even-odd blocks will be merged together
+ // An even block should search for an insertion point in the next odd block,
+ // and the odd block should look for an insertion point in the corresponding previous even block
+ uint dstLogicalBlockSize = srcLogicalBlockSize<<1;
+ uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 );
+ leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize;
+ leftBlockIndex = min( leftBlockIndex, srcVecSize );
+ uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize );
+
+ // if( localID == 0 )
+ // {
+ // printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex );
+ // }
+
+ // For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex
+ // uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex;
+ uint insertionIndex = 0;
+ if( (srcBlockNum & 0x1) == 0 )
+ {
+ insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
+ }
+ else
+ {
+ insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
+ }
+
+ // The index of an element in the result sequence is the summation of it's indixes in the two input
+ // sequences
+ uint dstBlockIndex = srcBlockIndex + insertionIndex;
+ uint dstBlockNum = srcBlockNum/2;
+
+ // if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 )
+ // {
+ // printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex );
+ // printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] );
+ // }
+ oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ];
+ oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ];
+ // printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex );
+}
+
+kernel void blockInsertionSort(
+ global K_T* key_ptr,
+ global V_T* value_ptr,
+ const uint vecSize,
+ local K_T* key_lds,
+ local V_T* val_lds
+)
+{
+ size_t gloId = get_global_id( 0 );
+ size_t groId = get_group_id( 0 );
+ size_t locId = get_local_id( 0 );
+ size_t wgSize = get_local_size( 0 );
+
+ bool in_range = gloId < vecSize;
+ K_T key;
+ V_T val;
+ // Abort threads that are passed the end of the input vector
+ if (in_range)
+ {
+ // Make a copy of the entire input array into fast local memory
+ key = key_ptr[ gloId ];
+ val = value_ptr[ gloId ];
+ key_lds[ locId ] = key;
+ val_lds[ locId ] = val;
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+ // Sorts a workgroup using a naive insertion sort
+ // The sort uses one thread within a workgroup to sort the entire workgroup
+ if( locId == 0 && in_range )
+ {
+ // The last workgroup may have an irregular size, so we calculate a per-block endIndex
+ // endIndex is essentially emulating a mod operator with subtraction and multiply
+ size_t endIndex = vecSize - ( groId * wgSize );
+ endIndex = min( endIndex, wgSize );
+
+ // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
+
+ // Indices are signed because the while loop will generate a -1 index inside of the max function
+ for( int currIndex = 1; currIndex < endIndex; ++currIndex )
+ {
+ key = key_lds[ currIndex ];
+ val = val_lds[ currIndex ];
+ int scanIndex = currIndex;
+ K_T ldsKey = key_lds[scanIndex - 1];
+ while( scanIndex > 0 && my_comp( key, ldsKey ) )
+ {
+ V_T ldsVal = val_lds[scanIndex - 1];
+
+ // If the keys are being swapped, make sure the values are swapped identicaly
+ key_lds[ scanIndex ] = ldsKey;
+ val_lds[ scanIndex ] = ldsVal;
+
+ scanIndex = scanIndex - 1;
+ ldsKey = key_lds[ max( 0, scanIndex - 1 ) ]; // scanIndex-1 may be -1
+ }
+ key_lds[ scanIndex ] = key;
+ val_lds[ scanIndex ] = val;
+ }
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if(in_range)
+ {
+ key = key_lds[ locId ];
+ key_ptr[ gloId ] = key;
+
+ val = val_lds[ locId ];
+ value_ptr[ gloId ] = val;
+ }
+}
+
+///////////// Radix sort from b40c library /////////////
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other oclMaterials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iomanip>
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace ocl
+{
+
+extern const char * kernel_sort_by_key;
+extern const char * kernel_stablesort_by_key;
+extern const char * kernel_radix_sort_by_key;
+
+//TODO(pengx17): change this value depending on device other than a constant
+const static unsigned int GROUP_SIZE = 256;
+
+const char * depth_strings[] =
+{
+ "uchar", //CV_8U
+ "char", //CV_8S
+ "ushort", //CV_16U
+ "short", //CV_16S
+ "int", //CV_32S
+ "float", //CV_32F
+ "double" //CV_64F
+};
+
+void genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf)
+{
+ sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s",
+ isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]);
+ if(vals.oclchannels() > 1)
+ {
+ sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels(), 10);
+ }
+}
+inline bool isSizePowerOf2(size_t size)
+{
+ return ((size - 1) & (size)) == 0;
+}
+
+namespace bitonic_sort
+{
+static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
+{
+ CV_Assert(isSizePowerOf2(vecSize));
+
+ Context * cxt = Context::getContext();
+ size_t globalThreads[3] = {vecSize / 2, 1, 1};
+ size_t localThreads[3] = {GROUP_SIZE, 1, 1};
+
+ // 2^numStages should be equal to vecSize or the output is invalid
+ int numStages = 0;
+ for(int i = vecSize; i > 1; i >>= 1)
+ {
+ ++numStages;
+ }
+ char build_opt_buf [100];
+ genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
+ const int argc = 5;
+ std::vector< std::pair<size_t, const void *> > args(argc);
+ String kernelname = "bitonicSort";
+
+ args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
+ args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
+ args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize);
+
+ for(int stage = 0; stage < numStages; ++stage)
+ {
+ args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
+ for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
+ {
+ args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+ openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
+ }
+ }
+}
+} /* bitonic_sort */
+
+namespace selection_sort
+{
+// FIXME:
+// This function cannot sort arrays with duplicated keys
+static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
+{
+ CV_Error(-1, "This function is incorrect at the moment.");
+ Context * cxt = Context::getContext();
+
+ size_t globalThreads[3] = {vecSize, 1, 1};
+ size_t localThreads[3] = {GROUP_SIZE, 1, 1};
+
+ std::vector< std::pair<size_t, const void *> > args;
+ char build_opt_buf [100];
+ genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
+
+ //local
+ String kernelname = "selectionSortLocal";
+ int lds_size = GROUP_SIZE * keys.elemSize();
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
+ args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
+ args.push_back(std::make_pair(lds_size, (void*)NULL));
+
+ openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
+
+ //final
+ kernelname = "selectionSortFinal";
+ args.pop_back();
+ openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
+}
+
+} /* selection_sort */
+
+
+namespace radix_sort
+{
+//FIXME(pengx17):
+// exclusive scan, need to be optimized as this is too naive...
+//void naive_scan_addition(oclMat& input, oclMat& output)
+//{
+// Context * cxt = Context::getContext();
+// size_t vecSize = input.cols;
+// size_t globalThreads[3] = {1, 1, 1};
+// size_t localThreads[3] = {1, 1, 1};
+//
+// String kernelname = "naiveScanAddition";
+//
+// std::vector< std::pair<size_t, const void *> > args;
+// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
+// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
+// args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
+// openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
+//}
+
+void naive_scan_addition_cpu(oclMat& input, oclMat& output)
+{
+ Mat m_input = input, m_output(output.size(), output.type());
+ MatIterator_<int> i_mit = m_input.begin<int>();
+ MatIterator_<int> o_mit = m_output.begin<int>();
+ *o_mit = 0;
+ ++i_mit;
+ ++o_mit;
+ for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit)
+ {
+ *o_mit = *(o_mit - 1) + *(i_mit - 1);
+ }
+ output = m_output;
+}
+
+
+//radix sort ported from Bolt
+static void sort_by_key(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan)
+{
+ CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
+
+ bool isKeyFloat = keys.type() == CV_32F;
+
+ const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
+ //local array of 16 elements in the histogram kernel.
+ const int RADICES = (1 << RADIX); //Values handeled by each work-item?
+
+ bool newBuffer = false;
+ size_t vecSize = origVecSize;
+
+ unsigned int groupSize = RADICES;
+
+ size_t mulFactor = groupSize * RADICES;
+
+ oclMat buffer_keys, buffer_vals;
+
+ if(origVecSize % mulFactor != 0)
+ {
+ vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor;
+ buffer_keys.create(1, vecSize, keys.type());
+ buffer_vals.create(1, vecSize, vals.type());
+ Scalar padding_value;
+ oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1));
+
+ if(isGreaterThan)
+ {
+ switch(buffer_keys.depth())
+ {
+ case CV_32F:
+ padding_value = Scalar::all(-FLT_MAX);
+ break;
+ case CV_32S:
+ padding_value = Scalar::all(INT_MIN);
+ break;
+ }
+ }
+ else
+ {
+ switch(buffer_keys.depth())
+ {
+ case CV_32F:
+ padding_value = Scalar::all(FLT_MAX);
+ break;
+ case CV_32S:
+ padding_value = Scalar::all(INT_MAX);
+ break;
+ }
+ }
+ ocl::copyMakeBorder(
+ keys(Rect(0,0,origVecSize,1)), buffer_keys,
+ 0, 0, 0, vecSize - origVecSize,
+ BORDER_CONSTANT, padding_value);
+ vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals);
+ newBuffer = true;
+ }
+ else
+ {
+ buffer_keys = keys;
+ buffer_vals = vals;
+ newBuffer = false;
+ }
+ oclMat swap_input_keys(1, vecSize, keys.type());
+ oclMat swap_input_vals(1, vecSize, vals.type());
+ oclMat hist_bin_keys(1, vecSize, CV_32SC1);
+ oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1);
+
+ Context * cxt = Context::getContext();
+
+ size_t globalThreads[3] = {vecSize / RADICES, 1, 1};
+ size_t localThreads[3] = {groupSize, 1, 1};
+
+ std::vector< std::pair<size_t, const void *> > args;
+ char build_opt_buf [100];
+ genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
+
+ //additional build option for radix sort
+ sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT");
+
+ String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")};
+
+ int swap = 0;
+ for(int bits = 0; bits < (keys.elemSize() * 8); bits += RADIX)
+ {
+ args.clear();
+ //Do a histogram pass locally
+ if(swap == 0)
+ {
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
+ }
+ else
+ {
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
+ }
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data));
+ args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
+ openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads,
+ args, -1, -1, build_opt_buf);
+
+ args.clear();
+ //Perform a global scan
+ naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys);
+ // end of scan
+ if(swap == 0)
+ {
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
+ }
+ else
+ {
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
+ }
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data));
+ args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
+
+ if(swap == 0)
+ {
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
+ }
+ else
+ {
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
+ }
+ openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads,
+ args, -1, -1, build_opt_buf);
+ swap = swap ? 0 : 1;
+ }
+ if(newBuffer)
+ {
+ buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys);
+ buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals);
+ }
+}
+
+} /* radix_sort */
+
+namespace merge_sort
+{
+static void sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
+{
+ Context * cxt = Context::getContext();
+
+ size_t globalThreads[3] = {vecSize, 1, 1};
+ size_t localThreads[3] = {GROUP_SIZE, 1, 1};
+
+ std::vector< std::pair<size_t, const void *> > args;
+ char build_opt_buf [100];
+ genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
+
+ String kernelname[] = {String("blockInsertionSort"), String("merge")};
+ int keylds_size = GROUP_SIZE * keys.elemSize();
+ int vallds_size = GROUP_SIZE * vals.elemSize();
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
+ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
+ args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize));
+ args.push_back(std::make_pair(keylds_size, (void*)NULL));
+ args.push_back(std::make_pair(vallds_size, (void*)NULL));
+
+ openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf);
+
+ // Early exit for the case of no merge passes, values are already in destination vector
+ if(vecSize <= GROUP_SIZE)
+ {
+ return;
+ }
+
+ // An odd number of elements requires an extra merge pass to sort
+ size_t numMerges = 0;
+ // Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
+ // this is how many merge passes we want
+ size_t log2BlockSize = vecSize >> 6;
+ for( ; log2BlockSize > 1; log2BlockSize >>= 1 )
+ {
+ ++numMerges;
+ }
+ // Check to see if the input vector size is a power of 2, if not we will need last merge pass
+ numMerges += isSizePowerOf2(vecSize)? 1: 0;
+
+ // Allocate a flipflop buffer because the merge passes are out of place
+ oclMat tmpKeyBuffer(keys.size(), keys.type());
+ oclMat tmpValBuffer(vals.size(), vals.type());
+ args.resize(8);
+
+ args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize);
+ args[6] = std::make_pair(keylds_size, (void*)NULL);
+ args[7] = std::make_pair(vallds_size, (void*)NULL);
+
+ for(size_t pass = 1; pass <= numMerges; ++pass )
+ {
+ // For each pass, flip the input-output buffers
+ if( pass & 0x1 )
+ {
+ args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
+ args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
+ args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
+ args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
+ }
+ else
+ {
+ args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
+ args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
+ args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
+ args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
+ }
+ // For each pass, the merge window doubles
+ unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) );
+ args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize);
+ openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf);
+ }
+ // If there are an odd number of merges, then the output data is sitting in the temp buffer. We need to copy
+ // the results back into the input array
+ if( numMerges & 1 )
+ {
+ tmpKeyBuffer.copyTo(keys);
+ tmpValBuffer.copyTo(vals);
+ }
+}
+} /* merge_sort */
+
+}
+} /* namespace cv { namespace ocl */
+
+
+void cv::ocl::sort_by_key(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan)
+{
+ CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
+ CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
+ CV_Assert( vecSize <= static_cast<size_t>(keys.cols) );
+ switch(method)
+ {
+ case SORT_BITONIC:
+ bitonic_sort::sort_by_key(keys, vals, vecSize, isGreaterThan);
+ break;
+ case SORT_SELECTION:
+ selection_sort::sort_by_key(keys, vals, vecSize, isGreaterThan);
+ break;
+ case SORT_RADIX:
+ radix_sort::sort_by_key(keys, vals, vecSize, isGreaterThan);
+ break;
+ case SORT_MERGE:
+ merge_sort::sort_by_key(keys, vals, vecSize, isGreaterThan);
+ break;
+ }
+}
+
+
+void cv::ocl::sort_by_key(oclMat& keys, oclMat& vals, int method, bool isGreaterThan)
+{
+ CV_Assert( keys.size() == vals.size() );
+ CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
+ size_t vecSize = static_cast<size_t>(keys.cols);
+ sort_by_key(keys, vals, vecSize, method, isGreaterThan);
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+// Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other oclMaterials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <map>
+#include <functional>
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cvtest;
+using namespace testing;
+using namespace cv;
+
+
+namespace
+{
+IMPLEMENT_PARAM_CLASS(IsGreaterThan, bool)
+IMPLEMENT_PARAM_CLASS(InputSize, int)
+IMPLEMENT_PARAM_CLASS(SortMethod, int)
+
+
+template<class T>
+struct KV_CVTYPE{ static int toType() {return 0;} };
+
+template<> struct KV_CVTYPE<int> { static int toType() {return CV_32SC1;} };
+template<> struct KV_CVTYPE<float>{ static int toType() {return CV_32FC1;} };
+template<> struct KV_CVTYPE<Vec2i>{ static int toType() {return CV_32SC2;} };
+template<> struct KV_CVTYPE<Vec2f>{ static int toType() {return CV_32FC2;} };
+
+template<class key_type, class val_type>
+bool kvgreater(pair<key_type, val_type> p1, pair<key_type, val_type> p2)
+{
+ return p1.first > p2.first;
+}
+
+template<class key_type, class val_type>
+bool kvless(pair<key_type, val_type> p1, pair<key_type, val_type> p2)
+{
+ return p1.first < p2.first;
+}
+
+template<class key_type, class val_type>
+void toKVPair(
+ MatConstIterator_<key_type> kit,
+ MatConstIterator_<val_type> vit,
+ int vecSize,
+ vector<pair<key_type, val_type> >& kvres
+ )
+{
+ kvres.clear();
+ for(int i = 0; i < vecSize; i ++)
+ {
+ kvres.push_back(make_pair(*kit, *vit));
+ ++kit;
+ ++vit;
+ }
+}
+
+template<class key_type, class val_type>
+void kvquicksort(Mat& keys, Mat& vals, bool isGreater = false)
+{
+ vector<pair<key_type, val_type> > kvres;
+ toKVPair(keys.begin<key_type>(), vals.begin<val_type>(), keys.cols, kvres);
+
+ if(isGreater)
+ {
+ std::sort(kvres.begin(), kvres.end(), kvgreater<key_type, val_type>);
+ }
+ else
+ {
+ std::sort(kvres.begin(), kvres.end(), kvless<key_type, val_type>);
+ }
+ key_type * kptr = keys.ptr<key_type>();
+ val_type * vptr = vals.ptr<val_type>();
+ for(int i = 0; i < keys.cols; i ++)
+ {
+ kptr[i] = kvres[i].first;
+ vptr[i] = kvres[i].second;
+ }
+}
+
+class SortByKey_STL
+{
+public:
+ static void sort(cv::Mat&, cv::Mat&, bool is_gt);
+private:
+ typedef void (*quick_sorter)(cv::Mat&, cv::Mat&, bool);
+ SortByKey_STL();
+ quick_sorter quick_sorters[CV_64FC4][CV_64FC4];
+ static SortByKey_STL instance;
+};
+
+SortByKey_STL SortByKey_STL::instance = SortByKey_STL();
+
+SortByKey_STL::SortByKey_STL()
+{
+ memset(instance.quick_sorters, 0, sizeof(quick_sorters));
+#define NEW_SORTER(KT, VT) \
+ instance.quick_sorters[KV_CVTYPE<KT>::toType()][KV_CVTYPE<VT>::toType()] = kvquicksort<KT, VT>;
+
+ NEW_SORTER(int, int);
+ NEW_SORTER(int, Vec2i);
+ NEW_SORTER(int, float);
+ NEW_SORTER(int, Vec2f);
+
+ NEW_SORTER(float, int);
+ NEW_SORTER(float, Vec2i);
+ NEW_SORTER(float, float);
+ NEW_SORTER(float, Vec2f);
+#undef NEW_SORTER
+}
+
+void SortByKey_STL::sort(cv::Mat& keys, cv::Mat& vals, bool is_gt)
+{
+ instance.quick_sorters[keys.type()][vals.type()](keys, vals, is_gt);
+}
+
+bool checkUnstableSorterResult(const Mat& gkeys_, const Mat& gvals_,
+ const Mat& /*dkeys_*/, const Mat& dvals_)
+{
+ int cn_val = gvals_.channels();
+ int count = gkeys_.cols;
+
+ //for convenience we convert depth to float and channels to 1
+ Mat gkeys, gvals, dkeys, dvals;
+ gkeys_.reshape(1).convertTo(gkeys, CV_32F);
+ gvals_.reshape(1).convertTo(gvals, CV_32F);
+ //dkeys_.reshape(1).convertTo(dkeys, CV_32F);
+ dvals_.reshape(1).convertTo(dvals, CV_32F);
+ float * gkptr = gkeys.ptr<float>();
+ float * gvptr = gvals.ptr<float>();
+ //float * dkptr = dkeys.ptr<float>();
+ float * dvptr = dvals.ptr<float>();
+
+ for(int i = 0; i < count - 1; ++i)
+ {
+ int iden_count = 0;
+ // firstly calculate the number of identical keys
+ while(gkptr[i + iden_count] == gkptr[i + 1 + iden_count])
+ {
+ ++ iden_count;
+ }
+
+ // sort dv and gv
+ int num_of_val = (iden_count + 1) * cn_val;
+ std::sort(gvptr + i * cn_val, gvptr + i * cn_val + num_of_val);
+ std::sort(dvptr + i * cn_val, dvptr + i * cn_val + num_of_val);
+
+ // then check if [i, i + iden_count) is the same
+ for(int j = 0; j < num_of_val; ++j)
+ {
+ if(gvptr[i + j] != dvptr[i + j])
+ {
+ return false;
+ }
+ }
+ i += iden_count;
+ }
+ return true;
+}
+}
+
+#define INPUT_SIZES Values(InputSize(0x10), InputSize(0x100), InputSize(0x10000)) //2^4, 2^8, 2^16
+#define KEY_TYPES Values(MatType(CV_32SC1), MatType(CV_32FC1))
+#define VAL_TYPES Values(MatType(CV_32SC1), MatType(CV_32SC2), MatType(CV_32FC1), MatType(CV_32FC2))
+#define SORT_METHODS Values(SortMethod(cv::ocl::SORT_BITONIC),SortMethod(cv::ocl::SORT_MERGE),SortMethod(cv::ocl::SORT_RADIX)/*,SortMethod(cv::ocl::SORT_SELECTION)*/)
+#define F_OR_T Values(IsGreaterThan(false), IsGreaterThan(true))
+
+PARAM_TEST_CASE(SortByKey, InputSize, MatType, MatType, SortMethod, IsGreaterThan)
+{
+ InputSize input_size;
+ MatType key_type, val_type;
+ SortMethod method;
+ IsGreaterThan is_gt;
+
+ Mat mat_key, mat_val;
+ virtual void SetUp()
+ {
+ input_size = GET_PARAM(0);
+ key_type = GET_PARAM(1);
+ val_type = GET_PARAM(2);
+ method = GET_PARAM(3);
+ is_gt = GET_PARAM(4);
+
+ using namespace cv;
+ // fill key and val
+ mat_key = randomMat(Size(input_size, 1), key_type, INT_MIN, INT_MAX);
+ mat_val = randomMat(Size(input_size, 1), val_type, INT_MIN, INT_MAX);
+ }
+};
+
+TEST_P(SortByKey, Accuracy)
+{
+ using namespace cv;
+ ocl::oclMat oclmat_key(mat_key);
+ ocl::oclMat oclmat_val(mat_val);
+
+ ocl::sort_by_key(oclmat_key, oclmat_val, method, is_gt);
+ SortByKey_STL::sort(mat_key, mat_val, is_gt);
+
+ EXPECT_MAT_NEAR(mat_key, oclmat_key, 0.0);
+ EXPECT_TRUE(checkUnstableSorterResult(mat_key, mat_val, oclmat_key, oclmat_val));
+}
+INSTANTIATE_TEST_CASE_P(OCL_SORT, SortByKey, Combine(INPUT_SIZES, KEY_TYPES, VAL_TYPES, SORT_METHODS, F_OR_T));