PtrStep curMask;\r
};\r
\r
- class WithOutMask\r
+ struct WithOutMask\r
{\r
- public:\r
__device__ __forceinline__ void nextMask() const\r
{\r
}\r
///////////////////////////////////////////////////////////////////////////////\r
// Reduce Sum\r
\r
- template <int BLOCK_DIM_X> struct SumReductor; \r
+ template <int BLOCK_DIM_X> struct SumReductor;\r
template <> struct SumReductor<16>\r
{\r
- template <typename T> static __device__ void reduce(T* sdiff_row, T& mySum)\r
+ template <typename T> static __device__ void reduce(volatile T* sdiff_row, T& mySum)\r
{\r
- volatile T* smem = sdiff_row;\r
-\r
- smem[threadIdx.x] = mySum;\r
+ sdiff_row[threadIdx.x] = mySum;\r
\r
if (threadIdx.x < 8) \r
{\r
- smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; \r
- smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; \r
- smem[threadIdx.x] = mySum += smem[threadIdx.x + 2];\r
- smem[threadIdx.x] = mySum += smem[threadIdx.x + 1]; \r
+ sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 8]; \r
+ sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 4]; \r
+ sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 2];\r
+ sdiff_row[threadIdx.x] = mySum += sdiff_row[threadIdx.x + 1]; \r
}\r
}\r
};\r
///////////////////////////////////////////////////////////////////////////////\r
// warpReduceMinIdxIdx\r
\r
- template <int BLOCK_DIM_Y> struct MinIdxIdxWarpReductor;\r
+ template <int BLOCK_DIM_Y> struct MinIdxIdxWarpReductor; \r
template <> struct MinIdxIdxWarpReductor<16>\r
{\r
template <typename T> \r
__device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem)\r
{\r
loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);\r
+ __syncthreads();\r
}\r
\r
template <typename Dist>\r
///////////////////////////////////////////////////////////////////////////////////\r
//////////////////////////////////// Knn Match ////////////////////////////////////\r
///////////////////////////////////////////////////////////////////////////////////\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename ReduceDescCalculator, typename T, typename Mask>\r
+ __device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,\r
+ typename Dist::ResultType& distMin1, typename Dist::ResultType& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, \r
+ typename Dist::ResultType* smem)\r
+ {\r
+ ReduceDescCalculator reduceDescCalc;\r
+\r
+ reduceDescCalc.prepare(query.ptr(queryIdx), train.cols, (typename Dist::ValueType*)smem);\r
+ \r
+ typename Dist::ResultType* sdiffRow = smem + BLOCK_DIM_X * threadIdx.y;\r
+\r
+ for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += BLOCK_DIM_Y)\r
+ {\r
+ if (m(queryIdx, trainIdx))\r
+ {\r
+ Dist dist;\r
+\r
+ const T* trainRow = train.ptr(trainIdx);\r
+ \r
+ reduceDescCalc.calc(trainRow, train.cols, dist, sdiffRow);\r
+\r
+ if (threadIdx.x == 0)\r
+ {\r
+ typename Dist::ResultType val = dist;\r
+\r
+ if (val < distMin1)\r
+ {\r
+ distMin1 = val;\r
+ bestTrainIdx1 = trainIdx;\r
+ }\r
+ else if (val < distMin2)\r
+ {\r
+ distMin2 = val;\r
+ bestTrainIdx2 = trainIdx;\r
+ }\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename ReduceDescCalculator, typename T, typename Mask>\r
+ __global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, PtrStep_<int2> trainIdx, PtrStep_<float2> distance)\r
+ {\r
+ typedef typename Dist::ResultType ResultType;\r
+ typedef typename Dist::ValueType ValueType;\r
+\r
+ __shared__ ResultType smem[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+\r
+ const int queryIdx = blockIdx.x;\r
+\r
+ ResultType distMin1 = numeric_limits<ResultType>::max();\r
+ ResultType distMin2 = numeric_limits<ResultType>::max();\r
+\r
+ int bestTrainIdx1 = -1;\r
+ int bestTrainIdx2 = -1;\r
+\r
+ distanceCalcLoop<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, ReduceDescCalculator>(query, train, m, queryIdx, \r
+ distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);\r
+ __syncthreads();\r
+\r
+ volatile ResultType* sdistMinRow = smem;\r
+ volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);\r
+\r
+ if (threadIdx.x == 0)\r
+ {\r
+ sdistMinRow[threadIdx.y] = distMin1;\r
+ sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;\r
+\r
+ sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1; \r
+ sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;\r
+ }\r
+ __syncthreads();\r
+\r
+ if (threadIdx.x == 0 && threadIdx.y == 0)\r
+ {\r
+ distMin1 = numeric_limits<ResultType>::max();\r
+ distMin2 = numeric_limits<ResultType>::max();\r
+\r
+ bestTrainIdx1 = -1;\r
+ bestTrainIdx2 = -1;\r
+\r
+ #pragma unroll\r
+ for (int i = 0; i < BLOCK_DIM_Y; ++i)\r
+ {\r
+ ResultType val = sdistMinRow[i];\r
+\r
+ if (val < distMin1)\r
+ {\r
+ distMin1 = val;\r
+ bestTrainIdx1 = sbestTrainIdxRow[i];\r
+ }\r
+ else if (val < distMin2)\r
+ {\r
+ distMin2 = val;\r
+ bestTrainIdx2 = sbestTrainIdxRow[i];\r
+ }\r
+ }\r
+\r
+ #pragma unroll\r
+ for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)\r
+ {\r
+ ResultType val = sdistMinRow[i];\r
+\r
+ if (val < distMin2)\r
+ {\r
+ distMin2 = val;\r
+ bestTrainIdx2 = sbestTrainIdxRow[i];\r
+ }\r
+ }\r
+\r
+ trainIdx.ptr(queryIdx)[0] = make_int2(bestTrainIdx1, bestTrainIdx2);\r
+ distance.ptr(queryIdx)[0] = make_float2(distMin1, distMin2);\r
+ }\r
+ }\r
+\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
+ void knnMatch2Simple_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, cudaStream_t stream)\r
+ {\r
+ dim3 grid(queryDescs.rows, 1, 1);\r
+ dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+\r
+ knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, T>\r
+ <<<grid, threads, 0, stream>>>(queryDescs, trainDescs, mask, trainIdx, distance);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>\r
+ void knnMatch2Cached_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, cudaStream_t stream)\r
+ {\r
+ StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length\r
+ StaticAssert<MAX_DESCRIPTORS_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX\r
+\r
+ dim3 grid(queryDescs.rows, 1, 1);\r
+ dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
+\r
+ knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T, typename Dist::ValueType>, T>\r
+ <<<grid, threads, 0, stream>>>(queryDescs, trainDescs, mask, trainIdx, distance);\r
+ cudaSafeCall( cudaGetLastError() );\r
+\r
+ if (stream == 0)\r
+ cudaSafeCall( cudaDeviceSynchronize() );\r
+ }\r
+ \r
+ template <typename Dist, typename T, typename Mask>\r
+ void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
+ const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, bool cc_12, cudaStream_t stream)\r
+ {\r
+ if (query.cols < 64)\r
+ knnMatch2Cached_caller<16, 16, 64, false, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ else if (query.cols == 64)\r
+ knnMatch2Cached_caller<16, 16, 64, true, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ else if (query.cols < 128)\r
+ knnMatch2Cached_caller<16, 16, 128, false, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ else if (query.cols == 128 && cc_12)\r
+ knnMatch2Cached_caller<16, 16, 128, true, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ else if (query.cols < 256 && cc_12)\r
+ knnMatch2Cached_caller<16, 16, 256, false, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ else if (query.cols == 256 && cc_12)\r
+ knnMatch2Cached_caller<16, 16, 256, true, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ else\r
+ knnMatch2Simple_caller<16, 16, Dist>(query, train, mask, trainIdx, distance, stream);\r
+ }\r
\r
///////////////////////////////////////////////////////////////////////////////\r
// Calc distance kernel\r
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist, stream);\r
}\r
\r
- template <typename T>\r
- void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
- const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
+ template < typename Dist, typename T >\r
+ void knnMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, int knn,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream)\r
{\r
if (mask.data)\r
{\r
- calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, SingleMask(mask), allDist, stream);\r
+ if (knn == 2)\r
+ {\r
+ knnMatch2Dispatcher<Dist>(queryDescs, trainDescs, SingleMask(mask), (DevMem2D_<int2>)trainIdx, (DevMem2D_<float2>)distance, cc_12, stream);\r
+ return;\r
+ }\r
+\r
+ calcDistanceDispatcher<Dist>(queryDescs, trainDescs, SingleMask(mask), allDist, stream);\r
}\r
else\r
{\r
- calcDistanceDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, WithOutMask(), allDist, stream);\r
+ if (knn == 2)\r
+ {\r
+ knnMatch2Dispatcher<Dist>(queryDescs, trainDescs, WithOutMask(), (DevMem2D_<int2>)trainIdx, (DevMem2D_<float2>)distance, cc_12, stream);\r
+ return;\r
+ }\r
+\r
+ calcDistanceDispatcher<Dist>(queryDescs, trainDescs, WithOutMask(), allDist, stream);\r
}\r
\r
findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);\r
}\r
\r
- template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
+ template <typename T>\r
+ void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream)\r
+ {\r
+ knnMatchDispatcher< L1Dist<T> >((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream);\r
+ }\r
+\r
+ template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
\r
template <typename T>\r
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
- const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream)\r
{\r
- if (mask.data)\r
- {\r
- calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
- SingleMask(mask), allDist, stream);\r
- }\r
- else\r
- {\r
- calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
- WithOutMask(), allDist, stream);\r
- }\r
-\r
- findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);\r
+ knnMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream);\r
}\r
\r
- template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
+ template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
\r
template <typename T>\r
void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,\r
- const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
+ const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream)\r
{\r
- if (mask.data)\r
- {\r
- calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
- SingleMask(mask), allDist, stream);\r
- }\r
- else\r
- {\r
- calcDistanceDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, \r
- WithOutMask(), allDist, stream);\r
- }\r
-\r
- findKnnMatchDispatcher(knn, trainIdx, distance, allDist, stream);\r
+ knnMatchDispatcher<HammingDist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, knn, mask, trainIdx, distance, allDist, cc_12, stream);\r
}\r
\r
- template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
- template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream);\r
+ template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
+ template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);\r
\r
///////////////////////////////////////////////////////////////////////////////////\r
/////////////////////////////////// Radius Match //////////////////////////////////\r