1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
45 #include "opencv2/gpu/device/common.hpp"
46 #include "opencv2/gpu/device/utility.hpp"
47 #include "opencv2/gpu/device/reduce.hpp"
48 #include "opencv2/gpu/device/limits.hpp"
49 #include "opencv2/gpu/device/vec_distance.hpp"
50 #include "opencv2/gpu/device/datamov_utils.hpp"
51 #include "opencv2/gpu/device/warp_shuffle.hpp"
53 namespace cv { namespace gpu { namespace device
57 ///////////////////////////////////////////////////////////////////////////////
60 template <int BLOCK_SIZE>
61 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
62 int& bestTrainIdx1, int& bestTrainIdx2,
63 float* s_distance, int* s_trainIdx)
65 #if __CUDA_ARCH__ >= 300
73 for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
75 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
76 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
77 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
78 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
80 if (bestDistance1 < d1)
82 if (d1 < bestDistance2)
90 bestDistance2 = bestDistance1;
91 bestTrainIdx2 = bestTrainIdx1;
96 if (d2 < bestDistance2)
104 float myBestDistance1 = numeric_limits<float>::max();
105 float myBestDistance2 = numeric_limits<float>::max();
106 int myBestTrainIdx1 = -1;
107 int myBestTrainIdx2 = -1;
109 s_distance += threadIdx.y * BLOCK_SIZE;
110 s_trainIdx += threadIdx.y * BLOCK_SIZE;
112 s_distance[threadIdx.x] = bestDistance1;
113 s_trainIdx[threadIdx.x] = bestTrainIdx1;
117 if (threadIdx.x == 0)
120 for (int i = 0; i < BLOCK_SIZE; ++i)
122 float val = s_distance[i];
124 if (val < myBestDistance1)
126 myBestDistance2 = myBestDistance1;
127 myBestTrainIdx2 = myBestTrainIdx1;
129 myBestDistance1 = val;
130 myBestTrainIdx1 = s_trainIdx[i];
132 else if (val < myBestDistance2)
134 myBestDistance2 = val;
135 myBestTrainIdx2 = s_trainIdx[i];
142 s_distance[threadIdx.x] = bestDistance2;
143 s_trainIdx[threadIdx.x] = bestTrainIdx2;
147 if (threadIdx.x == 0)
150 for (int i = 0; i < BLOCK_SIZE; ++i)
152 float val = s_distance[i];
154 if (val < myBestDistance2)
156 myBestDistance2 = val;
157 myBestTrainIdx2 = s_trainIdx[i];
162 bestDistance1 = myBestDistance1;
163 bestDistance2 = myBestDistance2;
165 bestTrainIdx1 = myBestTrainIdx1;
166 bestTrainIdx2 = myBestTrainIdx2;
170 template <int BLOCK_SIZE>
171 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
172 int& bestTrainIdx1, int& bestTrainIdx2,
173 int& bestImgIdx1, int& bestImgIdx2,
174 float* s_distance, int* s_trainIdx, int* s_imgIdx)
176 #if __CUDA_ARCH__ >= 300
186 for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
188 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
189 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
190 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
191 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
192 j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
193 j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
195 if (bestDistance1 < d1)
197 if (d1 < bestDistance2)
206 bestDistance2 = bestDistance1;
207 bestTrainIdx2 = bestTrainIdx1;
208 bestImgIdx2 = bestImgIdx1;
214 if (d2 < bestDistance2)
223 float myBestDistance1 = numeric_limits<float>::max();
224 float myBestDistance2 = numeric_limits<float>::max();
225 int myBestTrainIdx1 = -1;
226 int myBestTrainIdx2 = -1;
227 int myBestImgIdx1 = -1;
228 int myBestImgIdx2 = -1;
230 s_distance += threadIdx.y * BLOCK_SIZE;
231 s_trainIdx += threadIdx.y * BLOCK_SIZE;
232 s_imgIdx += threadIdx.y * BLOCK_SIZE;
234 s_distance[threadIdx.x] = bestDistance1;
235 s_trainIdx[threadIdx.x] = bestTrainIdx1;
236 s_imgIdx[threadIdx.x] = bestImgIdx1;
240 if (threadIdx.x == 0)
243 for (int i = 0; i < BLOCK_SIZE; ++i)
245 float val = s_distance[i];
247 if (val < myBestDistance1)
249 myBestDistance2 = myBestDistance1;
250 myBestTrainIdx2 = myBestTrainIdx1;
251 myBestImgIdx2 = myBestImgIdx1;
253 myBestDistance1 = val;
254 myBestTrainIdx1 = s_trainIdx[i];
255 myBestImgIdx1 = s_imgIdx[i];
257 else if (val < myBestDistance2)
259 myBestDistance2 = val;
260 myBestTrainIdx2 = s_trainIdx[i];
261 myBestImgIdx2 = s_imgIdx[i];
268 s_distance[threadIdx.x] = bestDistance2;
269 s_trainIdx[threadIdx.x] = bestTrainIdx2;
270 s_imgIdx[threadIdx.x] = bestImgIdx2;
274 if (threadIdx.x == 0)
277 for (int i = 0; i < BLOCK_SIZE; ++i)
279 float val = s_distance[i];
281 if (val < myBestDistance2)
283 myBestDistance2 = val;
284 myBestTrainIdx2 = s_trainIdx[i];
285 myBestImgIdx2 = s_imgIdx[i];
290 bestDistance1 = myBestDistance1;
291 bestDistance2 = myBestDistance2;
293 bestTrainIdx1 = myBestTrainIdx1;
294 bestTrainIdx2 = myBestTrainIdx2;
296 bestImgIdx1 = myBestImgIdx1;
297 bestImgIdx2 = myBestImgIdx2;
301 ///////////////////////////////////////////////////////////////////////////////
302 // Match Unrolled Cached
304 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
305 __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
308 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
310 const int loadX = threadIdx.x + i * BLOCK_SIZE;
311 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
315 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
316 __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
317 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
318 float& bestDistance1, float& bestDistance2,
319 int& bestTrainIdx1, int& bestTrainIdx2,
320 int& bestImgIdx1, int& bestImgIdx2)
322 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
327 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
329 const int loadX = threadIdx.x + i * BLOCK_SIZE;
331 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
333 if (loadX < train.cols)
337 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
338 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
344 for (int j = 0; j < BLOCK_SIZE; ++j)
345 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
350 typename Dist::result_type distVal = dist;
352 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
354 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
356 if (distVal < bestDistance1)
358 bestImgIdx2 = bestImgIdx1;
359 bestDistance2 = bestDistance1;
360 bestTrainIdx2 = bestTrainIdx1;
362 bestImgIdx1 = imgIdx;
363 bestDistance1 = distVal;
364 bestTrainIdx1 = trainIdx;
366 else if (distVal < bestDistance2)
368 bestImgIdx2 = imgIdx;
369 bestDistance2 = distVal;
370 bestTrainIdx2 = trainIdx;
376 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
377 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
379 extern __shared__ int smem[];
381 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
383 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
384 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
386 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
388 float myBestDistance1 = numeric_limits<float>::max();
389 float myBestDistance2 = numeric_limits<float>::max();
390 int myBestTrainIdx1 = -1;
391 int myBestTrainIdx2 = -1;
393 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
397 float* s_distance = (float*)(smem);
398 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
400 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
402 if (queryIdx < query.rows && threadIdx.x == 0)
404 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
405 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
409 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
410 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
411 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
414 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
415 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
417 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
419 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
420 cudaSafeCall( cudaGetLastError() );
423 cudaSafeCall( cudaDeviceSynchronize() );
426 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
427 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
429 extern __shared__ int smem[];
431 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
433 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
434 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
436 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
438 float myBestDistance1 = numeric_limits<float>::max();
439 float myBestDistance2 = numeric_limits<float>::max();
440 int myBestTrainIdx1 = -1;
441 int myBestTrainIdx2 = -1;
442 int myBestImgIdx1 = -1;
443 int myBestImgIdx2 = -1;
447 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
449 const PtrStepSz<T> train = trains[imgIdx];
451 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
456 float* s_distance = (float*)(smem);
457 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
458 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
460 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
462 if (queryIdx < query.rows && threadIdx.x == 0)
464 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
465 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
466 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
470 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
471 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
472 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
475 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
476 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
478 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
480 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
481 cudaSafeCall( cudaGetLastError() );
484 cudaSafeCall( cudaDeviceSynchronize() );
487 ///////////////////////////////////////////////////////////////////////////////
490 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
491 __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
492 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
493 float& bestDistance1, float& bestDistance2,
494 int& bestTrainIdx1, int& bestTrainIdx2,
495 int& bestImgIdx1, int& bestImgIdx2)
497 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
502 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
504 const int loadX = threadIdx.x + i * BLOCK_SIZE;
506 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
507 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
509 if (loadX < query.cols)
513 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
514 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
516 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
517 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
523 for (int j = 0; j < BLOCK_SIZE; ++j)
524 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
529 typename Dist::result_type distVal = dist;
531 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
533 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
535 if (distVal < bestDistance1)
537 bestImgIdx2 = bestImgIdx1;
538 bestDistance2 = bestDistance1;
539 bestTrainIdx2 = bestTrainIdx1;
541 bestImgIdx1 = imgIdx;
542 bestDistance1 = distVal;
543 bestTrainIdx1 = trainIdx;
545 else if (distVal < bestDistance2)
547 bestImgIdx2 = imgIdx;
548 bestDistance2 = distVal;
549 bestTrainIdx2 = trainIdx;
555 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
556 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
558 extern __shared__ int smem[];
560 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
562 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
563 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
565 float myBestDistance1 = numeric_limits<float>::max();
566 float myBestDistance2 = numeric_limits<float>::max();
567 int myBestTrainIdx1 = -1;
568 int myBestTrainIdx2 = -1;
570 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
574 float* s_distance = (float*)(smem);
575 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
577 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
579 if (queryIdx < query.rows && threadIdx.x == 0)
581 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
582 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
586 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
587 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
588 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
591 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
592 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
594 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
596 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
597 cudaSafeCall( cudaGetLastError() );
600 cudaSafeCall( cudaDeviceSynchronize() );
603 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
604 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
606 extern __shared__ int smem[];
608 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
610 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
611 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
613 float myBestDistance1 = numeric_limits<float>::max();
614 float myBestDistance2 = numeric_limits<float>::max();
615 int myBestTrainIdx1 = -1;
616 int myBestTrainIdx2 = -1;
617 int myBestImgIdx1 = -1;
618 int myBestImgIdx2 = -1;
622 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
624 const PtrStepSz<T> train = trains[imgIdx];
626 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
631 float* s_distance = (float*)(smem);
632 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
633 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
635 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
637 if (queryIdx < query.rows && threadIdx.x == 0)
639 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
640 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
641 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
645 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
646 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
647 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
650 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
651 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
653 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
655 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
656 cudaSafeCall( cudaGetLastError() );
659 cudaSafeCall( cudaDeviceSynchronize() );
662 ///////////////////////////////////////////////////////////////////////////////
665 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
666 __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
667 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
668 float& bestDistance1, float& bestDistance2,
669 int& bestTrainIdx1, int& bestTrainIdx2,
670 int& bestImgIdx1, int& bestImgIdx2)
672 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
676 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
678 const int loadX = threadIdx.x + i * BLOCK_SIZE;
680 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
681 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
683 if (loadX < query.cols)
687 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
688 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
690 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
691 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
697 for (int j = 0; j < BLOCK_SIZE; ++j)
698 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
703 typename Dist::result_type distVal = dist;
705 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
707 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
709 if (distVal < bestDistance1)
711 bestImgIdx2 = bestImgIdx1;
712 bestDistance2 = bestDistance1;
713 bestTrainIdx2 = bestTrainIdx1;
715 bestImgIdx1 = imgIdx;
716 bestDistance1 = distVal;
717 bestTrainIdx1 = trainIdx;
719 else if (distVal < bestDistance2)
721 bestImgIdx2 = imgIdx;
722 bestDistance2 = distVal;
723 bestTrainIdx2 = trainIdx;
729 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
730 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
732 extern __shared__ int smem[];
734 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
736 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
737 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
739 float myBestDistance1 = numeric_limits<float>::max();
740 float myBestDistance2 = numeric_limits<float>::max();
741 int myBestTrainIdx1 = -1;
742 int myBestTrainIdx2 = -1;
744 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
748 float* s_distance = (float*)(smem);
749 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
751 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
753 if (queryIdx < query.rows && threadIdx.x == 0)
755 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
756 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
760 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
761 void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
762 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
765 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
766 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
768 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
770 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
771 cudaSafeCall( cudaGetLastError() );
774 cudaSafeCall( cudaDeviceSynchronize() );
777 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
778 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
780 extern __shared__ int smem[];
782 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
784 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
785 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
787 float myBestDistance1 = numeric_limits<float>::max();
788 float myBestDistance2 = numeric_limits<float>::max();
789 int myBestTrainIdx1 = -1;
790 int myBestTrainIdx2 = -1;
791 int myBestImgIdx1 = -1;
792 int myBestImgIdx2 = -1;
796 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
798 const PtrStepSz<T> train = trains[imgIdx];
800 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
805 float* s_distance = (float*)(smem);
806 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
807 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
809 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
811 if (queryIdx < query.rows && threadIdx.x == 0)
813 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
814 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
815 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
819 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
820 void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
821 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
824 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
825 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
827 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
829 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
830 cudaSafeCall( cudaGetLastError() );
833 cudaSafeCall( cudaDeviceSynchronize() );
836 ///////////////////////////////////////////////////////////////////////////////
837 // knnMatch 2 dispatcher
839 template <typename Dist, typename T, typename Mask>
840 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
841 const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
844 if (query.cols <= 64)
846 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
848 else if (query.cols <= 128)
850 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
852 /*else if (query.cols <= 256)
854 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
856 else if (query.cols <= 512)
858 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
860 else if (query.cols <= 1024)
862 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
866 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
870 template <typename Dist, typename T, typename Mask>
871 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
872 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
875 if (query.cols <= 64)
877 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
879 else if (query.cols <= 128)
881 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
883 /*else if (query.cols <= 256)
885 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
887 else if (query.cols <= 512)
889 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
891 else if (query.cols <= 1024)
893 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
897 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
901 ///////////////////////////////////////////////////////////////////////////////
902 // Calc distance kernel
904 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
905 __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
907 extern __shared__ int smem[];
909 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
910 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
912 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
913 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
918 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
920 const int loadX = threadIdx.x + i * BLOCK_SIZE;
922 if (loadX < query.cols)
924 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
925 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
929 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
930 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
936 for (int j = 0; j < BLOCK_SIZE; ++j)
937 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
942 if (queryIdx < query.rows && trainIdx < train.rows)
944 float distVal = numeric_limits<float>::max();
946 if (mask(queryIdx, trainIdx))
947 distVal = (typename Dist::result_type)dist;
949 allDist.ptr(queryIdx)[trainIdx] = distVal;
953 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
954 void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
956 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
957 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
959 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
961 calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
962 cudaSafeCall( cudaGetLastError() );
965 cudaSafeCall( cudaDeviceSynchronize() );
968 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
969 __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
971 extern __shared__ int smem[];
973 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
974 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
976 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
977 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
981 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
983 const int loadX = threadIdx.x + i * BLOCK_SIZE;
985 if (loadX < query.cols)
987 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
988 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
992 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
993 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
999 for (int j = 0; j < BLOCK_SIZE; ++j)
1000 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
1005 if (queryIdx < query.rows && trainIdx < train.rows)
1007 float distVal = numeric_limits<float>::max();
1009 if (mask(queryIdx, trainIdx))
1010 distVal = (typename Dist::result_type)dist;
1012 allDist.ptr(queryIdx)[trainIdx] = distVal;
1016 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
1017 void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
1019 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
1020 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
1022 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
1024 calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
1025 cudaSafeCall( cudaGetLastError() );
1028 cudaSafeCall( cudaDeviceSynchronize() );
1031 ///////////////////////////////////////////////////////////////////////////////
1032 // Calc Distance dispatcher
1034 template <typename Dist, typename T, typename Mask>
1035 void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
1036 const PtrStepSzf& allDist,
1037 cudaStream_t stream)
1039 if (query.cols <= 64)
1041 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
1043 else if (query.cols <= 128)
1045 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
1047 /*else if (query.cols <= 256)
1049 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
1051 else if (query.cols <= 512)
1053 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
1055 else if (query.cols <= 1024)
1057 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
1061 calcDistance<16, Dist>(query, train, mask, allDist, stream);
1065 ///////////////////////////////////////////////////////////////////////////////
1066 // find knn match kernel
1068 template <int BLOCK_SIZE>
1069 __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
1071 const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
1072 __shared__ float s_dist[SMEM_SIZE];
1073 __shared__ int s_trainIdx[SMEM_SIZE];
1075 const int queryIdx = blockIdx.x;
1077 float* allDistRow = allDist.ptr(queryIdx);
1079 float dist = numeric_limits<float>::max();
1082 for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
1084 float reg = allDistRow[i];
1092 s_dist[threadIdx.x] = dist;
1093 s_trainIdx[threadIdx.x] = bestIdx;
1096 reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
1098 if (threadIdx.x == 0)
1100 if (dist < numeric_limits<float>::max())
1102 allDistRow[bestIdx] = numeric_limits<float>::max();
1103 trainIdx.ptr(queryIdx)[i] = bestIdx;
1104 distance.ptr(queryIdx)[i] = dist;
1109 template <int BLOCK_SIZE>
1110 void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
1112 const dim3 block(BLOCK_SIZE, 1, 1);
1113 const dim3 grid(trainIdx.rows, 1, 1);
1115 for (int i = 0; i < k; ++i)
1117 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
1118 cudaSafeCall( cudaGetLastError() );
1122 cudaSafeCall( cudaDeviceSynchronize() );
1125 void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
1127 findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
1130 ///////////////////////////////////////////////////////////////////////////////
1131 // knn match Dispatcher
1133 template <typename Dist, typename T, typename Mask>
1134 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
1135 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1136 cudaStream_t stream)
1140 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
1144 calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
1145 findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
1149 ///////////////////////////////////////////////////////////////////////////////
1152 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1153 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1154 cudaStream_t stream)
1157 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1159 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1162 template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1163 //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1164 template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1165 template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1166 template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1167 template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1169 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1170 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1171 cudaStream_t stream)
1174 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1176 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1179 //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1180 //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1181 //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1182 //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1183 //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1184 template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1186 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1187 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1188 cudaStream_t stream)
1191 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1193 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1196 template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1197 //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1198 template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1199 //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1200 template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1202 template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1203 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1204 cudaStream_t stream)
1207 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1209 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
1212 template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1213 //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1214 template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1215 template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1216 template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1217 template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1219 template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1220 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1221 cudaStream_t stream)
1224 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1226 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
1229 //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1230 //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1231 //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1232 //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1233 //template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1234 template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1236 template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1237 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1238 cudaStream_t stream)
1241 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1243 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
1246 template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1247 //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1248 template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1249 //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1250 template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1251 } // namespace bf_knnmatch
1252 }}} // namespace cv { namespace gpu { namespace device {
1255 #endif /* CUDA_DISABLER */