1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
45 #include "opencv2/gpu/device/common.hpp"
46 #include "opencv2/gpu/device/utility.hpp"
47 #include "opencv2/gpu/device/reduce.hpp"
48 #include "opencv2/gpu/device/limits.hpp"
49 #include "opencv2/gpu/device/vec_distance.hpp"
50 #include "opencv2/gpu/device/datamov_utils.hpp"
52 namespace cv { namespace gpu { namespace device
56 ///////////////////////////////////////////////////////////////////////////////
59 template <int BLOCK_SIZE>
60 __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
62 s_distance += threadIdx.y * BLOCK_SIZE;
63 s_trainIdx += threadIdx.y * BLOCK_SIZE;
65 reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
68 template <int BLOCK_SIZE>
69 __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
71 s_distance += threadIdx.y * BLOCK_SIZE;
72 s_trainIdx += threadIdx.y * BLOCK_SIZE;
73 s_imgIdx += threadIdx.y * BLOCK_SIZE;
75 reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
78 ///////////////////////////////////////////////////////////////////////////////
79 // Match Unrolled Cached
81 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
82 __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
85 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
87 const int loadX = threadIdx.x + i * BLOCK_SIZE;
88 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
92 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
93 __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
94 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
95 float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
97 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
102 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
104 const int loadX = threadIdx.x + i * BLOCK_SIZE;
106 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
108 if (loadX < train.cols)
112 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
113 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
119 for (int j = 0; j < BLOCK_SIZE; ++j)
120 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
125 typename Dist::result_type distVal = dist;
127 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
129 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
132 bestDistance = distVal;
133 bestTrainIdx = trainIdx;
138 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
139 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
141 extern __shared__ int smem[];
143 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
145 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
146 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
148 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
150 float myBestDistance = numeric_limits<float>::max();
151 int myBestTrainIdx = -1;
153 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
157 float* s_distance = (float*)(smem);
158 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
160 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
162 if (queryIdx < query.rows && threadIdx.x == 0)
164 bestTrainIdx[queryIdx] = myBestTrainIdx;
165 bestDistance[queryIdx] = myBestDistance;
169 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
170 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
171 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
174 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
175 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
177 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
179 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
180 cudaSafeCall( cudaGetLastError() );
183 cudaSafeCall( cudaDeviceSynchronize() );
186 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
187 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
188 int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
190 extern __shared__ int smem[];
192 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
194 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
195 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
197 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
199 float myBestDistance = numeric_limits<float>::max();
200 int myBestTrainIdx = -1;
201 int myBestImgIdx = -1;
205 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
207 const PtrStepSz<T> train = trains[imgIdx];
209 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
214 float* s_distance = (float*)(smem);
215 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
216 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
218 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
220 if (queryIdx < query.rows && threadIdx.x == 0)
222 bestTrainIdx[queryIdx] = myBestTrainIdx;
223 bestImgIdx[queryIdx] = myBestImgIdx;
224 bestDistance[queryIdx] = myBestDistance;
228 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
229 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
230 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
233 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
234 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
236 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
238 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
239 cudaSafeCall( cudaGetLastError() );
242 cudaSafeCall( cudaDeviceSynchronize() );
245 ///////////////////////////////////////////////////////////////////////////////
248 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
249 __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
250 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
251 float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
253 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
258 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
260 const int loadX = threadIdx.x + i * BLOCK_SIZE;
262 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
263 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
265 if (loadX < query.cols)
269 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
270 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
272 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
273 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
279 for (int j = 0; j < BLOCK_SIZE; ++j)
280 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
285 typename Dist::result_type distVal = dist;
287 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
289 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
292 bestDistance = distVal;
293 bestTrainIdx = trainIdx;
298 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
299 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
301 extern __shared__ int smem[];
303 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
305 float myBestDistance = numeric_limits<float>::max();
306 int myBestTrainIdx = -1;
308 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
309 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
311 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
315 float* s_distance = (float*)(smem);
316 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
318 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
320 if (queryIdx < query.rows && threadIdx.x == 0)
322 bestTrainIdx[queryIdx] = myBestTrainIdx;
323 bestDistance[queryIdx] = myBestDistance;
327 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
328 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
329 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
332 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
333 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
335 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
337 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
338 cudaSafeCall( cudaGetLastError() );
341 cudaSafeCall( cudaDeviceSynchronize() );
344 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
345 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
346 int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
348 extern __shared__ int smem[];
350 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
352 float myBestDistance = numeric_limits<float>::max();
353 int myBestTrainIdx = -1;
354 int myBestImgIdx = -1;
356 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
357 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
361 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
363 const PtrStepSz<T> train = trains[imgIdx];
365 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
370 float* s_distance = (float*)(smem);
371 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
372 int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
374 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
376 if (queryIdx < query.rows && threadIdx.x == 0)
378 bestTrainIdx[queryIdx] = myBestTrainIdx;
379 bestImgIdx[queryIdx] = myBestImgIdx;
380 bestDistance[queryIdx] = myBestDistance;
384 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
385 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
386 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
389 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
390 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
392 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
394 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
395 cudaSafeCall( cudaGetLastError() );
398 cudaSafeCall( cudaDeviceSynchronize() );
401 ///////////////////////////////////////////////////////////////////////////////
404 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
405 __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
406 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
407 float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
409 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
413 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
415 const int loadX = threadIdx.x + i * BLOCK_SIZE;
417 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
418 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
420 if (loadX < query.cols)
424 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
425 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
427 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
428 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
434 for (int j = 0; j < BLOCK_SIZE; ++j)
435 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
440 typename Dist::result_type distVal = dist;
442 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
444 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
447 bestDistance = distVal;
448 bestTrainIdx = trainIdx;
453 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
454 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
456 extern __shared__ int smem[];
458 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
460 float myBestDistance = numeric_limits<float>::max();
461 int myBestTrainIdx = -1;
463 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
464 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
466 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
470 float* s_distance = (float*)(smem);
471 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
473 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
475 if (queryIdx < query.rows && threadIdx.x == 0)
477 bestTrainIdx[queryIdx] = myBestTrainIdx;
478 bestDistance[queryIdx] = myBestDistance;
482 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
483 void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
484 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
487 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
488 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
490 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
492 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
493 cudaSafeCall( cudaGetLastError() );
496 cudaSafeCall( cudaDeviceSynchronize() );
499 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
500 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
501 int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
503 extern __shared__ int smem[];
505 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
507 float myBestDistance = numeric_limits<float>::max();
508 int myBestTrainIdx = -1;
509 int myBestImgIdx = -1;
511 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
512 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
515 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
517 const PtrStepSz<T> train = trains[imgIdx];
519 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
524 float* s_distance = (float*)(smem);
525 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
526 int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
528 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
530 if (queryIdx < query.rows && threadIdx.x == 0)
532 bestTrainIdx[queryIdx] = myBestTrainIdx;
533 bestImgIdx[queryIdx] = myBestImgIdx;
534 bestDistance[queryIdx] = myBestDistance;
538 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
539 void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
540 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
543 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
544 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
546 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
548 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
549 cudaSafeCall( cudaGetLastError() );
552 cudaSafeCall( cudaDeviceSynchronize() );
555 ///////////////////////////////////////////////////////////////////////////////
558 template <typename Dist, typename T, typename Mask>
559 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
560 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
563 if (query.cols <= 64)
565 matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
567 else if (query.cols <= 128)
569 matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
571 /*else if (query.cols <= 256)
573 matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
575 else if (query.cols <= 512)
577 matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
579 else if (query.cols <= 1024)
581 matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
585 match<16, Dist>(query, train, mask, trainIdx, distance, stream);
589 template <typename Dist, typename T, typename Mask>
590 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
591 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
594 if (query.cols <= 64)
596 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
598 else if (query.cols <= 128)
600 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
602 /*else if (query.cols <= 256)
604 matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
606 else if (query.cols <= 512)
608 matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
610 else if (query.cols <= 1024)
612 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
616 match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
620 ///////////////////////////////////////////////////////////////////////////////
623 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
624 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
629 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
635 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
641 template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
642 //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
643 template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
644 template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
645 template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
646 template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
648 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
649 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
654 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
660 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
666 //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
667 //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
668 //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
669 //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
670 //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
671 template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
673 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
674 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
679 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
685 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
691 template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
692 //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
693 template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
694 //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
695 template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
697 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
698 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
703 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
704 trainIdx, imgIdx, distance,
709 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
710 trainIdx, imgIdx, distance,
715 template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
716 //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
717 template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
718 template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
719 template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
720 template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
722 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
723 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
728 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
729 trainIdx, imgIdx, distance,
734 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
735 trainIdx, imgIdx, distance,
740 //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
741 //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
742 //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
743 //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
744 //template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
745 template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
747 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
748 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
753 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
754 trainIdx, imgIdx, distance,
759 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
760 trainIdx, imgIdx, distance,
765 template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
766 //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
767 template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
768 //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
769 template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
770 } // namespace bf_match
771 }}} // namespace cv { namespace gpu { namespace device {
774 #endif /* CUDA_DISABLER */