1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #if !defined CUDA_DISABLER
45 #include "internal_shared.hpp"
46 #include "opencv2/gpu/device/limits.hpp"
47 #include "opencv2/gpu/device/vec_distance.hpp"
48 #include "opencv2/gpu/device/datamov_utils.hpp"
50 namespace cv { namespace gpu { namespace device
54 ///////////////////////////////////////////////////////////////////////////////
57 template <int BLOCK_SIZE>
58 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
59 int& bestTrainIdx1, int& bestTrainIdx2,
60 float* s_distance, int* s_trainIdx)
62 float myBestDistance1 = numeric_limits<float>::max();
63 float myBestDistance2 = numeric_limits<float>::max();
64 int myBestTrainIdx1 = -1;
65 int myBestTrainIdx2 = -1;
67 s_distance += threadIdx.y * BLOCK_SIZE;
68 s_trainIdx += threadIdx.y * BLOCK_SIZE;
70 s_distance[threadIdx.x] = bestDistance1;
71 s_trainIdx[threadIdx.x] = bestTrainIdx1;
78 for (int i = 0; i < BLOCK_SIZE; ++i)
80 float val = s_distance[i];
82 if (val < myBestDistance1)
84 myBestDistance2 = myBestDistance1;
85 myBestTrainIdx2 = myBestTrainIdx1;
87 myBestDistance1 = val;
88 myBestTrainIdx1 = s_trainIdx[i];
90 else if (val < myBestDistance2)
92 myBestDistance2 = val;
93 myBestTrainIdx2 = s_trainIdx[i];
100 s_distance[threadIdx.x] = bestDistance2;
101 s_trainIdx[threadIdx.x] = bestTrainIdx2;
105 if (threadIdx.x == 0)
108 for (int i = 0; i < BLOCK_SIZE; ++i)
110 float val = s_distance[i];
112 if (val < myBestDistance2)
114 myBestDistance2 = val;
115 myBestTrainIdx2 = s_trainIdx[i];
120 bestDistance1 = myBestDistance1;
121 bestDistance2 = myBestDistance2;
123 bestTrainIdx1 = myBestTrainIdx1;
124 bestTrainIdx2 = myBestTrainIdx2;
127 template <int BLOCK_SIZE>
128 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
129 int& bestTrainIdx1, int& bestTrainIdx2,
130 int& bestImgIdx1, int& bestImgIdx2,
131 float* s_distance, int* s_trainIdx, int* s_imgIdx)
133 float myBestDistance1 = numeric_limits<float>::max();
134 float myBestDistance2 = numeric_limits<float>::max();
135 int myBestTrainIdx1 = -1;
136 int myBestTrainIdx2 = -1;
137 int myBestImgIdx1 = -1;
138 int myBestImgIdx2 = -1;
140 s_distance += threadIdx.y * BLOCK_SIZE;
141 s_trainIdx += threadIdx.y * BLOCK_SIZE;
142 s_imgIdx += threadIdx.y * BLOCK_SIZE;
144 s_distance[threadIdx.x] = bestDistance1;
145 s_trainIdx[threadIdx.x] = bestTrainIdx1;
146 s_imgIdx[threadIdx.x] = bestImgIdx1;
150 if (threadIdx.x == 0)
153 for (int i = 0; i < BLOCK_SIZE; ++i)
155 float val = s_distance[i];
157 if (val < myBestDistance1)
159 myBestDistance2 = myBestDistance1;
160 myBestTrainIdx2 = myBestTrainIdx1;
161 myBestImgIdx2 = myBestImgIdx1;
163 myBestDistance1 = val;
164 myBestTrainIdx1 = s_trainIdx[i];
165 myBestImgIdx1 = s_imgIdx[i];
167 else if (val < myBestDistance2)
169 myBestDistance2 = val;
170 myBestTrainIdx2 = s_trainIdx[i];
171 myBestImgIdx2 = s_imgIdx[i];
178 s_distance[threadIdx.x] = bestDistance2;
179 s_trainIdx[threadIdx.x] = bestTrainIdx2;
180 s_imgIdx[threadIdx.x] = bestImgIdx2;
184 if (threadIdx.x == 0)
187 for (int i = 0; i < BLOCK_SIZE; ++i)
189 float val = s_distance[i];
191 if (val < myBestDistance2)
193 myBestDistance2 = val;
194 myBestTrainIdx2 = s_trainIdx[i];
195 myBestImgIdx2 = s_imgIdx[i];
200 bestDistance1 = myBestDistance1;
201 bestDistance2 = myBestDistance2;
203 bestTrainIdx1 = myBestTrainIdx1;
204 bestTrainIdx2 = myBestTrainIdx2;
206 bestImgIdx1 = myBestImgIdx1;
207 bestImgIdx2 = myBestImgIdx2;
210 ///////////////////////////////////////////////////////////////////////////////
211 // Match Unrolled Cached
213 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
214 __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
217 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
219 const int loadX = threadIdx.x + i * BLOCK_SIZE;
220 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
224 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
225 __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
226 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
227 float& bestDistance1, float& bestDistance2,
228 int& bestTrainIdx1, int& bestTrainIdx2,
229 int& bestImgIdx1, int& bestImgIdx2)
231 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
236 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
238 const int loadX = threadIdx.x + i * BLOCK_SIZE;
240 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
242 if (loadX < train.cols)
246 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
247 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
253 for (int j = 0; j < BLOCK_SIZE; ++j)
254 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
259 typename Dist::result_type distVal = dist;
261 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
263 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
265 if (distVal < bestDistance1)
267 bestImgIdx2 = bestImgIdx1;
268 bestDistance2 = bestDistance1;
269 bestTrainIdx2 = bestTrainIdx1;
271 bestImgIdx1 = imgIdx;
272 bestDistance1 = distVal;
273 bestTrainIdx1 = trainIdx;
275 else if (distVal < bestDistance2)
277 bestImgIdx2 = imgIdx;
278 bestDistance2 = distVal;
279 bestTrainIdx2 = trainIdx;
285 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
286 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
288 extern __shared__ int smem[];
290 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
292 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
293 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
295 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
297 float myBestDistance1 = numeric_limits<float>::max();
298 float myBestDistance2 = numeric_limits<float>::max();
299 int myBestTrainIdx1 = -1;
300 int myBestTrainIdx2 = -1;
302 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
306 float* s_distance = (float*)(smem);
307 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
309 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
311 if (queryIdx < query.rows && threadIdx.x == 0)
313 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
314 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
318 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
319 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
320 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
323 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
324 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
326 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
328 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
329 cudaSafeCall( cudaGetLastError() );
332 cudaSafeCall( cudaDeviceSynchronize() );
335 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
336 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
338 extern __shared__ int smem[];
340 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
342 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
343 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
345 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
347 float myBestDistance1 = numeric_limits<float>::max();
348 float myBestDistance2 = numeric_limits<float>::max();
349 int myBestTrainIdx1 = -1;
350 int myBestTrainIdx2 = -1;
351 int myBestImgIdx1 = -1;
352 int myBestImgIdx2 = -1;
356 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
358 const PtrStepSz<T> train = trains[imgIdx];
360 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
365 float* s_distance = (float*)(smem);
366 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
367 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
369 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
371 if (queryIdx < query.rows && threadIdx.x == 0)
373 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
374 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
375 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
379 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
380 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
381 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
384 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
385 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
387 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
389 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
390 cudaSafeCall( cudaGetLastError() );
393 cudaSafeCall( cudaDeviceSynchronize() );
396 ///////////////////////////////////////////////////////////////////////////////
399 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
400 __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
401 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
402 float& bestDistance1, float& bestDistance2,
403 int& bestTrainIdx1, int& bestTrainIdx2,
404 int& bestImgIdx1, int& bestImgIdx2)
406 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
411 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
413 const int loadX = threadIdx.x + i * BLOCK_SIZE;
415 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
416 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
418 if (loadX < query.cols)
422 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
423 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
425 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
426 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
432 for (int j = 0; j < BLOCK_SIZE; ++j)
433 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
438 typename Dist::result_type distVal = dist;
440 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
442 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
444 if (distVal < bestDistance1)
446 bestImgIdx2 = bestImgIdx1;
447 bestDistance2 = bestDistance1;
448 bestTrainIdx2 = bestTrainIdx1;
450 bestImgIdx1 = imgIdx;
451 bestDistance1 = distVal;
452 bestTrainIdx1 = trainIdx;
454 else if (distVal < bestDistance2)
456 bestImgIdx2 = imgIdx;
457 bestDistance2 = distVal;
458 bestTrainIdx2 = trainIdx;
464 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
465 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
467 extern __shared__ int smem[];
469 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
471 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
472 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
474 float myBestDistance1 = numeric_limits<float>::max();
475 float myBestDistance2 = numeric_limits<float>::max();
476 int myBestTrainIdx1 = -1;
477 int myBestTrainIdx2 = -1;
479 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
483 float* s_distance = (float*)(smem);
484 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
486 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
488 if (queryIdx < query.rows && threadIdx.x == 0)
490 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
491 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
495 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
496 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
497 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
500 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
501 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
503 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
505 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
506 cudaSafeCall( cudaGetLastError() );
509 cudaSafeCall( cudaDeviceSynchronize() );
512 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
513 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
515 extern __shared__ int smem[];
517 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
519 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
520 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
522 float myBestDistance1 = numeric_limits<float>::max();
523 float myBestDistance2 = numeric_limits<float>::max();
524 int myBestTrainIdx1 = -1;
525 int myBestTrainIdx2 = -1;
526 int myBestImgIdx1 = -1;
527 int myBestImgIdx2 = -1;
531 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
533 const PtrStepSz<T> train = trains[imgIdx];
535 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
540 float* s_distance = (float*)(smem);
541 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
542 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
544 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
546 if (queryIdx < query.rows && threadIdx.x == 0)
548 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
549 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
550 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
554 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
555 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
556 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
559 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
560 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
562 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
564 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
565 cudaSafeCall( cudaGetLastError() );
568 cudaSafeCall( cudaDeviceSynchronize() );
571 ///////////////////////////////////////////////////////////////////////////////
574 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
575 __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
576 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
577 float& bestDistance1, float& bestDistance2,
578 int& bestTrainIdx1, int& bestTrainIdx2,
579 int& bestImgIdx1, int& bestImgIdx2)
581 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
585 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
587 const int loadX = threadIdx.x + i * BLOCK_SIZE;
589 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
590 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
592 if (loadX < query.cols)
596 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
597 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
599 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
600 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
606 for (int j = 0; j < BLOCK_SIZE; ++j)
607 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
612 typename Dist::result_type distVal = dist;
614 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
616 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
618 if (distVal < bestDistance1)
620 bestImgIdx2 = bestImgIdx1;
621 bestDistance2 = bestDistance1;
622 bestTrainIdx2 = bestTrainIdx1;
624 bestImgIdx1 = imgIdx;
625 bestDistance1 = distVal;
626 bestTrainIdx1 = trainIdx;
628 else if (distVal < bestDistance2)
630 bestImgIdx2 = imgIdx;
631 bestDistance2 = distVal;
632 bestTrainIdx2 = trainIdx;
638 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
639 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
641 extern __shared__ int smem[];
643 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
645 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
646 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
648 float myBestDistance1 = numeric_limits<float>::max();
649 float myBestDistance2 = numeric_limits<float>::max();
650 int myBestTrainIdx1 = -1;
651 int myBestTrainIdx2 = -1;
653 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
657 float* s_distance = (float*)(smem);
658 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
660 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
662 if (queryIdx < query.rows && threadIdx.x == 0)
664 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
665 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
669 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
670 void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
671 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
674 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
675 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
677 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
679 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
680 cudaSafeCall( cudaGetLastError() );
683 cudaSafeCall( cudaDeviceSynchronize() );
686 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
687 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
689 extern __shared__ int smem[];
691 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
693 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
694 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
696 float myBestDistance1 = numeric_limits<float>::max();
697 float myBestDistance2 = numeric_limits<float>::max();
698 int myBestTrainIdx1 = -1;
699 int myBestTrainIdx2 = -1;
700 int myBestImgIdx1 = -1;
701 int myBestImgIdx2 = -1;
705 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
707 const PtrStepSz<T> train = trains[imgIdx];
709 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
714 float* s_distance = (float*)(smem);
715 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
716 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
718 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
720 if (queryIdx < query.rows && threadIdx.x == 0)
722 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
723 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
724 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
728 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
729 void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
730 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
733 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
734 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
736 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
738 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
739 cudaSafeCall( cudaGetLastError() );
742 cudaSafeCall( cudaDeviceSynchronize() );
745 ///////////////////////////////////////////////////////////////////////////////
746 // knnMatch 2 dispatcher
748 template <typename Dist, typename T, typename Mask>
749 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
750 const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
751 int cc, cudaStream_t stream)
754 if (query.cols <= 64)
756 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
758 else if (query.cols <= 128)
760 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
762 /*else if (query.cols <= 256)
764 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
766 else if (query.cols <= 512)
768 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
770 else if (query.cols <= 1024)
772 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
776 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
780 template <typename Dist, typename T, typename Mask>
781 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
782 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
783 int cc, cudaStream_t stream)
786 if (query.cols <= 64)
788 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
790 else if (query.cols <= 128)
792 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
794 /*else if (query.cols <= 256)
796 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
798 else if (query.cols <= 512)
800 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
802 else if (query.cols <= 1024)
804 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
808 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
812 ///////////////////////////////////////////////////////////////////////////////
813 // Calc distance kernel
815 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
816 __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
818 extern __shared__ int smem[];
820 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
821 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
823 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
824 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
829 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
831 const int loadX = threadIdx.x + i * BLOCK_SIZE;
833 if (loadX < query.cols)
835 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
836 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
840 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
841 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
847 for (int j = 0; j < BLOCK_SIZE; ++j)
848 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
853 if (queryIdx < query.rows && trainIdx < train.rows)
855 float distVal = numeric_limits<float>::max();
857 if (mask(queryIdx, trainIdx))
858 distVal = (typename Dist::result_type)dist;
860 allDist.ptr(queryIdx)[trainIdx] = distVal;
864 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
865 void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
867 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
868 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
870 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
872 calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
873 cudaSafeCall( cudaGetLastError() );
876 cudaSafeCall( cudaDeviceSynchronize() );
879 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
880 __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
882 extern __shared__ int smem[];
884 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
885 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
887 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
888 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
892 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
894 const int loadX = threadIdx.x + i * BLOCK_SIZE;
896 if (loadX < query.cols)
898 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
899 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
903 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
904 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
910 for (int j = 0; j < BLOCK_SIZE; ++j)
911 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
916 if (queryIdx < query.rows && trainIdx < train.rows)
918 float distVal = numeric_limits<float>::max();
920 if (mask(queryIdx, trainIdx))
921 distVal = (typename Dist::result_type)dist;
923 allDist.ptr(queryIdx)[trainIdx] = distVal;
927 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
928 void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
930 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
931 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
933 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
935 calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
936 cudaSafeCall( cudaGetLastError() );
939 cudaSafeCall( cudaDeviceSynchronize() );
942 ///////////////////////////////////////////////////////////////////////////////
943 // Calc Distance dispatcher
945 template <typename Dist, typename T, typename Mask>
946 void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
947 const PtrStepSzf& allDist,
948 int cc, cudaStream_t stream)
951 if (query.cols <= 64)
953 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
955 else if (query.cols <= 128)
957 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
959 /*else if (query.cols <= 256)
961 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
963 else if (query.cols <= 512)
965 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
967 else if (query.cols <= 1024)
969 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
973 calcDistance<16, Dist>(query, train, mask, allDist, stream);
977 ///////////////////////////////////////////////////////////////////////////////
978 // find knn match kernel
980 template <int BLOCK_SIZE>
981 __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
983 const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
984 __shared__ float s_dist[SMEM_SIZE];
985 __shared__ int s_trainIdx[SMEM_SIZE];
987 const int queryIdx = blockIdx.x;
989 float* allDistRow = allDist.ptr(queryIdx);
991 float dist = numeric_limits<float>::max();
994 for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
996 float reg = allDistRow[i];
1004 s_dist[threadIdx.x] = dist;
1005 s_trainIdx[threadIdx.x] = bestIdx;
1008 reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
1010 if (threadIdx.x == 0)
1012 if (dist < numeric_limits<float>::max())
1014 allDistRow[bestIdx] = numeric_limits<float>::max();
1015 trainIdx.ptr(queryIdx)[i] = bestIdx;
1016 distance.ptr(queryIdx)[i] = dist;
1021 template <int BLOCK_SIZE>
1022 void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
1024 const dim3 block(BLOCK_SIZE, 1, 1);
1025 const dim3 grid(trainIdx.rows, 1, 1);
1027 for (int i = 0; i < k; ++i)
1029 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
1030 cudaSafeCall( cudaGetLastError() );
1034 cudaSafeCall( cudaDeviceSynchronize() );
1037 void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
1039 findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
1042 ///////////////////////////////////////////////////////////////////////////////
1043 // knn match Dispatcher
1045 template <typename Dist, typename T, typename Mask>
1046 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
1047 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1048 int cc, cudaStream_t stream)
1052 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
1056 calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
1057 findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
1061 ///////////////////////////////////////////////////////////////////////////////
1064 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1065 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1066 int cc, cudaStream_t stream)
1069 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
1071 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
1074 template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1075 //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1076 template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1077 template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1078 template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1079 template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1081 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1082 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1083 int cc, cudaStream_t stream)
1086 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
1088 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
1091 //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1092 //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1093 //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1094 //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1095 //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1096 template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1098 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1099 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1100 int cc, cudaStream_t stream)
1103 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
1105 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
1108 template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1109 //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1110 template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1111 //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1112 template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1114 template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1115 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1116 int cc, cudaStream_t stream)
1119 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
1121 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
1124 template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1125 //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1126 template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1127 template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1128 template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1129 template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1131 template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1132 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1133 int cc, cudaStream_t stream)
1136 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
1138 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
1141 //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1142 //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1143 //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1144 //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1145 //template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1146 template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1148 template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1149 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1150 int cc, cudaStream_t stream)
1153 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
1155 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
1158 template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1159 //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1160 template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1161 //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1162 template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1163 } // namespace bf_knnmatch
1164 }}} // namespace cv { namespace gpu { namespace device {
1167 #endif /* CUDA_DISABLER */