1 /*M///////////////////////////////////////////////////////////////////////////////////////
\r
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
\r
5 // By downloading, copying, installing or using the software you agree to this license.
\r
6 // If you do not agree to this license, do not download, install,
\r
7 // copy or use the software.
\r
10 // License Agreement
\r
11 // For Open Source Computer Vision Library
\r
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
\r
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
\r
15 // Third party copyrights are property of their respective owners.
\r
17 // Redistribution and use in source and binary forms, with or without modification,
\r
18 // are permitted provided that the following conditions are met:
\r
20 // * Redistribution's of source code must retain the above copyright notice,
\r
21 // this list of conditions and the following disclaimer.
\r
23 // * Redistribution's in binary form must reproduce the above copyright notice,
\r
24 // this list of conditions and the following disclaimer in the documentation
\r
25 // and/or other materials provided with the distribution.
\r
27 // * The name of the copyright holders may not be used to endorse or promote products
\r
28 // derived from this software without specific prior written permission.
\r
30 // This software is provided by the copyright holders and contributors "as is" and
\r
31 // any express or bpied warranties, including, but not limited to, the bpied
\r
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
\r
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
\r
34 // indirect, incidental, special, exemplary, or consequential damages
\r
35 // (including, but not limited to, procurement of substitute goods or services;
\r
36 // loss of use, data, or profits; or business interruption) however caused
\r
37 // and on any theory of liability, whether in contract, strict liability,
\r
38 // or tort (including negligence or otherwise) arising in any way out of
\r
39 // the use of this software, even if advised of the possibility of such damage.
\r
43 #if !defined CUDA_DISABLER
\r
45 #include "internal_shared.hpp"
\r
46 #include "opencv2/gpu/device/limits.hpp"
\r
47 #include "opencv2/gpu/device/vec_distance.hpp"
\r
48 #include "opencv2/gpu/device/datamov_utils.hpp"
\r
50 namespace cv { namespace gpu { namespace device
\r
52 namespace bf_knnmatch
\r
54 ///////////////////////////////////////////////////////////////////////////////
\r
57 template <int BLOCK_SIZE>
\r
58 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
\r
59 int& bestTrainIdx1, int& bestTrainIdx2,
\r
60 float* s_distance, int* s_trainIdx)
\r
62 float myBestDistance1 = numeric_limits<float>::max();
\r
63 float myBestDistance2 = numeric_limits<float>::max();
\r
64 int myBestTrainIdx1 = -1;
\r
65 int myBestTrainIdx2 = -1;
\r
67 s_distance += threadIdx.y * BLOCK_SIZE;
\r
68 s_trainIdx += threadIdx.y * BLOCK_SIZE;
\r
70 s_distance[threadIdx.x] = bestDistance1;
\r
71 s_trainIdx[threadIdx.x] = bestTrainIdx1;
\r
75 if (threadIdx.x == 0)
\r
78 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
80 float val = s_distance[i];
\r
82 if (val < myBestDistance1)
\r
84 myBestDistance2 = myBestDistance1;
\r
85 myBestTrainIdx2 = myBestTrainIdx1;
\r
87 myBestDistance1 = val;
\r
88 myBestTrainIdx1 = s_trainIdx[i];
\r
90 else if (val < myBestDistance2)
\r
92 myBestDistance2 = val;
\r
93 myBestTrainIdx2 = s_trainIdx[i];
\r
100 s_distance[threadIdx.x] = bestDistance2;
\r
101 s_trainIdx[threadIdx.x] = bestTrainIdx2;
\r
105 if (threadIdx.x == 0)
\r
108 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
110 float val = s_distance[i];
\r
112 if (val < myBestDistance2)
\r
114 myBestDistance2 = val;
\r
115 myBestTrainIdx2 = s_trainIdx[i];
\r
120 bestDistance1 = myBestDistance1;
\r
121 bestDistance2 = myBestDistance2;
\r
123 bestTrainIdx1 = myBestTrainIdx1;
\r
124 bestTrainIdx2 = myBestTrainIdx2;
\r
127 template <int BLOCK_SIZE>
\r
128 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
\r
129 int& bestTrainIdx1, int& bestTrainIdx2,
\r
130 int& bestImgIdx1, int& bestImgIdx2,
\r
131 float* s_distance, int* s_trainIdx, int* s_imgIdx)
\r
133 float myBestDistance1 = numeric_limits<float>::max();
\r
134 float myBestDistance2 = numeric_limits<float>::max();
\r
135 int myBestTrainIdx1 = -1;
\r
136 int myBestTrainIdx2 = -1;
\r
137 int myBestImgIdx1 = -1;
\r
138 int myBestImgIdx2 = -1;
\r
140 s_distance += threadIdx.y * BLOCK_SIZE;
\r
141 s_trainIdx += threadIdx.y * BLOCK_SIZE;
\r
142 s_imgIdx += threadIdx.y * BLOCK_SIZE;
\r
144 s_distance[threadIdx.x] = bestDistance1;
\r
145 s_trainIdx[threadIdx.x] = bestTrainIdx1;
\r
146 s_imgIdx[threadIdx.x] = bestImgIdx1;
\r
150 if (threadIdx.x == 0)
\r
153 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
155 float val = s_distance[i];
\r
157 if (val < myBestDistance1)
\r
159 myBestDistance2 = myBestDistance1;
\r
160 myBestTrainIdx2 = myBestTrainIdx1;
\r
161 myBestImgIdx2 = myBestImgIdx1;
\r
163 myBestDistance1 = val;
\r
164 myBestTrainIdx1 = s_trainIdx[i];
\r
165 myBestImgIdx1 = s_imgIdx[i];
\r
167 else if (val < myBestDistance2)
\r
169 myBestDistance2 = val;
\r
170 myBestTrainIdx2 = s_trainIdx[i];
\r
171 myBestImgIdx2 = s_imgIdx[i];
\r
178 s_distance[threadIdx.x] = bestDistance2;
\r
179 s_trainIdx[threadIdx.x] = bestTrainIdx2;
\r
180 s_imgIdx[threadIdx.x] = bestImgIdx2;
\r
184 if (threadIdx.x == 0)
\r
187 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
189 float val = s_distance[i];
\r
191 if (val < myBestDistance2)
\r
193 myBestDistance2 = val;
\r
194 myBestTrainIdx2 = s_trainIdx[i];
\r
195 myBestImgIdx2 = s_imgIdx[i];
\r
200 bestDistance1 = myBestDistance1;
\r
201 bestDistance2 = myBestDistance2;
\r
203 bestTrainIdx1 = myBestTrainIdx1;
\r
204 bestTrainIdx2 = myBestTrainIdx2;
\r
206 bestImgIdx1 = myBestImgIdx1;
\r
207 bestImgIdx2 = myBestImgIdx2;
\r
210 ///////////////////////////////////////////////////////////////////////////////
\r
211 // Match Unrolled Cached
\r
213 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
\r
214 __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
\r
217 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
219 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
220 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
\r
224 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
225 __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
\r
226 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
227 float& bestDistance1, float& bestDistance2,
\r
228 int& bestTrainIdx1, int& bestTrainIdx2,
\r
229 int& bestImgIdx1, int& bestImgIdx2)
\r
231 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
236 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
238 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
240 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
242 if (loadX < train.cols)
\r
246 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
247 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
253 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
254 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
259 typename Dist::result_type distVal = dist;
\r
261 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
263 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
265 if (distVal < bestDistance1)
\r
267 bestImgIdx2 = bestImgIdx1;
\r
268 bestDistance2 = bestDistance1;
\r
269 bestTrainIdx2 = bestTrainIdx1;
\r
271 bestImgIdx1 = imgIdx;
\r
272 bestDistance1 = distVal;
\r
273 bestTrainIdx1 = trainIdx;
\r
275 else if (distVal < bestDistance2)
\r
277 bestImgIdx2 = imgIdx;
\r
278 bestDistance2 = distVal;
\r
279 bestTrainIdx2 = trainIdx;
\r
285 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
286 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
288 extern __shared__ int smem[];
\r
290 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
292 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
293 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
\r
295 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
\r
297 float myBestDistance1 = numeric_limits<float>::max();
\r
298 float myBestDistance2 = numeric_limits<float>::max();
\r
299 int myBestTrainIdx1 = -1;
\r
300 int myBestTrainIdx2 = -1;
\r
302 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
306 float* s_distance = (float*)(smem);
\r
307 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
309 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
311 if (queryIdx < query.rows && threadIdx.x == 0)
\r
313 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
314 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
318 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
319 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
320 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
\r
321 cudaStream_t stream)
\r
323 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
324 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
326 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
328 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
329 cudaSafeCall( cudaGetLastError() );
\r
332 cudaSafeCall( cudaDeviceSynchronize() );
\r
335 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
336 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
338 extern __shared__ int smem[];
\r
340 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
342 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
343 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
\r
345 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
\r
347 float myBestDistance1 = numeric_limits<float>::max();
\r
348 float myBestDistance2 = numeric_limits<float>::max();
\r
349 int myBestTrainIdx1 = -1;
\r
350 int myBestTrainIdx2 = -1;
\r
351 int myBestImgIdx1 = -1;
\r
352 int myBestImgIdx2 = -1;
\r
356 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
358 const PtrStepSz<T> train = trains[imgIdx];
\r
360 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
365 float* s_distance = (float*)(smem);
\r
366 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
367 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
369 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
371 if (queryIdx < query.rows && threadIdx.x == 0)
\r
373 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
374 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
375 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
379 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
380 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
\r
381 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
\r
382 cudaStream_t stream)
\r
384 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
385 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
387 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
389 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
390 cudaSafeCall( cudaGetLastError() );
\r
393 cudaSafeCall( cudaDeviceSynchronize() );
\r
396 ///////////////////////////////////////////////////////////////////////////////
\r
399 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
400 __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
\r
401 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
402 float& bestDistance1, float& bestDistance2,
\r
403 int& bestTrainIdx1, int& bestTrainIdx2,
\r
404 int& bestImgIdx1, int& bestImgIdx2)
\r
406 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
411 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
413 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
415 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
416 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
418 if (loadX < query.cols)
\r
422 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
\r
423 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
\r
425 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
426 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
432 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
433 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
438 typename Dist::result_type distVal = dist;
\r
440 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
442 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
444 if (distVal < bestDistance1)
\r
446 bestImgIdx2 = bestImgIdx1;
\r
447 bestDistance2 = bestDistance1;
\r
448 bestTrainIdx2 = bestTrainIdx1;
\r
450 bestImgIdx1 = imgIdx;
\r
451 bestDistance1 = distVal;
\r
452 bestTrainIdx1 = trainIdx;
\r
454 else if (distVal < bestDistance2)
\r
456 bestImgIdx2 = imgIdx;
\r
457 bestDistance2 = distVal;
\r
458 bestTrainIdx2 = trainIdx;
\r
464 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
465 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
467 extern __shared__ int smem[];
\r
469 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
471 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
472 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
474 float myBestDistance1 = numeric_limits<float>::max();
\r
475 float myBestDistance2 = numeric_limits<float>::max();
\r
476 int myBestTrainIdx1 = -1;
\r
477 int myBestTrainIdx2 = -1;
\r
479 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
483 float* s_distance = (float*)(smem);
\r
484 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
486 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
488 if (queryIdx < query.rows && threadIdx.x == 0)
\r
490 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
491 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
495 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
496 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
497 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
\r
498 cudaStream_t stream)
\r
500 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
501 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
503 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
505 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
506 cudaSafeCall( cudaGetLastError() );
\r
509 cudaSafeCall( cudaDeviceSynchronize() );
\r
512 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
513 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
515 extern __shared__ int smem[];
\r
517 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
519 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
520 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
522 float myBestDistance1 = numeric_limits<float>::max();
\r
523 float myBestDistance2 = numeric_limits<float>::max();
\r
524 int myBestTrainIdx1 = -1;
\r
525 int myBestTrainIdx2 = -1;
\r
526 int myBestImgIdx1 = -1;
\r
527 int myBestImgIdx2 = -1;
\r
531 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
533 const PtrStepSz<T> train = trains[imgIdx];
\r
535 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
540 float* s_distance = (float*)(smem);
\r
541 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
542 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
544 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
546 if (queryIdx < query.rows && threadIdx.x == 0)
\r
548 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
549 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
550 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
554 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
555 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
\r
556 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
\r
557 cudaStream_t stream)
\r
559 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
560 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
562 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
564 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
565 cudaSafeCall( cudaGetLastError() );
\r
568 cudaSafeCall( cudaDeviceSynchronize() );
\r
571 ///////////////////////////////////////////////////////////////////////////////
\r
574 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
575 __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
\r
576 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
577 float& bestDistance1, float& bestDistance2,
\r
578 int& bestTrainIdx1, int& bestTrainIdx2,
\r
579 int& bestImgIdx1, int& bestImgIdx2)
\r
581 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
585 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
\r
587 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
589 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
590 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
592 if (loadX < query.cols)
\r
596 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
\r
597 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
\r
599 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
600 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
606 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
607 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
612 typename Dist::result_type distVal = dist;
\r
614 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
616 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
618 if (distVal < bestDistance1)
\r
620 bestImgIdx2 = bestImgIdx1;
\r
621 bestDistance2 = bestDistance1;
\r
622 bestTrainIdx2 = bestTrainIdx1;
\r
624 bestImgIdx1 = imgIdx;
\r
625 bestDistance1 = distVal;
\r
626 bestTrainIdx1 = trainIdx;
\r
628 else if (distVal < bestDistance2)
\r
630 bestImgIdx2 = imgIdx;
\r
631 bestDistance2 = distVal;
\r
632 bestTrainIdx2 = trainIdx;
\r
638 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
639 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
641 extern __shared__ int smem[];
\r
643 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
645 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
646 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
648 float myBestDistance1 = numeric_limits<float>::max();
\r
649 float myBestDistance2 = numeric_limits<float>::max();
\r
650 int myBestTrainIdx1 = -1;
\r
651 int myBestTrainIdx2 = -1;
\r
653 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
657 float* s_distance = (float*)(smem);
\r
658 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
660 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
662 if (queryIdx < query.rows && threadIdx.x == 0)
\r
664 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
665 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
669 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
670 void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
671 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
\r
672 cudaStream_t stream)
\r
674 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
675 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
677 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
679 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
680 cudaSafeCall( cudaGetLastError() );
\r
683 cudaSafeCall( cudaDeviceSynchronize() );
\r
686 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
687 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
689 extern __shared__ int smem[];
\r
691 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
693 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
694 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
696 float myBestDistance1 = numeric_limits<float>::max();
\r
697 float myBestDistance2 = numeric_limits<float>::max();
\r
698 int myBestTrainIdx1 = -1;
\r
699 int myBestTrainIdx2 = -1;
\r
700 int myBestImgIdx1 = -1;
\r
701 int myBestImgIdx2 = -1;
\r
705 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
707 const PtrStepSz<T> train = trains[imgIdx];
\r
709 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
714 float* s_distance = (float*)(smem);
\r
715 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
716 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
718 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
720 if (queryIdx < query.rows && threadIdx.x == 0)
\r
722 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
723 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
724 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
728 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
729 void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
\r
730 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
\r
731 cudaStream_t stream)
\r
733 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
734 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
736 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
738 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
739 cudaSafeCall( cudaGetLastError() );
\r
742 cudaSafeCall( cudaDeviceSynchronize() );
\r
745 ///////////////////////////////////////////////////////////////////////////////
\r
746 // knnMatch 2 dispatcher
\r
748 template <typename Dist, typename T, typename Mask>
\r
749 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
750 const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
\r
751 int cc, cudaStream_t stream)
\r
754 if (query.cols <= 64)
\r
756 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
758 else if (query.cols <= 128)
\r
760 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
762 /*else if (query.cols <= 256)
\r
764 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
766 else if (query.cols <= 512)
\r
768 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
770 else if (query.cols <= 1024)
\r
772 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
776 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
780 template <typename Dist, typename T, typename Mask>
\r
781 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
\r
782 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
\r
783 int cc, cudaStream_t stream)
\r
786 if (query.cols <= 64)
\r
788 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
790 else if (query.cols <= 128)
\r
792 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
794 /*else if (query.cols <= 256)
\r
796 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
798 else if (query.cols <= 512)
\r
800 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
802 else if (query.cols <= 1024)
\r
804 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
808 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
812 ///////////////////////////////////////////////////////////////////////////////
\r
813 // Calc distance kernel
\r
815 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
816 __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
\r
818 extern __shared__ int smem[];
\r
820 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
\r
821 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
\r
823 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
824 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
829 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
831 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
833 if (loadX < query.cols)
\r
835 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
\r
836 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
840 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
841 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
847 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
848 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
853 if (queryIdx < query.rows && trainIdx < train.rows)
\r
855 float distVal = numeric_limits<float>::max();
\r
857 if (mask(queryIdx, trainIdx))
\r
858 distVal = (typename Dist::result_type)dist;
\r
860 allDist.ptr(queryIdx)[trainIdx] = distVal;
\r
864 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
865 void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
\r
867 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
868 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
\r
870 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
872 calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
\r
873 cudaSafeCall( cudaGetLastError() );
\r
876 cudaSafeCall( cudaDeviceSynchronize() );
\r
879 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
880 __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
\r
882 extern __shared__ int smem[];
\r
884 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
\r
885 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
\r
887 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
888 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
892 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
\r
894 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
896 if (loadX < query.cols)
\r
898 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
\r
899 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
903 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
904 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
910 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
911 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
916 if (queryIdx < query.rows && trainIdx < train.rows)
\r
918 float distVal = numeric_limits<float>::max();
\r
920 if (mask(queryIdx, trainIdx))
\r
921 distVal = (typename Dist::result_type)dist;
\r
923 allDist.ptr(queryIdx)[trainIdx] = distVal;
\r
927 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
928 void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
\r
930 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
931 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
\r
933 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
935 calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
\r
936 cudaSafeCall( cudaGetLastError() );
\r
939 cudaSafeCall( cudaDeviceSynchronize() );
\r
942 ///////////////////////////////////////////////////////////////////////////////
\r
943 // Calc Distance dispatcher
\r
945 template <typename Dist, typename T, typename Mask>
\r
946 void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
947 const PtrStepSzf& allDist,
\r
948 int cc, cudaStream_t stream)
\r
951 if (query.cols <= 64)
\r
953 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
\r
955 else if (query.cols <= 128)
\r
957 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
\r
959 /*else if (query.cols <= 256)
\r
961 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
\r
963 else if (query.cols <= 512)
\r
965 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
\r
967 else if (query.cols <= 1024)
\r
969 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
\r
973 calcDistance<16, Dist>(query, train, mask, allDist, stream);
\r
977 ///////////////////////////////////////////////////////////////////////////////
\r
978 // find knn match kernel
\r
980 template <int BLOCK_SIZE>
\r
981 __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
\r
983 const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
\r
984 __shared__ float s_dist[SMEM_SIZE];
\r
985 __shared__ int s_trainIdx[SMEM_SIZE];
\r
987 const int queryIdx = blockIdx.x;
\r
989 float* allDistRow = allDist.ptr(queryIdx);
\r
991 float dist = numeric_limits<float>::max();
\r
994 for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
\r
996 float reg = allDistRow[i];
\r
1004 s_dist[threadIdx.x] = dist;
\r
1005 s_trainIdx[threadIdx.x] = bestIdx;
\r
1008 reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
\r
1010 if (threadIdx.x == 0)
\r
1012 if (dist < numeric_limits<float>::max())
\r
1014 allDistRow[bestIdx] = numeric_limits<float>::max();
\r
1015 trainIdx.ptr(queryIdx)[i] = bestIdx;
\r
1016 distance.ptr(queryIdx)[i] = dist;
\r
1021 template <int BLOCK_SIZE>
\r
1022 void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
\r
1024 const dim3 block(BLOCK_SIZE, 1, 1);
\r
1025 const dim3 grid(trainIdx.rows, 1, 1);
\r
1027 for (int i = 0; i < k; ++i)
\r
1029 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
\r
1030 cudaSafeCall( cudaGetLastError() );
\r
1034 cudaSafeCall( cudaDeviceSynchronize() );
\r
1037 void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
\r
1039 findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
\r
1042 ///////////////////////////////////////////////////////////////////////////////
\r
1043 // knn match Dispatcher
\r
1045 template <typename Dist, typename T, typename Mask>
\r
1046 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
\r
1047 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
\r
1048 int cc, cudaStream_t stream)
\r
1052 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
\r
1056 calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
\r
1057 findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
\r
1061 ///////////////////////////////////////////////////////////////////////////////
\r
1062 // knn match caller
\r
1064 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
\r
1065 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
\r
1066 int cc, cudaStream_t stream)
\r
1069 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1071 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1074 template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1075 //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1076 template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1077 template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1078 template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1079 template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1081 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
\r
1082 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
\r
1083 int cc, cudaStream_t stream)
\r
1086 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1088 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1091 //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1092 //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1093 //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1094 //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1095 //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1096 template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1098 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
\r
1099 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
\r
1100 int cc, cudaStream_t stream)
\r
1103 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1105 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1108 template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1109 //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1110 template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1111 //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1112 template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1114 template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
\r
1115 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
\r
1116 int cc, cudaStream_t stream)
\r
1119 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1121 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1124 template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1125 //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1126 template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1127 template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1128 template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1129 template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1131 template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
\r
1132 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
\r
1133 int cc, cudaStream_t stream)
\r
1136 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1138 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1141 //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1142 //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1143 //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1144 //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1145 //template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1146 template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1148 template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
\r
1149 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
\r
1150 int cc, cudaStream_t stream)
\r
1153 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1155 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1158 template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1159 //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1160 template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1161 //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1162 template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1163 } // namespace bf_knnmatch
\r
1164 }}} // namespace cv { namespace gpu { namespace device {
\r
1167 #endif /* CUDA_DISABLER */