1 /*M///////////////////////////////////////////////////////////////////////////////////////
\r
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
\r
5 // By downloading, copying, installing or using the software you agree to this license.
\r
6 // If you do not agree to this license, do not download, install,
\r
7 // copy or use the software.
\r
10 // License Agreement
\r
11 // For Open Source Computer Vision Library
\r
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
\r
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
\r
15 // Third party copyrights are property of their respective owners.
\r
17 // Redistribution and use in source and binary forms, with or without modification,
\r
18 // are permitted provided that the following conditions are met:
\r
20 // * Redistribution's of source code must retain the above copyright notice,
\r
21 // this list of conditions and the following disclaimer.
\r
23 // * Redistribution's in binary form must reproduce the above copyright notice,
\r
24 // this list of conditions and the following disclaimer in the documentation
\r
25 // and/or other materials provided with the distribution.
\r
27 // * The name of the copyright holders may not be used to endorse or promote products
\r
28 // derived from this software without specific prior written permission.
\r
30 // This software is provided by the copyright holders and contributors "as is" and
\r
31 // any express or bpied warranties, including, but not limited to, the bpied
\r
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
\r
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
\r
34 // indirect, incidental, special, exemplary, or consequential damages
\r
35 // (including, but not limited to, procurement of substitute goods or services;
\r
36 // loss of use, data, or profits; or business interruption) however caused
\r
37 // and on any theory of liability, whether in contract, strict liability,
\r
38 // or tort (including negligence or otherwise) arising in any way out of
\r
39 // the use of this software, even if advised of the possibility of such damage.
\r
43 #include "internal_shared.hpp"
\r
44 #include "opencv2/gpu/device/limits.hpp"
\r
45 #include "opencv2/gpu/device/vec_distance.hpp"
\r
46 #include "opencv2/gpu/device/datamov_utils.hpp"
\r
48 namespace cv { namespace gpu { namespace device
\r
50 namespace bf_knnmatch
\r
52 ///////////////////////////////////////////////////////////////////////////////
\r
55 template <int BLOCK_SIZE>
\r
56 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
\r
57 int& bestTrainIdx1, int& bestTrainIdx2,
\r
58 float* s_distance, int* s_trainIdx)
\r
60 float myBestDistance1 = numeric_limits<float>::max();
\r
61 float myBestDistance2 = numeric_limits<float>::max();
\r
62 int myBestTrainIdx1 = -1;
\r
63 int myBestTrainIdx2 = -1;
\r
65 s_distance += threadIdx.y * BLOCK_SIZE;
\r
66 s_trainIdx += threadIdx.y * BLOCK_SIZE;
\r
68 s_distance[threadIdx.x] = bestDistance1;
\r
69 s_trainIdx[threadIdx.x] = bestTrainIdx1;
\r
73 if (threadIdx.x == 0)
\r
76 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
78 float val = s_distance[i];
\r
80 if (val < myBestDistance1)
\r
82 myBestDistance2 = myBestDistance1;
\r
83 myBestTrainIdx2 = myBestTrainIdx1;
\r
85 myBestDistance1 = val;
\r
86 myBestTrainIdx1 = s_trainIdx[i];
\r
88 else if (val < myBestDistance2)
\r
90 myBestDistance2 = val;
\r
91 myBestTrainIdx2 = s_trainIdx[i];
\r
98 s_distance[threadIdx.x] = bestDistance2;
\r
99 s_trainIdx[threadIdx.x] = bestTrainIdx2;
\r
103 if (threadIdx.x == 0)
\r
106 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
108 float val = s_distance[i];
\r
110 if (val < myBestDistance2)
\r
112 myBestDistance2 = val;
\r
113 myBestTrainIdx2 = s_trainIdx[i];
\r
118 bestDistance1 = myBestDistance1;
\r
119 bestDistance2 = myBestDistance2;
\r
121 bestTrainIdx1 = myBestTrainIdx1;
\r
122 bestTrainIdx2 = myBestTrainIdx2;
\r
125 template <int BLOCK_SIZE>
\r
126 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
\r
127 int& bestTrainIdx1, int& bestTrainIdx2,
\r
128 int& bestImgIdx1, int& bestImgIdx2,
\r
129 float* s_distance, int* s_trainIdx, int* s_imgIdx)
\r
131 float myBestDistance1 = numeric_limits<float>::max();
\r
132 float myBestDistance2 = numeric_limits<float>::max();
\r
133 int myBestTrainIdx1 = -1;
\r
134 int myBestTrainIdx2 = -1;
\r
135 int myBestImgIdx1 = -1;
\r
136 int myBestImgIdx2 = -1;
\r
138 s_distance += threadIdx.y * BLOCK_SIZE;
\r
139 s_trainIdx += threadIdx.y * BLOCK_SIZE;
\r
140 s_imgIdx += threadIdx.y * BLOCK_SIZE;
\r
142 s_distance[threadIdx.x] = bestDistance1;
\r
143 s_trainIdx[threadIdx.x] = bestTrainIdx1;
\r
144 s_imgIdx[threadIdx.x] = bestImgIdx1;
\r
148 if (threadIdx.x == 0)
\r
151 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
153 float val = s_distance[i];
\r
155 if (val < myBestDistance1)
\r
157 myBestDistance2 = myBestDistance1;
\r
158 myBestTrainIdx2 = myBestTrainIdx1;
\r
159 myBestImgIdx2 = myBestImgIdx1;
\r
161 myBestDistance1 = val;
\r
162 myBestTrainIdx1 = s_trainIdx[i];
\r
163 myBestImgIdx1 = s_imgIdx[i];
\r
165 else if (val < myBestDistance2)
\r
167 myBestDistance2 = val;
\r
168 myBestTrainIdx2 = s_trainIdx[i];
\r
169 myBestImgIdx2 = s_imgIdx[i];
\r
176 s_distance[threadIdx.x] = bestDistance2;
\r
177 s_trainIdx[threadIdx.x] = bestTrainIdx2;
\r
178 s_imgIdx[threadIdx.x] = bestImgIdx2;
\r
182 if (threadIdx.x == 0)
\r
185 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
187 float val = s_distance[i];
\r
189 if (val < myBestDistance2)
\r
191 myBestDistance2 = val;
\r
192 myBestTrainIdx2 = s_trainIdx[i];
\r
193 myBestImgIdx2 = s_imgIdx[i];
\r
198 bestDistance1 = myBestDistance1;
\r
199 bestDistance2 = myBestDistance2;
\r
201 bestTrainIdx1 = myBestTrainIdx1;
\r
202 bestTrainIdx2 = myBestTrainIdx2;
\r
204 bestImgIdx1 = myBestImgIdx1;
\r
205 bestImgIdx2 = myBestImgIdx2;
\r
208 ///////////////////////////////////////////////////////////////////////////////
\r
209 // Match Unrolled Cached
\r
211 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
\r
212 __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
\r
215 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
217 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
218 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
\r
222 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
223 __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
\r
224 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
225 float& bestDistance1, float& bestDistance2,
\r
226 int& bestTrainIdx1, int& bestTrainIdx2,
\r
227 int& bestImgIdx1, int& bestImgIdx2)
\r
229 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
234 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
236 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
238 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
240 if (loadX < train.cols)
\r
244 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
245 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
251 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
252 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
257 typename Dist::result_type distVal = dist;
\r
259 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
261 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
263 if (distVal < bestDistance1)
\r
265 bestImgIdx2 = bestImgIdx1;
\r
266 bestDistance2 = bestDistance1;
\r
267 bestTrainIdx2 = bestTrainIdx1;
\r
269 bestImgIdx1 = imgIdx;
\r
270 bestDistance1 = distVal;
\r
271 bestTrainIdx1 = trainIdx;
\r
273 else if (distVal < bestDistance2)
\r
275 bestImgIdx2 = imgIdx;
\r
276 bestDistance2 = distVal;
\r
277 bestTrainIdx2 = trainIdx;
\r
283 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
284 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
286 extern __shared__ int smem[];
\r
288 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
290 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
291 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
\r
293 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
\r
295 float myBestDistance1 = numeric_limits<float>::max();
\r
296 float myBestDistance2 = numeric_limits<float>::max();
\r
297 int myBestTrainIdx1 = -1;
\r
298 int myBestTrainIdx2 = -1;
\r
300 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
304 float* s_distance = (float*)(smem);
\r
305 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
307 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
309 if (queryIdx < query.rows && threadIdx.x == 0)
\r
311 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
312 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
316 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
317 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
318 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
\r
319 cudaStream_t stream)
\r
321 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
322 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
324 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
326 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
327 cudaSafeCall( cudaGetLastError() );
\r
330 cudaSafeCall( cudaDeviceSynchronize() );
\r
333 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
334 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
336 extern __shared__ int smem[];
\r
338 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
340 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
341 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
\r
343 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
\r
345 float myBestDistance1 = numeric_limits<float>::max();
\r
346 float myBestDistance2 = numeric_limits<float>::max();
\r
347 int myBestTrainIdx1 = -1;
\r
348 int myBestTrainIdx2 = -1;
\r
349 int myBestImgIdx1 = -1;
\r
350 int myBestImgIdx2 = -1;
\r
354 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
356 const PtrStepSz<T> train = trains[imgIdx];
\r
358 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
363 float* s_distance = (float*)(smem);
\r
364 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
365 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
367 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
369 if (queryIdx < query.rows && threadIdx.x == 0)
\r
371 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
372 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
373 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
377 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
378 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
\r
379 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
\r
380 cudaStream_t stream)
\r
382 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
383 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
385 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
387 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
388 cudaSafeCall( cudaGetLastError() );
\r
391 cudaSafeCall( cudaDeviceSynchronize() );
\r
394 ///////////////////////////////////////////////////////////////////////////////
\r
397 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
398 __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
\r
399 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
400 float& bestDistance1, float& bestDistance2,
\r
401 int& bestTrainIdx1, int& bestTrainIdx2,
\r
402 int& bestImgIdx1, int& bestImgIdx2)
\r
404 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
409 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
411 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
413 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
414 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
416 if (loadX < query.cols)
\r
420 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
\r
421 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
\r
423 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
424 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
430 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
431 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
436 typename Dist::result_type distVal = dist;
\r
438 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
440 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
442 if (distVal < bestDistance1)
\r
444 bestImgIdx2 = bestImgIdx1;
\r
445 bestDistance2 = bestDistance1;
\r
446 bestTrainIdx2 = bestTrainIdx1;
\r
448 bestImgIdx1 = imgIdx;
\r
449 bestDistance1 = distVal;
\r
450 bestTrainIdx1 = trainIdx;
\r
452 else if (distVal < bestDistance2)
\r
454 bestImgIdx2 = imgIdx;
\r
455 bestDistance2 = distVal;
\r
456 bestTrainIdx2 = trainIdx;
\r
462 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
463 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
465 extern __shared__ int smem[];
\r
467 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
469 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
470 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
472 float myBestDistance1 = numeric_limits<float>::max();
\r
473 float myBestDistance2 = numeric_limits<float>::max();
\r
474 int myBestTrainIdx1 = -1;
\r
475 int myBestTrainIdx2 = -1;
\r
477 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
481 float* s_distance = (float*)(smem);
\r
482 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
484 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
486 if (queryIdx < query.rows && threadIdx.x == 0)
\r
488 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
489 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
493 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
494 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
495 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
\r
496 cudaStream_t stream)
\r
498 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
499 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
501 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
503 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
504 cudaSafeCall( cudaGetLastError() );
\r
507 cudaSafeCall( cudaDeviceSynchronize() );
\r
510 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
511 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
513 extern __shared__ int smem[];
\r
515 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
517 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
518 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
520 float myBestDistance1 = numeric_limits<float>::max();
\r
521 float myBestDistance2 = numeric_limits<float>::max();
\r
522 int myBestTrainIdx1 = -1;
\r
523 int myBestTrainIdx2 = -1;
\r
524 int myBestImgIdx1 = -1;
\r
525 int myBestImgIdx2 = -1;
\r
529 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
531 const PtrStepSz<T> train = trains[imgIdx];
\r
533 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
538 float* s_distance = (float*)(smem);
\r
539 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
540 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
542 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
544 if (queryIdx < query.rows && threadIdx.x == 0)
\r
546 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
547 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
548 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
552 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
553 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
\r
554 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
\r
555 cudaStream_t stream)
\r
557 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
558 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
560 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
562 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
563 cudaSafeCall( cudaGetLastError() );
\r
566 cudaSafeCall( cudaDeviceSynchronize() );
\r
569 ///////////////////////////////////////////////////////////////////////////////
\r
572 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
573 __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
\r
574 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
575 float& bestDistance1, float& bestDistance2,
\r
576 int& bestTrainIdx1, int& bestTrainIdx2,
\r
577 int& bestImgIdx1, int& bestImgIdx2)
\r
579 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
583 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
\r
585 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
587 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
588 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
590 if (loadX < query.cols)
\r
594 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
\r
595 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
\r
597 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
598 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
604 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
605 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
610 typename Dist::result_type distVal = dist;
\r
612 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
614 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
616 if (distVal < bestDistance1)
\r
618 bestImgIdx2 = bestImgIdx1;
\r
619 bestDistance2 = bestDistance1;
\r
620 bestTrainIdx2 = bestTrainIdx1;
\r
622 bestImgIdx1 = imgIdx;
\r
623 bestDistance1 = distVal;
\r
624 bestTrainIdx1 = trainIdx;
\r
626 else if (distVal < bestDistance2)
\r
628 bestImgIdx2 = imgIdx;
\r
629 bestDistance2 = distVal;
\r
630 bestTrainIdx2 = trainIdx;
\r
636 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
637 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
639 extern __shared__ int smem[];
\r
641 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
643 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
644 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
646 float myBestDistance1 = numeric_limits<float>::max();
\r
647 float myBestDistance2 = numeric_limits<float>::max();
\r
648 int myBestTrainIdx1 = -1;
\r
649 int myBestTrainIdx2 = -1;
\r
651 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
655 float* s_distance = (float*)(smem);
\r
656 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
658 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
660 if (queryIdx < query.rows && threadIdx.x == 0)
\r
662 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
663 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
667 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
668 void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
669 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
\r
670 cudaStream_t stream)
\r
672 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
673 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
675 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
677 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
678 cudaSafeCall( cudaGetLastError() );
\r
681 cudaSafeCall( cudaDeviceSynchronize() );
\r
684 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
685 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
687 extern __shared__ int smem[];
\r
689 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
691 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
692 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
694 float myBestDistance1 = numeric_limits<float>::max();
\r
695 float myBestDistance2 = numeric_limits<float>::max();
\r
696 int myBestTrainIdx1 = -1;
\r
697 int myBestTrainIdx2 = -1;
\r
698 int myBestImgIdx1 = -1;
\r
699 int myBestImgIdx2 = -1;
\r
703 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
705 const PtrStepSz<T> train = trains[imgIdx];
\r
707 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
712 float* s_distance = (float*)(smem);
\r
713 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
714 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
716 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
718 if (queryIdx < query.rows && threadIdx.x == 0)
\r
720 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
721 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
722 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
726 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
727 void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
\r
728 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
\r
729 cudaStream_t stream)
\r
731 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
732 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
734 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
736 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
737 cudaSafeCall( cudaGetLastError() );
\r
740 cudaSafeCall( cudaDeviceSynchronize() );
\r
743 ///////////////////////////////////////////////////////////////////////////////
\r
744 // knnMatch 2 dispatcher
\r
746 template <typename Dist, typename T, typename Mask>
\r
747 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
748 const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
\r
749 int cc, cudaStream_t stream)
\r
752 if (query.cols <= 64)
\r
754 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
756 else if (query.cols <= 128)
\r
758 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
760 /*else if (query.cols <= 256)
\r
762 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
764 else if (query.cols <= 512)
\r
766 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
768 else if (query.cols <= 1024)
\r
770 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
774 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
778 template <typename Dist, typename T, typename Mask>
\r
779 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
\r
780 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
\r
781 int cc, cudaStream_t stream)
\r
784 if (query.cols <= 64)
\r
786 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
788 else if (query.cols <= 128)
\r
790 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
792 /*else if (query.cols <= 256)
\r
794 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
796 else if (query.cols <= 512)
\r
798 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
800 else if (query.cols <= 1024)
\r
802 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
806 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
\r
810 ///////////////////////////////////////////////////////////////////////////////
\r
811 // Calc distance kernel
\r
813 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
814 __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
\r
816 extern __shared__ int smem[];
\r
818 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
\r
819 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
\r
821 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
822 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
827 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
829 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
831 if (loadX < query.cols)
\r
833 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
\r
834 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
838 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
839 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
845 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
846 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
851 if (queryIdx < query.rows && trainIdx < train.rows)
\r
853 float distVal = numeric_limits<float>::max();
\r
855 if (mask(queryIdx, trainIdx))
\r
856 distVal = (typename Dist::result_type)dist;
\r
858 allDist.ptr(queryIdx)[trainIdx] = distVal;
\r
862 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
863 void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
\r
865 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
866 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
\r
868 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
870 calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
\r
871 cudaSafeCall( cudaGetLastError() );
\r
874 cudaSafeCall( cudaDeviceSynchronize() );
\r
877 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
878 __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
\r
880 extern __shared__ int smem[];
\r
882 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
\r
883 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
\r
885 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
886 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
890 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
\r
892 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
894 if (loadX < query.cols)
\r
896 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
\r
897 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
901 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
902 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
908 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
909 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
914 if (queryIdx < query.rows && trainIdx < train.rows)
\r
916 float distVal = numeric_limits<float>::max();
\r
918 if (mask(queryIdx, trainIdx))
\r
919 distVal = (typename Dist::result_type)dist;
\r
921 allDist.ptr(queryIdx)[trainIdx] = distVal;
\r
925 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
926 void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
\r
928 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
929 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
\r
931 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
933 calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
\r
934 cudaSafeCall( cudaGetLastError() );
\r
937 cudaSafeCall( cudaDeviceSynchronize() );
\r
940 ///////////////////////////////////////////////////////////////////////////////
\r
941 // Calc Distance dispatcher
\r
943 template <typename Dist, typename T, typename Mask>
\r
944 void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
\r
945 const PtrStepSzf& allDist,
\r
946 int cc, cudaStream_t stream)
\r
949 if (query.cols <= 64)
\r
951 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
\r
953 else if (query.cols <= 128)
\r
955 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
\r
957 /*else if (query.cols <= 256)
\r
959 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
\r
961 else if (query.cols <= 512)
\r
963 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
\r
965 else if (query.cols <= 1024)
\r
967 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
\r
971 calcDistance<16, Dist>(query, train, mask, allDist, stream);
\r
975 ///////////////////////////////////////////////////////////////////////////////
\r
976 // find knn match kernel
\r
978 template <int BLOCK_SIZE>
\r
979 __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
\r
981 const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
\r
982 __shared__ float s_dist[SMEM_SIZE];
\r
983 __shared__ int s_trainIdx[SMEM_SIZE];
\r
985 const int queryIdx = blockIdx.x;
\r
987 float* allDistRow = allDist.ptr(queryIdx);
\r
989 float dist = numeric_limits<float>::max();
\r
992 for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
\r
994 float reg = allDistRow[i];
\r
1002 s_dist[threadIdx.x] = dist;
\r
1003 s_trainIdx[threadIdx.x] = bestIdx;
\r
1006 reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
\r
1008 if (threadIdx.x == 0)
\r
1010 if (dist < numeric_limits<float>::max())
\r
1012 allDistRow[bestIdx] = numeric_limits<float>::max();
\r
1013 trainIdx.ptr(queryIdx)[i] = bestIdx;
\r
1014 distance.ptr(queryIdx)[i] = dist;
\r
1019 template <int BLOCK_SIZE>
\r
1020 void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
\r
1022 const dim3 block(BLOCK_SIZE, 1, 1);
\r
1023 const dim3 grid(trainIdx.rows, 1, 1);
\r
1025 for (int i = 0; i < k; ++i)
\r
1027 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
\r
1028 cudaSafeCall( cudaGetLastError() );
\r
1032 cudaSafeCall( cudaDeviceSynchronize() );
\r
1035 void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
\r
1037 findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
\r
1040 ///////////////////////////////////////////////////////////////////////////////
\r
1041 // knn match Dispatcher
\r
1043 template <typename Dist, typename T, typename Mask>
\r
1044 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
\r
1045 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
\r
1046 int cc, cudaStream_t stream)
\r
1050 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
\r
1054 calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
\r
1055 findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
\r
1059 ///////////////////////////////////////////////////////////////////////////////
\r
1060 // knn match caller
\r
1062 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
\r
1063 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
\r
1064 int cc, cudaStream_t stream)
\r
1067 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1069 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1072 template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1073 //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1074 template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1075 template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1076 template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1077 template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1079 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
\r
1080 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
\r
1081 int cc, cudaStream_t stream)
\r
1084 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1086 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1089 //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1090 //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1091 //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1092 //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1093 //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1094 template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1096 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
\r
1097 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
\r
1098 int cc, cudaStream_t stream)
\r
1101 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1103 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1106 template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1107 //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1108 template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1109 //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1110 template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
\r
1112 template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
\r
1113 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
\r
1114 int cc, cudaStream_t stream)
\r
1117 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1119 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1122 template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1123 //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1124 template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1125 template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1126 template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1127 template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1129 template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
\r
1130 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
\r
1131 int cc, cudaStream_t stream)
\r
1134 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1136 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1139 //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1140 //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1141 //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1142 //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1143 //template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1144 template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1146 template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
\r
1147 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
\r
1148 int cc, cudaStream_t stream)
\r
1151 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1153 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1156 template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1157 //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1158 template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1159 //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1160 template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
\r
1161 } // namespace bf_knnmatch
\r
1162 }}} // namespace cv { namespace gpu { namespace device {
\r