1 /*M///////////////////////////////////////////////////////////////////////////////////////
\r
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
\r
5 // By downloading, copying, installing or using the software you agree to this license.
\r
6 // If you do not agree to this license, do not download, install,
\r
7 // copy or use the software.
\r
10 // License Agreement
\r
11 // For Open Source Computer Vision Library
\r
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
\r
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
\r
15 // Third party copyrights are property of their respective owners.
\r
17 // Redistribution and use in source and binary forms, with or without modification,
\r
18 // are permitted provided that the following conditions are met:
\r
20 // * Redistribution's of source code must retain the above copyright notice,
\r
21 // this list of conditions and the following disclaimer.
\r
23 // * Redistribution's in binary form must reproduce the above copyright notice,
\r
24 // this list of conditions and the following disclaimer in the documentation
\r
25 // and/or other materials provided with the distribution.
\r
27 // * The name of the copyright holders may not be used to endorse or promote products
\r
28 // derived from this software without specific prior written permission.
\r
30 // This software is provided by the copyright holders and contributors "as is" and
\r
31 // any express or bpied warranties, including, but not limited to, the bpied
\r
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
\r
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
\r
34 // indirect, incidental, special, exemplary, or consequential damages
\r
35 // (including, but not limited to, procurement of substitute goods or services;
\r
36 // loss of use, data, or profits; or business interruption) however caused
\r
37 // and on any theory of liability, whether in contract, strict liability,
\r
38 // or tort (including negligence or otherwise) arising in any way out of
\r
39 // the use of this software, even if advised of the possibility of such damage.
\r
43 #include "internal_shared.hpp"
\r
44 #include "opencv2/gpu/device/limits.hpp"
\r
45 #include "opencv2/gpu/device/vec_distance.hpp"
\r
47 using namespace cv::gpu;
\r
48 using namespace cv::gpu::device;
\r
50 namespace cv { namespace gpu { namespace bf_knnmatch
\r
52 ///////////////////////////////////////////////////////////////////////////////
\r
55 template <int BLOCK_SIZE>
\r
56 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
\r
57 int& bestTrainIdx1, int& bestTrainIdx2,
\r
58 float* s_distance, int* s_trainIdx)
\r
60 float myBestDistance1 = numeric_limits<float>::max();
\r
61 float myBestDistance2 = numeric_limits<float>::max();
\r
62 int myBestTrainIdx1 = -1;
\r
63 int myBestTrainIdx2 = -1;
\r
65 s_distance += threadIdx.y * BLOCK_SIZE;
\r
66 s_trainIdx += threadIdx.y * BLOCK_SIZE;
\r
68 s_distance[threadIdx.x] = bestDistance1;
\r
69 s_trainIdx[threadIdx.x] = bestTrainIdx1;
\r
73 if (threadIdx.x == 0)
\r
76 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
78 float val = s_distance[i];
\r
80 if (val < myBestDistance1)
\r
82 myBestDistance2 = myBestDistance1;
\r
83 myBestTrainIdx2 = myBestTrainIdx1;
\r
85 myBestDistance1 = val;
\r
86 myBestTrainIdx1 = s_trainIdx[i];
\r
88 else if (val < myBestDistance2)
\r
90 myBestDistance2 = val;
\r
91 myBestTrainIdx2 = s_trainIdx[i];
\r
98 s_distance[threadIdx.x] = bestDistance2;
\r
99 s_trainIdx[threadIdx.x] = bestTrainIdx2;
\r
103 if (threadIdx.x == 0)
\r
106 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
108 float val = s_distance[i];
\r
110 if (val < myBestDistance2)
\r
112 myBestDistance2 = val;
\r
113 myBestTrainIdx2 = s_trainIdx[i];
\r
118 bestDistance1 = myBestDistance1;
\r
119 bestDistance2 = myBestDistance2;
\r
121 bestTrainIdx1 = myBestTrainIdx1;
\r
122 bestTrainIdx2 = myBestTrainIdx2;
\r
125 template <int BLOCK_SIZE>
\r
126 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
\r
127 int& bestTrainIdx1, int& bestTrainIdx2,
\r
128 int& bestImgIdx1, int& bestImgIdx2,
\r
129 float* s_distance, int* s_trainIdx, int* s_imgIdx)
\r
131 float myBestDistance1 = numeric_limits<float>::max();
\r
132 float myBestDistance2 = numeric_limits<float>::max();
\r
133 int myBestTrainIdx1 = -1;
\r
134 int myBestTrainIdx2 = -1;
\r
135 int myBestImgIdx1 = -1;
\r
136 int myBestImgIdx2 = -1;
\r
138 s_distance += threadIdx.y * BLOCK_SIZE;
\r
139 s_trainIdx += threadIdx.y * BLOCK_SIZE;
\r
140 s_imgIdx += threadIdx.y * BLOCK_SIZE;
\r
142 s_distance[threadIdx.x] = bestDistance1;
\r
143 s_trainIdx[threadIdx.x] = bestTrainIdx1;
\r
144 s_imgIdx[threadIdx.x] = bestImgIdx1;
\r
148 if (threadIdx.x == 0)
\r
151 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
153 float val = s_distance[i];
\r
155 if (val < myBestDistance1)
\r
157 myBestDistance2 = myBestDistance1;
\r
158 myBestTrainIdx2 = myBestTrainIdx1;
\r
159 myBestImgIdx2 = myBestImgIdx1;
\r
161 myBestDistance1 = val;
\r
162 myBestTrainIdx1 = s_trainIdx[i];
\r
163 myBestImgIdx1 = s_imgIdx[i];
\r
165 else if (val < myBestDistance2)
\r
167 myBestDistance2 = val;
\r
168 myBestTrainIdx2 = s_trainIdx[i];
\r
169 myBestImgIdx2 = s_imgIdx[i];
\r
176 s_distance[threadIdx.x] = bestDistance2;
\r
177 s_trainIdx[threadIdx.x] = bestTrainIdx2;
\r
178 s_imgIdx[threadIdx.x] = bestImgIdx2;
\r
182 if (threadIdx.x == 0)
\r
185 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
187 float val = s_distance[i];
\r
189 if (val < myBestDistance2)
\r
191 myBestDistance2 = val;
\r
192 myBestTrainIdx2 = s_trainIdx[i];
\r
193 myBestImgIdx2 = s_imgIdx[i];
\r
198 bestDistance1 = myBestDistance1;
\r
199 bestDistance2 = myBestDistance2;
\r
201 bestTrainIdx1 = myBestTrainIdx1;
\r
202 bestTrainIdx2 = myBestTrainIdx2;
\r
204 bestImgIdx1 = myBestImgIdx1;
\r
205 bestImgIdx2 = myBestImgIdx2;
\r
208 ///////////////////////////////////////////////////////////////////////////////
\r
209 // Match Unrolled Cached
\r
211 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
\r
212 __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
\r
215 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
217 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
218 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0;
\r
222 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
223 __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
\r
224 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
225 float& bestDistance1, float& bestDistance2,
\r
226 int& bestTrainIdx1, int& bestTrainIdx2,
\r
227 int& bestImgIdx1, int& bestImgIdx2)
\r
229 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
234 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
236 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
238 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = loadX < train.cols ? train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX] : 0;
\r
243 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
244 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
249 typename Dist::result_type distVal = dist;
\r
251 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
253 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
255 if (distVal < bestDistance1)
\r
257 bestImgIdx2 = bestImgIdx1;
\r
258 bestDistance2 = bestDistance1;
\r
259 bestTrainIdx2 = bestTrainIdx1;
\r
261 bestImgIdx1 = imgIdx;
\r
262 bestDistance1 = distVal;
\r
263 bestTrainIdx1 = trainIdx;
\r
265 else if (distVal < bestDistance2)
\r
267 bestImgIdx2 = imgIdx;
\r
268 bestDistance2 = distVal;
\r
269 bestTrainIdx2 = trainIdx;
\r
275 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
276 __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
278 extern __shared__ int smem[];
\r
280 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
282 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
283 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
\r
285 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
\r
287 float myBestDistance1 = numeric_limits<float>::max();
\r
288 float myBestDistance2 = numeric_limits<float>::max();
\r
289 int myBestTrainIdx1 = -1;
\r
290 int myBestTrainIdx2 = -1;
\r
292 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
296 float* s_distance = (float*)(smem);
\r
297 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
299 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
301 if (queryIdx < query.rows && threadIdx.x == 0)
\r
303 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
304 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
308 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
309 void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
310 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
\r
311 cudaStream_t stream)
\r
313 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
314 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
316 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
318 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
319 cudaSafeCall( cudaGetLastError() );
\r
322 cudaSafeCall( cudaDeviceSynchronize() );
\r
325 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
326 __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
328 extern __shared__ int smem[];
\r
330 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
332 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
333 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
\r
335 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
\r
337 float myBestDistance1 = numeric_limits<float>::max();
\r
338 float myBestDistance2 = numeric_limits<float>::max();
\r
339 int myBestTrainIdx1 = -1;
\r
340 int myBestTrainIdx2 = -1;
\r
341 int myBestImgIdx1 = -1;
\r
342 int myBestImgIdx2 = -1;
\r
346 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
348 const DevMem2D_<T> train = trains[imgIdx];
\r
350 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
355 float* s_distance = (float*)(smem);
\r
356 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
357 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
359 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
361 if (queryIdx < query.rows && threadIdx.x == 0)
\r
363 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
364 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
365 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
369 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
370 void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
\r
371 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
\r
372 cudaStream_t stream)
\r
374 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
375 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
377 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
379 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
380 cudaSafeCall( cudaGetLastError() );
\r
383 cudaSafeCall( cudaDeviceSynchronize() );
\r
386 ///////////////////////////////////////////////////////////////////////////////
\r
389 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
390 __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
\r
391 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
392 float& bestDistance1, float& bestDistance2,
\r
393 int& bestTrainIdx1, int& bestTrainIdx2,
\r
394 int& bestImgIdx1, int& bestImgIdx2)
\r
396 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
401 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
403 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
405 if (loadX < query.cols)
\r
407 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
\r
408 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
412 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
413 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
419 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
420 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
425 typename Dist::result_type distVal = dist;
\r
427 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
429 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
431 if (distVal < bestDistance1)
\r
433 bestImgIdx2 = bestImgIdx1;
\r
434 bestDistance2 = bestDistance1;
\r
435 bestTrainIdx2 = bestTrainIdx1;
\r
437 bestImgIdx1 = imgIdx;
\r
438 bestDistance1 = distVal;
\r
439 bestTrainIdx1 = trainIdx;
\r
441 else if (distVal < bestDistance2)
\r
443 bestImgIdx2 = imgIdx;
\r
444 bestDistance2 = distVal;
\r
445 bestTrainIdx2 = trainIdx;
\r
451 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
452 __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
454 extern __shared__ int smem[];
\r
456 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
458 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
459 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
461 float myBestDistance1 = numeric_limits<float>::max();
\r
462 float myBestDistance2 = numeric_limits<float>::max();
\r
463 int myBestTrainIdx1 = -1;
\r
464 int myBestTrainIdx2 = -1;
\r
466 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
470 float* s_distance = (float*)(smem);
\r
471 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
473 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
475 if (queryIdx < query.rows && threadIdx.x == 0)
\r
477 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
478 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
482 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
483 void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
484 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
\r
485 cudaStream_t stream)
\r
487 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
488 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
490 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
492 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
493 cudaSafeCall( cudaGetLastError() );
\r
496 cudaSafeCall( cudaDeviceSynchronize() );
\r
499 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
500 __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
502 extern __shared__ int smem[];
\r
504 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
506 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
507 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
509 float myBestDistance1 = numeric_limits<float>::max();
\r
510 float myBestDistance2 = numeric_limits<float>::max();
\r
511 int myBestTrainIdx1 = -1;
\r
512 int myBestTrainIdx2 = -1;
\r
513 int myBestImgIdx1 = -1;
\r
514 int myBestImgIdx2 = -1;
\r
518 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
520 const DevMem2D_<T> train = trains[imgIdx];
\r
522 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
527 float* s_distance = (float*)(smem);
\r
528 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
529 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
531 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
533 if (queryIdx < query.rows && threadIdx.x == 0)
\r
535 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
536 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
537 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
541 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
542 void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
\r
543 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
\r
544 cudaStream_t stream)
\r
546 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
547 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
549 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
551 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
552 cudaSafeCall( cudaGetLastError() );
\r
555 cudaSafeCall( cudaDeviceSynchronize() );
\r
558 ///////////////////////////////////////////////////////////////////////////////
\r
561 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
562 __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
\r
563 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
564 float& bestDistance1, float& bestDistance2,
\r
565 int& bestTrainIdx1, int& bestTrainIdx2,
\r
566 int& bestImgIdx1, int& bestImgIdx2)
\r
568 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
572 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
\r
574 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
576 if (loadX < query.cols)
\r
578 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
\r
579 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
583 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
584 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
590 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
591 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
596 typename Dist::result_type distVal = dist;
\r
598 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
600 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
602 if (distVal < bestDistance1)
\r
604 bestImgIdx2 = bestImgIdx1;
\r
605 bestDistance2 = bestDistance1;
\r
606 bestTrainIdx2 = bestTrainIdx1;
\r
608 bestImgIdx1 = imgIdx;
\r
609 bestDistance1 = distVal;
\r
610 bestTrainIdx1 = trainIdx;
\r
612 else if (distVal < bestDistance2)
\r
614 bestImgIdx2 = imgIdx;
\r
615 bestDistance2 = distVal;
\r
616 bestTrainIdx2 = trainIdx;
\r
622 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
623 __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
625 extern __shared__ int smem[];
\r
627 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
629 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
630 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
632 float myBestDistance1 = numeric_limits<float>::max();
\r
633 float myBestDistance2 = numeric_limits<float>::max();
\r
634 int myBestTrainIdx1 = -1;
\r
635 int myBestTrainIdx2 = -1;
\r
637 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
641 float* s_distance = (float*)(smem);
\r
642 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
644 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
646 if (queryIdx < query.rows && threadIdx.x == 0)
\r
648 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
649 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
653 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
654 void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
655 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
\r
656 cudaStream_t stream)
\r
658 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
659 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
661 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
663 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
664 cudaSafeCall( cudaGetLastError() );
\r
667 cudaSafeCall( cudaDeviceSynchronize() );
\r
670 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
671 __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
673 extern __shared__ int smem[];
\r
675 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
677 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
678 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
680 float myBestDistance1 = numeric_limits<float>::max();
\r
681 float myBestDistance2 = numeric_limits<float>::max();
\r
682 int myBestTrainIdx1 = -1;
\r
683 int myBestTrainIdx2 = -1;
\r
684 int myBestImgIdx1 = -1;
\r
685 int myBestImgIdx2 = -1;
\r
689 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
691 const DevMem2D_<T> train = trains[imgIdx];
\r
693 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
698 float* s_distance = (float*)(smem);
\r
699 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
700 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
702 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
704 if (queryIdx < query.rows && threadIdx.x == 0)
\r
706 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
707 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
708 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
712 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
713 void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
\r
714 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
\r
715 cudaStream_t stream)
\r
717 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
718 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
720 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
722 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
723 cudaSafeCall( cudaGetLastError() );
\r
726 cudaSafeCall( cudaDeviceSynchronize() );
\r
729 ///////////////////////////////////////////////////////////////////////////////
\r
730 // knnMatch 2 dispatcher
\r
732 template <typename Dist, typename T, typename Mask>
\r
733 void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
734 const DevMem2Db& trainIdx, const DevMem2Db& distance,
\r
735 int cc, cudaStream_t stream)
\r
737 if (query.cols <= 64)
\r
739 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
741 else if (query.cols <= 128)
\r
743 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
745 /*else if (query.cols <= 256)
\r
747 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
749 else if (query.cols <= 512)
\r
751 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
753 else if (query.cols <= 1024)
\r
755 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
759 match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
763 template <typename Dist, typename T, typename Mask>
\r
764 void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
\r
765 const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
\r
766 int cc, cudaStream_t stream)
\r
768 if (query.cols <= 64)
\r
770 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
772 else if (query.cols <= 128)
\r
774 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
776 /*else if (query.cols <= 256)
\r
778 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
780 else if (query.cols <= 512)
\r
782 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
784 else if (query.cols <= 1024)
\r
786 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
790 match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
794 ///////////////////////////////////////////////////////////////////////////////
\r
795 // Calc distance kernel
\r
797 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
798 __global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
\r
800 extern __shared__ int smem[];
\r
802 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
\r
803 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
\r
805 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
806 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
811 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
813 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
815 if (loadX < query.cols)
\r
817 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
\r
818 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
822 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
823 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
829 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
830 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
835 if (queryIdx < query.rows && trainIdx < train.rows)
\r
837 float distVal = numeric_limits<float>::max();
\r
839 if (mask(queryIdx, trainIdx))
\r
840 distVal = (typename Dist::result_type)dist;
\r
842 allDist.ptr(queryIdx)[trainIdx] = distVal;
\r
846 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
847 void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
\r
849 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
850 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
\r
852 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
854 calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
\r
855 cudaSafeCall( cudaGetLastError() );
\r
858 cudaSafeCall( cudaDeviceSynchronize() );
\r
861 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
862 __global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
\r
864 extern __shared__ int smem[];
\r
866 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
\r
867 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
\r
869 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
870 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
874 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
\r
876 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
878 if (loadX < query.cols)
\r
880 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
\r
881 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
885 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
886 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
892 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
893 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
898 if (queryIdx < query.rows && trainIdx < train.rows)
\r
900 float distVal = numeric_limits<float>::max();
\r
902 if (mask(queryIdx, trainIdx))
\r
903 distVal = (typename Dist::result_type)dist;
\r
905 allDist.ptr(queryIdx)[trainIdx] = distVal;
\r
909 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
910 void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
\r
912 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
913 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
\r
915 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
917 calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
\r
918 cudaSafeCall( cudaGetLastError() );
\r
921 cudaSafeCall( cudaDeviceSynchronize() );
\r
924 ///////////////////////////////////////////////////////////////////////////////
\r
925 // Calc Distance dispatcher
\r
927 template <typename Dist, typename T, typename Mask>
\r
928 void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
929 const DevMem2Df& allDist,
\r
930 int cc, cudaStream_t stream)
\r
932 if (query.cols <= 64)
\r
934 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
\r
936 else if (query.cols <= 128)
\r
938 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
\r
940 /*else if (query.cols <= 256)
\r
942 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
\r
944 else if (query.cols <= 512)
\r
946 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
\r
948 else if (query.cols <= 1024)
\r
950 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
\r
954 calcDistance<16, Dist>(query, train, mask, allDist, stream);
\r
958 ///////////////////////////////////////////////////////////////////////////////
\r
959 // find knn match kernel
\r
961 template <int BLOCK_SIZE>
\r
962 __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
\r
964 const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
\r
965 __shared__ float s_dist[SMEM_SIZE];
\r
966 __shared__ int s_trainIdx[SMEM_SIZE];
\r
968 const int queryIdx = blockIdx.x;
\r
970 float* allDistRow = allDist.ptr(queryIdx);
\r
972 float dist = numeric_limits<float>::max();
\r
975 for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
\r
977 float reg = allDistRow[i];
\r
985 s_dist[threadIdx.x] = dist;
\r
986 s_trainIdx[threadIdx.x] = bestIdx;
\r
989 reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
\r
991 if (threadIdx.x == 0)
\r
993 if (dist < numeric_limits<float>::max())
\r
995 allDistRow[bestIdx] = numeric_limits<float>::max();
\r
996 trainIdx.ptr(queryIdx)[i] = bestIdx;
\r
997 distance.ptr(queryIdx)[i] = dist;
\r
1002 template <int BLOCK_SIZE>
\r
1003 void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
\r
1005 const dim3 block(BLOCK_SIZE, 1, 1);
\r
1006 const dim3 grid(trainIdx.rows, 1, 1);
\r
1008 for (int i = 0; i < k; ++i)
\r
1010 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
\r
1011 cudaSafeCall( cudaGetLastError() );
\r
1015 cudaSafeCall( cudaDeviceSynchronize() );
\r
1018 void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)
\r
1020 findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);
\r
1023 ///////////////////////////////////////////////////////////////////////////////
\r
1024 // knn match Dispatcher
\r
1026 template <typename Dist, typename T, typename Mask>
\r
1027 void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask,
\r
1028 const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
\r
1029 int cc, cudaStream_t stream)
\r
1033 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
\r
1037 calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
\r
1038 findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
\r
1042 ///////////////////////////////////////////////////////////////////////////////
\r
1043 // knn match caller
\r
1045 template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
\r
1046 const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
\r
1047 int cc, cudaStream_t stream)
\r
1050 matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1052 matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1055 template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1056 //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1057 template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1058 template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1059 template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1060 template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1062 template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
\r
1063 const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
\r
1064 int cc, cudaStream_t stream)
\r
1067 matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1069 matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1072 //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1073 //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1074 //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1075 //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1076 //template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1077 template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1079 template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
\r
1080 const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
\r
1081 int cc, cudaStream_t stream)
\r
1084 matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1086 matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1089 template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1090 //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1091 template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1092 //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1093 template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1095 template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
\r
1096 const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
\r
1097 int cc, cudaStream_t stream)
\r
1100 match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1102 match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1105 template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1106 //template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1107 template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1108 template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1109 template void match2L1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1110 template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1112 template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
\r
1113 const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
\r
1114 int cc, cudaStream_t stream)
\r
1117 match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1119 match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1122 //template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1123 //template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1124 //template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1125 //template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1126 //template void match2L2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1127 template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1129 template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
\r
1130 const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
\r
1131 int cc, cudaStream_t stream)
\r
1134 match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1136 match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1139 template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1140 //template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1141 template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1142 //template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1143 template void match2Hamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r