1 /*M///////////////////////////////////////////////////////////////////////////////////////
\r
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
\r
5 // By downloading, copying, installing or using the software you agree to this license.
\r
6 // If you do not agree to this license, do not download, install,
\r
7 // copy or use the software.
\r
10 // License Agreement
\r
11 // For Open Source Computer Vision Library
\r
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
\r
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
\r
15 // Third party copyrights are property of their respective owners.
\r
17 // Redistribution and use in source and binary forms, with or without modification,
\r
18 // are permitted provided that the following conditions are met:
\r
20 // * Redistribution's of source code must retain the above copyright notice,
\r
21 // this list of conditions and the following disclaimer.
\r
23 // * Redistribution's in binary form must reproduce the above copyright notice,
\r
24 // this list of conditions and the following disclaimer in the documentation
\r
25 // and/or other materials provided with the distribution.
\r
27 // * The name of the copyright holders may not be used to endorse or promote products
\r
28 // derived from this software without specific prior written permission.
\r
30 // This software is provided by the copyright holders and contributors "as is" and
\r
31 // any express or bpied warranties, including, but not limited to, the bpied
\r
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
\r
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
\r
34 // indirect, incidental, special, exemplary, or consequential damages
\r
35 // (including, but not limited to, procurement of substitute goods or services;
\r
36 // loss of use, data, or profits; or business interruption) however caused
\r
37 // and on any theory of liability, whether in contract, strict liability,
\r
38 // or tort (including negligence or otherwise) arising in any way out of
\r
39 // the use of this software, even if advised of the possibility of such damage.
\r
43 #include "internal_shared.hpp"
\r
44 #include "opencv2/gpu/device/limits.hpp"
\r
45 #include "opencv2/gpu/device/vec_distance.hpp"
\r
46 #include "opencv2/gpu/device/datamov_utils.hpp"
\r
48 namespace cv { namespace gpu { namespace device
\r
50 namespace bf_knnmatch
\r
52 ///////////////////////////////////////////////////////////////////////////////
\r
55 template <int BLOCK_SIZE>
\r
56 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
\r
57 int& bestTrainIdx1, int& bestTrainIdx2,
\r
58 float* s_distance, int* s_trainIdx)
\r
60 float myBestDistance1 = numeric_limits<float>::max();
\r
61 float myBestDistance2 = numeric_limits<float>::max();
\r
62 int myBestTrainIdx1 = -1;
\r
63 int myBestTrainIdx2 = -1;
\r
65 s_distance += threadIdx.y * BLOCK_SIZE;
\r
66 s_trainIdx += threadIdx.y * BLOCK_SIZE;
\r
68 s_distance[threadIdx.x] = bestDistance1;
\r
69 s_trainIdx[threadIdx.x] = bestTrainIdx1;
\r
73 if (threadIdx.x == 0)
\r
76 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
78 float val = s_distance[i];
\r
80 if (val < myBestDistance1)
\r
82 myBestDistance2 = myBestDistance1;
\r
83 myBestTrainIdx2 = myBestTrainIdx1;
\r
85 myBestDistance1 = val;
\r
86 myBestTrainIdx1 = s_trainIdx[i];
\r
88 else if (val < myBestDistance2)
\r
90 myBestDistance2 = val;
\r
91 myBestTrainIdx2 = s_trainIdx[i];
\r
98 s_distance[threadIdx.x] = bestDistance2;
\r
99 s_trainIdx[threadIdx.x] = bestTrainIdx2;
\r
103 if (threadIdx.x == 0)
\r
106 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
108 float val = s_distance[i];
\r
110 if (val < myBestDistance2)
\r
112 myBestDistance2 = val;
\r
113 myBestTrainIdx2 = s_trainIdx[i];
\r
118 bestDistance1 = myBestDistance1;
\r
119 bestDistance2 = myBestDistance2;
\r
121 bestTrainIdx1 = myBestTrainIdx1;
\r
122 bestTrainIdx2 = myBestTrainIdx2;
\r
125 template <int BLOCK_SIZE>
\r
126 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
\r
127 int& bestTrainIdx1, int& bestTrainIdx2,
\r
128 int& bestImgIdx1, int& bestImgIdx2,
\r
129 float* s_distance, int* s_trainIdx, int* s_imgIdx)
\r
131 float myBestDistance1 = numeric_limits<float>::max();
\r
132 float myBestDistance2 = numeric_limits<float>::max();
\r
133 int myBestTrainIdx1 = -1;
\r
134 int myBestTrainIdx2 = -1;
\r
135 int myBestImgIdx1 = -1;
\r
136 int myBestImgIdx2 = -1;
\r
138 s_distance += threadIdx.y * BLOCK_SIZE;
\r
139 s_trainIdx += threadIdx.y * BLOCK_SIZE;
\r
140 s_imgIdx += threadIdx.y * BLOCK_SIZE;
\r
142 s_distance[threadIdx.x] = bestDistance1;
\r
143 s_trainIdx[threadIdx.x] = bestTrainIdx1;
\r
144 s_imgIdx[threadIdx.x] = bestImgIdx1;
\r
148 if (threadIdx.x == 0)
\r
151 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
153 float val = s_distance[i];
\r
155 if (val < myBestDistance1)
\r
157 myBestDistance2 = myBestDistance1;
\r
158 myBestTrainIdx2 = myBestTrainIdx1;
\r
159 myBestImgIdx2 = myBestImgIdx1;
\r
161 myBestDistance1 = val;
\r
162 myBestTrainIdx1 = s_trainIdx[i];
\r
163 myBestImgIdx1 = s_imgIdx[i];
\r
165 else if (val < myBestDistance2)
\r
167 myBestDistance2 = val;
\r
168 myBestTrainIdx2 = s_trainIdx[i];
\r
169 myBestImgIdx2 = s_imgIdx[i];
\r
176 s_distance[threadIdx.x] = bestDistance2;
\r
177 s_trainIdx[threadIdx.x] = bestTrainIdx2;
\r
178 s_imgIdx[threadIdx.x] = bestImgIdx2;
\r
182 if (threadIdx.x == 0)
\r
185 for (int i = 0; i < BLOCK_SIZE; ++i)
\r
187 float val = s_distance[i];
\r
189 if (val < myBestDistance2)
\r
191 myBestDistance2 = val;
\r
192 myBestTrainIdx2 = s_trainIdx[i];
\r
193 myBestImgIdx2 = s_imgIdx[i];
\r
198 bestDistance1 = myBestDistance1;
\r
199 bestDistance2 = myBestDistance2;
\r
201 bestTrainIdx1 = myBestTrainIdx1;
\r
202 bestTrainIdx2 = myBestTrainIdx2;
\r
204 bestImgIdx1 = myBestImgIdx1;
\r
205 bestImgIdx2 = myBestImgIdx2;
\r
208 ///////////////////////////////////////////////////////////////////////////////
\r
209 // Match Unrolled Cached
\r
211 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
\r
212 __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
\r
215 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
217 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
218 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
\r
222 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
223 __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
\r
224 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
225 float& bestDistance1, float& bestDistance2,
\r
226 int& bestTrainIdx1, int& bestTrainIdx2,
\r
227 int& bestImgIdx1, int& bestImgIdx2)
\r
229 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
234 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
236 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
238 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
240 if (loadX < train.cols)
\r
244 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
245 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
251 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
252 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
257 typename Dist::result_type distVal = dist;
\r
259 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
261 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
263 if (distVal < bestDistance1)
\r
265 bestImgIdx2 = bestImgIdx1;
\r
266 bestDistance2 = bestDistance1;
\r
267 bestTrainIdx2 = bestTrainIdx1;
\r
269 bestImgIdx1 = imgIdx;
\r
270 bestDistance1 = distVal;
\r
271 bestTrainIdx1 = trainIdx;
\r
273 else if (distVal < bestDistance2)
\r
275 bestImgIdx2 = imgIdx;
\r
276 bestDistance2 = distVal;
\r
277 bestTrainIdx2 = trainIdx;
\r
283 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
284 __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
286 extern __shared__ int smem[];
\r
288 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
290 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
291 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
\r
293 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
\r
295 float myBestDistance1 = numeric_limits<float>::max();
\r
296 float myBestDistance2 = numeric_limits<float>::max();
\r
297 int myBestTrainIdx1 = -1;
\r
298 int myBestTrainIdx2 = -1;
\r
300 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
304 float* s_distance = (float*)(smem);
\r
305 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
307 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
309 if (queryIdx < query.rows && threadIdx.x == 0)
\r
311 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
312 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
316 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
317 void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
318 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
\r
319 cudaStream_t stream)
\r
321 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
322 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
324 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
326 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
327 cudaSafeCall( cudaGetLastError() );
\r
330 cudaSafeCall( cudaDeviceSynchronize() );
\r
333 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
334 __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
336 extern __shared__ int smem[];
\r
338 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
340 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
341 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
\r
343 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
\r
345 float myBestDistance1 = numeric_limits<float>::max();
\r
346 float myBestDistance2 = numeric_limits<float>::max();
\r
347 int myBestTrainIdx1 = -1;
\r
348 int myBestTrainIdx2 = -1;
\r
349 int myBestImgIdx1 = -1;
\r
350 int myBestImgIdx2 = -1;
\r
354 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
356 const DevMem2D_<T> train = trains[imgIdx];
\r
358 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
363 float* s_distance = (float*)(smem);
\r
364 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
365 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
367 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
369 if (queryIdx < query.rows && threadIdx.x == 0)
\r
371 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
372 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
373 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
377 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
378 void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
\r
379 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
\r
380 cudaStream_t stream)
\r
382 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
383 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
385 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
387 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
388 cudaSafeCall( cudaGetLastError() );
\r
391 cudaSafeCall( cudaDeviceSynchronize() );
\r
394 ///////////////////////////////////////////////////////////////////////////////
\r
397 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
398 __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
\r
399 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
400 float& bestDistance1, float& bestDistance2,
\r
401 int& bestTrainIdx1, int& bestTrainIdx2,
\r
402 int& bestImgIdx1, int& bestImgIdx2)
\r
404 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
409 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
411 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
413 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
414 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
416 if (loadX < query.cols)
\r
420 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
\r
421 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
\r
423 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
424 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
430 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
431 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
436 typename Dist::result_type distVal = dist;
\r
438 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
440 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
442 if (distVal < bestDistance1)
\r
444 bestImgIdx2 = bestImgIdx1;
\r
445 bestDistance2 = bestDistance1;
\r
446 bestTrainIdx2 = bestTrainIdx1;
\r
448 bestImgIdx1 = imgIdx;
\r
449 bestDistance1 = distVal;
\r
450 bestTrainIdx1 = trainIdx;
\r
452 else if (distVal < bestDistance2)
\r
454 bestImgIdx2 = imgIdx;
\r
455 bestDistance2 = distVal;
\r
456 bestTrainIdx2 = trainIdx;
\r
462 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
463 __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
465 extern __shared__ int smem[];
\r
467 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
469 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
470 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
472 float myBestDistance1 = numeric_limits<float>::max();
\r
473 float myBestDistance2 = numeric_limits<float>::max();
\r
474 int myBestTrainIdx1 = -1;
\r
475 int myBestTrainIdx2 = -1;
\r
477 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
481 float* s_distance = (float*)(smem);
\r
482 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
484 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
486 if (queryIdx < query.rows && threadIdx.x == 0)
\r
488 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
489 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
493 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
494 void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
495 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
\r
496 cudaStream_t stream)
\r
498 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
499 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
501 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
503 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
504 cudaSafeCall( cudaGetLastError() );
\r
507 cudaSafeCall( cudaDeviceSynchronize() );
\r
510 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
511 __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
513 extern __shared__ int smem[];
\r
515 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
517 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
518 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
520 float myBestDistance1 = numeric_limits<float>::max();
\r
521 float myBestDistance2 = numeric_limits<float>::max();
\r
522 int myBestTrainIdx1 = -1;
\r
523 int myBestTrainIdx2 = -1;
\r
524 int myBestImgIdx1 = -1;
\r
525 int myBestImgIdx2 = -1;
\r
529 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
531 const DevMem2D_<T> train = trains[imgIdx];
\r
533 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
538 float* s_distance = (float*)(smem);
\r
539 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
540 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
542 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
544 if (queryIdx < query.rows && threadIdx.x == 0)
\r
546 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
547 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
548 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
552 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
553 void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
\r
554 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
\r
555 cudaStream_t stream)
\r
557 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
558 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
560 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
562 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
563 cudaSafeCall( cudaGetLastError() );
\r
566 cudaSafeCall( cudaDeviceSynchronize() );
\r
569 ///////////////////////////////////////////////////////////////////////////////
\r
572 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
573 __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
\r
574 typename Dist::value_type* s_query, typename Dist::value_type* s_train,
\r
575 float& bestDistance1, float& bestDistance2,
\r
576 int& bestTrainIdx1, int& bestTrainIdx2,
\r
577 int& bestImgIdx1, int& bestImgIdx2)
\r
579 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
\r
583 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
\r
585 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
587 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
588 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
590 if (loadX < query.cols)
\r
594 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
\r
595 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
\r
597 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
\r
598 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
\r
604 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
605 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
610 typename Dist::result_type distVal = dist;
\r
612 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
\r
614 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
\r
616 if (distVal < bestDistance1)
\r
618 bestImgIdx2 = bestImgIdx1;
\r
619 bestDistance2 = bestDistance1;
\r
620 bestTrainIdx2 = bestTrainIdx1;
\r
622 bestImgIdx1 = imgIdx;
\r
623 bestDistance1 = distVal;
\r
624 bestTrainIdx1 = trainIdx;
\r
626 else if (distVal < bestDistance2)
\r
628 bestImgIdx2 = imgIdx;
\r
629 bestDistance2 = distVal;
\r
630 bestTrainIdx2 = trainIdx;
\r
636 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
637 __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
\r
639 extern __shared__ int smem[];
\r
641 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
643 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
644 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
646 float myBestDistance1 = numeric_limits<float>::max();
\r
647 float myBestDistance2 = numeric_limits<float>::max();
\r
648 int myBestTrainIdx1 = -1;
\r
649 int myBestTrainIdx2 = -1;
\r
651 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
\r
655 float* s_distance = (float*)(smem);
\r
656 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
658 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
\r
660 if (queryIdx < query.rows && threadIdx.x == 0)
\r
662 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
663 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
667 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
668 void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
669 const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
\r
670 cudaStream_t stream)
\r
672 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
673 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
675 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
677 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
\r
678 cudaSafeCall( cudaGetLastError() );
\r
681 cudaSafeCall( cudaDeviceSynchronize() );
\r
684 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
685 __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
\r
687 extern __shared__ int smem[];
\r
689 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
\r
691 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
692 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
694 float myBestDistance1 = numeric_limits<float>::max();
\r
695 float myBestDistance2 = numeric_limits<float>::max();
\r
696 int myBestTrainIdx1 = -1;
\r
697 int myBestTrainIdx2 = -1;
\r
698 int myBestImgIdx1 = -1;
\r
699 int myBestImgIdx2 = -1;
\r
703 for (int imgIdx = 0; imgIdx < n; ++imgIdx)
\r
705 const DevMem2D_<T> train = trains[imgIdx];
\r
707 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
\r
712 float* s_distance = (float*)(smem);
\r
713 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
714 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
\r
716 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
\r
718 if (queryIdx < query.rows && threadIdx.x == 0)
\r
720 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
\r
721 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
\r
722 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
\r
726 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
727 void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
\r
728 const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
\r
729 cudaStream_t stream)
\r
731 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
732 const dim3 grid(divUp(query.rows, BLOCK_SIZE));
\r
734 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
736 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
\r
737 cudaSafeCall( cudaGetLastError() );
\r
740 cudaSafeCall( cudaDeviceSynchronize() );
\r
743 ///////////////////////////////////////////////////////////////////////////////
\r
744 // knnMatch 2 dispatcher
\r
746 template <typename Dist, typename T, typename Mask>
\r
747 void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
748 const DevMem2Db& trainIdx, const DevMem2Db& distance,
\r
749 int cc, cudaStream_t stream)
\r
751 if (query.cols <= 64)
\r
753 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
755 else if (query.cols <= 128)
\r
757 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
759 /*else if (query.cols <= 256)
\r
761 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
763 else if (query.cols <= 512)
\r
765 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
767 else if (query.cols <= 1024)
\r
769 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
773 match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
777 template <typename Dist, typename T, typename Mask>
\r
778 void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
\r
779 const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
\r
780 int cc, cudaStream_t stream)
\r
782 if (query.cols <= 64)
\r
784 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
786 else if (query.cols <= 128)
\r
788 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
790 /*else if (query.cols <= 256)
\r
792 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
794 else if (query.cols <= 512)
\r
796 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
798 else if (query.cols <= 1024)
\r
800 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
804 match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
\r
808 ///////////////////////////////////////////////////////////////////////////////
\r
809 // Calc distance kernel
\r
811 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
812 __global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
\r
814 extern __shared__ int smem[];
\r
816 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
\r
817 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
\r
819 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
820 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
825 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
\r
827 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
829 if (loadX < query.cols)
\r
831 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
\r
832 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
836 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
837 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
843 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
844 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
849 if (queryIdx < query.rows && trainIdx < train.rows)
\r
851 float distVal = numeric_limits<float>::max();
\r
853 if (mask(queryIdx, trainIdx))
\r
854 distVal = (typename Dist::result_type)dist;
\r
856 allDist.ptr(queryIdx)[trainIdx] = distVal;
\r
860 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
\r
861 void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
\r
863 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
864 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
\r
866 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
868 calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
\r
869 cudaSafeCall( cudaGetLastError() );
\r
872 cudaSafeCall( cudaDeviceSynchronize() );
\r
875 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
876 __global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
\r
878 extern __shared__ int smem[];
\r
880 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
\r
881 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
\r
883 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
\r
884 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
\r
888 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
\r
890 const int loadX = threadIdx.x + i * BLOCK_SIZE;
\r
892 if (loadX < query.cols)
\r
894 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
\r
895 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
\r
899 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
\r
900 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
\r
906 for (int j = 0; j < BLOCK_SIZE; ++j)
\r
907 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
\r
912 if (queryIdx < query.rows && trainIdx < train.rows)
\r
914 float distVal = numeric_limits<float>::max();
\r
916 if (mask(queryIdx, trainIdx))
\r
917 distVal = (typename Dist::result_type)dist;
\r
919 allDist.ptr(queryIdx)[trainIdx] = distVal;
\r
923 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
\r
924 void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
\r
926 const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
\r
927 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
\r
929 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
\r
931 calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
\r
932 cudaSafeCall( cudaGetLastError() );
\r
935 cudaSafeCall( cudaDeviceSynchronize() );
\r
938 ///////////////////////////////////////////////////////////////////////////////
\r
939 // Calc Distance dispatcher
\r
941 template <typename Dist, typename T, typename Mask>
\r
942 void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
\r
943 const DevMem2Df& allDist,
\r
944 int cc, cudaStream_t stream)
\r
946 if (query.cols <= 64)
\r
948 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
\r
950 else if (query.cols <= 128)
\r
952 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
\r
954 /*else if (query.cols <= 256)
\r
956 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
\r
958 else if (query.cols <= 512)
\r
960 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
\r
962 else if (query.cols <= 1024)
\r
964 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
\r
968 calcDistance<16, Dist>(query, train, mask, allDist, stream);
\r
972 ///////////////////////////////////////////////////////////////////////////////
\r
973 // find knn match kernel
\r
975 template <int BLOCK_SIZE>
\r
976 __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
\r
978 const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
\r
979 __shared__ float s_dist[SMEM_SIZE];
\r
980 __shared__ int s_trainIdx[SMEM_SIZE];
\r
982 const int queryIdx = blockIdx.x;
\r
984 float* allDistRow = allDist.ptr(queryIdx);
\r
986 float dist = numeric_limits<float>::max();
\r
989 for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
\r
991 float reg = allDistRow[i];
\r
999 s_dist[threadIdx.x] = dist;
\r
1000 s_trainIdx[threadIdx.x] = bestIdx;
\r
1003 reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
\r
1005 if (threadIdx.x == 0)
\r
1007 if (dist < numeric_limits<float>::max())
\r
1009 allDistRow[bestIdx] = numeric_limits<float>::max();
\r
1010 trainIdx.ptr(queryIdx)[i] = bestIdx;
\r
1011 distance.ptr(queryIdx)[i] = dist;
\r
1016 template <int BLOCK_SIZE>
\r
1017 void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
\r
1019 const dim3 block(BLOCK_SIZE, 1, 1);
\r
1020 const dim3 grid(trainIdx.rows, 1, 1);
\r
1022 for (int i = 0; i < k; ++i)
\r
1024 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
\r
1025 cudaSafeCall( cudaGetLastError() );
\r
1029 cudaSafeCall( cudaDeviceSynchronize() );
\r
1032 void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)
\r
1034 findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);
\r
1037 ///////////////////////////////////////////////////////////////////////////////
\r
1038 // knn match Dispatcher
\r
1040 template <typename Dist, typename T, typename Mask>
\r
1041 void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask,
\r
1042 const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
\r
1043 int cc, cudaStream_t stream)
\r
1047 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
\r
1051 calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
\r
1052 findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
\r
1056 ///////////////////////////////////////////////////////////////////////////////
\r
1057 // knn match caller
\r
1059 template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
\r
1060 const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
\r
1061 int cc, cudaStream_t stream)
\r
1064 matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1066 matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1069 template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1070 //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1071 template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1072 template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1073 template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1074 template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1076 template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
\r
1077 const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
\r
1078 int cc, cudaStream_t stream)
\r
1081 matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1083 matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1086 //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1087 //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1088 //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1089 //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1090 //template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1091 template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1093 template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
\r
1094 const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
\r
1095 int cc, cudaStream_t stream)
\r
1098 matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
\r
1100 matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
\r
1103 template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1104 //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1105 template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1106 //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1107 template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
\r
1109 template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
\r
1110 const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
\r
1111 int cc, cudaStream_t stream)
\r
1114 match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1116 match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1119 template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1120 //template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1121 template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1122 template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1123 template void match2L1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1124 template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1126 template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
\r
1127 const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
\r
1128 int cc, cudaStream_t stream)
\r
1131 match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1133 match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1136 //template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1137 //template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1138 //template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1139 //template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1140 //template void match2L2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1141 template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1143 template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
\r
1144 const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
\r
1145 int cc, cudaStream_t stream)
\r
1148 match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
\r
1150 match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
\r
1153 template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1154 //template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1155 template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1156 //template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1157 template void match2Hamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
\r
1158 } // namespace bf_knnmatch
\r
1159 }}} // namespace cv { namespace gpu { namespace device {
\r