Merge pull request #328 from jet47:new-gpu-fixes
[profile/ivi/opencv.git] / modules / gpu / src / cuda / bf_knnmatch.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #if !defined CUDA_DISABLER
44
45 #include "opencv2/gpu/device/common.hpp"
46 #include "opencv2/gpu/device/utility.hpp"
47 #include "opencv2/gpu/device/reduce.hpp"
48 #include "opencv2/gpu/device/limits.hpp"
49 #include "opencv2/gpu/device/vec_distance.hpp"
50 #include "opencv2/gpu/device/datamov_utils.hpp"
51 #include "opencv2/gpu/device/warp_shuffle.hpp"
52
53 namespace cv { namespace gpu { namespace device
54 {
55     namespace bf_knnmatch
56     {
57         ///////////////////////////////////////////////////////////////////////////////
58         // Reduction
59
60         template <int BLOCK_SIZE>
61         __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
62                                       int& bestTrainIdx1, int& bestTrainIdx2,
63                                       float* s_distance, int* s_trainIdx)
64         {
65         #if __CUDA_ARCH__ >= 300
66             (void) s_distance;
67             (void) s_trainIdx;
68
69             float d1, d2;
70             int i1, i2;
71
72             #pragma unroll
73             for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
74             {
75                 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
76                 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
77                 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
78                 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
79
80                 if (bestDistance1 < d1)
81                 {
82                     if (d1 < bestDistance2)
83                     {
84                         bestDistance2 = d1;
85                         bestTrainIdx2 = i1;
86                     }
87                 }
88                 else
89                 {
90                     bestDistance2 = bestDistance1;
91                     bestTrainIdx2 = bestTrainIdx1;
92
93                     bestDistance1 = d1;
94                     bestTrainIdx1 = i1;
95
96                     if (d2 < bestDistance2)
97                     {
98                         bestDistance2 = d2;
99                         bestTrainIdx2 = i2;
100                     }
101                 }
102             }
103         #else
104             float myBestDistance1 = numeric_limits<float>::max();
105             float myBestDistance2 = numeric_limits<float>::max();
106             int myBestTrainIdx1 = -1;
107             int myBestTrainIdx2 = -1;
108
109             s_distance += threadIdx.y * BLOCK_SIZE;
110             s_trainIdx += threadIdx.y * BLOCK_SIZE;
111
112             s_distance[threadIdx.x] = bestDistance1;
113             s_trainIdx[threadIdx.x] = bestTrainIdx1;
114
115             __syncthreads();
116
117             if (threadIdx.x == 0)
118             {
119                 #pragma unroll
120                 for (int i = 0; i < BLOCK_SIZE; ++i)
121                 {
122                     float val = s_distance[i];
123
124                     if (val < myBestDistance1)
125                     {
126                         myBestDistance2 = myBestDistance1;
127                         myBestTrainIdx2 = myBestTrainIdx1;
128
129                         myBestDistance1 = val;
130                         myBestTrainIdx1 = s_trainIdx[i];
131                     }
132                     else if (val < myBestDistance2)
133                     {
134                         myBestDistance2 = val;
135                         myBestTrainIdx2 = s_trainIdx[i];
136                     }
137                 }
138             }
139
140             __syncthreads();
141
142             s_distance[threadIdx.x] = bestDistance2;
143             s_trainIdx[threadIdx.x] = bestTrainIdx2;
144
145             __syncthreads();
146
147             if (threadIdx.x == 0)
148             {
149                 #pragma unroll
150                 for (int i = 0; i < BLOCK_SIZE; ++i)
151                 {
152                     float val = s_distance[i];
153
154                     if (val < myBestDistance2)
155                     {
156                         myBestDistance2 = val;
157                         myBestTrainIdx2 = s_trainIdx[i];
158                     }
159                 }
160             }
161
162             bestDistance1 = myBestDistance1;
163             bestDistance2 = myBestDistance2;
164
165             bestTrainIdx1 = myBestTrainIdx1;
166             bestTrainIdx2 = myBestTrainIdx2;
167         #endif
168         }
169
170         template <int BLOCK_SIZE>
171         __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
172                                        int& bestTrainIdx1, int& bestTrainIdx2,
173                                        int& bestImgIdx1, int& bestImgIdx2,
174                                        float* s_distance, int* s_trainIdx, int* s_imgIdx)
175         {
176         #if __CUDA_ARCH__ >= 300
177             (void) s_distance;
178             (void) s_trainIdx;
179             (void) s_imgIdx;
180
181             float d1, d2;
182             int i1, i2;
183             int j1, j2;
184
185             #pragma unroll
186             for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
187             {
188                 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
189                 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
190                 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
191                 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
192                 j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
193                 j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
194
195                 if (bestDistance1 < d1)
196                 {
197                     if (d1 < bestDistance2)
198                     {
199                         bestDistance2 = d1;
200                         bestTrainIdx2 = i1;
201                         bestImgIdx2 = j1;
202                     }
203                 }
204                 else
205                 {
206                     bestDistance2 = bestDistance1;
207                     bestTrainIdx2 = bestTrainIdx1;
208                     bestImgIdx2 = bestImgIdx1;
209
210                     bestDistance1 = d1;
211                     bestTrainIdx1 = i1;
212                     bestImgIdx1 = j1;
213
214                     if (d2 < bestDistance2)
215                     {
216                         bestDistance2 = d2;
217                         bestTrainIdx2 = i2;
218                         bestImgIdx2 = j2;
219                     }
220                 }
221             }
222         #else
223             float myBestDistance1 = numeric_limits<float>::max();
224             float myBestDistance2 = numeric_limits<float>::max();
225             int myBestTrainIdx1 = -1;
226             int myBestTrainIdx2 = -1;
227             int myBestImgIdx1 = -1;
228             int myBestImgIdx2 = -1;
229
230             s_distance += threadIdx.y * BLOCK_SIZE;
231             s_trainIdx += threadIdx.y * BLOCK_SIZE;
232             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
233
234             s_distance[threadIdx.x] = bestDistance1;
235             s_trainIdx[threadIdx.x] = bestTrainIdx1;
236             s_imgIdx[threadIdx.x]   = bestImgIdx1;
237
238             __syncthreads();
239
240             if (threadIdx.x == 0)
241             {
242                 #pragma unroll
243                 for (int i = 0; i < BLOCK_SIZE; ++i)
244                 {
245                     float val = s_distance[i];
246
247                     if (val < myBestDistance1)
248                     {
249                         myBestDistance2 = myBestDistance1;
250                         myBestTrainIdx2 = myBestTrainIdx1;
251                         myBestImgIdx2   = myBestImgIdx1;
252
253                         myBestDistance1 = val;
254                         myBestTrainIdx1 = s_trainIdx[i];
255                         myBestImgIdx1   = s_imgIdx[i];
256                     }
257                     else if (val < myBestDistance2)
258                     {
259                         myBestDistance2 = val;
260                         myBestTrainIdx2 = s_trainIdx[i];
261                         myBestImgIdx2   = s_imgIdx[i];
262                     }
263                 }
264             }
265
266             __syncthreads();
267
268             s_distance[threadIdx.x] = bestDistance2;
269             s_trainIdx[threadIdx.x] = bestTrainIdx2;
270             s_imgIdx[threadIdx.x]   = bestImgIdx2;
271
272             __syncthreads();
273
274             if (threadIdx.x == 0)
275             {
276                 #pragma unroll
277                 for (int i = 0; i < BLOCK_SIZE; ++i)
278                 {
279                     float val = s_distance[i];
280
281                     if (val < myBestDistance2)
282                     {
283                         myBestDistance2 = val;
284                         myBestTrainIdx2 = s_trainIdx[i];
285                         myBestImgIdx2   = s_imgIdx[i];
286                     }
287                 }
288             }
289
290             bestDistance1 = myBestDistance1;
291             bestDistance2 = myBestDistance2;
292
293             bestTrainIdx1 = myBestTrainIdx1;
294             bestTrainIdx2 = myBestTrainIdx2;
295
296             bestImgIdx1 = myBestImgIdx1;
297             bestImgIdx2 = myBestImgIdx2;
298         #endif
299         }
300
301         ///////////////////////////////////////////////////////////////////////////////
302         // Match Unrolled Cached
303
304         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
305         __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
306         {
307             #pragma unroll
308             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
309             {
310                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
311                 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
312             }
313         }
314
315         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
316         __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
317                                            typename Dist::value_type* s_query, typename Dist::value_type* s_train,
318                                            float& bestDistance1, float& bestDistance2,
319                                            int& bestTrainIdx1, int& bestTrainIdx2,
320                                            int& bestImgIdx1, int& bestImgIdx2)
321         {
322             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
323             {
324                 Dist dist;
325
326                 #pragma unroll
327                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
328                 {
329                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
330
331                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
332
333                     if (loadX < train.cols)
334                     {
335                         T val;
336
337                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
338                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
339                     }
340
341                     __syncthreads();
342
343                     #pragma unroll
344                     for (int j = 0; j < BLOCK_SIZE; ++j)
345                         dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
346
347                     __syncthreads();
348                 }
349
350                 typename Dist::result_type distVal = dist;
351
352                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
353
354                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
355                 {
356                     if (distVal < bestDistance1)
357                     {
358                         bestImgIdx2   = bestImgIdx1;
359                         bestDistance2 = bestDistance1;
360                         bestTrainIdx2 = bestTrainIdx1;
361
362                         bestImgIdx1   = imgIdx;
363                         bestDistance1 = distVal;
364                         bestTrainIdx1 = trainIdx;
365                     }
366                     else if (distVal < bestDistance2)
367                     {
368                         bestImgIdx2   = imgIdx;
369                         bestDistance2 = distVal;
370                         bestTrainIdx2 = trainIdx;
371                     }
372                 }
373             }
374         }
375
376         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
377         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
378         {
379             extern __shared__ int smem[];
380
381             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
382
383             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
384             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
385
386             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
387
388             float myBestDistance1 = numeric_limits<float>::max();
389             float myBestDistance2 = numeric_limits<float>::max();
390             int myBestTrainIdx1 = -1;
391             int myBestTrainIdx2 = -1;
392
393             loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
394
395             __syncthreads();
396
397             float* s_distance = (float*)(smem);
398             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
399
400             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
401
402             if (queryIdx < query.rows && threadIdx.x == 0)
403             {
404                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
405                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
406             }
407         }
408
409         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
410         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
411                                  const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
412                                  cudaStream_t stream)
413         {
414             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
415             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
416
417             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
418
419             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
420             cudaSafeCall( cudaGetLastError() );
421
422             if (stream == 0)
423                 cudaSafeCall( cudaDeviceSynchronize() );
424         }
425
426         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
427         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
428         {
429             extern __shared__ int smem[];
430
431             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
432
433             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
434             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
435
436             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
437
438             float myBestDistance1 = numeric_limits<float>::max();
439             float myBestDistance2 = numeric_limits<float>::max();
440             int myBestTrainIdx1 = -1;
441             int myBestTrainIdx2 = -1;
442             int myBestImgIdx1 = -1;
443             int myBestImgIdx2 = -1;
444
445             Mask m = mask;
446
447             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
448             {
449                 const PtrStepSz<T> train = trains[imgIdx];
450                 m.next();
451                 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
452             }
453
454             __syncthreads();
455
456             float* s_distance = (float*)(smem);
457             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
458             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
459
460             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
461
462             if (queryIdx < query.rows && threadIdx.x == 0)
463             {
464                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
465                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
466                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
467             }
468         }
469
470         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
471         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
472                                  const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
473                                  cudaStream_t stream)
474         {
475             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
476             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
477
478             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
479
480             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
481             cudaSafeCall( cudaGetLastError() );
482
483             if (stream == 0)
484                 cudaSafeCall( cudaDeviceSynchronize() );
485         }
486
487         ///////////////////////////////////////////////////////////////////////////////
488         // Match Unrolled
489
490         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
491         __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
492                                      typename Dist::value_type* s_query, typename Dist::value_type* s_train,
493                                      float& bestDistance1, float& bestDistance2,
494                                      int& bestTrainIdx1, int& bestTrainIdx2,
495                                      int& bestImgIdx1, int& bestImgIdx2)
496         {
497             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
498             {
499                 Dist dist;
500
501                 #pragma unroll
502                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
503                 {
504                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
505
506                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
507                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
508
509                     if (loadX < query.cols)
510                     {
511                         T val;
512
513                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
514                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
515
516                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
517                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
518                     }
519
520                     __syncthreads();
521
522                     #pragma unroll
523                     for (int j = 0; j < BLOCK_SIZE; ++j)
524                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
525
526                     __syncthreads();
527                 }
528
529                 typename Dist::result_type distVal = dist;
530
531                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
532
533                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
534                 {
535                     if (distVal < bestDistance1)
536                     {
537                         bestImgIdx2   = bestImgIdx1;
538                         bestDistance2 = bestDistance1;
539                         bestTrainIdx2 = bestTrainIdx1;
540
541                         bestImgIdx1   = imgIdx;
542                         bestDistance1 = distVal;
543                         bestTrainIdx1 = trainIdx;
544                     }
545                     else if (distVal < bestDistance2)
546                     {
547                         bestImgIdx2   = imgIdx;
548                         bestDistance2 = distVal;
549                         bestTrainIdx2 = trainIdx;
550                     }
551                 }
552             }
553         }
554
555         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
556         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
557         {
558             extern __shared__ int smem[];
559
560             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
561
562             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
563             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
564
565             float myBestDistance1 = numeric_limits<float>::max();
566             float myBestDistance2 = numeric_limits<float>::max();
567             int myBestTrainIdx1 = -1;
568             int myBestTrainIdx2 = -1;
569
570             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
571
572             __syncthreads();
573
574             float* s_distance = (float*)(smem);
575             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
576
577             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
578
579             if (queryIdx < query.rows && threadIdx.x == 0)
580             {
581                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
582                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
583             }
584         }
585
586         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
587         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
588                            const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
589                            cudaStream_t stream)
590         {
591             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
592             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
593
594             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
595
596             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
597             cudaSafeCall( cudaGetLastError() );
598
599             if (stream == 0)
600                 cudaSafeCall( cudaDeviceSynchronize() );
601         }
602
603         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
604         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
605         {
606             extern __shared__ int smem[];
607
608             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
609
610             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
611             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
612
613             float myBestDistance1 = numeric_limits<float>::max();
614             float myBestDistance2 = numeric_limits<float>::max();
615             int myBestTrainIdx1 = -1;
616             int myBestTrainIdx2 = -1;
617             int myBestImgIdx1 = -1;
618             int myBestImgIdx2 = -1;
619
620             Mask m = mask;
621
622             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
623             {
624                 const PtrStepSz<T> train = trains[imgIdx];
625                 m.next();
626                 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
627             }
628
629             __syncthreads();
630
631             float* s_distance = (float*)(smem);
632             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
633             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
634
635             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
636
637             if (queryIdx < query.rows && threadIdx.x == 0)
638             {
639                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
640                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
641                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
642             }
643         }
644
645         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
646         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
647                            const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
648                            cudaStream_t stream)
649         {
650             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
651             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
652
653             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
654
655             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
656             cudaSafeCall( cudaGetLastError() );
657
658             if (stream == 0)
659                 cudaSafeCall( cudaDeviceSynchronize() );
660         }
661
662         ///////////////////////////////////////////////////////////////////////////////
663         // Match
664
665         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
666         __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
667                              typename Dist::value_type* s_query, typename Dist::value_type* s_train,
668                              float& bestDistance1, float& bestDistance2,
669                              int& bestTrainIdx1, int& bestTrainIdx2,
670                              int& bestImgIdx1, int& bestImgIdx2)
671         {
672             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
673             {
674                 Dist dist;
675
676                 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
677                 {
678                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
679
680                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
681                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
682
683                     if (loadX < query.cols)
684                     {
685                         T val;
686
687                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
688                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
689
690                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
691                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
692                     }
693
694                     __syncthreads();
695
696                     #pragma unroll
697                     for (int j = 0; j < BLOCK_SIZE; ++j)
698                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
699
700                     __syncthreads();
701                 }
702
703                 typename Dist::result_type distVal = dist;
704
705                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
706
707                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
708                 {
709                     if (distVal < bestDistance1)
710                     {
711                         bestImgIdx2   = bestImgIdx1;
712                         bestDistance2 = bestDistance1;
713                         bestTrainIdx2 = bestTrainIdx1;
714
715                         bestImgIdx1   = imgIdx;
716                         bestDistance1 = distVal;
717                         bestTrainIdx1 = trainIdx;
718                     }
719                     else if (distVal < bestDistance2)
720                     {
721                         bestImgIdx2   = imgIdx;
722                         bestDistance2 = distVal;
723                         bestTrainIdx2 = trainIdx;
724                     }
725                 }
726             }
727         }
728
729         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
730         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
731         {
732             extern __shared__ int smem[];
733
734             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
735
736             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
737             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
738
739             float myBestDistance1 = numeric_limits<float>::max();
740             float myBestDistance2 = numeric_limits<float>::max();
741             int myBestTrainIdx1 = -1;
742             int myBestTrainIdx2 = -1;
743
744             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
745
746             __syncthreads();
747
748             float* s_distance = (float*)(smem);
749             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
750
751             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
752
753             if (queryIdx < query.rows && threadIdx.x == 0)
754             {
755                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
756                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
757             }
758         }
759
760         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
761         void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
762                    const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
763                    cudaStream_t stream)
764         {
765             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
766             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
767
768             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
769
770             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
771             cudaSafeCall( cudaGetLastError() );
772
773             if (stream == 0)
774                 cudaSafeCall( cudaDeviceSynchronize() );
775         }
776
777         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
778         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
779         {
780             extern __shared__ int smem[];
781
782             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
783
784             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
785             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
786
787             float myBestDistance1 = numeric_limits<float>::max();
788             float myBestDistance2 = numeric_limits<float>::max();
789             int myBestTrainIdx1 = -1;
790             int myBestTrainIdx2 = -1;
791             int myBestImgIdx1 = -1;
792             int myBestImgIdx2 = -1;
793
794             Mask m = mask;
795
796             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
797             {
798                 const PtrStepSz<T> train = trains[imgIdx];
799                 m.next();
800                 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
801             }
802
803             __syncthreads();
804
805             float* s_distance = (float*)(smem);
806             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
807             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
808
809             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
810
811             if (queryIdx < query.rows && threadIdx.x == 0)
812             {
813                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
814                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
815                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
816             }
817         }
818
819         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
820         void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
821                    const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
822                    cudaStream_t stream)
823         {
824             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
825             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
826
827             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
828
829             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
830             cudaSafeCall( cudaGetLastError() );
831
832             if (stream == 0)
833                 cudaSafeCall( cudaDeviceSynchronize() );
834         }
835
836         ///////////////////////////////////////////////////////////////////////////////
837         // knnMatch 2 dispatcher
838
839         template <typename Dist, typename T, typename Mask>
840         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
841                               const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
842                               cudaStream_t stream)
843         {
844             if (query.cols <= 64)
845             {
846                 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
847             }
848             else if (query.cols <= 128)
849             {
850                 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
851             }
852             /*else if (query.cols <= 256)
853             {
854                 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
855             }
856             else if (query.cols <= 512)
857             {
858                 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
859             }
860             else if (query.cols <= 1024)
861             {
862                 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
863             }*/
864             else
865             {
866                 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
867             }
868         }
869
870         template <typename Dist, typename T, typename Mask>
871         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
872                               const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
873                               cudaStream_t stream)
874         {
875             if (query.cols <= 64)
876             {
877                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
878             }
879             else if (query.cols <= 128)
880             {
881                 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
882             }
883             /*else if (query.cols <= 256)
884             {
885                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
886             }
887             else if (query.cols <= 512)
888             {
889                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
890             }
891             else if (query.cols <= 1024)
892             {
893                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
894             }*/
895             else
896             {
897                 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
898             }
899         }
900
901         ///////////////////////////////////////////////////////////////////////////////
902         // Calc distance kernel
903
904         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
905         __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
906         {
907             extern __shared__ int smem[];
908
909             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
910             const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
911
912             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
913             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
914
915             Dist dist;
916
917             #pragma unroll
918             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
919             {
920                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
921
922                 if (loadX < query.cols)
923                 {
924                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
925                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
926                 }
927                 else
928                 {
929                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
930                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
931                 }
932
933                 __syncthreads();
934
935                 #pragma unroll
936                 for (int j = 0; j < BLOCK_SIZE; ++j)
937                     dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
938
939                 __syncthreads();
940             }
941
942             if (queryIdx < query.rows && trainIdx < train.rows)
943             {
944                 float distVal = numeric_limits<float>::max();
945
946                 if (mask(queryIdx, trainIdx))
947                     distVal = (typename Dist::result_type)dist;
948
949                 allDist.ptr(queryIdx)[trainIdx] = distVal;
950             }
951         }
952
953         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
954         void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
955         {
956             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
957             const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
958
959             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
960
961             calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
962             cudaSafeCall( cudaGetLastError() );
963
964             if (stream == 0)
965                 cudaSafeCall( cudaDeviceSynchronize() );
966         }
967
968         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
969         __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
970         {
971             extern __shared__ int smem[];
972
973             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
974             const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
975
976             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
977             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
978
979             Dist dist;
980
981             for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
982             {
983                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
984
985                 if (loadX < query.cols)
986                 {
987                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
988                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
989                 }
990                 else
991                 {
992                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
993                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
994                 }
995
996                 __syncthreads();
997
998                 #pragma unroll
999                 for (int j = 0; j < BLOCK_SIZE; ++j)
1000                     dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
1001
1002                 __syncthreads();
1003             }
1004
1005             if (queryIdx < query.rows && trainIdx < train.rows)
1006             {
1007                 float distVal = numeric_limits<float>::max();
1008
1009                 if (mask(queryIdx, trainIdx))
1010                     distVal = (typename Dist::result_type)dist;
1011
1012                 allDist.ptr(queryIdx)[trainIdx] = distVal;
1013             }
1014         }
1015
1016         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
1017         void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
1018         {
1019             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
1020             const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
1021
1022             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
1023
1024             calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
1025             cudaSafeCall( cudaGetLastError() );
1026
1027             if (stream == 0)
1028                 cudaSafeCall( cudaDeviceSynchronize() );
1029         }
1030
1031         ///////////////////////////////////////////////////////////////////////////////
1032         // Calc Distance dispatcher
1033
1034         template <typename Dist, typename T, typename Mask>
1035         void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
1036                                     const PtrStepSzf& allDist,
1037                                     cudaStream_t stream)
1038         {
1039             if (query.cols <= 64)
1040             {
1041                 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
1042             }
1043             else if (query.cols <= 128)
1044             {
1045                 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
1046             }
1047             /*else if (query.cols <= 256)
1048             {
1049                 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
1050             }
1051             else if (query.cols <= 512)
1052             {
1053                 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
1054             }
1055             else if (query.cols <= 1024)
1056             {
1057                 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
1058             }*/
1059             else
1060             {
1061                 calcDistance<16, Dist>(query, train, mask, allDist, stream);
1062             }
1063         }
1064
1065         ///////////////////////////////////////////////////////////////////////////////
1066         // find knn match kernel
1067
1068         template <int BLOCK_SIZE>
1069         __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
1070         {
1071             const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
1072             __shared__ float s_dist[SMEM_SIZE];
1073             __shared__ int s_trainIdx[SMEM_SIZE];
1074
1075             const int queryIdx = blockIdx.x;
1076
1077             float* allDistRow = allDist.ptr(queryIdx);
1078
1079             float dist = numeric_limits<float>::max();
1080             int bestIdx = -1;
1081
1082             for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
1083             {
1084                 float reg = allDistRow[i];
1085                 if (reg < dist)
1086                 {
1087                     dist = reg;
1088                     bestIdx = i;
1089                 }
1090             }
1091
1092             s_dist[threadIdx.x] = dist;
1093             s_trainIdx[threadIdx.x] = bestIdx;
1094             __syncthreads();
1095
1096             reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
1097
1098             if (threadIdx.x == 0)
1099             {
1100                 if (dist < numeric_limits<float>::max())
1101                 {
1102                     allDistRow[bestIdx] = numeric_limits<float>::max();
1103                     trainIdx.ptr(queryIdx)[i] = bestIdx;
1104                     distance.ptr(queryIdx)[i] = dist;
1105                 }
1106             }
1107         }
1108
1109         template <int BLOCK_SIZE>
1110         void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
1111         {
1112             const dim3 block(BLOCK_SIZE, 1, 1);
1113             const dim3 grid(trainIdx.rows, 1, 1);
1114
1115             for (int i = 0; i < k; ++i)
1116             {
1117                 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
1118                 cudaSafeCall( cudaGetLastError() );
1119             }
1120
1121             if (stream == 0)
1122                 cudaSafeCall( cudaDeviceSynchronize() );
1123         }
1124
1125         void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
1126         {
1127             findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
1128         }
1129
1130         ///////////////////////////////////////////////////////////////////////////////
1131         // knn match Dispatcher
1132
1133         template <typename Dist, typename T, typename Mask>
1134         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
1135             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1136             cudaStream_t stream)
1137         {
1138             if (k == 2)
1139             {
1140                 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
1141             }
1142             else
1143             {
1144                 calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
1145                 findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
1146             }
1147         }
1148
1149         ///////////////////////////////////////////////////////////////////////////////
1150         // knn match caller
1151
1152         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1153             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1154             cudaStream_t stream)
1155         {
1156             if (mask.data)
1157                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1158             else
1159                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1160         }
1161
1162         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1163         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1164         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1165         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1166         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1167         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1168
1169         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1170             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1171             cudaStream_t stream)
1172         {
1173             if (mask.data)
1174                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1175             else
1176                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1177         }
1178
1179         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1180         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1181         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1182         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1183         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1184         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1185
1186         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1187             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1188             cudaStream_t stream)
1189         {
1190             if (mask.data)
1191                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1192             else
1193                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1194         }
1195
1196         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1197         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1198         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1199         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1200         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1201
1202         template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1203             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1204             cudaStream_t stream)
1205         {
1206             if (masks.data)
1207                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1208             else
1209                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
1210         }
1211
1212         template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1213         //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1214         template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1215         template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1216         template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1217         template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1218
1219         template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1220             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1221             cudaStream_t stream)
1222         {
1223             if (masks.data)
1224                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1225             else
1226                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
1227         }
1228
1229         //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1230         //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1231         //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1232         //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1233         //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1234         template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1235
1236         template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1237             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1238             cudaStream_t stream)
1239         {
1240             if (masks.data)
1241                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1242             else
1243                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
1244         }
1245
1246         template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1247         //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1248         template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1249         //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1250         template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1251     } // namespace bf_knnmatch
1252 }}} // namespace cv { namespace gpu { namespace device {
1253
1254
1255 #endif /* CUDA_DISABLER */