added dual tvl1 optical flow gpu implementation
[profile/ivi/opencv.git] / modules / gpu / src / cuda / bf_match.cu
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or bpied warranties, including, but not limited to, the bpied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #if !defined CUDA_DISABLER
44
45 #include "opencv2/gpu/device/common.hpp"
46 #include "opencv2/gpu/device/utility.hpp"
47 #include "opencv2/gpu/device/reduce.hpp"
48 #include "opencv2/gpu/device/limits.hpp"
49 #include "opencv2/gpu/device/vec_distance.hpp"
50 #include "opencv2/gpu/device/datamov_utils.hpp"
51
52 namespace cv { namespace gpu { namespace device
53 {
54     namespace bf_match
55     {
56         ///////////////////////////////////////////////////////////////////////////////
57         // Reduction
58
59         template <int BLOCK_SIZE>
60         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
61         {
62             s_distance += threadIdx.y * BLOCK_SIZE;
63             s_trainIdx += threadIdx.y * BLOCK_SIZE;
64
65             reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
66         }
67
68         template <int BLOCK_SIZE>
69         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
70         {
71             s_distance += threadIdx.y * BLOCK_SIZE;
72             s_trainIdx += threadIdx.y * BLOCK_SIZE;
73             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
74
75             reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
76         }
77
78         ///////////////////////////////////////////////////////////////////////////////
79         // Match Unrolled Cached
80
81         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
82         __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
83         {
84             #pragma unroll
85             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
86             {
87                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
88                 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
89             }
90         }
91
92         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
93         __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
94                                            typename Dist::value_type* s_query, typename Dist::value_type* s_train,
95                                            float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
96         {
97             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
98             {
99                 Dist dist;
100
101                 #pragma unroll
102                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
103                 {
104                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
105
106                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
107
108                     if (loadX < train.cols)
109                     {
110                         T val;
111
112                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
113                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
114                     }
115
116                     __syncthreads();
117
118                     #pragma unroll
119                     for (int j = 0; j < BLOCK_SIZE; ++j)
120                         dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
121
122                     __syncthreads();
123                 }
124
125                 typename Dist::result_type distVal = dist;
126
127                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
128
129                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
130                 {
131                     bestImgIdx = imgIdx;
132                     bestDistance = distVal;
133                     bestTrainIdx = trainIdx;
134                 }
135             }
136         }
137
138         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
139         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
140         {
141             extern __shared__ int smem[];
142
143             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
144
145             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
146             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
147
148             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
149
150             float myBestDistance = numeric_limits<float>::max();
151             int myBestTrainIdx = -1;
152
153             loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
154
155             __syncthreads();
156
157             float* s_distance = (float*)(smem);
158             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
159
160             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
161
162             if (queryIdx < query.rows && threadIdx.x == 0)
163             {
164                 bestTrainIdx[queryIdx] = myBestTrainIdx;
165                 bestDistance[queryIdx] = myBestDistance;
166             }
167         }
168
169         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
170         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
171                                  const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
172                                  cudaStream_t stream)
173         {
174             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
175             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
176
177             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
178
179             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
180             cudaSafeCall( cudaGetLastError() );
181
182             if (stream == 0)
183                 cudaSafeCall( cudaDeviceSynchronize() );
184         }
185
186         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
187         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
188                                             int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
189         {
190             extern __shared__ int smem[];
191
192             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
193
194             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
195             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
196
197             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
198
199             float myBestDistance = numeric_limits<float>::max();
200             int myBestTrainIdx = -1;
201             int myBestImgIdx = -1;
202
203             Mask m = mask;
204
205             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
206             {
207                 const PtrStepSz<T> train = trains[imgIdx];
208                 m.next();
209                 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
210             }
211
212             __syncthreads();
213
214             float* s_distance = (float*)(smem);
215             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
216             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
217
218             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
219
220             if (queryIdx < query.rows && threadIdx.x == 0)
221             {
222                 bestTrainIdx[queryIdx] = myBestTrainIdx;
223                 bestImgIdx[queryIdx] = myBestImgIdx;
224                 bestDistance[queryIdx] = myBestDistance;
225             }
226         }
227
228         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
229         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
230                                  const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
231                                  cudaStream_t stream)
232         {
233             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
234             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
235
236             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
237
238             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
239             cudaSafeCall( cudaGetLastError() );
240
241             if (stream == 0)
242                 cudaSafeCall( cudaDeviceSynchronize() );
243         }
244
245         ///////////////////////////////////////////////////////////////////////////////
246         // Match Unrolled
247
248         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
249         __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
250                                      typename Dist::value_type* s_query, typename Dist::value_type* s_train,
251                                      float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
252         {
253             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
254             {
255                 Dist dist;
256
257                 #pragma unroll
258                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
259                 {
260                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
261
262                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
263                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
264
265                     if (loadX < query.cols)
266                     {
267                         T val;
268
269                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
270                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
271
272                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
273                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
274                     }
275
276                     __syncthreads();
277
278                     #pragma unroll
279                     for (int j = 0; j < BLOCK_SIZE; ++j)
280                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
281
282                     __syncthreads();
283                 }
284
285                 typename Dist::result_type distVal = dist;
286
287                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
288
289                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
290                 {
291                     bestImgIdx = imgIdx;
292                     bestDistance = distVal;
293                     bestTrainIdx = trainIdx;
294                 }
295             }
296         }
297
298         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
299         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
300         {
301             extern __shared__ int smem[];
302
303             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
304
305             float myBestDistance = numeric_limits<float>::max();
306             int myBestTrainIdx = -1;
307
308             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
309             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
310
311             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
312
313             __syncthreads();
314
315             float* s_distance = (float*)(smem);
316             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
317
318             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
319
320             if (queryIdx < query.rows && threadIdx.x == 0)
321             {
322                 bestTrainIdx[queryIdx] = myBestTrainIdx;
323                 bestDistance[queryIdx] = myBestDistance;
324             }
325         }
326
327         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
328         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
329                            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
330                            cudaStream_t stream)
331         {
332             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
333             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
334
335             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
336
337             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
338             cudaSafeCall( cudaGetLastError() );
339
340             if (stream == 0)
341                 cudaSafeCall( cudaDeviceSynchronize() );
342         }
343
344         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
345         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
346                                       int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
347         {
348             extern __shared__ int smem[];
349
350             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
351
352             float myBestDistance = numeric_limits<float>::max();
353             int myBestTrainIdx = -1;
354             int myBestImgIdx = -1;
355
356             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
357             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
358
359             Mask m = mask;
360
361             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
362             {
363                 const PtrStepSz<T> train = trains[imgIdx];
364                 m.next();
365                 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
366             }
367
368             __syncthreads();
369
370             float* s_distance = (float*)(smem);
371             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
372             int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
373
374             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
375
376             if (queryIdx < query.rows && threadIdx.x == 0)
377             {
378                 bestTrainIdx[queryIdx] = myBestTrainIdx;
379                 bestImgIdx[queryIdx] = myBestImgIdx;
380                 bestDistance[queryIdx] = myBestDistance;
381             }
382         }
383
384         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
385         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
386                            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
387                            cudaStream_t stream)
388         {
389             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
390             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
391
392             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
393
394             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
395             cudaSafeCall( cudaGetLastError() );
396
397             if (stream == 0)
398                 cudaSafeCall( cudaDeviceSynchronize() );
399         }
400
401         ///////////////////////////////////////////////////////////////////////////////
402         // Match
403
404         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
405         __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
406                              typename Dist::value_type* s_query, typename Dist::value_type* s_train,
407                              float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
408         {
409             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
410             {
411                 Dist dist;
412
413                 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
414                 {
415                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
416
417                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
418                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
419
420                     if (loadX < query.cols)
421                     {
422                         T val;
423
424                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
425                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
426
427                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
428                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
429                     }
430
431                     __syncthreads();
432
433                     #pragma unroll
434                     for (int j = 0; j < BLOCK_SIZE; ++j)
435                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
436
437                     __syncthreads();
438                 }
439
440                 typename Dist::result_type distVal = dist;
441
442                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
443
444                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
445                 {
446                     bestImgIdx = imgIdx;
447                     bestDistance = distVal;
448                     bestTrainIdx = trainIdx;
449                 }
450             }
451         }
452
453         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
454         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
455         {
456             extern __shared__ int smem[];
457
458             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
459
460             float myBestDistance = numeric_limits<float>::max();
461             int myBestTrainIdx = -1;
462
463             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
464             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
465
466             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
467
468             __syncthreads();
469
470             float* s_distance = (float*)(smem);
471             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
472
473             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
474
475             if (queryIdx < query.rows && threadIdx.x == 0)
476             {
477                 bestTrainIdx[queryIdx] = myBestTrainIdx;
478                 bestDistance[queryIdx] = myBestDistance;
479             }
480         }
481
482         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
483         void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
484                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
485                    cudaStream_t stream)
486         {
487             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
488             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
489
490             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
491
492             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
493             cudaSafeCall( cudaGetLastError() );
494
495             if (stream == 0)
496                 cudaSafeCall( cudaDeviceSynchronize() );
497         }
498
499         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
500         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
501                               int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
502         {
503             extern __shared__ int smem[];
504
505             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
506
507             float myBestDistance = numeric_limits<float>::max();
508             int myBestTrainIdx = -1;
509             int myBestImgIdx = -1;
510
511             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
512             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
513
514             Mask m = mask;
515             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
516             {
517                 const PtrStepSz<T> train = trains[imgIdx];
518                 m.next();
519                 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
520             }
521
522             __syncthreads();
523
524             float* s_distance = (float*)(smem);
525             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
526             int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
527
528             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
529
530             if (queryIdx < query.rows && threadIdx.x == 0)
531             {
532                 bestTrainIdx[queryIdx] = myBestTrainIdx;
533                 bestImgIdx[queryIdx] = myBestImgIdx;
534                 bestDistance[queryIdx] = myBestDistance;
535             }
536         }
537
538         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
539         void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
540                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
541                    cudaStream_t stream)
542         {
543             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
544             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
545
546             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
547
548             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
549             cudaSafeCall( cudaGetLastError() );
550
551             if (stream == 0)
552                 cudaSafeCall( cudaDeviceSynchronize() );
553         }
554
555         ///////////////////////////////////////////////////////////////////////////////
556         // Match dispatcher
557
558         template <typename Dist, typename T, typename Mask>
559         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
560                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
561                              cudaStream_t stream)
562         {
563             if (query.cols <= 64)
564             {
565                 matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
566             }
567             else if (query.cols <= 128)
568             {
569                 matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
570             }
571             /*else if (query.cols <= 256)
572             {
573                 matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
574             }
575             else if (query.cols <= 512)
576             {
577                 matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
578             }
579             else if (query.cols <= 1024)
580             {
581                 matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
582             }*/
583             else
584             {
585                 match<16, Dist>(query, train, mask, trainIdx, distance, stream);
586             }
587         }
588
589         template <typename Dist, typename T, typename Mask>
590         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
591                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
592                              cudaStream_t stream)
593         {
594             if (query.cols <= 64)
595             {
596                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
597             }
598             else if (query.cols <= 128)
599             {
600                 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
601             }
602             /*else if (query.cols <= 256)
603             {
604                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
605             }
606             else if (query.cols <= 512)
607             {
608                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
609             }
610             else if (query.cols <= 1024)
611             {
612                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
613             }*/
614             else
615             {
616                 match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
617             }
618         }
619
620         ///////////////////////////////////////////////////////////////////////////////
621         // Match caller
622
623         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
624                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
625                                                cudaStream_t stream)
626         {
627             if (mask.data)
628             {
629                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
630                     trainIdx, distance,
631                     stream);
632             }
633             else
634             {
635                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
636                     trainIdx, distance,
637                     stream);
638             }
639         }
640
641         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
642         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
643         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
644         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
645         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
646         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
647
648         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
649                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
650                                                cudaStream_t stream)
651         {
652             if (mask.data)
653             {
654                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
655                     trainIdx, distance,
656                     stream);
657             }
658             else
659             {
660                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
661                     trainIdx, distance,
662                     stream);
663             }
664         }
665
666         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
667         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
668         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
669         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
670         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
671         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
672
673         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
674                                                     const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
675                                                     cudaStream_t stream)
676         {
677             if (mask.data)
678             {
679                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
680                     trainIdx, distance,
681                     stream);
682             }
683             else
684             {
685                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
686                     trainIdx, distance,
687                     stream);
688             }
689         }
690
691         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
692         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
693         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
694         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
695         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
696
697         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
698                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
699                                                 cudaStream_t stream)
700         {
701             if (masks.data)
702             {
703                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
704                     trainIdx, imgIdx, distance,
705                     stream);
706             }
707             else
708             {
709                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
710                     trainIdx, imgIdx, distance,
711                     stream);
712             }
713         }
714
715         template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
716         //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
717         template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
718         template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
719         template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
720         template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
721
722         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
723                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
724                                                cudaStream_t stream)
725         {
726             if (masks.data)
727             {
728                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
729                     trainIdx, imgIdx, distance,
730                     stream);
731             }
732             else
733             {
734                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
735                     trainIdx, imgIdx, distance,
736                     stream);
737             }
738         }
739
740         //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
741         //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
742         //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
743         //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
744         //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
745         template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
746
747         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
748                                                     const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
749                                                     cudaStream_t stream)
750         {
751             if (masks.data)
752             {
753                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
754                     trainIdx, imgIdx, distance,
755                     stream);
756             }
757             else
758             {
759                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
760                     trainIdx, imgIdx, distance,
761                     stream);
762             }
763         }
764
765         template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
766         //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
767         template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
768         //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
769         template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
770     } // namespace bf_match
771 }}} // namespace cv { namespace gpu { namespace device {
772
773
774 #endif /* CUDA_DISABLER */