modules/gpu/src/cuda/bf_match.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or bpied warranties, including, but not limited to, the bpied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "opencv2/gpu/device/common.hpp"
  46 #include "opencv2/gpu/device/utility.hpp"
  47 #include "opencv2/gpu/device/reduce.hpp"
  48 #include "opencv2/gpu/device/limits.hpp"
  49 #include "opencv2/gpu/device/vec_distance.hpp"
  50 #include "opencv2/gpu/device/datamov_utils.hpp"
  51
  52 namespace cv { namespace gpu { namespace device
  53 {
  54     namespace bf_match
  55     {
  56         ///////////////////////////////////////////////////////////////////////////////
  57         // Reduction
  58
  59         template <int BLOCK_SIZE>
  60         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
  61         {
  62             s_distance += threadIdx.y * BLOCK_SIZE;
  63             s_trainIdx += threadIdx.y * BLOCK_SIZE;
  64
  65             reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
  66         }
  67
  68         template <int BLOCK_SIZE>
  69         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
  70         {
  71             s_distance += threadIdx.y * BLOCK_SIZE;
  72             s_trainIdx += threadIdx.y * BLOCK_SIZE;
  73             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
  74
  75             reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
  76         }
  77
  78         ///////////////////////////////////////////////////////////////////////////////
  79         // Match Unrolled Cached
  80
  81         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
  82         __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
  83         {
  84             #pragma unroll
  85             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
  86             {
  87                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
  88                 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
  89             }
  90         }
  91
  92         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
  93         __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
  94                                            typename Dist::value_type* s_query, typename Dist::value_type* s_train,
  95                                            float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
  96         {
  97             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
  98             {
  99                 Dist dist;
 100
 101                 #pragma unroll
 102                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 103                 {
 104                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 105
 106                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 107
 108                     if (loadX < train.cols)
 109                     {
 110                         T val;
 111
 112                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 113                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 114                     }
 115
 116                     __syncthreads();
 117
 118                     #pragma unroll
 119                     for (int j = 0; j < BLOCK_SIZE; ++j)
 120                         dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 121
 122                     __syncthreads();
 123                 }
 124
 125                 typename Dist::result_type distVal = dist;
 126
 127                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 128
 129                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
 130                 {
 131                     bestImgIdx = imgIdx;
 132                     bestDistance = distVal;
 133                     bestTrainIdx = trainIdx;
 134                 }
 135             }
 136         }
 137
 138         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 139         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
 140         {
 141             extern __shared__ int smem[];
 142
 143             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 144
 145             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 146             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 147
 148             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 149
 150             float myBestDistance = numeric_limits<float>::max();
 151             int myBestTrainIdx = -1;
 152
 153             loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
 154
 155             __syncthreads();
 156
 157             float* s_distance = (float*)(smem);
 158             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 159
 160             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
 161
 162             if (queryIdx < query.rows && threadIdx.x == 0)
 163             {
 164                 bestTrainIdx[queryIdx] = myBestTrainIdx;
 165                 bestDistance[queryIdx] = myBestDistance;
 166             }
 167         }
 168
 169         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 170         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 171                                  const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
 172                                  cudaStream_t stream)
 173         {
 174             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 175             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 176
 177             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 178
 179             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 180             cudaSafeCall( cudaGetLastError() );
 181
 182             if (stream == 0)
 183                 cudaSafeCall( cudaDeviceSynchronize() );
 184         }
 185
 186         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 187         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
 188                                             int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
 189         {
 190             extern __shared__ int smem[];
 191
 192             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 193
 194             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 195             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 196
 197             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 198
 199             float myBestDistance = numeric_limits<float>::max();
 200             int myBestTrainIdx = -1;
 201             int myBestImgIdx = -1;
 202
 203             Mask m = mask;
 204
 205             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 206             {
 207                 const PtrStepSz<T> train = trains[imgIdx];
 208                 m.next();
 209                 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
 210             }
 211
 212             __syncthreads();
 213
 214             float* s_distance = (float*)(smem);
 215             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 216             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 217
 218             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
 219
 220             if (queryIdx < query.rows && threadIdx.x == 0)
 221             {
 222                 bestTrainIdx[queryIdx] = myBestTrainIdx;
 223                 bestImgIdx[queryIdx] = myBestImgIdx;
 224                 bestDistance[queryIdx] = myBestDistance;
 225             }
 226         }
 227
 228         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 229         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 230                                  const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
 231                                  cudaStream_t stream)
 232         {
 233             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 234             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 235
 236             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 237
 238             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 239             cudaSafeCall( cudaGetLastError() );
 240
 241             if (stream == 0)
 242                 cudaSafeCall( cudaDeviceSynchronize() );
 243         }
 244
 245         ///////////////////////////////////////////////////////////////////////////////
 246         // Match Unrolled
 247
 248         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 249         __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
 250                                      typename Dist::value_type* s_query, typename Dist::value_type* s_train,
 251                                      float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
 252         {
 253             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
 254             {
 255                 Dist dist;
 256
 257                 #pragma unroll
 258                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 259                 {
 260                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 261
 262                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 263                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 264
 265                     if (loadX < query.cols)
 266                     {
 267                         T val;
 268
 269                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
 270                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 271
 272                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 273                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 274                     }
 275
 276                     __syncthreads();
 277
 278                     #pragma unroll
 279                     for (int j = 0; j < BLOCK_SIZE; ++j)
 280                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 281
 282                     __syncthreads();
 283                 }
 284
 285                 typename Dist::result_type distVal = dist;
 286
 287                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 288
 289                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
 290                 {
 291                     bestImgIdx = imgIdx;
 292                     bestDistance = distVal;
 293                     bestTrainIdx = trainIdx;
 294                 }
 295             }
 296         }
 297
 298         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 299         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
 300         {
 301             extern __shared__ int smem[];
 302
 303             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 304
 305             float myBestDistance = numeric_limits<float>::max();
 306             int myBestTrainIdx = -1;
 307
 308             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 309             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 310
 311             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
 312
 313             __syncthreads();
 314
 315             float* s_distance = (float*)(smem);
 316             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 317
 318             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
 319
 320             if (queryIdx < query.rows && threadIdx.x == 0)
 321             {
 322                 bestTrainIdx[queryIdx] = myBestTrainIdx;
 323                 bestDistance[queryIdx] = myBestDistance;
 324             }
 325         }
 326
 327         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 328         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 329                            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
 330                            cudaStream_t stream)
 331         {
 332             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 333             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 334
 335             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 336
 337             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 338             cudaSafeCall( cudaGetLastError() );
 339
 340             if (stream == 0)
 341                 cudaSafeCall( cudaDeviceSynchronize() );
 342         }
 343
 344         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 345         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
 346                                       int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
 347         {
 348             extern __shared__ int smem[];
 349
 350             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 351
 352             float myBestDistance = numeric_limits<float>::max();
 353             int myBestTrainIdx = -1;
 354             int myBestImgIdx = -1;
 355
 356             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 357             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 358
 359             Mask m = mask;
 360
 361             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 362             {
 363                 const PtrStepSz<T> train = trains[imgIdx];
 364                 m.next();
 365                 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
 366             }
 367
 368             __syncthreads();
 369
 370             float* s_distance = (float*)(smem);
 371             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 372             int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 373
 374             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
 375
 376             if (queryIdx < query.rows && threadIdx.x == 0)
 377             {
 378                 bestTrainIdx[queryIdx] = myBestTrainIdx;
 379                 bestImgIdx[queryIdx] = myBestImgIdx;
 380                 bestDistance[queryIdx] = myBestDistance;
 381             }
 382         }
 383
 384         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 385         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 386                            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
 387                            cudaStream_t stream)
 388         {
 389             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 390             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 391
 392             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 393
 394             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 395             cudaSafeCall( cudaGetLastError() );
 396
 397             if (stream == 0)
 398                 cudaSafeCall( cudaDeviceSynchronize() );
 399         }
 400
 401         ///////////////////////////////////////////////////////////////////////////////
 402         // Match
 403
 404         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 405         __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
 406                              typename Dist::value_type* s_query, typename Dist::value_type* s_train,
 407                              float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
 408         {
 409             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
 410             {
 411                 Dist dist;
 412
 413                 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
 414                 {
 415                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 416
 417                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 418                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 419
 420                     if (loadX < query.cols)
 421                     {
 422                         T val;
 423
 424                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
 425                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 426
 427                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 428                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 429                     }
 430
 431                     __syncthreads();
 432
 433                     #pragma unroll
 434                     for (int j = 0; j < BLOCK_SIZE; ++j)
 435                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 436
 437                     __syncthreads();
 438                 }
 439
 440                 typename Dist::result_type distVal = dist;
 441
 442                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 443
 444                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
 445                 {
 446                     bestImgIdx = imgIdx;
 447                     bestDistance = distVal;
 448                     bestTrainIdx = trainIdx;
 449                 }
 450             }
 451         }
 452
 453         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 454         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
 455         {
 456             extern __shared__ int smem[];
 457
 458             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 459
 460             float myBestDistance = numeric_limits<float>::max();
 461             int myBestTrainIdx = -1;
 462
 463             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 464             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 465
 466             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
 467
 468             __syncthreads();
 469
 470             float* s_distance = (float*)(smem);
 471             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 472
 473             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
 474
 475             if (queryIdx < query.rows && threadIdx.x == 0)
 476             {
 477                 bestTrainIdx[queryIdx] = myBestTrainIdx;
 478                 bestDistance[queryIdx] = myBestDistance;
 479             }
 480         }
 481
 482         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 483         void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 484                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
 485                    cudaStream_t stream)
 486         {
 487             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 488             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 489
 490             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 491
 492             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 493             cudaSafeCall( cudaGetLastError() );
 494
 495             if (stream == 0)
 496                 cudaSafeCall( cudaDeviceSynchronize() );
 497         }
 498
 499         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 500         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
 501                               int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
 502         {
 503             extern __shared__ int smem[];
 504
 505             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 506
 507             float myBestDistance = numeric_limits<float>::max();
 508             int myBestTrainIdx = -1;
 509             int myBestImgIdx = -1;
 510
 511             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 512             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 513
 514             Mask m = mask;
 515             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 516             {
 517                 const PtrStepSz<T> train = trains[imgIdx];
 518                 m.next();
 519                 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
 520             }
 521
 522             __syncthreads();
 523
 524             float* s_distance = (float*)(smem);
 525             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 526             int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 527
 528             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
 529
 530             if (queryIdx < query.rows && threadIdx.x == 0)
 531             {
 532                 bestTrainIdx[queryIdx] = myBestTrainIdx;
 533                 bestImgIdx[queryIdx] = myBestImgIdx;
 534                 bestDistance[queryIdx] = myBestDistance;
 535             }
 536         }
 537
 538         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 539         void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 540                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
 541                    cudaStream_t stream)
 542         {
 543             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 544             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 545
 546             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 547
 548             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 549             cudaSafeCall( cudaGetLastError() );
 550
 551             if (stream == 0)
 552                 cudaSafeCall( cudaDeviceSynchronize() );
 553         }
 554
 555         ///////////////////////////////////////////////////////////////////////////////
 556         // Match dispatcher
 557
 558         template <typename Dist, typename T, typename Mask>
 559         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 560                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
 561                              cudaStream_t stream)
 562         {
 563             if (query.cols <= 64)
 564             {
 565                 matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
 566             }
 567             else if (query.cols <= 128)
 568             {
 569                 matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
 570             }
 571             /*else if (query.cols <= 256)
 572             {
 573                 matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
 574             }
 575             else if (query.cols <= 512)
 576             {
 577                 matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
 578             }
 579             else if (query.cols <= 1024)
 580             {
 581                 matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
 582             }*/
 583             else
 584             {
 585                 match<16, Dist>(query, train, mask, trainIdx, distance, stream);
 586             }
 587         }
 588
 589         template <typename Dist, typename T, typename Mask>
 590         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 591                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
 592                              cudaStream_t stream)
 593         {
 594             if (query.cols <= 64)
 595             {
 596                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
 597             }
 598             else if (query.cols <= 128)
 599             {
 600                 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
 601             }
 602             /*else if (query.cols <= 256)
 603             {
 604                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
 605             }
 606             else if (query.cols <= 512)
 607             {
 608                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
 609             }
 610             else if (query.cols <= 1024)
 611             {
 612                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
 613             }*/
 614             else
 615             {
 616                 match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
 617             }
 618         }
 619
 620         ///////////////////////////////////////////////////////////////////////////////
 621         // Match caller
 622
 623         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
 624                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
 625                                                cudaStream_t stream)
 626         {
 627             if (mask.data)
 628             {
 629                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
 630                     trainIdx, distance,
 631                     stream);
 632             }
 633             else
 634             {
 635                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
 636                     trainIdx, distance,
 637                     stream);
 638             }
 639         }
 640
 641         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 642         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 643         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 644         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 645         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 646         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 647
 648         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
 649                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
 650                                                cudaStream_t stream)
 651         {
 652             if (mask.data)
 653             {
 654                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
 655                     trainIdx, distance,
 656                     stream);
 657             }
 658             else
 659             {
 660                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
 661                     trainIdx, distance,
 662                     stream);
 663             }
 664         }
 665
 666         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 667         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 668         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 669         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 670         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 671         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 672
 673         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
 674                                                     const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
 675                                                     cudaStream_t stream)
 676         {
 677             if (mask.data)
 678             {
 679                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
 680                     trainIdx, distance,
 681                     stream);
 682             }
 683             else
 684             {
 685                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
 686                     trainIdx, distance,
 687                     stream);
 688             }
 689         }
 690
 691         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 692         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 693         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 694         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 695         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
 696
 697         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
 698                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
 699                                                 cudaStream_t stream)
 700         {
 701             if (masks.data)
 702             {
 703                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
 704                     trainIdx, imgIdx, distance,
 705                     stream);
 706             }
 707             else
 708             {
 709                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
 710                     trainIdx, imgIdx, distance,
 711                     stream);
 712             }
 713         }
 714
 715         template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 716         //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 717         template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 718         template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 719         template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 720         template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 721
 722         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
 723                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
 724                                                cudaStream_t stream)
 725         {
 726             if (masks.data)
 727             {
 728                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
 729                     trainIdx, imgIdx, distance,
 730                     stream);
 731             }
 732             else
 733             {
 734                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
 735                     trainIdx, imgIdx, distance,
 736                     stream);
 737             }
 738         }
 739
 740         //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 741         //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 742         //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 743         //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 744         //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 745         template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 746
 747         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
 748                                                     const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
 749                                                     cudaStream_t stream)
 750         {
 751             if (masks.data)
 752             {
 753                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
 754                     trainIdx, imgIdx, distance,
 755                     stream);
 756             }
 757             else
 758             {
 759                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
 760                     trainIdx, imgIdx, distance,
 761                     stream);
 762             }
 763         }
 764
 765         template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 766         //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 767         template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 768         //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 769         template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
 770     } // namespace bf_match
 771 }}} // namespace cv { namespace gpu { namespace device {
 772
 773
 774 #endif /* CUDA_DISABLER */