modules/gpu/src/cuda/bf_knnmatch.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or bpied warranties, including, but not limited to, the bpied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "internal_shared.hpp"
  46 #include "opencv2/gpu/device/limits.hpp"
  47 #include "opencv2/gpu/device/vec_distance.hpp"
  48 #include "opencv2/gpu/device/datamov_utils.hpp"
  49
  50 namespace cv { namespace gpu { namespace device
  51 {
  52     namespace bf_knnmatch
  53     {
  54         ///////////////////////////////////////////////////////////////////////////////
  55         // Reduction
  56
  57         template <int BLOCK_SIZE>
  58         __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
  59                                       int& bestTrainIdx1, int& bestTrainIdx2,
  60                                       float* s_distance, int* s_trainIdx)
  61         {
  62             float myBestDistance1 = numeric_limits<float>::max();
  63             float myBestDistance2 = numeric_limits<float>::max();
  64             int myBestTrainIdx1 = -1;
  65             int myBestTrainIdx2 = -1;
  66
  67             s_distance += threadIdx.y * BLOCK_SIZE;
  68             s_trainIdx += threadIdx.y * BLOCK_SIZE;
  69
  70             s_distance[threadIdx.x] = bestDistance1;
  71             s_trainIdx[threadIdx.x] = bestTrainIdx1;
  72
  73             __syncthreads();
  74
  75             if (threadIdx.x == 0)
  76             {
  77                 #pragma unroll
  78                 for (int i = 0; i < BLOCK_SIZE; ++i)
  79                 {
  80                     float val = s_distance[i];
  81
  82                     if (val < myBestDistance1)
  83                     {
  84                         myBestDistance2 = myBestDistance1;
  85                         myBestTrainIdx2 = myBestTrainIdx1;
  86
  87                         myBestDistance1 = val;
  88                         myBestTrainIdx1 = s_trainIdx[i];
  89                     }
  90                     else if (val < myBestDistance2)
  91                     {
  92                         myBestDistance2 = val;
  93                         myBestTrainIdx2 = s_trainIdx[i];
  94                     }
  95                 }
  96             }
  97
  98             __syncthreads();
  99
 100             s_distance[threadIdx.x] = bestDistance2;
 101             s_trainIdx[threadIdx.x] = bestTrainIdx2;
 102
 103             __syncthreads();
 104
 105             if (threadIdx.x == 0)
 106             {
 107                 #pragma unroll
 108                 for (int i = 0; i < BLOCK_SIZE; ++i)
 109                 {
 110                     float val = s_distance[i];
 111
 112                     if (val < myBestDistance2)
 113                     {
 114                         myBestDistance2 = val;
 115                         myBestTrainIdx2 = s_trainIdx[i];
 116                     }
 117                 }
 118             }
 119
 120             bestDistance1 = myBestDistance1;
 121             bestDistance2 = myBestDistance2;
 122
 123             bestTrainIdx1 = myBestTrainIdx1;
 124             bestTrainIdx2 = myBestTrainIdx2;
 125         }
 126
 127         template <int BLOCK_SIZE>
 128         __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
 129                                        int& bestTrainIdx1, int& bestTrainIdx2,
 130                                        int& bestImgIdx1, int& bestImgIdx2,
 131                                        float* s_distance, int* s_trainIdx, int* s_imgIdx)
 132         {
 133             float myBestDistance1 = numeric_limits<float>::max();
 134             float myBestDistance2 = numeric_limits<float>::max();
 135             int myBestTrainIdx1 = -1;
 136             int myBestTrainIdx2 = -1;
 137             int myBestImgIdx1 = -1;
 138             int myBestImgIdx2 = -1;
 139
 140             s_distance += threadIdx.y * BLOCK_SIZE;
 141             s_trainIdx += threadIdx.y * BLOCK_SIZE;
 142             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
 143
 144             s_distance[threadIdx.x] = bestDistance1;
 145             s_trainIdx[threadIdx.x] = bestTrainIdx1;
 146             s_imgIdx[threadIdx.x]   = bestImgIdx1;
 147
 148             __syncthreads();
 149
 150             if (threadIdx.x == 0)
 151             {
 152                 #pragma unroll
 153                 for (int i = 0; i < BLOCK_SIZE; ++i)
 154                 {
 155                     float val = s_distance[i];
 156
 157                     if (val < myBestDistance1)
 158                     {
 159                         myBestDistance2 = myBestDistance1;
 160                         myBestTrainIdx2 = myBestTrainIdx1;
 161                         myBestImgIdx2   = myBestImgIdx1;
 162
 163                         myBestDistance1 = val;
 164                         myBestTrainIdx1 = s_trainIdx[i];
 165                         myBestImgIdx1   = s_imgIdx[i];
 166                     }
 167                     else if (val < myBestDistance2)
 168                     {
 169                         myBestDistance2 = val;
 170                         myBestTrainIdx2 = s_trainIdx[i];
 171                         myBestImgIdx2   = s_imgIdx[i];
 172                     }
 173                 }
 174             }
 175
 176             __syncthreads();
 177
 178             s_distance[threadIdx.x] = bestDistance2;
 179             s_trainIdx[threadIdx.x] = bestTrainIdx2;
 180             s_imgIdx[threadIdx.x]   = bestImgIdx2;
 181
 182             __syncthreads();
 183
 184             if (threadIdx.x == 0)
 185             {
 186                 #pragma unroll
 187                 for (int i = 0; i < BLOCK_SIZE; ++i)
 188                 {
 189                     float val = s_distance[i];
 190
 191                     if (val < myBestDistance2)
 192                     {
 193                         myBestDistance2 = val;
 194                         myBestTrainIdx2 = s_trainIdx[i];
 195                         myBestImgIdx2   = s_imgIdx[i];
 196                     }
 197                 }
 198             }
 199
 200             bestDistance1 = myBestDistance1;
 201             bestDistance2 = myBestDistance2;
 202
 203             bestTrainIdx1 = myBestTrainIdx1;
 204             bestTrainIdx2 = myBestTrainIdx2;
 205
 206             bestImgIdx1 = myBestImgIdx1;
 207             bestImgIdx2 = myBestImgIdx2;
 208         }
 209
 210         ///////////////////////////////////////////////////////////////////////////////
 211         // Match Unrolled Cached
 212
 213         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
 214         __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
 215         {
 216             #pragma unroll
 217             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 218             {
 219                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
 220                 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
 221             }
 222         }
 223
 224         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 225         __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
 226                                            typename Dist::value_type* s_query, typename Dist::value_type* s_train,
 227                                            float& bestDistance1, float& bestDistance2,
 228                                            int& bestTrainIdx1, int& bestTrainIdx2,
 229                                            int& bestImgIdx1, int& bestImgIdx2)
 230         {
 231             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
 232             {
 233                 Dist dist;
 234
 235                 #pragma unroll
 236                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 237                 {
 238                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 239
 240                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 241
 242                     if (loadX < train.cols)
 243                     {
 244                         T val;
 245
 246                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 247                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 248                     }
 249
 250                     __syncthreads();
 251
 252                     #pragma unroll
 253                     for (int j = 0; j < BLOCK_SIZE; ++j)
 254                         dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 255
 256                     __syncthreads();
 257                 }
 258
 259                 typename Dist::result_type distVal = dist;
 260
 261                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 262
 263                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
 264                 {
 265                     if (distVal < bestDistance1)
 266                     {
 267                         bestImgIdx2   = bestImgIdx1;
 268                         bestDistance2 = bestDistance1;
 269                         bestTrainIdx2 = bestTrainIdx1;
 270
 271                         bestImgIdx1   = imgIdx;
 272                         bestDistance1 = distVal;
 273                         bestTrainIdx1 = trainIdx;
 274                     }
 275                     else if (distVal < bestDistance2)
 276                     {
 277                         bestImgIdx2   = imgIdx;
 278                         bestDistance2 = distVal;
 279                         bestTrainIdx2 = trainIdx;
 280                     }
 281                 }
 282             }
 283         }
 284
 285         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 286         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
 287         {
 288             extern __shared__ int smem[];
 289
 290             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 291
 292             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 293             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 294
 295             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 296
 297             float myBestDistance1 = numeric_limits<float>::max();
 298             float myBestDistance2 = numeric_limits<float>::max();
 299             int myBestTrainIdx1 = -1;
 300             int myBestTrainIdx2 = -1;
 301
 302             loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 303
 304             __syncthreads();
 305
 306             float* s_distance = (float*)(smem);
 307             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 308
 309             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 310
 311             if (queryIdx < query.rows && threadIdx.x == 0)
 312             {
 313                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 314                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 315             }
 316         }
 317
 318         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 319         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 320                                  const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
 321                                  cudaStream_t stream)
 322         {
 323             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 324             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 325
 326             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 327
 328             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 329             cudaSafeCall( cudaGetLastError() );
 330
 331             if (stream == 0)
 332                 cudaSafeCall( cudaDeviceSynchronize() );
 333         }
 334
 335         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 336         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
 337         {
 338             extern __shared__ int smem[];
 339
 340             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 341
 342             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 343             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 344
 345             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 346
 347             float myBestDistance1 = numeric_limits<float>::max();
 348             float myBestDistance2 = numeric_limits<float>::max();
 349             int myBestTrainIdx1 = -1;
 350             int myBestTrainIdx2 = -1;
 351             int myBestImgIdx1 = -1;
 352             int myBestImgIdx2 = -1;
 353
 354             Mask m = mask;
 355
 356             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 357             {
 358                 const PtrStepSz<T> train = trains[imgIdx];
 359                 m.next();
 360                 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
 361             }
 362
 363             __syncthreads();
 364
 365             float* s_distance = (float*)(smem);
 366             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 367             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 368
 369             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 370
 371             if (queryIdx < query.rows && threadIdx.x == 0)
 372             {
 373                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 374                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
 375                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 376             }
 377         }
 378
 379         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 380         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 381                                  const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
 382                                  cudaStream_t stream)
 383         {
 384             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 385             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 386
 387             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 388
 389             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 390             cudaSafeCall( cudaGetLastError() );
 391
 392             if (stream == 0)
 393                 cudaSafeCall( cudaDeviceSynchronize() );
 394         }
 395
 396         ///////////////////////////////////////////////////////////////////////////////
 397         // Match Unrolled
 398
 399         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 400         __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
 401                                      typename Dist::value_type* s_query, typename Dist::value_type* s_train,
 402                                      float& bestDistance1, float& bestDistance2,
 403                                      int& bestTrainIdx1, int& bestTrainIdx2,
 404                                      int& bestImgIdx1, int& bestImgIdx2)
 405         {
 406             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
 407             {
 408                 Dist dist;
 409
 410                 #pragma unroll
 411                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 412                 {
 413                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 414
 415                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 416                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 417
 418                     if (loadX < query.cols)
 419                     {
 420                         T val;
 421
 422                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
 423                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 424
 425                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 426                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 427                     }
 428
 429                     __syncthreads();
 430
 431                     #pragma unroll
 432                     for (int j = 0; j < BLOCK_SIZE; ++j)
 433                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 434
 435                     __syncthreads();
 436                 }
 437
 438                 typename Dist::result_type distVal = dist;
 439
 440                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 441
 442                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
 443                 {
 444                     if (distVal < bestDistance1)
 445                     {
 446                         bestImgIdx2   = bestImgIdx1;
 447                         bestDistance2 = bestDistance1;
 448                         bestTrainIdx2 = bestTrainIdx1;
 449
 450                         bestImgIdx1   = imgIdx;
 451                         bestDistance1 = distVal;
 452                         bestTrainIdx1 = trainIdx;
 453                     }
 454                     else if (distVal < bestDistance2)
 455                     {
 456                         bestImgIdx2   = imgIdx;
 457                         bestDistance2 = distVal;
 458                         bestTrainIdx2 = trainIdx;
 459                     }
 460                 }
 461             }
 462         }
 463
 464         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 465         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
 466         {
 467             extern __shared__ int smem[];
 468
 469             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 470
 471             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 472             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 473
 474             float myBestDistance1 = numeric_limits<float>::max();
 475             float myBestDistance2 = numeric_limits<float>::max();
 476             int myBestTrainIdx1 = -1;
 477             int myBestTrainIdx2 = -1;
 478
 479             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 480
 481             __syncthreads();
 482
 483             float* s_distance = (float*)(smem);
 484             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 485
 486             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 487
 488             if (queryIdx < query.rows && threadIdx.x == 0)
 489             {
 490                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 491                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 492             }
 493         }
 494
 495         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 496         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 497                            const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
 498                            cudaStream_t stream)
 499         {
 500             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 501             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 502
 503             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 504
 505             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 506             cudaSafeCall( cudaGetLastError() );
 507
 508             if (stream == 0)
 509                 cudaSafeCall( cudaDeviceSynchronize() );
 510         }
 511
 512         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 513         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
 514         {
 515             extern __shared__ int smem[];
 516
 517             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 518
 519             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 520             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 521
 522             float myBestDistance1 = numeric_limits<float>::max();
 523             float myBestDistance2 = numeric_limits<float>::max();
 524             int myBestTrainIdx1 = -1;
 525             int myBestTrainIdx2 = -1;
 526             int myBestImgIdx1 = -1;
 527             int myBestImgIdx2 = -1;
 528
 529             Mask m = mask;
 530
 531             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 532             {
 533                 const PtrStepSz<T> train = trains[imgIdx];
 534                 m.next();
 535                 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
 536             }
 537
 538             __syncthreads();
 539
 540             float* s_distance = (float*)(smem);
 541             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 542             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 543
 544             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 545
 546             if (queryIdx < query.rows && threadIdx.x == 0)
 547             {
 548                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 549                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
 550                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 551             }
 552         }
 553
 554         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 555         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 556                            const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
 557                            cudaStream_t stream)
 558         {
 559             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 560             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 561
 562             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 563
 564             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 565             cudaSafeCall( cudaGetLastError() );
 566
 567             if (stream == 0)
 568                 cudaSafeCall( cudaDeviceSynchronize() );
 569         }
 570
 571         ///////////////////////////////////////////////////////////////////////////////
 572         // Match
 573
 574         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 575         __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
 576                              typename Dist::value_type* s_query, typename Dist::value_type* s_train,
 577                              float& bestDistance1, float& bestDistance2,
 578                              int& bestTrainIdx1, int& bestTrainIdx2,
 579                              int& bestImgIdx1, int& bestImgIdx2)
 580         {
 581             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
 582             {
 583                 Dist dist;
 584
 585                 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
 586                 {
 587                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 588
 589                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 590                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 591
 592                     if (loadX < query.cols)
 593                     {
 594                         T val;
 595
 596                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
 597                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 598
 599                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 600                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 601                     }
 602
 603                     __syncthreads();
 604
 605                     #pragma unroll
 606                     for (int j = 0; j < BLOCK_SIZE; ++j)
 607                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 608
 609                     __syncthreads();
 610                 }
 611
 612                 typename Dist::result_type distVal = dist;
 613
 614                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 615
 616                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
 617                 {
 618                     if (distVal < bestDistance1)
 619                     {
 620                         bestImgIdx2   = bestImgIdx1;
 621                         bestDistance2 = bestDistance1;
 622                         bestTrainIdx2 = bestTrainIdx1;
 623
 624                         bestImgIdx1   = imgIdx;
 625                         bestDistance1 = distVal;
 626                         bestTrainIdx1 = trainIdx;
 627                     }
 628                     else if (distVal < bestDistance2)
 629                     {
 630                         bestImgIdx2   = imgIdx;
 631                         bestDistance2 = distVal;
 632                         bestTrainIdx2 = trainIdx;
 633                     }
 634                 }
 635             }
 636         }
 637
 638         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 639         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
 640         {
 641             extern __shared__ int smem[];
 642
 643             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 644
 645             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 646             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 647
 648             float myBestDistance1 = numeric_limits<float>::max();
 649             float myBestDistance2 = numeric_limits<float>::max();
 650             int myBestTrainIdx1 = -1;
 651             int myBestTrainIdx2 = -1;
 652
 653             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 654
 655             __syncthreads();
 656
 657             float* s_distance = (float*)(smem);
 658             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 659
 660             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 661
 662             if (queryIdx < query.rows && threadIdx.x == 0)
 663             {
 664                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 665                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 666             }
 667         }
 668
 669         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 670         void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 671                    const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
 672                    cudaStream_t stream)
 673         {
 674             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 675             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 676
 677             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 678
 679             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 680             cudaSafeCall( cudaGetLastError() );
 681
 682             if (stream == 0)
 683                 cudaSafeCall( cudaDeviceSynchronize() );
 684         }
 685
 686         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 687         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
 688         {
 689             extern __shared__ int smem[];
 690
 691             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 692
 693             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 694             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 695
 696             float myBestDistance1 = numeric_limits<float>::max();
 697             float myBestDistance2 = numeric_limits<float>::max();
 698             int myBestTrainIdx1 = -1;
 699             int myBestTrainIdx2 = -1;
 700             int myBestImgIdx1 = -1;
 701             int myBestImgIdx2 = -1;
 702
 703             Mask m = mask;
 704
 705             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 706             {
 707                 const PtrStepSz<T> train = trains[imgIdx];
 708                 m.next();
 709                 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
 710             }
 711
 712             __syncthreads();
 713
 714             float* s_distance = (float*)(smem);
 715             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 716             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 717
 718             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 719
 720             if (queryIdx < query.rows && threadIdx.x == 0)
 721             {
 722                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 723                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
 724                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 725             }
 726         }
 727
 728         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 729         void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 730                    const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
 731                    cudaStream_t stream)
 732         {
 733             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 734             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 735
 736             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 737
 738             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 739             cudaSafeCall( cudaGetLastError() );
 740
 741             if (stream == 0)
 742                 cudaSafeCall( cudaDeviceSynchronize() );
 743         }
 744
 745         ///////////////////////////////////////////////////////////////////////////////
 746         // knnMatch 2 dispatcher
 747
 748         template <typename Dist, typename T, typename Mask>
 749         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 750                               const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
 751                               int cc, cudaStream_t stream)
 752         {
 753             (void)cc;
 754             if (query.cols <= 64)
 755             {
 756                 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 757             }
 758             else if (query.cols <= 128)
 759             {
 760                 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 761             }
 762             /*else if (query.cols <= 256)
 763             {
 764                 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 765             }
 766             else if (query.cols <= 512)
 767             {
 768                 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 769             }
 770             else if (query.cols <= 1024)
 771             {
 772                 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 773             }*/
 774             else
 775             {
 776                 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 777             }
 778         }
 779
 780         template <typename Dist, typename T, typename Mask>
 781         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 782                               const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
 783                               int cc, cudaStream_t stream)
 784         {
 785             (void)cc;
 786             if (query.cols <= 64)
 787             {
 788                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 789             }
 790             else if (query.cols <= 128)
 791             {
 792                 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 793             }
 794             /*else if (query.cols <= 256)
 795             {
 796                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 797             }
 798             else if (query.cols <= 512)
 799             {
 800                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 801             }
 802             else if (query.cols <= 1024)
 803             {
 804                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 805             }*/
 806             else
 807             {
 808                 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 809             }
 810         }
 811
 812         ///////////////////////////////////////////////////////////////////////////////
 813         // Calc distance kernel
 814
 815         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 816         __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
 817         {
 818             extern __shared__ int smem[];
 819
 820             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
 821             const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 822
 823             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 824             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 825
 826             Dist dist;
 827
 828             #pragma unroll
 829             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 830             {
 831                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
 832
 833                 if (loadX < query.cols)
 834                 {
 835                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
 836                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
 837                 }
 838                 else
 839                 {
 840                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 841                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 842                 }
 843
 844                 __syncthreads();
 845
 846                 #pragma unroll
 847                 for (int j = 0; j < BLOCK_SIZE; ++j)
 848                     dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 849
 850                 __syncthreads();
 851             }
 852
 853             if (queryIdx < query.rows && trainIdx < train.rows)
 854             {
 855                 float distVal = numeric_limits<float>::max();
 856
 857                 if (mask(queryIdx, trainIdx))
 858                     distVal = (typename Dist::result_type)dist;
 859
 860                 allDist.ptr(queryIdx)[trainIdx] = distVal;
 861             }
 862         }
 863
 864         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 865         void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
 866         {
 867             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 868             const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
 869
 870             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 871
 872             calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
 873             cudaSafeCall( cudaGetLastError() );
 874
 875             if (stream == 0)
 876                 cudaSafeCall( cudaDeviceSynchronize() );
 877         }
 878
 879         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 880         __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
 881         {
 882             extern __shared__ int smem[];
 883
 884             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
 885             const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 886
 887             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 888             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 889
 890             Dist dist;
 891
 892             for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
 893             {
 894                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
 895
 896                 if (loadX < query.cols)
 897                 {
 898                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
 899                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
 900                 }
 901                 else
 902                 {
 903                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 904                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 905                 }
 906
 907                 __syncthreads();
 908
 909                 #pragma unroll
 910                 for (int j = 0; j < BLOCK_SIZE; ++j)
 911                     dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 912
 913                 __syncthreads();
 914             }
 915
 916             if (queryIdx < query.rows && trainIdx < train.rows)
 917             {
 918                 float distVal = numeric_limits<float>::max();
 919
 920                 if (mask(queryIdx, trainIdx))
 921                     distVal = (typename Dist::result_type)dist;
 922
 923                 allDist.ptr(queryIdx)[trainIdx] = distVal;
 924             }
 925         }
 926
 927         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 928         void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
 929         {
 930             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 931             const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
 932
 933             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 934
 935             calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
 936             cudaSafeCall( cudaGetLastError() );
 937
 938             if (stream == 0)
 939                 cudaSafeCall( cudaDeviceSynchronize() );
 940         }
 941
 942         ///////////////////////////////////////////////////////////////////////////////
 943         // Calc Distance dispatcher
 944
 945         template <typename Dist, typename T, typename Mask>
 946         void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 947                                     const PtrStepSzf& allDist,
 948                                     int cc, cudaStream_t stream)
 949         {
 950             (void)cc;
 951             if (query.cols <= 64)
 952             {
 953                 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
 954             }
 955             else if (query.cols <= 128)
 956             {
 957                 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
 958             }
 959             /*else if (query.cols <= 256)
 960             {
 961                 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
 962             }
 963             else if (query.cols <= 512)
 964             {
 965                 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
 966             }
 967             else if (query.cols <= 1024)
 968             {
 969                 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
 970             }*/
 971             else
 972             {
 973                 calcDistance<16, Dist>(query, train, mask, allDist, stream);
 974             }
 975         }
 976
 977         ///////////////////////////////////////////////////////////////////////////////
 978         // find knn match kernel
 979
 980         template <int BLOCK_SIZE>
 981         __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
 982         {
 983             const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
 984             __shared__ float s_dist[SMEM_SIZE];
 985             __shared__ int s_trainIdx[SMEM_SIZE];
 986
 987             const int queryIdx = blockIdx.x;
 988
 989             float* allDistRow = allDist.ptr(queryIdx);
 990
 991             float dist = numeric_limits<float>::max();
 992             int bestIdx = -1;
 993
 994             for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
 995             {
 996                 float reg = allDistRow[i];
 997                 if (reg < dist)
 998                 {
 999                     dist = reg;
1000                     bestIdx = i;
1001                 }
1002             }
1003
1004             s_dist[threadIdx.x] = dist;
1005             s_trainIdx[threadIdx.x] = bestIdx;
1006             __syncthreads();
1007
1008             reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
1009
1010             if (threadIdx.x == 0)
1011             {
1012                 if (dist < numeric_limits<float>::max())
1013                 {
1014                     allDistRow[bestIdx] = numeric_limits<float>::max();
1015                     trainIdx.ptr(queryIdx)[i] = bestIdx;
1016                     distance.ptr(queryIdx)[i] = dist;
1017                 }
1018             }
1019         }
1020
1021         template <int BLOCK_SIZE>
1022         void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
1023         {
1024             const dim3 block(BLOCK_SIZE, 1, 1);
1025             const dim3 grid(trainIdx.rows, 1, 1);
1026
1027             for (int i = 0; i < k; ++i)
1028             {
1029                 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
1030                 cudaSafeCall( cudaGetLastError() );
1031             }
1032
1033             if (stream == 0)
1034                 cudaSafeCall( cudaDeviceSynchronize() );
1035         }
1036
1037         void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
1038         {
1039             findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
1040         }
1041
1042         ///////////////////////////////////////////////////////////////////////////////
1043         // knn match Dispatcher
1044
1045         template <typename Dist, typename T, typename Mask>
1046         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
1047             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1048             int cc, cudaStream_t stream)
1049         {
1050             if (k == 2)
1051             {
1052                 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
1053             }
1054             else
1055             {
1056                 calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
1057                 findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
1058             }
1059         }
1060
1061         ///////////////////////////////////////////////////////////////////////////////
1062         // knn match caller
1063
1064         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1065             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1066             int cc, cudaStream_t stream)
1067         {
1068             if (mask.data)
1069                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
1070             else
1071                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
1072         }
1073
1074         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1075         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1076         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1077         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1078         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1079         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1080
1081         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1082             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1083             int cc, cudaStream_t stream)
1084         {
1085             if (mask.data)
1086                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
1087             else
1088                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
1089         }
1090
1091         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1092         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1093         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1094         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1095         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1096         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1097
1098         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1099             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1100             int cc, cudaStream_t stream)
1101         {
1102             if (mask.data)
1103                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
1104             else
1105                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
1106         }
1107
1108         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1109         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1110         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1111         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1112         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream);
1113
1114         template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1115             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1116             int cc, cudaStream_t stream)
1117         {
1118             if (masks.data)
1119                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
1120             else
1121                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
1122         }
1123
1124         template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1125         //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1126         template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1127         template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1128         template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1129         template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1130
1131         template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1132             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1133             int cc, cudaStream_t stream)
1134         {
1135             if (masks.data)
1136                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
1137             else
1138                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
1139         }
1140
1141         //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1142         //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1143         //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1144         //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1145         //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1146         template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1147
1148         template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1149             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1150             int cc, cudaStream_t stream)
1151         {
1152             if (masks.data)
1153                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
1154             else
1155                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
1156         }
1157
1158         template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1159         //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1160         template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1161         //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1162         template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream);
1163     } // namespace bf_knnmatch
1164 }}} // namespace cv { namespace gpu { namespace device {
1165
1166
1167 #endif /* CUDA_DISABLER */