modules/gpu/src/cuda/bf_knnmatch.cu

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or bpied warranties, including, but not limited to, the bpied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #if !defined CUDA_DISABLER
  44
  45 #include "opencv2/gpu/device/common.hpp"
  46 #include "opencv2/gpu/device/utility.hpp"
  47 #include "opencv2/gpu/device/reduce.hpp"
  48 #include "opencv2/gpu/device/limits.hpp"
  49 #include "opencv2/gpu/device/vec_distance.hpp"
  50 #include "opencv2/gpu/device/datamov_utils.hpp"
  51 #include "opencv2/gpu/device/warp_shuffle.hpp"
  52
  53 namespace cv { namespace gpu { namespace device
  54 {
  55     namespace bf_knnmatch
  56     {
  57         ///////////////////////////////////////////////////////////////////////////////
  58         // Reduction
  59
  60         template <int BLOCK_SIZE>
  61         __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
  62                                       int& bestTrainIdx1, int& bestTrainIdx2,
  63                                       float* s_distance, int* s_trainIdx)
  64         {
  65         #if __CUDA_ARCH__ >= 300
  66             (void) s_distance;
  67             (void) s_trainIdx;
  68
  69             float d1, d2;
  70             int i1, i2;
  71
  72             #pragma unroll
  73             for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
  74             {
  75                 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
  76                 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
  77                 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
  78                 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
  79
  80                 if (bestDistance1 < d1)
  81                 {
  82                     if (d1 < bestDistance2)
  83                     {
  84                         bestDistance2 = d1;
  85                         bestTrainIdx2 = i1;
  86                     }
  87                 }
  88                 else
  89                 {
  90                     bestDistance2 = bestDistance1;
  91                     bestTrainIdx2 = bestTrainIdx1;
  92
  93                     bestDistance1 = d1;
  94                     bestTrainIdx1 = i1;
  95
  96                     if (d2 < bestDistance2)
  97                     {
  98                         bestDistance2 = d2;
  99                         bestTrainIdx2 = i2;
 100                     }
 101                 }
 102             }
 103         #else
 104             float myBestDistance1 = numeric_limits<float>::max();
 105             float myBestDistance2 = numeric_limits<float>::max();
 106             int myBestTrainIdx1 = -1;
 107             int myBestTrainIdx2 = -1;
 108
 109             s_distance += threadIdx.y * BLOCK_SIZE;
 110             s_trainIdx += threadIdx.y * BLOCK_SIZE;
 111
 112             s_distance[threadIdx.x] = bestDistance1;
 113             s_trainIdx[threadIdx.x] = bestTrainIdx1;
 114
 115             __syncthreads();
 116
 117             if (threadIdx.x == 0)
 118             {
 119                 #pragma unroll
 120                 for (int i = 0; i < BLOCK_SIZE; ++i)
 121                 {
 122                     float val = s_distance[i];
 123
 124                     if (val < myBestDistance1)
 125                     {
 126                         myBestDistance2 = myBestDistance1;
 127                         myBestTrainIdx2 = myBestTrainIdx1;
 128
 129                         myBestDistance1 = val;
 130                         myBestTrainIdx1 = s_trainIdx[i];
 131                     }
 132                     else if (val < myBestDistance2)
 133                     {
 134                         myBestDistance2 = val;
 135                         myBestTrainIdx2 = s_trainIdx[i];
 136                     }
 137                 }
 138             }
 139
 140             __syncthreads();
 141
 142             s_distance[threadIdx.x] = bestDistance2;
 143             s_trainIdx[threadIdx.x] = bestTrainIdx2;
 144
 145             __syncthreads();
 146
 147             if (threadIdx.x == 0)
 148             {
 149                 #pragma unroll
 150                 for (int i = 0; i < BLOCK_SIZE; ++i)
 151                 {
 152                     float val = s_distance[i];
 153
 154                     if (val < myBestDistance2)
 155                     {
 156                         myBestDistance2 = val;
 157                         myBestTrainIdx2 = s_trainIdx[i];
 158                     }
 159                 }
 160             }
 161
 162             bestDistance1 = myBestDistance1;
 163             bestDistance2 = myBestDistance2;
 164
 165             bestTrainIdx1 = myBestTrainIdx1;
 166             bestTrainIdx2 = myBestTrainIdx2;
 167         #endif
 168         }
 169
 170         template <int BLOCK_SIZE>
 171         __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
 172                                        int& bestTrainIdx1, int& bestTrainIdx2,
 173                                        int& bestImgIdx1, int& bestImgIdx2,
 174                                        float* s_distance, int* s_trainIdx, int* s_imgIdx)
 175         {
 176         #if __CUDA_ARCH__ >= 300
 177             (void) s_distance;
 178             (void) s_trainIdx;
 179             (void) s_imgIdx;
 180
 181             float d1, d2;
 182             int i1, i2;
 183             int j1, j2;
 184
 185             #pragma unroll
 186             for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
 187             {
 188                 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
 189                 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
 190                 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
 191                 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
 192                 j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
 193                 j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
 194
 195                 if (bestDistance1 < d1)
 196                 {
 197                     if (d1 < bestDistance2)
 198                     {
 199                         bestDistance2 = d1;
 200                         bestTrainIdx2 = i1;
 201                         bestImgIdx2 = j1;
 202                     }
 203                 }
 204                 else
 205                 {
 206                     bestDistance2 = bestDistance1;
 207                     bestTrainIdx2 = bestTrainIdx1;
 208                     bestImgIdx2 = bestImgIdx1;
 209
 210                     bestDistance1 = d1;
 211                     bestTrainIdx1 = i1;
 212                     bestImgIdx1 = j1;
 213
 214                     if (d2 < bestDistance2)
 215                     {
 216                         bestDistance2 = d2;
 217                         bestTrainIdx2 = i2;
 218                         bestImgIdx2 = j2;
 219                     }
 220                 }
 221             }
 222         #else
 223             float myBestDistance1 = numeric_limits<float>::max();
 224             float myBestDistance2 = numeric_limits<float>::max();
 225             int myBestTrainIdx1 = -1;
 226             int myBestTrainIdx2 = -1;
 227             int myBestImgIdx1 = -1;
 228             int myBestImgIdx2 = -1;
 229
 230             s_distance += threadIdx.y * BLOCK_SIZE;
 231             s_trainIdx += threadIdx.y * BLOCK_SIZE;
 232             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
 233
 234             s_distance[threadIdx.x] = bestDistance1;
 235             s_trainIdx[threadIdx.x] = bestTrainIdx1;
 236             s_imgIdx[threadIdx.x]   = bestImgIdx1;
 237
 238             __syncthreads();
 239
 240             if (threadIdx.x == 0)
 241             {
 242                 #pragma unroll
 243                 for (int i = 0; i < BLOCK_SIZE; ++i)
 244                 {
 245                     float val = s_distance[i];
 246
 247                     if (val < myBestDistance1)
 248                     {
 249                         myBestDistance2 = myBestDistance1;
 250                         myBestTrainIdx2 = myBestTrainIdx1;
 251                         myBestImgIdx2   = myBestImgIdx1;
 252
 253                         myBestDistance1 = val;
 254                         myBestTrainIdx1 = s_trainIdx[i];
 255                         myBestImgIdx1   = s_imgIdx[i];
 256                     }
 257                     else if (val < myBestDistance2)
 258                     {
 259                         myBestDistance2 = val;
 260                         myBestTrainIdx2 = s_trainIdx[i];
 261                         myBestImgIdx2   = s_imgIdx[i];
 262                     }
 263                 }
 264             }
 265
 266             __syncthreads();
 267
 268             s_distance[threadIdx.x] = bestDistance2;
 269             s_trainIdx[threadIdx.x] = bestTrainIdx2;
 270             s_imgIdx[threadIdx.x]   = bestImgIdx2;
 271
 272             __syncthreads();
 273
 274             if (threadIdx.x == 0)
 275             {
 276                 #pragma unroll
 277                 for (int i = 0; i < BLOCK_SIZE; ++i)
 278                 {
 279                     float val = s_distance[i];
 280
 281                     if (val < myBestDistance2)
 282                     {
 283                         myBestDistance2 = val;
 284                         myBestTrainIdx2 = s_trainIdx[i];
 285                         myBestImgIdx2   = s_imgIdx[i];
 286                     }
 287                 }
 288             }
 289
 290             bestDistance1 = myBestDistance1;
 291             bestDistance2 = myBestDistance2;
 292
 293             bestTrainIdx1 = myBestTrainIdx1;
 294             bestTrainIdx2 = myBestTrainIdx2;
 295
 296             bestImgIdx1 = myBestImgIdx1;
 297             bestImgIdx2 = myBestImgIdx2;
 298         #endif
 299         }
 300
 301         ///////////////////////////////////////////////////////////////////////////////
 302         // Match Unrolled Cached
 303
 304         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
 305         __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
 306         {
 307             #pragma unroll
 308             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 309             {
 310                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
 311                 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
 312             }
 313         }
 314
 315         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 316         __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
 317                                            typename Dist::value_type* s_query, typename Dist::value_type* s_train,
 318                                            float& bestDistance1, float& bestDistance2,
 319                                            int& bestTrainIdx1, int& bestTrainIdx2,
 320                                            int& bestImgIdx1, int& bestImgIdx2)
 321         {
 322             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
 323             {
 324                 Dist dist;
 325
 326                 #pragma unroll
 327                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 328                 {
 329                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 330
 331                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 332
 333                     if (loadX < train.cols)
 334                     {
 335                         T val;
 336
 337                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 338                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 339                     }
 340
 341                     __syncthreads();
 342
 343                     #pragma unroll
 344                     for (int j = 0; j < BLOCK_SIZE; ++j)
 345                         dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 346
 347                     __syncthreads();
 348                 }
 349
 350                 typename Dist::result_type distVal = dist;
 351
 352                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 353
 354                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
 355                 {
 356                     if (distVal < bestDistance1)
 357                     {
 358                         bestImgIdx2   = bestImgIdx1;
 359                         bestDistance2 = bestDistance1;
 360                         bestTrainIdx2 = bestTrainIdx1;
 361
 362                         bestImgIdx1   = imgIdx;
 363                         bestDistance1 = distVal;
 364                         bestTrainIdx1 = trainIdx;
 365                     }
 366                     else if (distVal < bestDistance2)
 367                     {
 368                         bestImgIdx2   = imgIdx;
 369                         bestDistance2 = distVal;
 370                         bestTrainIdx2 = trainIdx;
 371                     }
 372                 }
 373             }
 374         }
 375
 376         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 377         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
 378         {
 379             extern __shared__ int smem[];
 380
 381             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 382
 383             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 384             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 385
 386             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 387
 388             float myBestDistance1 = numeric_limits<float>::max();
 389             float myBestDistance2 = numeric_limits<float>::max();
 390             int myBestTrainIdx1 = -1;
 391             int myBestTrainIdx2 = -1;
 392
 393             loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 394
 395             __syncthreads();
 396
 397             float* s_distance = (float*)(smem);
 398             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 399
 400             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 401
 402             if (queryIdx < query.rows && threadIdx.x == 0)
 403             {
 404                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 405                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 406             }
 407         }
 408
 409         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 410         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 411                                  const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
 412                                  cudaStream_t stream)
 413         {
 414             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 415             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 416
 417             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 418
 419             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 420             cudaSafeCall( cudaGetLastError() );
 421
 422             if (stream == 0)
 423                 cudaSafeCall( cudaDeviceSynchronize() );
 424         }
 425
 426         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 427         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
 428         {
 429             extern __shared__ int smem[];
 430
 431             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 432
 433             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 434             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
 435
 436             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
 437
 438             float myBestDistance1 = numeric_limits<float>::max();
 439             float myBestDistance2 = numeric_limits<float>::max();
 440             int myBestTrainIdx1 = -1;
 441             int myBestTrainIdx2 = -1;
 442             int myBestImgIdx1 = -1;
 443             int myBestImgIdx2 = -1;
 444
 445             Mask m = mask;
 446
 447             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 448             {
 449                 const PtrStepSz<T> train = trains[imgIdx];
 450                 m.next();
 451                 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
 452             }
 453
 454             __syncthreads();
 455
 456             float* s_distance = (float*)(smem);
 457             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 458             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 459
 460             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 461
 462             if (queryIdx < query.rows && threadIdx.x == 0)
 463             {
 464                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 465                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
 466                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 467             }
 468         }
 469
 470         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 471         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 472                                  const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
 473                                  cudaStream_t stream)
 474         {
 475             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 476             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 477
 478             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 479
 480             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 481             cudaSafeCall( cudaGetLastError() );
 482
 483             if (stream == 0)
 484                 cudaSafeCall( cudaDeviceSynchronize() );
 485         }
 486
 487         ///////////////////////////////////////////////////////////////////////////////
 488         // Match Unrolled
 489
 490         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 491         __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
 492                                      typename Dist::value_type* s_query, typename Dist::value_type* s_train,
 493                                      float& bestDistance1, float& bestDistance2,
 494                                      int& bestTrainIdx1, int& bestTrainIdx2,
 495                                      int& bestImgIdx1, int& bestImgIdx2)
 496         {
 497             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
 498             {
 499                 Dist dist;
 500
 501                 #pragma unroll
 502                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 503                 {
 504                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 505
 506                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 507                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 508
 509                     if (loadX < query.cols)
 510                     {
 511                         T val;
 512
 513                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
 514                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 515
 516                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 517                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 518                     }
 519
 520                     __syncthreads();
 521
 522                     #pragma unroll
 523                     for (int j = 0; j < BLOCK_SIZE; ++j)
 524                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 525
 526                     __syncthreads();
 527                 }
 528
 529                 typename Dist::result_type distVal = dist;
 530
 531                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 532
 533                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
 534                 {
 535                     if (distVal < bestDistance1)
 536                     {
 537                         bestImgIdx2   = bestImgIdx1;
 538                         bestDistance2 = bestDistance1;
 539                         bestTrainIdx2 = bestTrainIdx1;
 540
 541                         bestImgIdx1   = imgIdx;
 542                         bestDistance1 = distVal;
 543                         bestTrainIdx1 = trainIdx;
 544                     }
 545                     else if (distVal < bestDistance2)
 546                     {
 547                         bestImgIdx2   = imgIdx;
 548                         bestDistance2 = distVal;
 549                         bestTrainIdx2 = trainIdx;
 550                     }
 551                 }
 552             }
 553         }
 554
 555         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 556         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
 557         {
 558             extern __shared__ int smem[];
 559
 560             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 561
 562             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 563             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 564
 565             float myBestDistance1 = numeric_limits<float>::max();
 566             float myBestDistance2 = numeric_limits<float>::max();
 567             int myBestTrainIdx1 = -1;
 568             int myBestTrainIdx2 = -1;
 569
 570             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 571
 572             __syncthreads();
 573
 574             float* s_distance = (float*)(smem);
 575             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 576
 577             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 578
 579             if (queryIdx < query.rows && threadIdx.x == 0)
 580             {
 581                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 582                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 583             }
 584         }
 585
 586         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 587         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 588                            const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
 589                            cudaStream_t stream)
 590         {
 591             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 592             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 593
 594             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 595
 596             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 597             cudaSafeCall( cudaGetLastError() );
 598
 599             if (stream == 0)
 600                 cudaSafeCall( cudaDeviceSynchronize() );
 601         }
 602
 603         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 604         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
 605         {
 606             extern __shared__ int smem[];
 607
 608             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 609
 610             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 611             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 612
 613             float myBestDistance1 = numeric_limits<float>::max();
 614             float myBestDistance2 = numeric_limits<float>::max();
 615             int myBestTrainIdx1 = -1;
 616             int myBestTrainIdx2 = -1;
 617             int myBestImgIdx1 = -1;
 618             int myBestImgIdx2 = -1;
 619
 620             Mask m = mask;
 621
 622             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 623             {
 624                 const PtrStepSz<T> train = trains[imgIdx];
 625                 m.next();
 626                 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
 627             }
 628
 629             __syncthreads();
 630
 631             float* s_distance = (float*)(smem);
 632             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 633             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 634
 635             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 636
 637             if (queryIdx < query.rows && threadIdx.x == 0)
 638             {
 639                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 640                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
 641                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 642             }
 643         }
 644
 645         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 646         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 647                            const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
 648                            cudaStream_t stream)
 649         {
 650             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 651             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 652
 653             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 654
 655             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 656             cudaSafeCall( cudaGetLastError() );
 657
 658             if (stream == 0)
 659                 cudaSafeCall( cudaDeviceSynchronize() );
 660         }
 661
 662         ///////////////////////////////////////////////////////////////////////////////
 663         // Match
 664
 665         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 666         __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
 667                              typename Dist::value_type* s_query, typename Dist::value_type* s_train,
 668                              float& bestDistance1, float& bestDistance2,
 669                              int& bestTrainIdx1, int& bestTrainIdx2,
 670                              int& bestImgIdx1, int& bestImgIdx2)
 671         {
 672             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
 673             {
 674                 Dist dist;
 675
 676                 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
 677                 {
 678                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
 679
 680                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 681                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 682
 683                     if (loadX < query.cols)
 684                     {
 685                         T val;
 686
 687                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
 688                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
 689
 690                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
 691                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
 692                     }
 693
 694                     __syncthreads();
 695
 696                     #pragma unroll
 697                     for (int j = 0; j < BLOCK_SIZE; ++j)
 698                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 699
 700                     __syncthreads();
 701                 }
 702
 703                 typename Dist::result_type distVal = dist;
 704
 705                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
 706
 707                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
 708                 {
 709                     if (distVal < bestDistance1)
 710                     {
 711                         bestImgIdx2   = bestImgIdx1;
 712                         bestDistance2 = bestDistance1;
 713                         bestTrainIdx2 = bestTrainIdx1;
 714
 715                         bestImgIdx1   = imgIdx;
 716                         bestDistance1 = distVal;
 717                         bestTrainIdx1 = trainIdx;
 718                     }
 719                     else if (distVal < bestDistance2)
 720                     {
 721                         bestImgIdx2   = imgIdx;
 722                         bestDistance2 = distVal;
 723                         bestTrainIdx2 = trainIdx;
 724                     }
 725                 }
 726             }
 727         }
 728
 729         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 730         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
 731         {
 732             extern __shared__ int smem[];
 733
 734             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 735
 736             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 737             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 738
 739             float myBestDistance1 = numeric_limits<float>::max();
 740             float myBestDistance2 = numeric_limits<float>::max();
 741             int myBestTrainIdx1 = -1;
 742             int myBestTrainIdx2 = -1;
 743
 744             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
 745
 746             __syncthreads();
 747
 748             float* s_distance = (float*)(smem);
 749             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 750
 751             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
 752
 753             if (queryIdx < query.rows && threadIdx.x == 0)
 754             {
 755                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 756                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 757             }
 758         }
 759
 760         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 761         void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 762                    const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
 763                    cudaStream_t stream)
 764         {
 765             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 766             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 767
 768             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 769
 770             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
 771             cudaSafeCall( cudaGetLastError() );
 772
 773             if (stream == 0)
 774                 cudaSafeCall( cudaDeviceSynchronize() );
 775         }
 776
 777         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 778         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
 779         {
 780             extern __shared__ int smem[];
 781
 782             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
 783
 784             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 785             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 786
 787             float myBestDistance1 = numeric_limits<float>::max();
 788             float myBestDistance2 = numeric_limits<float>::max();
 789             int myBestTrainIdx1 = -1;
 790             int myBestTrainIdx2 = -1;
 791             int myBestImgIdx1 = -1;
 792             int myBestImgIdx2 = -1;
 793
 794             Mask m = mask;
 795
 796             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
 797             {
 798                 const PtrStepSz<T> train = trains[imgIdx];
 799                 m.next();
 800                 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
 801             }
 802
 803             __syncthreads();
 804
 805             float* s_distance = (float*)(smem);
 806             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 807             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
 808
 809             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
 810
 811             if (queryIdx < query.rows && threadIdx.x == 0)
 812             {
 813                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
 814                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
 815                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
 816             }
 817         }
 818
 819         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 820         void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 821                    const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
 822                    cudaStream_t stream)
 823         {
 824             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 825             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
 826
 827             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 828
 829             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
 830             cudaSafeCall( cudaGetLastError() );
 831
 832             if (stream == 0)
 833                 cudaSafeCall( cudaDeviceSynchronize() );
 834         }
 835
 836         ///////////////////////////////////////////////////////////////////////////////
 837         // knnMatch 2 dispatcher
 838
 839         template <typename Dist, typename T, typename Mask>
 840         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
 841                               const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
 842                               cudaStream_t stream)
 843         {
 844             if (query.cols <= 64)
 845             {
 846                 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 847             }
 848             else if (query.cols <= 128)
 849             {
 850                 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 851             }
 852             /*else if (query.cols <= 256)
 853             {
 854                 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 855             }
 856             else if (query.cols <= 512)
 857             {
 858                 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 859             }
 860             else if (query.cols <= 1024)
 861             {
 862                 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 863             }*/
 864             else
 865             {
 866                 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 867             }
 868         }
 869
 870         template <typename Dist, typename T, typename Mask>
 871         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
 872                               const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
 873                               cudaStream_t stream)
 874         {
 875             if (query.cols <= 64)
 876             {
 877                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 878             }
 879             else if (query.cols <= 128)
 880             {
 881                 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 882             }
 883             /*else if (query.cols <= 256)
 884             {
 885                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 886             }
 887             else if (query.cols <= 512)
 888             {
 889                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 890             }
 891             else if (query.cols <= 1024)
 892             {
 893                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 894             }*/
 895             else
 896             {
 897                 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
 898             }
 899         }
 900
 901         ///////////////////////////////////////////////////////////////////////////////
 902         // Calc distance kernel
 903
 904         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 905         __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
 906         {
 907             extern __shared__ int smem[];
 908
 909             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
 910             const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 911
 912             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 913             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 914
 915             Dist dist;
 916
 917             #pragma unroll
 918             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
 919             {
 920                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
 921
 922                 if (loadX < query.cols)
 923                 {
 924                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
 925                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
 926                 }
 927                 else
 928                 {
 929                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 930                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 931                 }
 932
 933                 __syncthreads();
 934
 935                 #pragma unroll
 936                 for (int j = 0; j < BLOCK_SIZE; ++j)
 937                     dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
 938
 939                 __syncthreads();
 940             }
 941
 942             if (queryIdx < query.rows && trainIdx < train.rows)
 943             {
 944                 float distVal = numeric_limits<float>::max();
 945
 946                 if (mask(queryIdx, trainIdx))
 947                     distVal = (typename Dist::result_type)dist;
 948
 949                 allDist.ptr(queryIdx)[trainIdx] = distVal;
 950             }
 951         }
 952
 953         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
 954         void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
 955         {
 956             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
 957             const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
 958
 959             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
 960
 961             calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
 962             cudaSafeCall( cudaGetLastError() );
 963
 964             if (stream == 0)
 965                 cudaSafeCall( cudaDeviceSynchronize() );
 966         }
 967
 968         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
 969         __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
 970         {
 971             extern __shared__ int smem[];
 972
 973             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
 974             const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
 975
 976             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
 977             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
 978
 979             Dist dist;
 980
 981             for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
 982             {
 983                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
 984
 985                 if (loadX < query.cols)
 986                 {
 987                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
 988                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
 989                 }
 990                 else
 991                 {
 992                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
 993                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
 994                 }
 995
 996                 __syncthreads();
 997
 998                 #pragma unroll
 999                 for (int j = 0; j < BLOCK_SIZE; ++j)
1000                     dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
1001
1002                 __syncthreads();
1003             }
1004
1005             if (queryIdx < query.rows && trainIdx < train.rows)
1006             {
1007                 float distVal = numeric_limits<float>::max();
1008
1009                 if (mask(queryIdx, trainIdx))
1010                     distVal = (typename Dist::result_type)dist;
1011
1012                 allDist.ptr(queryIdx)[trainIdx] = distVal;
1013             }
1014         }
1015
1016         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
1017         void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
1018         {
1019             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
1020             const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
1021
1022             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
1023
1024             calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
1025             cudaSafeCall( cudaGetLastError() );
1026
1027             if (stream == 0)
1028                 cudaSafeCall( cudaDeviceSynchronize() );
1029         }
1030
1031         ///////////////////////////////////////////////////////////////////////////////
1032         // Calc Distance dispatcher
1033
1034         template <typename Dist, typename T, typename Mask>
1035         void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
1036                                     const PtrStepSzf& allDist,
1037                                     cudaStream_t stream)
1038         {
1039             if (query.cols <= 64)
1040             {
1041                 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
1042             }
1043             else if (query.cols <= 128)
1044             {
1045                 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
1046             }
1047             /*else if (query.cols <= 256)
1048             {
1049                 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
1050             }
1051             else if (query.cols <= 512)
1052             {
1053                 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
1054             }
1055             else if (query.cols <= 1024)
1056             {
1057                 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
1058             }*/
1059             else
1060             {
1061                 calcDistance<16, Dist>(query, train, mask, allDist, stream);
1062             }
1063         }
1064
1065         ///////////////////////////////////////////////////////////////////////////////
1066         // find knn match kernel
1067
1068         template <int BLOCK_SIZE>
1069         __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
1070         {
1071             const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
1072             __shared__ float s_dist[SMEM_SIZE];
1073             __shared__ int s_trainIdx[SMEM_SIZE];
1074
1075             const int queryIdx = blockIdx.x;
1076
1077             float* allDistRow = allDist.ptr(queryIdx);
1078
1079             float dist = numeric_limits<float>::max();
1080             int bestIdx = -1;
1081
1082             for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
1083             {
1084                 float reg = allDistRow[i];
1085                 if (reg < dist)
1086                 {
1087                     dist = reg;
1088                     bestIdx = i;
1089                 }
1090             }
1091
1092             s_dist[threadIdx.x] = dist;
1093             s_trainIdx[threadIdx.x] = bestIdx;
1094             __syncthreads();
1095
1096             reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
1097
1098             if (threadIdx.x == 0)
1099             {
1100                 if (dist < numeric_limits<float>::max())
1101                 {
1102                     allDistRow[bestIdx] = numeric_limits<float>::max();
1103                     trainIdx.ptr(queryIdx)[i] = bestIdx;
1104                     distance.ptr(queryIdx)[i] = dist;
1105                 }
1106             }
1107         }
1108
1109         template <int BLOCK_SIZE>
1110         void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
1111         {
1112             const dim3 block(BLOCK_SIZE, 1, 1);
1113             const dim3 grid(trainIdx.rows, 1, 1);
1114
1115             for (int i = 0; i < k; ++i)
1116             {
1117                 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
1118                 cudaSafeCall( cudaGetLastError() );
1119             }
1120
1121             if (stream == 0)
1122                 cudaSafeCall( cudaDeviceSynchronize() );
1123         }
1124
1125         void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
1126         {
1127             findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
1128         }
1129
1130         ///////////////////////////////////////////////////////////////////////////////
1131         // knn match Dispatcher
1132
1133         template <typename Dist, typename T, typename Mask>
1134         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
1135             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1136             cudaStream_t stream)
1137         {
1138             if (k == 2)
1139             {
1140                 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
1141             }
1142             else
1143             {
1144                 calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
1145                 findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
1146             }
1147         }
1148
1149         ///////////////////////////////////////////////////////////////////////////////
1150         // knn match caller
1151
1152         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1153             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1154             cudaStream_t stream)
1155         {
1156             if (mask.data)
1157                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1158             else
1159                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1160         }
1161
1162         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1163         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1164         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1165         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1166         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1167         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1168
1169         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1170             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1171             cudaStream_t stream)
1172         {
1173             if (mask.data)
1174                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1175             else
1176                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1177         }
1178
1179         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1180         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1181         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1182         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1183         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1184         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1185
1186         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
1187             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
1188             cudaStream_t stream)
1189         {
1190             if (mask.data)
1191                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
1192             else
1193                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
1194         }
1195
1196         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1197         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1198         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1199         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1200         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
1201
1202         template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1203             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1204             cudaStream_t stream)
1205         {
1206             if (masks.data)
1207                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1208             else
1209                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
1210         }
1211
1212         template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1213         //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1214         template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1215         template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1216         template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1217         template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1218
1219         template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1220             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1221             cudaStream_t stream)
1222         {
1223             if (masks.data)
1224                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1225             else
1226                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
1227         }
1228
1229         //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1230         //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1231         //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1232         //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1233         //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1234         template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1235
1236         template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
1237             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
1238             cudaStream_t stream)
1239         {
1240             if (masks.data)
1241                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
1242             else
1243                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
1244         }
1245
1246         template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1247         //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1248         template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1249         //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1250         template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
1251     } // namespace bf_knnmatch
1252 }}} // namespace cv { namespace gpu { namespace device {
1253
1254
1255 #endif /* CUDA_DISABLER */