added dual tvl1 optical flow gpu implementation

[profile/ivi/opencv.git] / modules / gpu / src / cuda / bf_knnmatch.cu
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu

index f53af9e..49bc1df 100644 (file)
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -1,1159 +1,1255 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////\r
-//\r
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
-//\r
-//  By downloading, copying, installing or using the software you agree to this license.\r
-//  If you do not agree to this license, do not download, install,\r
-//  copy or use the software.\r
-//\r
-//\r
-//                           License Agreement\r
-//                For Open Source Computer Vision Library\r
-//\r
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
-// Third party copyrights are property of their respective owners.\r
-//\r
-// Redistribution and use in source and binary forms, with or without modification,\r
-// are permitted provided that the following conditions are met:\r
-//\r
-//   * Redistribution's of source code must retain the above copyright notice,\r
-//     this list of conditions and the following disclaimer.\r
-//\r
-//   * Redistribution's in binary form must reproduce the above copyright notice,\r
-//     this list of conditions and the following disclaimer in the documentation\r
-//     and/or other materials provided with the distribution.\r
-//\r
-//   * The name of the copyright holders may not be used to endorse or promote products\r
-//     derived from this software without specific prior written permission.\r
-//\r
-// This software is provided by the copyright holders and contributors "as is" and\r
-// any express or bpied warranties, including, but not limited to, the bpied\r
-// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
-// In no event shall the Intel Corporation or contributors be liable for any direct,\r
-// indirect, incidental, special, exemplary, or consequential damages\r
-// (including, but not limited to, procurement of substitute goods or services;\r
-// loss of use, data, or profits; or business interruption) however caused\r
-// and on any theory of liability, whether in contract, strict liability,\r
-// or tort (including negligence or otherwise) arising in any way out of\r
-// the use of this software, even if advised of the possibility of such damage.\r
-//\r
-//M*/\r
-\r
-#include "internal_shared.hpp"\r
-#include "opencv2/gpu/device/limits.hpp"\r
-#include "opencv2/gpu/device/vec_distance.hpp"\r
-#include "opencv2/gpu/device/datamov_utils.hpp"\r
-\r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
-\r
-namespace cv { namespace gpu { namespace bf_knnmatch\r
-{\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Reduction\r
-\r
-    template <int BLOCK_SIZE> \r
-    __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
-                                  int& bestTrainIdx1, int& bestTrainIdx2, \r
-                                  float* s_distance, int* s_trainIdx)\r
-    {\r
-        float myBestDistance1 = numeric_limits<float>::max(); \r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-\r
-        s_distance += threadIdx.y * BLOCK_SIZE;\r
-        s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
-\r
-        s_distance[threadIdx.x] = bestDistance1;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
-\r
-        __syncthreads();\r
-\r
-        if (threadIdx.x == 0)\r
-        {\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_SIZE; ++i)\r
-            {\r
-                float val = s_distance[i];\r
-\r
-                if (val < myBestDistance1)\r
-                {\r
-                    myBestDistance2 = myBestDistance1;\r
-                    myBestTrainIdx2 = myBestTrainIdx1;\r
-\r
-                    myBestDistance1 = val;\r
-                    myBestTrainIdx1 = s_trainIdx[i];\r
-                }\r
-                else if (val < myBestDistance2)\r
-                {\r
-                    myBestDistance2 = val;\r
-                    myBestTrainIdx2 = s_trainIdx[i];\r
-                }\r
-            }\r
-        }\r
-\r
-        __syncthreads();\r
-\r
-        s_distance[threadIdx.x] = bestDistance2;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
-\r
-        __syncthreads();\r
-\r
-        if (threadIdx.x == 0)\r
-        {\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_SIZE; ++i)\r
-            {\r
-                float val = s_distance[i];\r
-\r
-                if (val < myBestDistance2)\r
-                {\r
-                    myBestDistance2 = val;\r
-                    myBestTrainIdx2 = s_trainIdx[i];\r
-                }\r
-            }\r
-        }\r
-\r
-        bestDistance1 = myBestDistance1;\r
-        bestDistance2 = myBestDistance2;\r
-\r
-        bestTrainIdx1 = myBestTrainIdx1;\r
-        bestTrainIdx2 = myBestTrainIdx2;\r
-    }\r
-\r
-    template <int BLOCK_SIZE> \r
-    __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, \r
-                                   int& bestTrainIdx1, int& bestTrainIdx2, \r
-                                   int& bestImgIdx1, int& bestImgIdx2, \r
-                                   float* s_distance, int* s_trainIdx, int* s_imgIdx)\r
-    {\r
-        float myBestDistance1 = numeric_limits<float>::max(); \r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-        int myBestImgIdx1 = -1;\r
-        int myBestImgIdx2 = -1;\r
-\r
-        s_distance += threadIdx.y * BLOCK_SIZE;\r
-        s_trainIdx += threadIdx.y * BLOCK_SIZE;\r
-        s_imgIdx   += threadIdx.y * BLOCK_SIZE;\r
-\r
-        s_distance[threadIdx.x] = bestDistance1;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx1;\r
-        s_imgIdx[threadIdx.x]   = bestImgIdx1;\r
-\r
-        __syncthreads();\r
-\r
-        if (threadIdx.x == 0)\r
-        {\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_SIZE; ++i)\r
-            {\r
-                float val = s_distance[i];\r
-\r
-                if (val < myBestDistance1)\r
-                {\r
-                    myBestDistance2 = myBestDistance1;\r
-                    myBestTrainIdx2 = myBestTrainIdx1;\r
-                    myBestImgIdx2   = myBestImgIdx1;\r
-\r
-                    myBestDistance1 = val;\r
-                    myBestTrainIdx1 = s_trainIdx[i];\r
-                    myBestImgIdx1   = s_imgIdx[i];\r
-                }\r
-                else if (val < myBestDistance2)\r
-                {\r
-                    myBestDistance2 = val;\r
-                    myBestTrainIdx2 = s_trainIdx[i];\r
-                    myBestImgIdx2   = s_imgIdx[i];\r
-                }\r
-            }\r
-        }\r
-\r
-        __syncthreads();\r
-\r
-        s_distance[threadIdx.x] = bestDistance2;\r
-        s_trainIdx[threadIdx.x] = bestTrainIdx2;\r
-        s_imgIdx[threadIdx.x]   = bestImgIdx2;\r
-\r
-        __syncthreads();\r
-\r
-        if (threadIdx.x == 0)\r
-        {\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_SIZE; ++i)\r
-            {\r
-                float val = s_distance[i];\r
-\r
-                if (val < myBestDistance2)\r
-                {\r
-                    myBestDistance2 = val;\r
-                    myBestTrainIdx2 = s_trainIdx[i];\r
-                    myBestImgIdx2   = s_imgIdx[i];\r
-                }\r
-            }\r
-        }\r
-\r
-        bestDistance1 = myBestDistance1;\r
-        bestDistance2 = myBestDistance2;\r
-\r
-        bestTrainIdx1 = myBestTrainIdx1;\r
-        bestTrainIdx2 = myBestTrainIdx2;\r
-\r
-        bestImgIdx1 = myBestImgIdx1;\r
-        bestImgIdx2 = myBestImgIdx2;\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match Unrolled Cached\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> \r
-    __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)\r
-    {\r
-        #pragma unroll\r
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
-        {\r
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-            s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0;\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                                       typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                                       float& bestDistance1, float& bestDistance2, \r
-                                       int& bestTrainIdx1, int& bestTrainIdx2, \r
-                                       int& bestImgIdx1, int& bestImgIdx2)\r
-    {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
-        {\r
-            Dist dist;\r
-\r
-            #pragma unroll\r
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
-            {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-\r
-                if (loadX < train.cols)\r
-                {\r
-                    T val;\r
-\r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
-\r
-                __syncthreads();\r
-\r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
-\r
-                __syncthreads();\r
-            }\r
-\r
-            typename Dist::result_type distVal = dist;\r
-\r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
-\r
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
-            {\r
-                if (distVal < bestDistance1)\r
-                {\r
-                    bestImgIdx2   = bestImgIdx1;\r
-                    bestDistance2 = bestDistance1;\r
-                    bestTrainIdx2 = bestTrainIdx1;\r
-\r
-                    bestImgIdx1   = imgIdx;\r
-                    bestDistance1 = distVal;\r
-                    bestTrainIdx1 = trainIdx;\r
-                }\r
-                else if (distVal < bestDistance2)\r
-                {\r
-                    bestImgIdx2   = imgIdx;\r
-                    bestDistance2 = distVal;\r
-                    bestTrainIdx2 = trainIdx;\r
-                }\r
-            }\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
-\r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
-\r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
-\r
-        loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
-\r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-\r
-        loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
-\r
-        __syncthreads();\r
-\r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
-\r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                             const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
-                             cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
-\r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
-\r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);\r
-\r
-        loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);\r
-\r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-        int myBestImgIdx1 = -1;\r
-        int myBestImgIdx2 = -1;\r
-\r
-        Mask m = mask;\r
-\r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
-        }\r
-\r
-        __syncthreads();\r
-\r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
-\r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                             const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
-                             cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match Unrolled\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                                 typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                                 float& bestDistance1, float& bestDistance2, \r
-                                 int& bestTrainIdx1, int& bestTrainIdx2, \r
-                                 int& bestImgIdx1, int& bestImgIdx2)\r
-    {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
-        {\r
-            Dist dist;\r
-\r
-            #pragma unroll\r
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
-            {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-\r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-\r
-                if (loadX < query.cols)\r
-                {\r
-                    T val;\r
-\r
-                    ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);\r
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
-\r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
-\r
-                __syncthreads();\r
-\r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
-\r
-                __syncthreads();\r
-            }\r
-\r
-            typename Dist::result_type distVal = dist;\r
-\r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
-\r
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
-            {\r
-                if (distVal < bestDistance1)\r
-                {\r
-                    bestImgIdx2   = bestImgIdx1;\r
-                    bestDistance2 = bestDistance1;\r
-                    bestTrainIdx2 = bestTrainIdx1;\r
-\r
-                    bestImgIdx1   = imgIdx;\r
-                    bestDistance1 = distVal;\r
-                    bestTrainIdx1 = trainIdx;\r
-                }\r
-                else if (distVal < bestDistance2)\r
-                {\r
-                    bestImgIdx2   = imgIdx;\r
-                    bestDistance2 = distVal;\r
-                    bestTrainIdx2 = trainIdx;\r
-                }\r
-            }\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
-\r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
-\r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-\r
-        loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
-\r
-        __syncthreads();\r
-\r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
-\r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                       const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
-                       cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
-\r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
-\r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-        int myBestImgIdx1 = -1;\r
-        int myBestImgIdx2 = -1;\r
-\r
-        Mask m = mask;\r
-\r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
-        }\r
-\r
-        __syncthreads();\r
-\r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
-\r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                       const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
-                       cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Match\r
-\r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, \r
-                         typename Dist::value_type* s_query, typename Dist::value_type* s_train, \r
-                         float& bestDistance1, float& bestDistance2, \r
-                         int& bestTrainIdx1, int& bestTrainIdx2, \r
-                         int& bestImgIdx1, int& bestImgIdx2)\r
-    {\r
-        for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)\r
-        {\r
-            Dist dist;\r
-\r
-            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
-            {\r
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-\r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-\r
-                if (loadX < query.cols)\r
-                {\r
-                    T val;\r
-\r
-                    ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);\r
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;\r
-\r
-                    ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);\r
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;\r
-                }\r
-\r
-                __syncthreads();\r
-\r
-                #pragma unroll\r
-                for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
-\r
-                __syncthreads();\r
-            }\r
-\r
-            typename Dist::result_type distVal = dist;\r
-\r
-            const int trainIdx = t * BLOCK_SIZE + threadIdx.x;\r
-\r
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))\r
-            {\r
-                if (distVal < bestDistance1)\r
-                {\r
-                    bestImgIdx2   = bestImgIdx1;\r
-                    bestDistance2 = bestDistance1;\r
-                    bestTrainIdx2 = bestTrainIdx1;\r
-\r
-                    bestImgIdx1   = imgIdx;\r
-                    bestDistance1 = distVal;\r
-                    bestTrainIdx1 = trainIdx;\r
-                }\r
-                else if (distVal < bestDistance2)\r
-                {\r
-                    bestImgIdx2   = imgIdx;\r
-                    bestDistance2 = distVal;\r
-                    bestTrainIdx2 = trainIdx;\r
-                }\r
-            }\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
-\r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
-\r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-\r
-        loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);\r
-\r
-        __syncthreads();\r
-\r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);\r
-\r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-               const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
-               cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)\r
-    {\r
-        extern __shared__ int smem[];\r
-\r
-        const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;\r
-\r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        float myBestDistance1 = numeric_limits<float>::max();\r
-        float myBestDistance2 = numeric_limits<float>::max();\r
-        int myBestTrainIdx1 = -1;\r
-        int myBestTrainIdx2 = -1;\r
-        int myBestImgIdx1 = -1;\r
-        int myBestImgIdx2 = -1;\r
-\r
-        Mask m = mask;\r
-\r
-        for (int imgIdx = 0; imgIdx < n; ++imgIdx)\r
-        {\r
-            const DevMem2D_<T> train = trains[imgIdx];\r
-            m.next();\r
-            loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);\r
-        }\r
-\r
-        __syncthreads();\r
-\r
-        float* s_distance = (float*)(smem);\r
-        int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-        int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);\r
-\r
-        if (queryIdx < query.rows && threadIdx.x == 0)\r
-        {\r
-            bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);\r
-            bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);\r
-            bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-               const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, \r
-               cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // knnMatch 2 dispatcher\r
-\r
-    template <typename Dist, typename T, typename Mask> \r
-    void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                          const DevMem2Db& trainIdx, const DevMem2Db& distance, \r
-                          int cc, cudaStream_t stream)\r
-    {\r
-        if (query.cols <= 64)\r
-        {\r
-            matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }*/\r
-        else\r
-        {\r
-            match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-    }\r
-\r
-    template <typename Dist, typename T, typename Mask> \r
-    void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, \r
-                          const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
-                          int cc, cudaStream_t stream)\r
-    {\r
-        if (query.cols <= 64)\r
-        {\r
-            matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }*/\r
-        else\r
-        {\r
-            match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);\r
-        }\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Calc distance kernel\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>\r
-    __global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
-    {\r
-        extern __shared__ int smem[];\r
-\r
-        const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
-        const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
-\r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        Dist dist;\r
-\r
-        #pragma unroll\r
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)\r
-        {\r
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-\r
-            if (loadX < query.cols)\r
-            {\r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
-            }\r
-            else\r
-            {                \r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-            }\r
-\r
-            __syncthreads();\r
-\r
-            #pragma unroll\r
-            for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
-\r
-            __syncthreads();\r
-        }\r
-\r
-        if (queryIdx < query.rows && trainIdx < train.rows)\r
-        {\r
-            float distVal = numeric_limits<float>::max();\r
-\r
-            if (mask(queryIdx, trainIdx))\r
-                distVal = (typename Dist::result_type)dist;\r
-\r
-            allDist.ptr(queryIdx)[trainIdx] = distVal;\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> \r
-    void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>\r
-    __global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)\r
-    {\r
-        extern __shared__ int smem[];\r
-\r
-        const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;\r
-        const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;\r
-\r
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);\r
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);\r
-\r
-        Dist dist;\r
-\r
-        for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)\r
-        {\r
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;\r
-\r
-            if (loadX < query.cols)\r
-            {\r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];\r
-            }\r
-            else\r
-            {                \r
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;\r
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;\r
-            }\r
-\r
-            __syncthreads();\r
-\r
-            #pragma unroll\r
-            for (int j = 0; j < BLOCK_SIZE; ++j)\r
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);\r
-\r
-            __syncthreads();\r
-        }\r
-\r
-        if (queryIdx < query.rows && trainIdx < train.rows)\r
-        {\r
-            float distVal = numeric_limits<float>::max();\r
-\r
-            if (mask(queryIdx, trainIdx))\r
-                distVal = (typename Dist::result_type)dist;\r
-\r
-            allDist.ptr(queryIdx)[trainIdx] = distVal;\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> \r
-    void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);\r
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));\r
-\r
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);\r
-\r
-        calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Calc Distance dispatcher\r
-\r
-    template <typename Dist, typename T, typename Mask> \r
-    void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-                                const DevMem2Df& allDist, \r
-                                int cc, cudaStream_t stream)\r
-    {\r
-        if (query.cols <= 64)\r
-        {\r
-            calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-        else if (query.cols <= 128)\r
-        {\r
-            calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-        /*else if (query.cols <= 256)\r
-        {\r
-            calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-        else if (query.cols <= 512)\r
-        {            \r
-            calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-        else if (query.cols <= 1024)\r
-        {            \r
-            calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);\r
-        }*/\r
-        else\r
-        {\r
-            calcDistance<16, Dist>(query, train, mask, allDist, stream);\r
-        }\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // find knn match kernel\r
-\r
-    template <int BLOCK_SIZE> \r
-    __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)\r
-    {\r
-        const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;\r
-        __shared__ float s_dist[SMEM_SIZE];\r
-        __shared__ int s_trainIdx[SMEM_SIZE];\r
-\r
-        const int queryIdx = blockIdx.x;\r
-\r
-        float* allDistRow = allDist.ptr(queryIdx);\r
-\r
-        float dist = numeric_limits<float>::max();\r
-        int bestIdx = -1;\r
-        \r
-        for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)\r
-        {\r
-            float reg = allDistRow[i];\r
-            if (reg < dist)\r
-            {\r
-                dist = reg;\r
-                bestIdx = i;\r
-            }\r
-        }\r
-\r
-        s_dist[threadIdx.x] = dist;\r
-        s_trainIdx[threadIdx.x] = bestIdx;\r
-        __syncthreads();\r
-\r
-        reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());\r
-\r
-        if (threadIdx.x == 0)\r
-        {\r
-            if (dist < numeric_limits<float>::max())\r
-            {\r
-                allDistRow[bestIdx] = numeric_limits<float>::max();\r
-                trainIdx.ptr(queryIdx)[i] = bestIdx;\r
-                distance.ptr(queryIdx)[i] = dist;\r
-            }\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_SIZE> \r
-    void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
-    {\r
-        const dim3 block(BLOCK_SIZE, 1, 1);\r
-        const dim3 grid(trainIdx.rows, 1, 1);\r
-\r
-        for (int i = 0; i < k; ++i)\r
-        {\r
-            findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);\r
-            cudaSafeCall( cudaGetLastError() );\r
-        }\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)\r
-    {\r
-        findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // knn match Dispatcher\r
-\r
-    template <typename Dist, typename T, typename Mask>\r
-    void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (k == 2)\r
-        {\r
-            match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);\r
-        }\r
-        else\r
-        {\r
-            calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);\r
-            findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);\r
-        }\r
-    }     \r
-    \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // knn match caller\r
-\r
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (mask.data)\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
-        else\r
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
-    }\r
-\r
-    template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,\r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (mask.data)\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
-        else\r
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
-    }\r
-\r
-    //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,\r
-        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (mask.data)\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);\r
-        else\r
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);\r
-    }\r
-\r
-    template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-    template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (masks.data)\r
-            match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
-        else\r
-            match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
-    }\r
-    \r
-    template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L1_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (masks.data)\r
-            match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
-        else\r
-            match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
-    }\r
-    \r
-    //template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2L2_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    \r
-    template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, \r
-        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (masks.data)\r
-            match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);\r
-        else\r
-            match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);\r
-    }\r
-    \r
-    template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    //template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-    template void match2Hamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);\r
-}}}\r
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/vec_distance.hpp"
+#include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace bf_knnmatch
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
+                                      int& bestTrainIdx1, int& bestTrainIdx2,
+                                      float* s_distance, int* s_trainIdx)
+        {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+
+            float d1, d2;
+            int i1, i2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                    }
+                }
+            }
+        #else
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+
+            s_distance[threadIdx.x] = bestDistance1;
+            s_trainIdx[threadIdx.x] = bestTrainIdx1;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance1)
+                    {
+                        myBestDistance2 = myBestDistance1;
+                        myBestTrainIdx2 = myBestTrainIdx1;
+
+                        myBestDistance1 = val;
+                        myBestTrainIdx1 = s_trainIdx[i];
+                    }
+                    else if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            s_distance[threadIdx.x] = bestDistance2;
+            s_trainIdx[threadIdx.x] = bestTrainIdx2;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                    }
+                }
+            }
+
+            bestDistance1 = myBestDistance1;
+            bestDistance2 = myBestDistance2;
+
+            bestTrainIdx1 = myBestTrainIdx1;
+            bestTrainIdx2 = myBestTrainIdx2;
+        #endif
+        }
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
+                                       int& bestTrainIdx1, int& bestTrainIdx2,
+                                       int& bestImgIdx1, int& bestImgIdx2,
+                                       float* s_distance, int* s_trainIdx, int* s_imgIdx)
+        {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+            (void) s_imgIdx;
+
+            float d1, d2;
+            int i1, i2;
+            int j1, j2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+                j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
+                j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                        bestImgIdx2 = j1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+                    bestImgIdx2 = bestImgIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+                    bestImgIdx1 = j1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                        bestImgIdx2 = j2;
+                    }
+                }
+            }
+        #else
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+            s_imgIdx   += threadIdx.y * BLOCK_SIZE;
+
+            s_distance[threadIdx.x] = bestDistance1;
+            s_trainIdx[threadIdx.x] = bestTrainIdx1;
+            s_imgIdx[threadIdx.x]   = bestImgIdx1;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance1)
+                    {
+                        myBestDistance2 = myBestDistance1;
+                        myBestTrainIdx2 = myBestTrainIdx1;
+                        myBestImgIdx2   = myBestImgIdx1;
+
+                        myBestDistance1 = val;
+                        myBestTrainIdx1 = s_trainIdx[i];
+                        myBestImgIdx1   = s_imgIdx[i];
+                    }
+                    else if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                        myBestImgIdx2   = s_imgIdx[i];
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            s_distance[threadIdx.x] = bestDistance2;
+            s_trainIdx[threadIdx.x] = bestTrainIdx2;
+            s_imgIdx[threadIdx.x]   = bestImgIdx2;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                        myBestImgIdx2   = s_imgIdx[i];
+                    }
+                }
+            }
+
+            bestDistance1 = myBestDistance1;
+            bestDistance2 = myBestDistance2;
+
+            bestTrainIdx1 = myBestTrainIdx1;
+            bestTrainIdx2 = myBestTrainIdx2;
+
+            bestImgIdx1 = myBestImgIdx1;
+            bestImgIdx2 = myBestImgIdx2;
+        #endif
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled Cached
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
+        __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
+        {
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                           float& bestDistance1, float& bestDistance2,
+                                           int& bestTrainIdx1, int& bestTrainIdx2,
+                                           int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < train.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                                 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                     float& bestDistance1, float& bestDistance2,
+                                     int& bestTrainIdx1, int& bestTrainIdx2,
+                                     int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                           const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                           const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                             float& bestDistance1, float& bestDistance2,
+                             int& bestTrainIdx1, int& bestTrainIdx2,
+                             int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                   const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                   const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knnMatch 2 dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
+                              cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+        }
+
+        template <typename Dist, typename T, typename Mask>
+        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+                              cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Calc distance kernel
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                if (loadX < query.cols)
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
+                }
+                else
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            if (queryIdx < query.rows && trainIdx < train.rows)
+            {
+                float distVal = numeric_limits<float>::max();
+
+                if (mask(queryIdx, trainIdx))
+                    distVal = (typename Dist::result_type)dist;
+
+                allDist.ptr(queryIdx)[trainIdx] = distVal;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                if (loadX < query.cols)
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
+                }
+                else
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            if (queryIdx < query.rows && trainIdx < train.rows)
+            {
+                float distVal = numeric_limits<float>::max();
+
+                if (mask(queryIdx, trainIdx))
+                    distVal = (typename Dist::result_type)dist;
+
+                allDist.ptr(queryIdx)[trainIdx] = distVal;
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Calc Distance dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                    const PtrStepSzf& allDist,
+                                    cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
+            }*/
+            else
+            {
+                calcDistance<16, Dist>(query, train, mask, allDist, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // find knn match kernel
+
+        template <int BLOCK_SIZE>
+        __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
+        {
+            const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
+            __shared__ float s_dist[SMEM_SIZE];
+            __shared__ int s_trainIdx[SMEM_SIZE];
+
+            const int queryIdx = blockIdx.x;
+
+            float* allDistRow = allDist.ptr(queryIdx);
+
+            float dist = numeric_limits<float>::max();
+            int bestIdx = -1;
+
+            for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
+            {
+                float reg = allDistRow[i];
+                if (reg < dist)
+                {
+                    dist = reg;
+                    bestIdx = i;
+                }
+            }
+
+            s_dist[threadIdx.x] = dist;
+            s_trainIdx[threadIdx.x] = bestIdx;
+            __syncthreads();
+
+            reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
+
+            if (threadIdx.x == 0)
+            {
+                if (dist < numeric_limits<float>::max())
+                {
+                    allDistRow[bestIdx] = numeric_limits<float>::max();
+                    trainIdx.ptr(queryIdx)[i] = bestIdx;
+                    distance.ptr(queryIdx)[i] = dist;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE>
+        void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, 1, 1);
+            const dim3 grid(trainIdx.rows, 1, 1);
+
+            for (int i = 0; i < k; ++i)
+            {
+                findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knn match Dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (k == 2)
+            {
+                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else
+            {
+                calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
+                findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knn match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
+        }
+
+        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+
+        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
+        }
+
+        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+
+        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
+        }
+
+        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+    } // namespace bf_knnmatch
+}}} // namespace cv { namespace gpu { namespace device {
+
+
+#endif /* CUDA_DISABLER */