added dual tvl1 optical flow gpu implementation

[profile/ivi/opencv.git] / modules / gpu / src / cuda / bf_knnmatch.cu
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu

index d67c92a..49bc1df 100644 (file)
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -1,464 +1,1255 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////\r
-//\r
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.\r
-//\r
-//  By downloading, copying, installing or using the software you agree to this license.\r
-//  If you do not agree to this license, do not download, install,\r
-//  copy or use the software.\r
-//\r
-//\r
-//                           License Agreement\r
-//                For Open Source Computer Vision Library\r
-//\r
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.\r
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.\r
-// Third party copyrights are property of their respective owners.\r
-//\r
-// Redistribution and use in source and binary forms, with or without modification,\r
-// are permitted provided that the following conditions are met:\r
-//\r
-//   * Redistribution's of source code must retain the above copyright notice,\r
-//     this list of conditions and the following disclaimer.\r
-//\r
-//   * Redistribution's in binary form must reproduce the above copyright notice,\r
-//     this list of conditions and the following disclaimer in the documentation\r
-//     and/or other materials provided with the distribution.\r
-//\r
-//   * The name of the copyright holders may not be used to endorse or promote products\r
-//     derived from this software without specific prior written permission.\r
-//\r
-// This software is provided by the copyright holders and contributors "as is" and\r
-// any express or bpied warranties, including, but not limited to, the bpied\r
-// warranties of merchantability and fitness for a particular purpose are disclaimed.\r
-// In no event shall the Intel Corporation or contributors be liable for any direct,\r
-// indirect, incidental, special, exemplary, or consequential damages\r
-// (including, but not limited to, procurement of substitute goods or services;\r
-// loss of use, data, or profits; or business interruption) however caused\r
-// and on any theory of liability, whether in contract, strict liability,\r
-// or tort (including negligence or otherwise) arising in any way out of\r
-// the use of this software, even if advised of the possibility of such damage.\r
-//\r
-//M*/\r
-\r
-#include "internal_shared.hpp"\r
-#include "opencv2/gpu/device/limits.hpp"\r
-#include "opencv2/gpu/device/vec_distance.hpp"\r
-\r
-using namespace cv::gpu;\r
-using namespace cv::gpu::device;\r
-\r
-namespace cv { namespace gpu { namespace bf_knnmatch\r
-{\r
-    template <typename VecDiff, typename Dist, typename T, typename Mask>\r
-    __device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,\r
-        typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2, \r
-        typename Dist::result_type* smem)\r
-    {\r
-        const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);\r
-        \r
-        typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;\r
-        \r
-        distMin1 = numeric_limits<typename Dist::result_type>::max();\r
-        distMin2 = numeric_limits<typename Dist::result_type>::max();\r
-\r
-        bestTrainIdx1 = -1;\r
-        bestTrainIdx2 = -1;\r
-\r
-        for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)\r
-        {\r
-            if (m(queryIdx, trainIdx))\r
-            {\r
-                Dist dist;\r
-\r
-                const T* trainRow = train.ptr(trainIdx);\r
-                \r
-                vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);\r
-\r
-                const typename Dist::result_type val = dist;\r
-\r
-                if (val < distMin1)\r
-                {\r
-                    distMin1 = val;\r
-                    bestTrainIdx1 = trainIdx;\r
-                }\r
-                else if (val < distMin2)\r
-                {\r
-                    distMin2 = val;\r
-                    bestTrainIdx2 = trainIdx;\r
-                }\r
-            }\r
-        }\r
-    }\r
-\r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>\r
-    __global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance)\r
-    {\r
-        typedef typename Dist::result_type result_type;\r
-        typedef typename Dist::value_type value_type;\r
-\r
-        __shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];\r
-\r
-        const int queryIdx = blockIdx.x;\r
-\r
-        result_type distMin1;\r
-        result_type distMin2;\r
-\r
-        int bestTrainIdx1;\r
-        int bestTrainIdx2;\r
-\r
-        distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);\r
-        __syncthreads();\r
-\r
-        volatile result_type* sdistMinRow = smem;\r
-        volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y);\r
-\r
-        if (threadIdx.x == 0)\r
-        {\r
-            sdistMinRow[threadIdx.y] = distMin1;\r
-            sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;\r
-\r
-            sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;            \r
-            sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;\r
-        }\r
-        __syncthreads();\r
-\r
-        if (threadIdx.x == 0 && threadIdx.y == 0)\r
-        {\r
-            distMin1 = numeric_limits<result_type>::max();\r
-            distMin2 = numeric_limits<result_type>::max();\r
-\r
-            bestTrainIdx1 = -1;\r
-            bestTrainIdx2 = -1;\r
-\r
-            #pragma unroll\r
-            for (int i = 0; i < BLOCK_DIM_Y; ++i)\r
-            {\r
-                result_type val = sdistMinRow[i];\r
-\r
-                if (val < distMin1)\r
-                {\r
-                    distMin1 = val;\r
-                    bestTrainIdx1 = sbestTrainIdxRow[i];\r
-                }\r
-                else if (val < distMin2)\r
-                {\r
-                    distMin2 = val;\r
-                    bestTrainIdx2 = sbestTrainIdxRow[i];\r
-                }\r
-            }\r
-\r
-            #pragma unroll\r
-            for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)\r
-            {\r
-                result_type val = sdistMinRow[i];\r
-\r
-                if (val < distMin2)\r
-                {\r
-                    distMin2 = val;\r
-                    bestTrainIdx2 = sbestTrainIdxRow[i];\r
-                }\r
-            }\r
-\r
-            trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);\r
-            distance[queryIdx] = make_float2(distMin1, distMin2);\r
-        }\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Knn 2 Match kernel caller\r
-\r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
-    void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
-        cudaStream_t stream)\r
-    {\r
-        const dim3 grid(query.rows, 1, 1);\r
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
-\r
-        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>\r
-            <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>\r
-    void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-        const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, \r
-        cudaStream_t stream)\r
-    {\r
-        StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length\r
-        StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();           // max descriptors length must divide to blockDimX\r
-\r
-        const dim3 grid(query.rows, 1, 1);\r
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
-\r
-        knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>\r
-              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Knn 2 Match Dispatcher\r
-    \r
-    template <typename Dist, typename T, typename Mask>\r
-    void knnMatch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, \r
-        const DevMem2D& trainIdx, const DevMem2D& distance, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (query.cols < 64)\r
-        {\r
-            knnMatch2Cached_caller<16, 16, 64, false, Dist>(\r
-                query, train, mask, \r
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),\r
-                stream);\r
-        }\r
-        else if (query.cols == 64)\r
-        {\r
-            knnMatch2Cached_caller<16, 16, 64, true, Dist>(\r
-                query, train, mask, \r
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), \r
-                stream);\r
-        }\r
-        else if (query.cols < 128)\r
-        {\r
-            knnMatch2Cached_caller<16, 16, 128, false, Dist>(\r
-                query, train, mask, \r
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), \r
-                stream);\r
-        }\r
-        else if (query.cols == 128 && cc >= 12)\r
-        {\r
-            knnMatch2Cached_caller<16, 16, 128, true, Dist>(\r
-                query, train, mask, \r
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), \r
-                stream);\r
-        }\r
-        else if (query.cols < 256 && cc >= 12)\r
-        {\r
-            knnMatch2Cached_caller<16, 16, 256, false, Dist>(\r
-                query, train, mask, \r
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), \r
-                stream);\r
-        }\r
-        else if (query.cols == 256 && cc >= 12)\r
-        {\r
-            knnMatch2Cached_caller<16, 16, 256, true, Dist>(\r
-                query, train, mask, \r
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance), \r
-                stream);\r
-        }\r
-        else\r
-        {\r
-            knnMatch2Simple_caller<16, 16, Dist>(\r
-                query, train, mask, \r
-                static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),\r
-                stream);\r
-        }\r
-    }\r
-    \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Calc distance kernel\r
-\r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
-    __global__ void calcDistance(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance)\r
-    {\r
-        __shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
-\r
-        typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
-        \r
-        const int queryIdx = blockIdx.x;\r
-        const T* queryDescs = query.ptr(queryIdx);\r
-\r
-        const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
-\r
-        if (trainIdx < train.rows)\r
-        {\r
-            const T* trainDescs = train.ptr(trainIdx);\r
-\r
-            typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();\r
-\r
-            if (mask(queryIdx, trainIdx))\r
-            {\r
-                Dist dist;\r
-\r
-                calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x);\r
-\r
-                myDist = dist;\r
-            }\r
-            \r
-            if (threadIdx.x == 0)\r
-                distance.ptr(queryIdx)[trainIdx] = myDist;\r
-        }\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // Calc distance kernel caller\r
-\r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
-    void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)\r
-    {\r
-        const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
-        const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);\r
-\r
-        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance);\r
-        cudaSafeCall( cudaGetLastError() );\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    template <typename Dist, typename T, typename Mask>\r
-    void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)\r
-    {\r
-        calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // find knn match kernel\r
-\r
-    template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)\r
-    {\r
-        const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;\r
-        __shared__ float sdist[SMEM_SIZE];\r
-        __shared__ int strainIdx[SMEM_SIZE];\r
-\r
-        const int queryIdx = blockIdx.x;\r
-\r
-        float* allDist = allDist_.ptr(queryIdx);\r
-        int* trainIdx = trainIdx_.ptr(queryIdx);\r
-        float* distance = distance_.ptr(queryIdx);\r
-\r
-        float dist = numeric_limits<float>::max();\r
-        int bestIdx = -1;\r
-        \r
-        for (int i = threadIdx.x; i < allDist_.cols; i += BLOCK_SIZE)\r
-        {\r
-            float reg = allDist[i];\r
-            if (reg < dist)\r
-            {\r
-                dist = reg;\r
-                bestIdx = i;\r
-            }\r
-        }\r
-\r
-        sdist[threadIdx.x] = dist;\r
-        strainIdx[threadIdx.x] = bestIdx;\r
-        __syncthreads();\r
-\r
-        reducePredVal<BLOCK_SIZE>(sdist, dist, strainIdx, bestIdx, threadIdx.x, less<volatile float>());\r
-\r
-        if (threadIdx.x == 0)\r
-        {\r
-            if (dist < numeric_limits<float>::max())\r
-            {\r
-                allDist[bestIdx] = numeric_limits<float>::max();\r
-                trainIdx[i] = bestIdx;\r
-                distance[i] = dist;\r
-            }\r
-        }\r
-    }\r
-    \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // find knn match kernel caller\r
-\r
-    template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)\r
-    {\r
-        const dim3 threads(BLOCK_SIZE, 1, 1);\r
-        const dim3 grid(trainIdx.rows, 1, 1);\r
-\r
-        for (int i = 0; i < k; ++i)\r
-        {\r
-            findBestMatch<BLOCK_SIZE><<<grid, threads, 0, stream>>>(allDist, i, trainIdx, distance);\r
-            cudaSafeCall( cudaGetLastError() );\r
-        }\r
-\r
-        if (stream == 0)\r
-            cudaSafeCall( cudaDeviceSynchronize() );\r
-    }\r
-\r
-    void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, cudaStream_t stream)\r
-    {\r
-        findKnnMatch_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist), stream);\r
-    }\r
-    \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // knn match Dispatcher\r
-\r
-    template <typename Dist, typename T>\r
-    void knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D& mask, \r
-        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        if (mask.data)\r
-        {\r
-            if (k == 2)\r
-            {\r
-                knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);\r
-                return;\r
-            }\r
-\r
-            calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);\r
-        }\r
-        else\r
-        {\r
-            if (k == 2)\r
-            {\r
-                knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);\r
-                return;\r
-            }\r
-\r
-            calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);\r
-        }\r
-\r
-        findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);\r
-    }\r
-    \r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // knn match caller\r
-\r
-    template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, \r
-        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);\r
-    }\r
-\r
-    template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    //template void knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    template void knnMatchL1_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask, \r
-        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,\r
-        int cc, cudaStream_t stream)\r
-    {\r
-        knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);\r
-    }\r
-\r
-    //template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    //template void knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    //template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    //template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    //template void knnMatchL2_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-\r
-    template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,\r
-        const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, \r
-        int cc, cudaStream_t stream)\r
-    {\r
-        knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);\r
-    }\r
-\r
-    template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    //template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    //template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-    template void knnMatchHamming_gpu<int   >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);\r
-}}}\r
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/vec_distance.hpp"
+#include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace bf_knnmatch
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
+                                      int& bestTrainIdx1, int& bestTrainIdx2,
+                                      float* s_distance, int* s_trainIdx)
+        {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+
+            float d1, d2;
+            int i1, i2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                    }
+                }
+            }
+        #else
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+
+            s_distance[threadIdx.x] = bestDistance1;
+            s_trainIdx[threadIdx.x] = bestTrainIdx1;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance1)
+                    {
+                        myBestDistance2 = myBestDistance1;
+                        myBestTrainIdx2 = myBestTrainIdx1;
+
+                        myBestDistance1 = val;
+                        myBestTrainIdx1 = s_trainIdx[i];
+                    }
+                    else if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            s_distance[threadIdx.x] = bestDistance2;
+            s_trainIdx[threadIdx.x] = bestTrainIdx2;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                    }
+                }
+            }
+
+            bestDistance1 = myBestDistance1;
+            bestDistance2 = myBestDistance2;
+
+            bestTrainIdx1 = myBestTrainIdx1;
+            bestTrainIdx2 = myBestTrainIdx2;
+        #endif
+        }
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
+                                       int& bestTrainIdx1, int& bestTrainIdx2,
+                                       int& bestImgIdx1, int& bestImgIdx2,
+                                       float* s_distance, int* s_trainIdx, int* s_imgIdx)
+        {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+            (void) s_imgIdx;
+
+            float d1, d2;
+            int i1, i2;
+            int j1, j2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+                j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
+                j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                        bestImgIdx2 = j1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+                    bestImgIdx2 = bestImgIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+                    bestImgIdx1 = j1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                        bestImgIdx2 = j2;
+                    }
+                }
+            }
+        #else
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+            s_imgIdx   += threadIdx.y * BLOCK_SIZE;
+
+            s_distance[threadIdx.x] = bestDistance1;
+            s_trainIdx[threadIdx.x] = bestTrainIdx1;
+            s_imgIdx[threadIdx.x]   = bestImgIdx1;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance1)
+                    {
+                        myBestDistance2 = myBestDistance1;
+                        myBestTrainIdx2 = myBestTrainIdx1;
+                        myBestImgIdx2   = myBestImgIdx1;
+
+                        myBestDistance1 = val;
+                        myBestTrainIdx1 = s_trainIdx[i];
+                        myBestImgIdx1   = s_imgIdx[i];
+                    }
+                    else if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                        myBestImgIdx2   = s_imgIdx[i];
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            s_distance[threadIdx.x] = bestDistance2;
+            s_trainIdx[threadIdx.x] = bestTrainIdx2;
+            s_imgIdx[threadIdx.x]   = bestImgIdx2;
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+            {
+                #pragma unroll
+                for (int i = 0; i < BLOCK_SIZE; ++i)
+                {
+                    float val = s_distance[i];
+
+                    if (val < myBestDistance2)
+                    {
+                        myBestDistance2 = val;
+                        myBestTrainIdx2 = s_trainIdx[i];
+                        myBestImgIdx2   = s_imgIdx[i];
+                    }
+                }
+            }
+
+            bestDistance1 = myBestDistance1;
+            bestDistance2 = myBestDistance2;
+
+            bestTrainIdx1 = myBestTrainIdx1;
+            bestTrainIdx2 = myBestTrainIdx2;
+
+            bestImgIdx1 = myBestImgIdx1;
+            bestImgIdx2 = myBestImgIdx2;
+        #endif
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled Cached
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
+        __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
+        {
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                           float& bestDistance1, float& bestDistance2,
+                                           int& bestTrainIdx1, int& bestTrainIdx2,
+                                           int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < train.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                                 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                     float& bestDistance1, float& bestDistance2,
+                                     int& bestTrainIdx1, int& bestTrainIdx2,
+                                     int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                           const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                           const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                             float& bestDistance1, float& bestDistance2,
+                             int& bestTrainIdx1, int& bestTrainIdx2,
+                             int& bestImgIdx1, int& bestImgIdx2)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
+                {
+                    if (distVal < bestDistance1)
+                    {
+                        bestImgIdx2   = bestImgIdx1;
+                        bestDistance2 = bestDistance1;
+                        bestTrainIdx2 = bestTrainIdx1;
+
+                        bestImgIdx1   = imgIdx;
+                        bestDistance1 = distVal;
+                        bestTrainIdx1 = trainIdx;
+                    }
+                    else if (distVal < bestDistance2)
+                    {
+                        bestImgIdx2   = imgIdx;
+                        bestDistance2 = distVal;
+                        bestTrainIdx2 = trainIdx;
+                    }
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+
+            loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                   const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            float myBestDistance1 = numeric_limits<float>::max();
+            float myBestDistance2 = numeric_limits<float>::max();
+            int myBestTrainIdx1 = -1;
+            int myBestTrainIdx2 = -1;
+            int myBestImgIdx1 = -1;
+            int myBestImgIdx2 = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
+                bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
+                bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                   const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knnMatch 2 dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                              const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
+                              cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+        }
+
+        template <typename Dist, typename T, typename Mask>
+        void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                              const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+                              cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Calc distance kernel
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                if (loadX < query.cols)
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
+                }
+                else
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            if (queryIdx < query.rows && trainIdx < train.rows)
+            {
+                float distVal = numeric_limits<float>::max();
+
+                if (mask(queryIdx, trainIdx))
+                    distVal = (typename Dist::result_type)dist;
+
+                allDist.ptr(queryIdx)[trainIdx] = distVal;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                if (loadX < query.cols)
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
+                }
+                else
+                {
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            if (queryIdx < query.rows && trainIdx < train.rows)
+            {
+                float distVal = numeric_limits<float>::max();
+
+                if (mask(queryIdx, trainIdx))
+                    distVal = (typename Dist::result_type)dist;
+
+                allDist.ptr(queryIdx)[trainIdx] = distVal;
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Calc Distance dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                    const PtrStepSzf& allDist,
+                                    cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
+            }*/
+            else
+            {
+                calcDistance<16, Dist>(query, train, mask, allDist, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // find knn match kernel
+
+        template <int BLOCK_SIZE>
+        __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
+        {
+            const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
+            __shared__ float s_dist[SMEM_SIZE];
+            __shared__ int s_trainIdx[SMEM_SIZE];
+
+            const int queryIdx = blockIdx.x;
+
+            float* allDistRow = allDist.ptr(queryIdx);
+
+            float dist = numeric_limits<float>::max();
+            int bestIdx = -1;
+
+            for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
+            {
+                float reg = allDistRow[i];
+                if (reg < dist)
+                {
+                    dist = reg;
+                    bestIdx = i;
+                }
+            }
+
+            s_dist[threadIdx.x] = dist;
+            s_trainIdx[threadIdx.x] = bestIdx;
+            __syncthreads();
+
+            reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
+
+            if (threadIdx.x == 0)
+            {
+                if (dist < numeric_limits<float>::max())
+                {
+                    allDistRow[bestIdx] = numeric_limits<float>::max();
+                    trainIdx.ptr(queryIdx)[i] = bestIdx;
+                    distance.ptr(queryIdx)[i] = dist;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE>
+        void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, 1, 1);
+            const dim3 grid(trainIdx.rows, 1, 1);
+
+            for (int i = 0; i < k; ++i)
+            {
+                findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
+        {
+            findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knn match Dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (k == 2)
+            {
+                match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else
+            {
+                calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
+                findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // knn match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
+            else
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+
+        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
+        }
+
+        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+
+        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
+        }
+
+        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+
+        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
+            cudaStream_t stream)
+        {
+            if (masks.data)
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
+            else
+                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
+        }
+
+        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+    } // namespace bf_knnmatch
+}}} // namespace cv { namespace gpu { namespace device {
+
+
+#endif /* CUDA_DISABLER */