fixed gpu tests (BruteForceMatcher_GPU, divide, phase, cartToPolar, async)
authorVladislav Vinogradov <no@email>
Mon, 31 Jan 2011 13:20:52 +0000 (13:20 +0000)
committerVladislav Vinogradov <no@email>
Mon, 31 Jan 2011 13:20:52 +0000 (13:20 +0000)
minor code refactoring

modules/gpu/include/opencv2/gpu/gpu.hpp
modules/gpu/src/brute_force_matcher.cpp
modules/gpu/src/cuda/brute_force_matcher.cu
modules/gpu/src/imgproc_gpu.cpp
tests/gpu/src/arithm.cpp
tests/gpu/src/brute_force_matcher.cpp
tests/gpu/src/gputest_main.cpp
tests/gpu/src/imgproc_gpu.cpp
tests/gpu/src/operator_async_call.cpp

index b719417..2531d1e 100644 (file)
@@ -671,10 +671,12 @@ namespace cv
         //! output will have CV_32FC1 type\r
         CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect);\r
 \r
-        //! applies Canny edge detector and produces the edge map\r
-        //! supprots only CV_8UC1 source type\r
-        //! disabled until fix crash\r
-        CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);\r
+        // applies Canny edge detector and produces the edge map\r
+        // disabled until fix crash\r
+        //CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);\r
+        //CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3);\r
+        //CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize = 3);\r
+        //CV_EXPORTS void Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buffer, double threshold1, double threshold2, int apertureSize = 3);\r
 \r
         //! computes Harris cornerness criteria at each image pixel\r
         CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType=BORDER_REFLECT101);\r
index 3447121..4806e67 100644 (file)
@@ -104,6 +104,18 @@ namespace cv { namespace gpu { namespace bfmatcher
         const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);\r
 }}}\r
 \r
+namespace\r
+{\r
+    class ImgIdxSetter\r
+    {\r
+    public:\r
+        ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}\r
+        void operator()(DMatch& m) const {m.imgIdx = imgIdx;}\r
+    private:\r
+        int imgIdx;\r
+    };\r
+}\r
+\r
 cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)\r
 {\r
 }\r
@@ -185,7 +197,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
         return;\r
 \r
     CV_Assert(trainIdx.type() == CV_32SC1 && trainIdx.isContinuous());\r
-    CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && distance.size().area() == trainIdx.size().area());\r
+    CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && distance.cols == trainIdx.cols);\r
 \r
     const int nQuery = trainIdx.cols;\r
 \r
@@ -309,8 +321,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
         return;\r
 \r
     CV_Assert(trainIdx.type() == CV_32SC1 && trainIdx.isContinuous());\r
-    CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.isContinuous());\r
-    CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous());\r
+    CV_Assert(imgIdx.type() == CV_32SC1 && imgIdx.isContinuous() && imgIdx.cols == trainIdx.cols);\r
+    CV_Assert(distance.type() == CV_32FC1 && distance.isContinuous() && imgIdx.cols == trainIdx.cols);\r
 \r
     const int nQuery = trainIdx.cols;\r
 \r
@@ -390,7 +402,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
     trainIdx.setTo(Scalar::all(-1));\r
     distance.create(nQuery, k, CV_32F);\r
 \r
-    allDist.create(nQuery, nTrain, CV_32F);\r
+    ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);\r
 \r
     match_caller_t func = match_callers[distType][queryDescs.depth()];\r
     CV_Assert(func != 0);\r
@@ -451,18 +463,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
     knnMatchDownload(trainIdx, distance, matches, compactResult);\r
 }\r
 \r
-namespace\r
-{\r
-    class ImgIdxSetter\r
-    {\r
-    public:\r
-        ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}\r
-        void operator()(DMatch& m) const {m.imgIdx = imgIdx;}\r
-    private:\r
-        int imgIdx;\r
-    };\r
-}\r
-\r
 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs,\r
     vector< vector<DMatch> >& matches, int knn, const vector<GpuMat>& masks, bool compactResult)\r
 {\r
@@ -538,9 +538,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
 \r
     CV_Assert(queryDescs.channels() == 1 && queryDescs.depth() < CV_64F);\r
     CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols);\r
-    CV_Assert(trainIdx.empty() || trainIdx.rows == nQuery);\r
+    CV_Assert(trainIdx.empty() || (trainIdx.rows == nQuery && trainIdx.size() == distance.size()));\r
 \r
-    nMatches.create(1, nQuery, CV_32SC1);\r
+    ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);\r
     nMatches.setTo(Scalar::all(0));\r
     if (trainIdx.empty())\r
     {\r
@@ -561,7 +561,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
         return;\r
 \r
     CV_Assert(trainIdx.type() == CV_32SC1);\r
-    CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.size().area() == trainIdx.rows);\r
+    CV_Assert(nMatches.type() == CV_32SC1 && nMatches.isContinuous() && nMatches.cols >= trainIdx.rows);\r
     CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());\r
 \r
     const int nQuery = trainIdx.rows;\r
index b28aee1..44f823d 100644 (file)
@@ -64,6 +64,7 @@ namespace cv { namespace gpu { namespace bfmatcher
         {            \r
             return mask.ptr(queryIdx)[trainIdx] != 0;\r
         }\r
+\r
     private:\r
         PtrStep mask;\r
     };\r
@@ -82,6 +83,7 @@ namespace cv { namespace gpu { namespace bfmatcher
         {            \r
             return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0;\r
         }\r
+\r
     private:\r
         PtrStep* maskCollection;\r
         PtrStep curMask;\r
@@ -102,172 +104,55 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////\r
     // Reduce Sum\r
     \r
-    template <int BLOCK_DIM_X>\r
-    __device__ void reduceSum(float* sdiff, float mySum, int tid)\r
-    {\r
-        sdiff[tid] = mySum;\r
-        __syncthreads();\r
+    template <int BLOCK_DIM_X> __device__ void reduceSum(float* sdiff_row, float& mySum);\r
 \r
-        if (BLOCK_DIM_X == 512) \r
-        {\r
-            if (tid < 256) \r
-            { \r
-                sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads(); \r
-                sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads();\r
-                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads();\r
-            }\r
-            volatile float* smem = sdiff;\r
-            smem[tid] = mySum += smem[tid + 32]; \r
-            smem[tid] = mySum += smem[tid + 16]; \r
-            smem[tid] = mySum += smem[tid +  8]; \r
-            smem[tid] = mySum += smem[tid +  4]; \r
-            smem[tid] = mySum += smem[tid +  2];\r
-            smem[tid] = mySum += smem[tid +  1]; \r
-        }\r
-        if (BLOCK_DIM_X == 256)\r
-        {\r
-            if (tid < 128) \r
-            { \r
-                sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads(); \r
-                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads();\r
-            }\r
-            volatile float* smem = sdiff;\r
-            smem[tid] = mySum += smem[tid + 32]; \r
-            smem[tid] = mySum += smem[tid + 16]; \r
-            smem[tid] = mySum += smem[tid +  8]; \r
-            smem[tid] = mySum += smem[tid +  4]; \r
-            smem[tid] = mySum += smem[tid +  2];\r
-            smem[tid] = mySum += smem[tid +  1];\r
-        }\r
-        if (BLOCK_DIM_X == 128)\r
-        {\r
-            if (tid <  64) \r
-            { \r
-                sdiff[tid] = mySum += sdiff[tid +  64]; __syncthreads(); \r
-            }\r
-            volatile float* smem = sdiff;\r
-            smem[tid] = mySum += smem[tid + 32]; \r
-            smem[tid] = mySum += smem[tid + 16]; \r
-            smem[tid] = mySum += smem[tid +  8]; \r
-            smem[tid] = mySum += smem[tid +  4]; \r
-            smem[tid] = mySum += smem[tid +  2];\r
-            smem[tid] = mySum += smem[tid +  1];\r
-        }\r
-        \r
-        volatile float* smem = sdiff;\r
-        if (BLOCK_DIM_X == 64) \r
-        {\r
-            if (tid < 32) \r
-            {\r
-                smem[tid] = mySum += smem[tid + 32]; \r
-                smem[tid] = mySum += smem[tid + 16]; \r
-                smem[tid] = mySum += smem[tid +  8]; \r
-                smem[tid] = mySum += smem[tid +  4]; \r
-                smem[tid] = mySum += smem[tid +  2];\r
-                smem[tid] = mySum += smem[tid +  1];  \r
-            }\r
-        }\r
-        if (BLOCK_DIM_X == 32) \r
-        {\r
-            if (tid < 16) \r
-            {\r
-                smem[tid] = mySum += smem[tid + 16]; \r
-                smem[tid] = mySum += smem[tid +  8]; \r
-                smem[tid] = mySum += smem[tid +  4]; \r
-                smem[tid] = mySum += smem[tid +  2];\r
-                smem[tid] = mySum += smem[tid +  1];  \r
-            }\r
-        }\r
-        if (BLOCK_DIM_X == 16) \r
-        {\r
-            if (tid < 8) \r
-            {\r
-                smem[tid] = mySum += smem[tid +  8]; \r
-                smem[tid] = mySum += smem[tid +  4]; \r
-                smem[tid] = mySum += smem[tid +  2];\r
-                smem[tid] = mySum += smem[tid +  1];  \r
-            }\r
-        }\r
-        if (BLOCK_DIM_X == 8) \r
-        {\r
-            if (tid < 4) \r
-            {\r
-                smem[tid] = mySum += smem[tid +  4]; \r
-                smem[tid] = mySum += smem[tid +  2];\r
-                smem[tid] = mySum += smem[tid +  1];  \r
-            }\r
-        }\r
-        if (BLOCK_DIM_X == 4) \r
-        {\r
-            if (tid < 2) \r
-            {\r
-                smem[tid] = mySum += smem[tid +  2];\r
-                smem[tid] = mySum += smem[tid +  1];  \r
-            }\r
-        }\r
-        if (BLOCK_DIM_X == 2) \r
-        {\r
-            if (tid < 1) \r
-            {\r
-                smem[tid] = mySum += smem[tid +  1];  \r
-            }\r
-        }\r
-    }\r
-\r
-    ///////////////////////////////////////////////////////////////////////////////\r
-    // loadDescsVals\r
-\r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, typename T> \r
-    __device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals)\r
+    template <> __device__ void reduceSum<16>(float* sdiff_row, float& mySum)\r
     {\r
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+        volatile float* smem = sdiff_row;\r
 \r
-        if (tid < desc_len)\r
+        smem[threadIdx.x] = mySum;\r
+        \r
+        if (threadIdx.x < 8) \r
         {\r
-            smem[tid] = (float)descs[tid];\r
-        }\r
-        __syncthreads();\r
-\r
-        #pragma unroll\r
-        for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)\r
-        {\r
-            *queryVals = smem[i];\r
-            ++queryVals;\r
+            smem[threadIdx.x] = mySum += smem[threadIdx.x + 8]; \r
+            smem[threadIdx.x] = mySum += smem[threadIdx.x + 4]; \r
+            smem[threadIdx.x] = mySum += smem[threadIdx.x + 2];\r
+            smem[threadIdx.x] = mySum += smem[threadIdx.x + 1];  \r
         }\r
     }\r
 \r
     ///////////////////////////////////////////////////////////////////////////////\r
     // Distance\r
 \r
-    template <int BLOCK_DIM_X>\r
     class L1Dist\r
     {\r
     public:\r
-        __device__ L1Dist() : mySum(0) {}\r
+        __device__ L1Dist() : mySum(0.0f) {}\r
 \r
         __device__ void reduceIter(float val1, float val2)\r
         {\r
             mySum += fabs(val1 - val2);\r
         }\r
 \r
-        __device__ void reduceAll(float* sdiff, int tid)\r
+        template <int BLOCK_DIM_X>\r
+        __device__ void reduceAll(float* sdiff_row)\r
         {\r
-            reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);\r
+            reduceSum<BLOCK_DIM_X>(sdiff_row, mySum);\r
         }\r
 \r
-        static __device__ float finalResult(float res)\r
+        __device__ operator float() const\r
         {\r
-            return res;\r
+            return mySum;\r
         }\r
+\r
     private:\r
         float mySum;\r
     };\r
 \r
-    template <int BLOCK_DIM_X>\r
     class L2Dist\r
     {\r
     public:\r
-        __device__ L2Dist() : mySum(0) {}\r
+        __device__ L2Dist() : mySum(0.0f) {}\r
 \r
         __device__ void reduceIter(float val1, float val2)\r
         {\r
@@ -275,15 +160,17 @@ namespace cv { namespace gpu { namespace bfmatcher
             mySum += reg * reg;\r
         }\r
 \r
-        __device__ void reduceAll(float* sdiff, int tid)\r
+        template <int BLOCK_DIM_X>\r
+        __device__ void reduceAll(float* sdiff_row)\r
         {\r
-            reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);\r
+            reduceSum<BLOCK_DIM_X>(sdiff_row, mySum);\r
         }\r
 \r
-        static __device__ float finalResult(float res)\r
+        __device__ operator float() const\r
         {\r
-            return sqrtf(res);\r
+            return sqrtf(mySum);\r
         }\r
+\r
     private:\r
         float mySum;\r
     };\r
@@ -292,56 +179,81 @@ namespace cv { namespace gpu { namespace bfmatcher
     // reduceDescDiff\r
 \r
     template <int BLOCK_DIM_X, typename Dist, typename T> \r
-    __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff)\r
+    __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, \r
+        float* sdiff_row)\r
     {\r
-        const int tid = threadIdx.x;\r
+        for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X)\r
+            dist.reduceIter(queryDescs[i], trainDescs[i]);\r
 \r
-        Dist dist;\r
+        dist.reduceAll<BLOCK_DIM_X>(sdiff_row);\r
+    }\r
 \r
-        for (int i = tid; i < desc_len; i += BLOCK_DIM_X)\r
-            dist.reduceIter(queryDescs[i], trainDescs[i]);\r
+///////////////////////////////////////////////////////////////////////////////////\r
+////////////////////////////////////// Match //////////////////////////////////////\r
+///////////////////////////////////////////////////////////////////////////////////\r
+    \r
+    ///////////////////////////////////////////////////////////////////////////////\r
+    // loadDescsVals\r
+\r
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, typename T> \r
+    __device__ void loadDescsVals(const T* descs, int desc_len, float* queryVals, float* smem)\r
+    {\r
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+        if (tid < desc_len)\r
+        {\r
+            smem[tid] = (float)descs[tid];\r
+        }\r
+        __syncthreads();\r
 \r
-        dist.reduceAll(sdiff, tid);\r
+        #pragma unroll\r
+        for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)\r
+        {\r
+            *queryVals = smem[i];\r
+            ++queryVals;\r
+        }\r
     }\r
 \r
     ///////////////////////////////////////////////////////////////////////////////\r
-    // reduceDescDiff_smem\r
+    // reduceDescDiffCached\r
 \r
     template <int N> struct UnrollDescDiff\r
     {\r
         template <typename Dist, typename T>\r
-        static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs\r
-            int ind, int desc_len)\r
+        static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len\r
+            Dist& dist, int ind)\r
         {\r
             if (ind < desc_len)\r
+            {\r
                 dist.reduceIter(*queryVals, trainDescs[ind]);\r
 \r
-            ++queryVals;\r
+                ++queryVals;\r
 \r
-            UnrollDescDiff<N - 1>::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len);\r
+                UnrollDescDiff<N - 1>::calcCheck(queryVals, trainDescs, desc_len, dist, ind + blockDim.x);\r
+            }\r
         }\r
 \r
         template <typename Dist, typename T>\r
-        static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)\r
+        static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist)\r
         {\r
             dist.reduceIter(*queryVals, *trainDescs);\r
 \r
             ++queryVals;\r
             trainDescs += blockDim.x;\r
 \r
-            UnrollDescDiff<N - 1>::calcWithoutCheck(dist, queryVals, trainDescs);\r
+            UnrollDescDiff<N - 1>::calcWithoutCheck(queryVals, trainDescs, dist);\r
         }\r
     };\r
     template <> struct UnrollDescDiff<0>\r
     {\r
         template <typename Dist, typename T>\r
-        static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs\r
-            int ind, int desc_len)\r
+        static __device__ void calcCheck(const float* queryVals, const T* trainDescs, int desc_len\r
+            Dist& dist, int ind)\r
         {\r
         }\r
 \r
         template <typename Dist, typename T>\r
-        static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)\r
+        static __device__ void calcWithoutCheck(const float* queryVals, const T* trainDescs, Dist& dist)\r
         {\r
         }\r
     };\r
@@ -351,106 +263,82 @@ namespace cv { namespace gpu { namespace bfmatcher
     struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>\r
     {\r
         template <typename Dist, typename T>\r
-        static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)\r
+        static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist)\r
         {\r
-            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(dist, queryVals, trainDescs\r
-                threadIdx.x, desc_len);\r
+            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(queryVals, trainDescs, desc_len\r
+                dist, threadIdx.x);\r
         }\r
     };\r
     template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN> \r
     struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>\r
     {\r
         template <typename Dist, typename T>\r
-        static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)\r
+        static __device__ void calc(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist)\r
         {\r
-            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(dist, queryVals, \r
-                trainDescs + threadIdx.x);\r
+            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals, \r
+                trainDescs + threadIdx.x, dist);\r
         }\r
     };\r
 \r
     template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>\r
-    __device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff)\r
-    {\r
-        const int tid = threadIdx.x;\r
+    __device__ void reduceDescDiffCached(const float* queryVals, const T* trainDescs, int desc_len, Dist& dist, \r
+        float* sdiff_row)\r
+    {        \r
+        DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals, \r
+            trainDescs, desc_len, dist);\r
         \r
-        Dist dist;\r
-\r
-        DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(dist, queryVals, \r
-            trainDescs, desc_len);\r
-        \r
-        dist.reduceAll(sdiff, tid);\r
+        dist.reduceAll<BLOCK_DIM_X>(sdiff_row);\r
     }\r
 \r
-///////////////////////////////////////////////////////////////////////////////////\r
-////////////////////////////////////// Match //////////////////////////////////////\r
-///////////////////////////////////////////////////////////////////////////////////\r
-\r
     ///////////////////////////////////////////////////////////////////////////////\r
-    // warpReduceMin\r
+    // warpReduceMinIdxIdx\r
 \r
     template <int BLOCK_DIM_Y> \r
-    __device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx)\r
+    __device__ void warpReduceMinIdxIdx(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, \r
+        volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx);\r
+\r
+    template <> \r
+    __device__ void warpReduceMinIdxIdx<16>(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, \r
+        volatile float* smin, volatile int* strainIdx, volatile int* simgIdx)\r
     {\r
-        float minSum = sdata[tid];\r
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
 \r
-        if (BLOCK_DIM_Y >= 64) \r
-        {\r
-            float reg = sdata[tid + 32];\r
-            if (reg < minSum)\r
-            {\r
-                sdata[tid] = minSum = reg;\r
-                strainIdx[tid] = strainIdx[tid + 32];\r
-                simgIdx[tid] = simgIdx[tid + 32];\r
-            }\r
-        }\r
-        if (BLOCK_DIM_Y >= 32) \r
+        if (tid < 8)\r
         {\r
-            float reg = sdata[tid + 16];\r
-            if (reg < minSum)\r
-            {\r
-                sdata[tid] = minSum = reg;\r
-                strainIdx[tid] = strainIdx[tid + 16];\r
-                simgIdx[tid] = simgIdx[tid + 16];\r
-            }\r
-        }\r
-        if (BLOCK_DIM_Y >= 16) \r
-        {\r
-            float reg = sdata[tid + 8];\r
-            if (reg < minSum)\r
+            myMin = smin[tid];\r
+            myBestTrainIdx = strainIdx[tid];\r
+            myBestImgIdx = simgIdx[tid];\r
+\r
+            float reg = smin[tid + 8];\r
+            if (reg < myMin)\r
             {\r
-                sdata[tid] = minSum = reg;\r
-                strainIdx[tid] = strainIdx[tid + 8];\r
-                simgIdx[tid] = simgIdx[tid + 8];\r
+                smin[tid] = myMin = reg;\r
+                strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 8];\r
+                simgIdx[tid] = myBestImgIdx = simgIdx[tid + 8];\r
             }\r
-        }\r
-        if (BLOCK_DIM_Y >= 8) \r
-        { \r
-            float reg = sdata[tid + 4];\r
-            if (reg < minSum)\r
+\r
+            reg = smin[tid + 4];\r
+            if (reg < myMin)\r
             {\r
-                sdata[tid] = minSum = reg;\r
-                strainIdx[tid] = strainIdx[tid + 4];\r
-                simgIdx[tid] = simgIdx[tid + 4];\r
+                smin[tid] = myMin = reg;\r
+                strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 4];\r
+                simgIdx[tid] = myBestImgIdx = simgIdx[tid + 4];\r
             }\r
-        }\r
-        if (BLOCK_DIM_Y >= 4) \r
-        { \r
-            float reg = sdata[tid + 2];\r
-            if (reg < minSum)\r
+        \r
+            reg = smin[tid + 2];\r
+            if (reg < myMin)\r
             {\r
-                sdata[tid] = minSum = reg;\r
-                strainIdx[tid] = strainIdx[tid + 2];\r
-                simgIdx[tid] = simgIdx[tid + 2];\r
+                smin[tid] = myMin = reg;\r
+                strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 2];\r
+                simgIdx[tid] = myBestImgIdx = simgIdx[tid + 2];\r
             }\r
-        }\r
-        if (BLOCK_DIM_Y >= 2) \r
-        {\r
-            float reg = sdata[tid + 1];\r
-            if (reg < minSum)\r
+        \r
+            reg = smin[tid + 1];\r
+            if (reg < myMin)\r
             {\r
-                sdata[tid] = minSum = reg;\r
-                strainIdx[tid] = strainIdx[tid + 1];\r
-                simgIdx[tid] = simgIdx[tid + 1];\r
+                smin[tid] = myMin = reg;\r
+                strainIdx[tid] = myBestTrainIdx = strainIdx[tid + 1];\r
+                simgIdx[tid] = myBestImgIdx = simgIdx[tid + 1];\r
             }\r
         }\r
     }\r
@@ -458,9 +346,9 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////\r
     // findBestMatch\r
 \r
-    template <int BLOCK_DIM_Y, typename Dist>\r
-    __device__ void findBestMatch(int queryIdx, float myMin, int myBestTrainIdx, int myBestImgIdx, \r
-        float* smin, int* strainIdx, int* simgIdx, int* trainIdx, int* imgIdx, float* distance)\r
+    template <int BLOCK_DIM_Y>\r
+    __device__ void findBestMatch(float& myMin, int& myBestTrainIdx, int& myBestImgIdx, \r
+        float* smin, int* strainIdx, int* simgIdx)\r
     {\r
         if (threadIdx.x == 0)\r
         {\r
@@ -470,27 +358,13 @@ namespace cv { namespace gpu { namespace bfmatcher
         }\r
         __syncthreads();\r
 \r
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
-        if (tid < 32)\r
-            warpReduceMin<BLOCK_DIM_Y>(tid, smin, strainIdx, simgIdx);\r
-\r
-        if (threadIdx.x == 0 && threadIdx.y == 0)\r
-        {\r
-            float minSum = smin[0];\r
-            int bestTrainIdx = strainIdx[0];\r
-            int bestImgIdx = simgIdx[0];\r
-\r
-            imgIdx[queryIdx] = bestImgIdx;\r
-            trainIdx[queryIdx] = bestTrainIdx;\r
-            distance[queryIdx] = Dist::finalResult(minSum);\r
-        }\r
+        warpReduceMinIdxIdx<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, smin, strainIdx, simgIdx);\r
     }\r
     \r
     ///////////////////////////////////////////////////////////////////////////////\r
     // ReduceDescCalculator\r
 \r
-    template <int BLOCK_DIM_X, typename Dist, typename T>\r
+    template <int BLOCK_DIM_X, typename T>\r
     class ReduceDescCalculatorSimple\r
     {\r
     public:\r
@@ -499,29 +373,30 @@ namespace cv { namespace gpu { namespace bfmatcher
             queryDescs = queryDescs_;\r
         }\r
 \r
-        __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const\r
+        template <typename Dist>\r
+        __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const\r
         {\r
-            reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, desc_len, sdiff_row);\r
+            reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);\r
         }\r
 \r
     private:\r
         const T* queryDescs;\r
     };\r
 \r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, \r
-        typename Dist, typename T>\r
-    class ReduceDescCalculatorSmem\r
+    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T>\r
+    class ReduceDescCalculatorCached\r
     {\r
     public:\r
         __device__ void prepare(const T* queryDescs, int desc_len, float* smem)\r
         {\r
-            loadDescsVals<BLOCK_DIM_X, BLOCK_DIM_Y, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, smem, queryVals);\r
+            loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);\r
         }\r
 \r
-        __device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const\r
+        template <typename Dist>\r
+        __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, float* sdiff_row) const\r
         {\r
-            reduceDescDiff_smem<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist>(queryVals, trainDescs, \r
-                desc_len, sdiff_row);\r
+            reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, \r
+                desc_len, dist, sdiff_row);\r
         }\r
 \r
     private:\r
@@ -531,26 +406,26 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////\r
     // matchDescs loop\r
 \r
-    template <typename ReduceDescCalculator, typename T, typename Mask>\r
-    __device__ void matchDescs(int queryIdx, const int imgIdx, const DevMem2D_<T>& trainDescs_,  \r
+    template <typename Dist, typename ReduceDescCalculator, typename T, typename Mask>\r
+    __device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& trainDescs_,  \r
         const Mask& m, const ReduceDescCalculator& reduceDescCalc,\r
-        float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx)\r
+        float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row)\r
     {\r
-        const T* trainDescs = trainDescs_.ptr(threadIdx.y);\r
-        const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T);\r
-        for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; \r
-             trainIdx += blockDim.y, trainDescs += trainDescsStep)\r
+        for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows; trainIdx += blockDim.y)\r
         {\r
             if (m(queryIdx, trainIdx))\r
             {\r
-                reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row);\r
+                const T* trainDescs = trainDescs_.ptr(trainIdx);\r
+\r
+                Dist dist;\r
+\r
+                reduceDescCalc.calc(trainDescs, trainDescs_.cols, dist, sdiff_row);\r
 \r
                 if (threadIdx.x == 0)\r
                 {\r
-                    float reg = sdiff_row[0];\r
-                    if (reg < myMin)\r
+                    if (dist < myMin)\r
                     {\r
-                        myMin = reg;\r
+                        myMin = dist;\r
                         myBestTrainIdx = trainIdx;\r
                         myBestImgIdx = imgIdx;\r
                     }\r
@@ -570,18 +445,19 @@ namespace cv { namespace gpu { namespace bfmatcher
         {\r
         }\r
 \r
-        template <typename ReduceDescCalculator, typename Mask>\r
+        template <typename Dist, typename ReduceDescCalculator, typename Mask>\r
         __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, \r
-            float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const\r
+            float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const\r
         {\r
-            matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc, \r
-                sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+            matchDescs<Dist>(queryIdx, 0, trainDescs, m, reduceDescCalc, \r
+                myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);\r
         }\r
 \r
         __device__ int desc_len() const\r
         {\r
             return trainDescs.cols;\r
         }\r
+\r
     private:\r
         DevMem2D_<T> trainDescs;\r
     };\r
@@ -595,16 +471,16 @@ namespace cv { namespace gpu { namespace bfmatcher
         {\r
         }\r
 \r
-        template <typename ReduceDescCalculator, typename Mask>\r
+        template <typename Dist, typename ReduceDescCalculator, typename Mask>\r
         __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, \r
-            float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const\r
+            float& myMin, int& myBestTrainIdx, int& myBestImgIdx, float* sdiff_row) const\r
         {\r
             for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)\r
             {\r
                 DevMem2D_<T> trainDescs = trainCollection[imgIdx];\r
                 m.nextMask();\r
-                matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, \r
-                    sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+                matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, \r
+                    myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);\r
             }\r
         }\r
 \r
@@ -612,6 +488,7 @@ namespace cv { namespace gpu { namespace bfmatcher
         {\r
             return desclen;\r
         }\r
+\r
     private:\r
         const DevMem2D_<T>* trainCollection;\r
         int nImg;\r
@@ -623,12 +500,10 @@ namespace cv { namespace gpu { namespace bfmatcher
 \r
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T, \r
         typename Train, typename Mask>\r
-    __global__ void match(PtrStep_<T> queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance)\r
+    __global__ void match(const PtrStep_<T> queryDescs_, const Train train, const Mask mask, \r
+        int* trainIdx, int* imgIdx, float* distance)\r
     {\r
-        __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
-        __shared__ float smin[64];\r
-        __shared__ int strainIdx[64];\r
-        __shared__ int simgIdx[64];\r
+        __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y];        \r
         \r
         const int queryIdx = blockIdx.x;\r
         \r
@@ -637,24 +512,39 @@ namespace cv { namespace gpu { namespace bfmatcher
         float myMin = numeric_limits_gpu<float>::max();\r
 \r
         {\r
-            float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+            float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;\r
+\r
             Mask m = mask;\r
+\r
             ReduceDescCalculator reduceDescCalc;\r
-            reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff);\r
+\r
+            reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), smem);\r
         \r
-            train.loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);\r
+            train.template loop<Dist>(queryIdx, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);\r
         }\r
+        __syncthreads();\r
+\r
+        float* smin = smem;\r
+        int* strainIdx = (int*)(smin + BLOCK_DIM_Y);\r
+        int* simgIdx = strainIdx + BLOCK_DIM_Y;\r
 \r
-        findBestMatch<BLOCK_DIM_Y, Dist>(queryIdx, myMin, myBestTrainIdx, myBestImgIdx, \r
-            smin, strainIdx, simgIdx, trainIdx, imgIdx, distance);\r
+        findBestMatch<BLOCK_DIM_Y>(myMin, myBestTrainIdx, myBestImgIdx, \r
+            smin, strainIdx, simgIdx);\r
+\r
+        if (threadIdx.x == 0 && threadIdx.y == 0)\r
+        {\r
+            imgIdx[queryIdx] = myBestImgIdx;\r
+            trainIdx[queryIdx] = myBestTrainIdx;\r
+            distance[queryIdx] = myMin;\r
+        }\r
     }\r
     \r
     ///////////////////////////////////////////////////////////////////////////////\r
     // Match kernel callers\r
 \r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, \r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, \r
         typename Train, typename Mask>\r
-    void match_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
+    void matchSimple_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
         const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
     {\r
         StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp\r
@@ -662,15 +552,15 @@ namespace cv { namespace gpu { namespace bfmatcher
         dim3 grid(queryDescs.rows, 1, 1);\r
         dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
 \r
-        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, Dist<BLOCK_DIM_X>, T>, \r
-            Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>\r
+            <<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
             imgIdx.data, distance.data);\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, \r
-        template <int> class Dist, typename T, typename Train, typename Mask>\r
-    void match_smem_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
+        typename Dist, typename T, typename Train, typename Mask>\r
+    void matchCached_caller(const DevMem2D_<T>& queryDescs, const Train& train, \r
         const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
     {\r
         StaticAssert<BLOCK_DIM_Y <= 64>::check();                                // blockDimY vals must reduce by warp\r
@@ -680,9 +570,10 @@ namespace cv { namespace gpu { namespace bfmatcher
         dim3 grid(queryDescs.rows, 1, 1);\r
         dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
 \r
-        match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSmem<BLOCK_DIM_X, BLOCK_DIM_Y, \r
-              MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist<BLOCK_DIM_X>, T>, \r
-              Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
+        match<BLOCK_DIM_X, BLOCK_DIM_Y, \r
+              ReduceDescCalculatorCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, T>, \r
+              Dist, T>\r
+              <<<grid, threads>>>(queryDescs, train, mask, trainIdx.data, \r
               imgIdx.data, distance.data);\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
@@ -691,24 +582,24 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////\r
     // Match kernel chooser\r
 \r
-    template <template <int> class Dist, typename T, typename Train, typename Mask>\r
+    template <typename Dist, typename T, typename Train, typename Mask>\r
     void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train, \r
         const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)\r
     {\r
         if (queryDescs.cols < 64)\r
-            match_smem_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+            matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
         else if (queryDescs.cols == 64)\r
-            match_smem_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+            matchCached_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
         else if (queryDescs.cols < 128)\r
-            match_smem_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+            matchCached_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
         else if (queryDescs.cols == 128)\r
-            match_smem_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+            matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
         else if (queryDescs.cols < 256)\r
-            match_smem_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+            matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
         else if (queryDescs.cols == 256)\r
-            match_smem_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+            matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
         else\r
-            match_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
+            matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
@@ -828,41 +719,41 @@ namespace cv { namespace gpu { namespace bfmatcher
         {\r
             const T* trainDescs = trainDescs_.ptr(trainIdx);\r
 \r
-            float dist = numeric_limits_gpu<float>::max();\r
+            float myDist = numeric_limits_gpu<float>::max();\r
 \r
             if (mask(queryIdx, trainIdx))\r
             {\r
-                reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);\r
+                Dist dist;\r
+\r
+                reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, trainDescs_.cols, dist, sdiff_row);\r
 \r
                 if (threadIdx.x == 0)\r
-                {\r
-                    dist = Dist::finalResult(sdiff_row[0]);\r
-                }\r
+                    myDist = dist;\r
             }\r
             \r
             if (threadIdx.x == 0)\r
-                distance.ptr(queryIdx)[trainIdx] = dist;\r
+                distance.ptr(queryIdx)[trainIdx] = myDist;\r
         }\r
     }\r
 \r
     ///////////////////////////////////////////////////////////////////////////////\r
     // Calc distance kernel caller\r
 \r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
     void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, \r
         const Mask& mask, const DevMem2Df& distance)\r
     {\r
         dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
         dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);\r
 \r
-        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(\r
+        calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(\r
             queryDescs, trainDescs, mask, distance);\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
     }\r
         \r
     ///////////////////////////////////////////////////////////////////////////////\r
-    // reduceMin\r
+    // warpReduceMinIdx\r
 \r
     template <int BLOCK_SIZE> \r
     __device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid)\r
@@ -1103,25 +994,27 @@ namespace cv { namespace gpu { namespace bfmatcher
     {\r
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
 \r
-        __shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];\r
+        __shared__ float smem[BLOCK_DIM_X * BLOCK_DIM_Y];\r
 \r
-        float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;\r
+        float* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;\r
         \r
         const int queryIdx = blockIdx.x;\r
         const T* queryDescs = queryDescs_.ptr(queryIdx);\r
 \r
         const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;\r
+\r
         if (trainIdx < trainDescs_.rows)\r
         {\r
             const T* trainDescs = trainDescs_.ptr(trainIdx);\r
 \r
             if (mask(queryIdx, trainIdx))\r
             {\r
-                reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);\r
+                Dist dist;\r
+\r
+                reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, trainDescs_.cols, dist, sdiff_row);\r
 \r
                 if (threadIdx.x == 0)\r
                 {\r
-                    float dist = Dist::finalResult(sdiff_row[0]);\r
                     if (dist < maxDistance)\r
                     {\r
                         unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);\r
@@ -1141,7 +1034,7 @@ namespace cv { namespace gpu { namespace bfmatcher
     ///////////////////////////////////////////////////////////////////////////////\r
     // Radius Match kernel caller\r
 \r
-    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>\r
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>\r
     void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, \r
         float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, \r
         const DevMem2Df& distance)\r
@@ -1149,7 +1042,7 @@ namespace cv { namespace gpu { namespace bfmatcher
         dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
         dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);\r
 \r
-        radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(\r
+        radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(\r
             queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
index 35aa3d5..16d8e6b 100644 (file)
@@ -66,7 +66,10 @@ void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&) { throw_nogpu(); }\r
 void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); }\r
 void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&) { throw_nogpu(); }\r
-void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }\r
+//void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }\r
+//void cv::gpu::Canny(const GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }\r
+//void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }\r
+//void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }\r
 void cv::gpu::evenLevels(GpuMat&, int, int, int) { throw_nogpu(); }\r
 void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }\r
 void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*) { throw_nogpu(); }\r
@@ -655,34 +658,60 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
 ////////////////////////////////////////////////////////////////////////\r
 // Canny\r
 \r
-void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize)\r
-{\r
-    CV_Assert(!"disabled until fix crash");\r
-    CV_Assert(image.type() == CV_8UC1);\r
-\r
-    GpuMat srcDx, srcDy;\r
-\r
-    Sobel(image, srcDx, -1, 1, 0, apertureSize);\r
-    Sobel(image, srcDy, -1, 0, 1, apertureSize);\r
-\r
-    srcDx.convertTo(srcDx, CV_32F);\r
-    srcDy.convertTo(srcDy, CV_32F);\r
-\r
-    edges.create(image.size(), CV_8UC1);\r
-\r
-    NppiSize sz;\r
-    sz.height = image.rows;\r
-    sz.width = image.cols;\r
-\r
-    int bufsz;\r
-    nppSafeCall( nppiCannyGetBufferSize(sz, &bufsz) );\r
-    GpuMat buf(1, bufsz, CV_8UC1);\r
-\r
-    nppSafeCall( nppiCanny_32f8u_C1R(srcDx.ptr<Npp32f>(), srcDx.step, srcDy.ptr<Npp32f>(), srcDy.step,\r
-        edges.ptr<Npp8u>(), edges.step, sz, (Npp32f)threshold1, (Npp32f)threshold2, buf.ptr<Npp8u>()) );\r
-\r
-    cudaSafeCall( cudaThreadSynchronize() );\r
-}\r
+//void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, double threshold1, double threshold2, int apertureSize)\r
+//{\r
+//    CV_Assert(!"disabled until fix crash");\r
+//\r
+//    GpuMat srcDx, srcDy;\r
+//\r
+//    Sobel(image, srcDx, CV_32F, 1, 0, apertureSize);\r
+//    Sobel(image, srcDy, CV_32F, 0, 1, apertureSize);\r
+//\r
+//    GpuMat buf;\r
+//\r
+//    Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);\r
+//}\r
+//\r
+//void cv::gpu::Canny(const GpuMat& image, GpuMat& edges, GpuMat& buf, double threshold1, double threshold2, int apertureSize)\r
+//{\r
+//    CV_Assert(!"disabled until fix crash");\r
+//\r
+//    GpuMat srcDx, srcDy;\r
+//\r
+//    Sobel(image, srcDx, CV_32F, 1, 0, apertureSize);\r
+//    Sobel(image, srcDy, CV_32F, 0, 1, apertureSize);\r
+//\r
+//    Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);\r
+//}\r
+//\r
+//void cv::gpu::Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, double threshold1, double threshold2, int apertureSize)\r
+//{\r
+//    CV_Assert(!"disabled until fix crash");\r
+//\r
+//    GpuMat buf;\r
+//    Canny(srcDx, srcDy, edges, buf, threshold1, threshold2, apertureSize);\r
+//}\r
+//\r
+//void cv::gpu::Canny(const GpuMat& srcDx, const GpuMat& srcDy, GpuMat& edges, GpuMat& buf, double threshold1, double threshold2, int apertureSize)\r
+//{\r
+//    CV_Assert(!"disabled until fix crash");\r
+//    CV_Assert(srcDx.type() == CV_32FC1 && srcDy.type() == CV_32FC1 && srcDx.size() == srcDy.size());\r
+//\r
+//    edges.create(srcDx.size(), CV_8UC1);\r
+//\r
+//    NppiSize sz;\r
+//    sz.height = srcDx.rows;\r
+//    sz.width = srcDx.cols;\r
+//\r
+//    int bufsz;\r
+//    nppSafeCall( nppiCannyGetBufferSize(sz, &bufsz) );\r
+//    ensureSizeIsEnough(1, bufsz, CV_8UC1, buf);\r
+//\r
+//    nppSafeCall( nppiCanny_32f8u_C1R(srcDx.ptr<Npp32f>(), srcDx.step, srcDy.ptr<Npp32f>(), srcDy.step,\r
+//        edges.ptr<Npp8u>(), edges.step, sz, (Npp32f)threshold1, (Npp32f)threshold2, buf.ptr<Npp8u>()) );\r
+//\r
+//    cudaSafeCall( cudaThreadSynchronize() );\r
+//}\r
 \r
 ////////////////////////////////////////////////////////////////////////\r
 // Histogram\r
index fe2385a..d03642a 100644 (file)
@@ -66,45 +66,58 @@ protected:
 \r
     virtual int test(const Mat& mat1, const Mat& mat2) = 0;\r
 \r
-    int CheckNorm(const Mat& m1, const Mat& m2);\r
-    int CheckNorm(const Scalar& s1, const Scalar& s2);\r
-    int CheckNorm(double d1, double d2);\r
+    int CheckNorm(const Mat& m1, const Mat& m2, double eps = 1e-5);\r
+    int CheckNorm(const Scalar& s1, const Scalar& s2, double eps = 1e-5);\r
+    int CheckNorm(double d1, double d2, double eps = 1e-5);\r
 };\r
 \r
 int CV_GpuArithmTest::test(int type)\r
 {\r
     cv::Size sz(200, 200);\r
     cv::Mat mat1(sz, type), mat2(sz, type);\r
+    \r
     cv::RNG rng(*ts->get_rng());\r
-    rng.fill(mat1, cv::RNG::UNIFORM, cv::Scalar::all(1), cv::Scalar::all(20));\r
-    rng.fill(mat2, cv::RNG::UNIFORM, cv::Scalar::all(1), cv::Scalar::all(20));\r
+\r
+    if (type != CV_32FC1)\r
+    {\r
+        rng.fill(mat1, cv::RNG::UNIFORM, cv::Scalar::all(1), cv::Scalar::all(20));\r
+        rng.fill(mat2, cv::RNG::UNIFORM, cv::Scalar::all(1), cv::Scalar::all(20));\r
+    }\r
+    else\r
+    {\r
+        rng.fill(mat1, cv::RNG::UNIFORM, cv::Scalar::all(0.1), cv::Scalar::all(1.0));\r
+        rng.fill(mat2, cv::RNG::UNIFORM, cv::Scalar::all(0.1), cv::Scalar::all(1.0));\r
+    }\r
 \r
     return test(mat1, mat2);\r
 }\r
 \r
-int CV_GpuArithmTest::CheckNorm(const Mat& m1, const Mat& m2)\r
+int CV_GpuArithmTest::CheckNorm(const Mat& m1, const Mat& m2, double eps)\r
 {\r
     double ret = norm(m1, m2, NORM_INF);\r
 \r
-    if (ret < 1e-5)\r
+    if (ret < eps)\r
         return CvTS::OK;\r
 \r
     ts->printf(CvTS::LOG, "\nNorm: %f\n", ret);\r
     return CvTS::FAIL_GENERIC;\r
 }\r
 \r
-int CV_GpuArithmTest::CheckNorm(const Scalar& s1, const Scalar& s2)\r
+int CV_GpuArithmTest::CheckNorm(const Scalar& s1, const Scalar& s2, double eps)\r
 {\r
-    double ret0 = CheckNorm(s1[0], s2[0]), ret1 = CheckNorm(s1[1], s2[1]), ret2 = CheckNorm(s1[2], s2[2]), ret3 = CheckNorm(s1[3], s2[3]);\r
+    int ret0 = CheckNorm(s1[0], s2[0], eps), \r
+        ret1 = CheckNorm(s1[1], s2[1], eps), \r
+        ret2 = CheckNorm(s1[2], s2[2], eps), \r
+        ret3 = CheckNorm(s1[3], s2[3], eps);\r
 \r
     return (ret0 == CvTS::OK && ret1 == CvTS::OK && ret2 == CvTS::OK && ret3 == CvTS::OK) ? CvTS::OK : CvTS::FAIL_GENERIC;\r
 }\r
 \r
-int CV_GpuArithmTest::CheckNorm(double d1, double d2)\r
+int CV_GpuArithmTest::CheckNorm(double d1, double d2, double eps)\r
 {\r
     double ret = ::fabs(d1 - d2);\r
 \r
-    if (ret < 1e-5)\r
+    if (ret < eps)\r
         return CvTS::OK;\r
 \r
     ts->printf(CvTS::LOG, "\nNorm: %f\n", ret);\r
@@ -245,7 +258,7 @@ struct CV_GpuNppImageDivideTest : public CV_GpuArithmTest
            GpuMat gpuRes;\r
            cv::gpu::divide(gpu1, gpu2, gpuRes);\r
 \r
-            return CheckNorm(cpuRes, gpuRes);\r
+        return CheckNorm(cpuRes, gpuRes, 1.01f);\r
     }\r
 };\r
 \r
@@ -584,7 +597,7 @@ struct CV_GpuNppImagePhaseTest : public CV_GpuArithmTest
         GpuMat gpuRes;\r
         cv::gpu::phase(gpu1, gpu2, gpuRes, true);\r
 \r
-        return CheckNorm(cpuRes, gpuRes);\r
+        return CheckNorm(cpuRes, gpuRes, 0.3f);\r
     }\r
 };\r
 \r
@@ -611,7 +624,7 @@ struct CV_GpuNppImageCartToPolarTest : public CV_GpuArithmTest
         cv::gpu::cartToPolar(gpu1, gpu2, gpuMag, gpuAngle);\r
 \r
         int magRes = CheckNorm(cpuMag, gpuMag);\r
-        int angleRes = CheckNorm(cpuAngle, gpuAngle);\r
+        int angleRes = CheckNorm(cpuAngle, gpuAngle, 0.005f);\r
 \r
         return magRes == CvTS::OK && angleRes == CvTS::OK ? CvTS::OK : CvTS::FAIL_GENERIC;\r
     }\r
index afe5db2..f4c0acc 100644 (file)
@@ -51,24 +51,27 @@ class CV_GpuBruteForceMatcherTest : public CvTest
 {\r
 public:\r
     CV_GpuBruteForceMatcherTest() :\r
-        CvTest( "GPU-BruteForceMatcher", "BruteForceMatcher" ), badPart(0.01f)\r
+        CvTest( "GPU-BruteForceMatcher", "BruteForceMatcher" )\r
     {\r
     }\r
-protected:\r
-    static const int dim = 500;\r
-    static const int queryDescCount = 300; // must be even number because we split train data in some cases in two\r
-    static const int countFactor = 4; // do not change it\r
-    const float badPart;\r
 \r
+protected:\r
     virtual void run(int);\r
-    void generateData(GpuMat& query, GpuMat& train);\r
-\r
+    \r
     void emptyDataTest();\r
+    void dataTest(int dim);\r
+    \r
+    void generateData(GpuMat& query, GpuMat& train, int dim);\r
+\r
     void matchTest(const GpuMat& query, const GpuMat& train);\r
     void knnMatchTest(const GpuMat& query, const GpuMat& train);\r
     void radiusMatchTest(const GpuMat& query, const GpuMat& train);\r
 \r
+private:\r
     BruteForceMatcher_GPU< L2<float> > dmatcher;\r
+\r
+    static const int queryDescCount = 300; // must be even number because we split train data in some cases in two\r
+    static const int countFactor = 4; // do not change it\r
 };\r
 \r
 void CV_GpuBruteForceMatcherTest::emptyDataTest()\r
@@ -150,7 +153,7 @@ void CV_GpuBruteForceMatcherTest::emptyDataTest()
 \r
 }\r
 \r
-void CV_GpuBruteForceMatcherTest::generateData( GpuMat& queryGPU, GpuMat& trainGPU )\r
+void CV_GpuBruteForceMatcherTest::generateData( GpuMat& queryGPU, GpuMat& trainGPU, int dim )\r
 {\r
     Mat query, train;\r
     RNG rng(*ts->get_rng());\r
@@ -209,7 +212,7 @@ void CV_GpuBruteForceMatcherTest::matchTest( const GpuMat& query, const GpuMat&
                 if( (match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0) )\r
                     badCount++;\r
             }\r
-            if( (float)badCount > (float)queryDescCount*badPart )\r
+            if (badCount > 0)\r
             {\r
                 ts->printf( CvTS::LOG, "%f - too large bad matches part while test match() function (1).\n",\r
                             (float)badCount/(float)queryDescCount );\r
@@ -260,7 +263,7 @@ void CV_GpuBruteForceMatcherTest::matchTest( const GpuMat& query, const GpuMat&
                     }\r
                 }\r
             }\r
-            if( (float)badCount > (float)queryDescCount*badPart )\r
+            if (badCount > 0)\r
             {\r
                 ts->printf( CvTS::LOG, "%f - too large bad matches part while test match() function (2).\n",\r
                             (float)badCount/(float)queryDescCount );\r
@@ -305,7 +308,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
                     badCount += localBadCount > 0 ? 1 : 0;\r
                 }\r
             }\r
-            if( (float)badCount > (float)queryDescCount*badPart )\r
+            if (badCount > 0)\r
             {\r
                 ts->printf( CvTS::LOG, "%f - too large bad matches part while test knnMatch() function (1).\n",\r
                             (float)badCount/(float)queryDescCount );\r
@@ -369,7 +372,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
                     badCount += localBadCount > 0 ? 1 : 0;\r
                 }\r
             }\r
-            if( (float)badCount > (float)queryDescCount*badPart )\r
+            if (badCount > 0)\r
             {\r
                 ts->printf( CvTS::LOG, "%f - too large bad matches part while test knnMatch() function (2).\n",\r
                             (float)badCount/(float)queryDescCount );\r
@@ -407,7 +410,7 @@ void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const Gp
                         badCount++;\r
                 }\r
             }\r
-            if( (float)badCount > (float)queryDescCount*badPart )\r
+            if (badCount > 0)\r
             {\r
                 ts->printf( CvTS::LOG, "%f - too large bad matches part while test radiusMatch() function (1).\n",\r
                             (float)badCount/(float)queryDescCount );\r
@@ -473,7 +476,8 @@ void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const Gp
                 badCount += localBadCount > 0 ? 1 : 0;\r
             }\r
         }\r
-        if( (float)badCount > (float)queryDescCount*badPart )\r
+\r
+        if (badCount > 0)\r
         {\r
             curRes = CvTS::FAIL_INVALID_OUTPUT;\r
             ts->printf( CvTS::LOG, "%f - too large bad matches part while test radiusMatch() function (2).\n",\r
@@ -483,20 +487,29 @@ void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const Gp
     }\r
 }\r
 \r
-void CV_GpuBruteForceMatcherTest::run( int )\r
+void CV_GpuBruteForceMatcherTest::dataTest(int dim)\r
 {\r
-    emptyDataTest();\r
-\r
     GpuMat query, train;\r
-    generateData( query, train );\r
+    generateData(query, train, dim);\r
 \r
-    matchTest( query, train );\r
+    matchTest(query, train);\r
+    knnMatchTest(query, train);\r
+    radiusMatchTest(query, train);\r
 \r
-    knnMatchTest( query, train );\r
+    dmatcher.clear();\r
+}\r
 \r
-    radiusMatchTest( query, train );\r
+void CV_GpuBruteForceMatcherTest::run(int)\r
+{\r
+    emptyDataTest();\r
 \r
-    dmatcher.clear();\r
+    dataTest(50);\r
+    dataTest(64);\r
+    dataTest(100);\r
+    dataTest(128);\r
+    dataTest(200);\r
+    dataTest(256);\r
+    dataTest(300);\r
 }\r
 \r
 CV_GpuBruteForceMatcherTest CV_GpuBruteForceMatcher_test;\r
index cbeb0d0..5d07d34 100644 (file)
@@ -45,7 +45,6 @@ CvTS test_system("gpu");
 
 const char* blacklist[] =
 {
-    "GPU-AsyncGpuMatOperator",     // crash
     "GPU-NppImageCanny",            // NPP_TEXTURE_BIND_ERROR
     0
 };
index d3affcf..d941a00 100644 (file)
@@ -408,30 +408,30 @@ struct CV_GpuNppImageIntegralTest : public CV_GpuImageProcTest
 \r
 ////////////////////////////////////////////////////////////////////////////////\r
 // Canny\r
-struct CV_GpuNppImageCannyTest : public CV_GpuImageProcTest\r
-{\r
-    CV_GpuNppImageCannyTest() : CV_GpuImageProcTest( "GPU-NppImageCanny", "Canny" ) {}\r
-\r
-    int test(const Mat& img)\r
-    {\r
-        if (img.type() != CV_8UC1)\r
-        {\r
-            ts->printf(CvTS::LOG, "\nUnsupported type\n");\r
-            return CvTS::OK;\r
-        }\r
-\r
-        const double threshold1 = 1.0, threshold2 = 10.0;\r
-\r
-        Mat cpudst;\r
-        cv::Canny(img, cpudst, threshold1, threshold2);\r
-\r
-        GpuMat gpu1(img);\r
-        GpuMat gpudst;\r
-        cv::gpu::Canny(gpu1, gpudst, threshold1, threshold2);\r
-\r
-        return CheckNorm(cpudst, gpudst);\r
-    }\r
-};\r
+//struct CV_GpuNppImageCannyTest : public CV_GpuImageProcTest\r
+//{\r
+//    CV_GpuNppImageCannyTest() : CV_GpuImageProcTest( "GPU-NppImageCanny", "Canny" ) {}\r
+//\r
+//    int test(const Mat& img)\r
+//    {\r
+//        if (img.type() != CV_8UC1)\r
+//        {\r
+//            ts->printf(CvTS::LOG, "\nUnsupported type\n");\r
+//            return CvTS::OK;\r
+//        }\r
+//\r
+//        const double threshold1 = 1.0, threshold2 = 10.0;\r
+//\r
+//        Mat cpudst;\r
+//        cv::Canny(img, cpudst, threshold1, threshold2);\r
+//\r
+//        GpuMat gpu1(img);\r
+//        GpuMat gpudst;\r
+//        cv::gpu::Canny(gpu1, gpudst, threshold1, threshold2);\r
+//\r
+//        return CheckNorm(cpudst, gpudst);\r
+//    }\r
+//};\r
 \r
 ////////////////////////////////////////////////////////////////////////////////\r
 // cvtColor\r
@@ -839,7 +839,7 @@ CV_GpuNppImageCopyMakeBorderTest CV_GpuNppImageCopyMakeBorder_test;
 CV_GpuNppImageWarpAffineTest CV_GpuNppImageWarpAffine_test;\r
 CV_GpuNppImageWarpPerspectiveTest CV_GpuNppImageWarpPerspective_test;\r
 CV_GpuNppImageIntegralTest CV_GpuNppImageIntegral_test;\r
-CV_GpuNppImageCannyTest CV_GpuNppImageCanny_test;\r
+//CV_GpuNppImageCannyTest CV_GpuNppImageCanny_test;\r
 CV_GpuCvtColorTest CV_GpuCvtColor_test;\r
 CV_GpuHistogramsTest CV_GpuHistograms_test;\r
 CV_GpuCornerHarrisTest CV_GpuCornerHarris_test;\r
index 4681327..62fa120 100644 (file)
 //M*/
 
 #include "gputest.hpp"
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <iterator>
-#include <limits>
-#include <numeric>
-#include <iomanip> // for  cout << setw()
 
-using namespace cv;
 using namespace std;
-using namespace gpu;
-
-class CV_AsyncGpuMatTest : public CvTest
-{
-    public:
-        CV_AsyncGpuMatTest() : CvTest( "GPU-AsyncGpuMatOperator", "async" )
-        {
-             rows = 234;
-            cols = 123;
-
-        }
-        ~CV_AsyncGpuMatTest() {}
-
-    protected:
-        void run(int);
-        template <typename T>
-        void print_mat(const T & mat, const std::string & name) const;
-        bool compare_matrix(cv::Mat & cpumat);
-
-    private:
-        int rows;
-        int cols;
-};
-
-template<typename T>
-void CV_AsyncGpuMatTest::print_mat(const T & mat, const std::string & name) const { cv::imshow(name, mat); }
+using namespace cv;
+using namespace cv::gpu;
 
-bool CV_AsyncGpuMatTest::compare_matrix(cv::Mat & cpumat)
+struct CV_AsyncGpuMatTest : public CvTest
 {
-    Mat cmat(cpumat.size(), cpumat.type(), Scalar::all(0));
-    GpuMat gmat0(cmat);
-    GpuMat gmat1;
-    GpuMat gmat2;
-    GpuMat gmat3;
-
-    //int64 time = getTickCount();
-
-    Stream stream;
-       stream.enqueueMemSet(gmat0, cv::Scalar::all(1), gmat1);
-       stream.enqueueMemSet(gmat0, cv::Scalar::all(1), gmat2);
-       stream.enqueueMemSet(gmat0, cv::Scalar::all(1), gmat3);
-    stream.waitForCompletion();
-
-    //int64 time1 = getTickCount();
-
-    gmat1.copyTo(gmat0);
-    gmat2.copyTo(gmat0);
-    gmat3.copyTo(gmat0);
-
-    //int64 time2 = getTickCount();
-
-    //std::cout << "\ntime async: " << std::fixed << std::setprecision(12) << double((time1 - time)  / (double)getTickFrequency());
-    //std::cout << "\ntime  sync: " << std::fixed << std::setprecision(12) << double((time2 - time1) / (double)getTickFrequency());
-    //std::cout << "\n";
-
-#ifdef PRINT_MATRIX
-    print_mat(cmat, "cpu mat");
-    print_mat(gmat0, "gpu mat 0");
-    print_mat(gmat1, "gpu mat 1");
-    print_mat(gmat2, "gpu mat 2");
-    print_mat(gmat3, "gpu mat 3");
-    cv::waitKey(0);
-#endif
-
-    double ret = norm(cmat, gmat0) + norm(cmat, gmat1) + norm(cmat, gmat2) + norm(cmat, gmat3);
-
-    if (ret < 1.0)
-        return true;
-    else
+    CV_AsyncGpuMatTest() : CvTest( "GPU-AsyncGpuMatOperator", "async" )
     {
-        ts->printf(CvTS::LOG, "\nNorm: %f\n", ret);
-        return false;
     }
-}
 
-void CV_AsyncGpuMatTest::run( int /* start_from */)
-{
-    bool is_test_good = true;
+    void run(int)
+    {
+        try
+        {
+            CudaMem src(Mat::zeros(100, 100, CV_8UC1));
 
-    Mat cpumat(rows, cols, CV_8U);
-    cpumat.setTo(Scalar::all(127));
+            GpuMat gpusrc;
+            GpuMat gpudst0, gpudst1(100, 100, CV_8UC1);
 
-    try
-    {
-        is_test_good &= compare_matrix(cpumat);
-    }
-    catch(cv::Exception& e)
-    {
-        if (!check_and_treat_gpu_exception(e, ts))
-            throw; 
-        return;
-    }
+            CudaMem cpudst0;
+            CudaMem cpudst1;
 
-    if (is_test_good == true)
-        ts->set_failed_test_info(CvTS::OK);
-    else
-        ts->set_failed_test_info(CvTS::FAIL_GENERIC);
-}
+            Stream stream0, stream1;
 
+            stream0.enqueueUpload(src, gpusrc);
+            bitwise_not(gpusrc, gpudst0, GpuMat(), stream0);
+            stream0.enqueueDownload(gpudst0, cpudst0);
 
-/////////////////////////////////////////////////////////////////////////////
-/////////////////// tests registration  /////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////
+            stream1.enqueueMemSet(gpudst1, Scalar::all(128));
+            stream1.enqueueDownload(gpudst1, cpudst1);
+            
+            stream0.waitForCompletion();
+            stream1.waitForCompletion();
 
+            Mat cpu_gold0(100, 100, CV_8UC1, Scalar::all(255));
+            Mat cpu_gold1(100, 100, CV_8UC1, Scalar::all(128));
 
-CV_AsyncGpuMatTest CV_AsyncGpuMatTest_test;
+            if (norm(cpudst0, cpu_gold0, NORM_INF) > 0 || norm(cpudst1, cpu_gold1, NORM_INF) > 0)
+                ts->set_failed_test_info(CvTS::FAIL_GENERIC);
+            else
+                ts->set_failed_test_info(CvTS::OK);
+        }
+        catch(cv::Exception& e)
+        {
+            if (!check_and_treat_gpu_exception(e, ts))
+                throw; 
+            return;
+        }
+    }
+} CV_AsyncGpuMatTest_test;