BruteForceMatcher

author Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Mon, 12 Nov 2012 08:46:49 +0000 (12:46 +0400)

committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Mon, 26 Nov 2012 07:37:37 +0000 (11:37 +0400)
author Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Mon, 12 Nov 2012 08:46:49 +0000 (12:46 +0400)
committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Mon, 26 Nov 2012 07:37:37 +0000 (11:37 +0400)
diff --git a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp

index f65af3a..d5b4bb2 100644 (file)
--- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
@@ -43,7 +43,7 @@
  #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
  #define __OPENCV_GPU_VEC_DISTANCE_HPP__
  
-#include "utility.hpp"
+#include "reduce.hpp"
  #include "functional.hpp"
  #include "detail/vec_distance_detail.hpp"
  
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
  
          template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
          {
-            reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
          }
  
          __device__ __forceinline__ operator int() const
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
  
          template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
          {
-            reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
          }
  
          __device__ __forceinline__ operator float() const
@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
  
          template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
          {
-            reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
          }
  
          __device__ __forceinline__ operator float() const
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
  
          template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
          {
-            reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
          }
  
          __device__ __forceinline__ operator int() const
diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu

index 6a77873..44e567b 100644 (file)
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -42,10 +42,13 @@
  
  #if !defined CUDA_DISABLER
  
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
  #include "opencv2/gpu/device/limits.hpp"
  #include "opencv2/gpu/device/vec_distance.hpp"
  #include "opencv2/gpu/device/datamov_utils.hpp"
+#include "opencv2/gpu/device/warp_shuffle.hpp"
  
  namespace cv { namespace gpu { namespace device
  {
@@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
                                        int& bestTrainIdx1, int& bestTrainIdx2,
                                        float* s_distance, int* s_trainIdx)
          {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+
+            float d1, d2;
+            int i1, i2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                    }
+                }
+            }
+        #else
              float myBestDistance1 = numeric_limits<float>::max();
              float myBestDistance2 = numeric_limits<float>::max();
              int myBestTrainIdx1 = -1;
@@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device
  
              bestTrainIdx1 = myBestTrainIdx1;
              bestTrainIdx2 = myBestTrainIdx2;
+        #endif
          }
  
          template <int BLOCK_SIZE>
@@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
                                         int& bestImgIdx1, int& bestImgIdx2,
                                         float* s_distance, int* s_trainIdx, int* s_imgIdx)
          {
+        #if __CUDA_ARCH__ >= 300
+            (void) s_distance;
+            (void) s_trainIdx;
+            (void) s_imgIdx;
+
+            float d1, d2;
+            int i1, i2;
+            int j1, j2;
+
+            #pragma unroll
+            for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
+            {
+                d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
+                d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
+                i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
+                i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
+                j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
+                j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
+
+                if (bestDistance1 < d1)
+                {
+                    if (d1 < bestDistance2)
+                    {
+                        bestDistance2 = d1;
+                        bestTrainIdx2 = i1;
+                        bestImgIdx2 = j1;
+                    }
+                }
+                else
+                {
+                    bestDistance2 = bestDistance1;
+                    bestTrainIdx2 = bestTrainIdx1;
+                    bestImgIdx2 = bestImgIdx1;
+
+                    bestDistance1 = d1;
+                    bestTrainIdx1 = i1;
+                    bestImgIdx1 = j1;
+
+                    if (d2 < bestDistance2)
+                    {
+                        bestDistance2 = d2;
+                        bestTrainIdx2 = i2;
+                        bestImgIdx2 = j2;
+                    }
+                }
+            }
+        #else
              float myBestDistance1 = numeric_limits<float>::max();
              float myBestDistance2 = numeric_limits<float>::max();
              int myBestTrainIdx1 = -1;
@@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device
  
              bestImgIdx1 = myBestImgIdx1;
              bestImgIdx2 = myBestImgIdx2;
+        #endif
          }
  
          ///////////////////////////////////////////////////////////////////////////////
@@ -1005,7 +1096,7 @@ namespace cv { namespace gpu { namespace device
              s_trainIdx[threadIdx.x] = bestIdx;
              __syncthreads();
  
-            reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
  
              if (threadIdx.x == 0)
              {
@@ -1164,4 +1255,4 @@ namespace cv { namespace gpu { namespace device
  }}} // namespace cv { namespace gpu { namespace device {
  
  
-#endif /* CUDA_DISABLER */
-\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu

index f50089e..9745dee 100644 (file)
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -42,7 +42,9 @@
  
  #if !defined CUDA_DISABLER
  
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
  #include "opencv2/gpu/device/limits.hpp"
  #include "opencv2/gpu/device/vec_distance.hpp"
  #include "opencv2/gpu/device/datamov_utils.hpp"
@@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
              s_distance += threadIdx.y * BLOCK_SIZE;
              s_trainIdx += threadIdx.y * BLOCK_SIZE;
  
-            s_distance[threadIdx.x] = bestDistance;
-            s_trainIdx[threadIdx.x] = bestTrainIdx;
-
-            __syncthreads();
-
-            reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
          }
  
          template <int BLOCK_SIZE>
@@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
              s_trainIdx += threadIdx.y * BLOCK_SIZE;
              s_imgIdx   += threadIdx.y * BLOCK_SIZE;
  
-            s_distance[threadIdx.x] = bestDistance;
-            s_trainIdx[threadIdx.x] = bestTrainIdx;
-            s_imgIdx  [threadIdx.x] = bestImgIdx;
-
-            __syncthreads();
-
-            reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
          }
  
          ///////////////////////////////////////////////////////////////////////////////
@@ -782,4 +773,4 @@ namespace cv { namespace gpu { namespace device
  }}} // namespace cv { namespace gpu { namespace device {
  
  
-#endif /* CUDA_DISABLER */
-\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu

index 934b8fe..bb82882 100644 (file)
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -42,7 +42,8 @@
  
  #if !defined CUDA_DISABLER
  
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
  #include "opencv2/gpu/device/limits.hpp"
  #include "opencv2/gpu/device/vec_distance.hpp"
  #include "opencv2/gpu/device/datamov_utils.hpp"
@@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
          __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
              PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
          {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
              extern __shared__ int smem[];
  
              const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
                      bestDistance.ptr(queryIdx)[ind] = distVal;
                  }
              }
-
-            #endif
          }
  
          template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
@@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
          __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
              PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
          {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
              extern __shared__ int smem[];
  
              const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
                      bestDistance.ptr(queryIdx)[ind] = distVal;
                  }
              }
-
-            #endif
          }
  
          template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
@@ -469,4 +462,4 @@ namespace cv { namespace gpu { namespace device
  }}} // namespace cv { namespace gpu { namespace device
  
  
-#endif /* CUDA_DISABLER */
-\ No newline at end of file
+#endif /* CUDA_DISABLER */
author	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Mon, 12 Nov 2012 08:46:49 +0000 (12:46 +0400)
committer	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Mon, 26 Nov 2012 07:37:37 +0000 (11:37 +0400)
modules/gpu/include/opencv2/gpu/device/vec_distance.hpp		patch \| blob \| history
modules/gpu/src/cuda/bf_knnmatch.cu		patch \| blob \| history
modules/gpu/src/cuda/bf_match.cu		patch \| blob \| history
modules/gpu/src/cuda/bf_radius_match.cu		patch \| blob \| history