fixed block size calculation in SURF_GPU (fasthessian_gpu and nonmaxonly_gpu kernels)

author Vladislav Vinogradov <no@email>

Wed, 9 Feb 2011 09:11:11 +0000 (09:11 +0000)

committer Vladislav Vinogradov <no@email>

Wed, 9 Feb 2011 09:11:11 +0000 (09:11 +0000)
author Vladislav Vinogradov <no@email>
Wed, 9 Feb 2011 09:11:11 +0000 (09:11 +0000)
committer Vladislav Vinogradov <no@email>
Wed, 9 Feb 2011 09:11:11 +0000 (09:11 +0000)
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu

index 00f62d4..cb47f28 100644 (file)
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -237,20 +237,31 @@ namespace cv { namespace gpu { namespace surf
  \r
                 hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
          }\r
-    }   \r
-\r
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size)\r
+    }\r
+    \r
+    dim3 calcBlockSize(int nIntervals)\r
      {\r
-        dim3 threads;\r
-        threads.x = 16;\r
-        threads.y = 8;\r
+        int threadsPerBlock = 512;\r
+        \r
+        dim3 threads;        \r
          threads.z = nIntervals;\r
+        threadsPerBlock /= nIntervals;\r
+        if (threadsPerBlock >= 48)\r
+            threads.x = 16;\r
+        else\r
+            threads.x = 8;\r
+        threadsPerBlock /= threads.x;\r
+        threads.y = threadsPerBlock;\r
+        \r
+        return threads;\r
+    }\r
  \r
+    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)\r
+    {\r
          dim3 grid;\r
          grid.x = divUp(x_size, threads.x);\r
          grid.y = divUp(y_size, threads.y);\r
-        grid.z = 1;\r
-\r
+        \r
             fasthessian<<<grid, threads>>>(hessianBuffer);\r
  \r
          cudaSafeCall( cudaThreadSynchronize() );\r
@@ -370,17 +381,11 @@ namespace cv { namespace gpu { namespace surf
      }\r
  \r
      void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
-        int nIntervals, int x_size, int y_size, bool use_mask)\r
+        int x_size, int y_size, bool use_mask, const dim3& threads)\r
      {\r
-        dim3 threads;\r
-        threads.x = 16;\r
-        threads.y = 8;\r
-        threads.z = nIntervals;\r
-\r
          dim3 grid;\r
          grid.x = divUp(x_size, threads.x - 2);\r
          grid.y = divUp(y_size, threads.y - 2);\r
-        grid.z = 1;\r
  \r
          const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);\r
  \r
@@ -565,8 +570,6 @@ namespace cv { namespace gpu { namespace surf
      \r
          dim3 grid;\r
          grid.x = maxCounter;\r
-        grid.y = 1; \r
-        grid.z = 1;\r
  \r
          DeviceReference<unsigned int> featureCounterWrapper(featureCounter);\r
      \r
@@ -624,6 +627,7 @@ namespace cv { namespace gpu { namespace surf
             // - SURF says to only use a circle, but the branching logic would slow it down\r
             // - Gaussian weighting should reduce the effects of the outer points anyway\r
          if (tid2 < 169)\r
+\r
          {\r
                 dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];\r
                 dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];\r
@@ -709,8 +713,6 @@ namespace cv { namespace gpu { namespace surf
  \r
          dim3 grid;\r
          grid.x = nFeatures;\r
-        grid.y = 1;\r
-        grid.z = 1;\r
  \r
          find_orientation<<<grid, threads>>>(features);\r
          cudaSafeCall( cudaThreadSynchronize() );\r
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp

index b5ab0d9..145ca49 100644 (file)
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -61,11 +61,13 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint
  #else /* !defined (HAVE_CUDA) */\r
  \r
  namespace cv { namespace gpu { namespace surf\r
-{    \r
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);\r
+{\r
+    dim3 calcBlockSize(int nIntervals);\r
+    \r
+    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);\r
      \r
      void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
-        int nIntervals, int x_size, int y_size, bool use_mask);\r
+        int x_size, int y_size, bool use_mask, const dim3& threads);\r
      \r
      void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, \r
          KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);\r
@@ -103,7 +105,7 @@ namespace
          {\r
              CV_Assert(!img.empty() && img.type() == CV_8UC1);\r
              CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));\r
-            CV_Assert(nOctaves > 0 && nIntervals > 2);\r
+            CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);\r
              CV_Assert(DeviceInfo().has(ATOMICS));\r
  \r
              max_features = static_cast<int>(img.size().area() * featuresRatio);\r
@@ -168,6 +170,7 @@ namespace
  \r
          void detectKeypoints(GpuMat& keypoints)\r
          {\r
+            dim3 threads = calcBlockSize(nIntervals);\r
              for(int octave = 0; octave < nOctaves; ++octave)\r
              {\r
                  int step = initialStep * (1 << octave);\r
@@ -189,12 +192,12 @@ namespace
                  uploadConstant("cv::gpu::surf::c_border", border);\r
                  uploadConstant("cv::gpu::surf::c_step",   step);\r
  \r
-                fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size);\r
+                fasthessian_gpu(hessianBuffer, x_size, y_size, threads);\r
  \r
                  // Reset the candidate count.\r
                  maxCounter = 0;\r
  \r
-                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask); \r
+                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads); \r
                  \r
                  maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));\r
  \r
diff --git a/samples/gpu/surf_keypoint_matcher.cpp b/samples/gpu/surf_keypoint_matcher.cpp

index c109ee7..b2c9385 100644 (file)
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@@ -38,6 +38,9 @@ int main(int argc, char* argv[])
      GpuMat descriptors1GPU, descriptors2GPU;
      surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
      surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
+    
+    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
+    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
  
      // matching descriptors
      BruteForceMatcher_GPU< L2<float> > matcher;
@@ -57,6 +60,8 @@ int main(int argc, char* argv[])
      // drawing the results
      Mat img_matches;
      drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
+    
+    namedWindow("matches", 0);
      imshow("matches", img_matches);
      waitKey(0);
  
diff --git a/tests/gpu/src/features2d.cpp b/tests/gpu/src/features2d.cpp

index 8a75447..99fb28d 100644 (file)
--- a/tests/gpu/src/features2d.cpp
+++ b/tests/gpu/src/features2d.cpp
@@ -149,12 +149,14 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
          assert(minDist >= 0);\r
          if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))\r
          {\r
+            ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n");\r
              ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );\r
              return;\r
          }\r
  \r
          if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)\r
          {\r
+            ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");\r
              ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );\r
              return;\r
          }\r
author	Vladislav Vinogradov <no@email>
	Wed, 9 Feb 2011 09:11:11 +0000 (09:11 +0000)
committer	Vladislav Vinogradov <no@email>
	Wed, 9 Feb 2011 09:11:11 +0000 (09:11 +0000)
modules/gpu/src/cuda/surf.cu		patch \| blob \| history
modules/gpu/src/surf.cpp		patch \| blob \| history
samples/gpu/surf_keypoint_matcher.cpp		patch \| blob \| history
tests/gpu/src/features2d.cpp		patch \| blob \| history