fixed block size calculation in SURF_GPU (fasthessian_gpu and nonmaxonly_gpu kernels)
authorVladislav Vinogradov <no@email>
Wed, 9 Feb 2011 09:11:11 +0000 (09:11 +0000)
committerVladislav Vinogradov <no@email>
Wed, 9 Feb 2011 09:11:11 +0000 (09:11 +0000)
modules/gpu/src/cuda/surf.cu
modules/gpu/src/surf.cpp
samples/gpu/surf_keypoint_matcher.cpp
tests/gpu/src/features2d.cpp

index 00f62d4..cb47f28 100644 (file)
@@ -237,20 +237,31 @@ namespace cv { namespace gpu { namespace surf
 \r
                hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
         }\r
-    }   \r
-\r
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size)\r
+    }\r
+    \r
+    dim3 calcBlockSize(int nIntervals)\r
     {\r
-        dim3 threads;\r
-        threads.x = 16;\r
-        threads.y = 8;\r
+        int threadsPerBlock = 512;\r
+        \r
+        dim3 threads;        \r
         threads.z = nIntervals;\r
+        threadsPerBlock /= nIntervals;\r
+        if (threadsPerBlock >= 48)\r
+            threads.x = 16;\r
+        else\r
+            threads.x = 8;\r
+        threadsPerBlock /= threads.x;\r
+        threads.y = threadsPerBlock;\r
+        \r
+        return threads;\r
+    }\r
 \r
+    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads)\r
+    {\r
         dim3 grid;\r
         grid.x = divUp(x_size, threads.x);\r
         grid.y = divUp(y_size, threads.y);\r
-        grid.z = 1;\r
-\r
+        \r
            fasthessian<<<grid, threads>>>(hessianBuffer);\r
 \r
         cudaSafeCall( cudaThreadSynchronize() );\r
@@ -370,17 +381,11 @@ namespace cv { namespace gpu { namespace surf
     }\r
 \r
     void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
-        int nIntervals, int x_size, int y_size, bool use_mask)\r
+        int x_size, int y_size, bool use_mask, const dim3& threads)\r
     {\r
-        dim3 threads;\r
-        threads.x = 16;\r
-        threads.y = 8;\r
-        threads.z = nIntervals;\r
-\r
         dim3 grid;\r
         grid.x = divUp(x_size, threads.x - 2);\r
         grid.y = divUp(y_size, threads.y - 2);\r
-        grid.z = 1;\r
 \r
         const size_t smem_size = threads.x * threads.y * threads.z * sizeof(float);\r
 \r
@@ -565,8 +570,6 @@ namespace cv { namespace gpu { namespace surf
     \r
         dim3 grid;\r
         grid.x = maxCounter;\r
-        grid.y = 1; \r
-        grid.z = 1;\r
 \r
         DeviceReference<unsigned int> featureCounterWrapper(featureCounter);\r
     \r
@@ -624,6 +627,7 @@ namespace cv { namespace gpu { namespace surf
            // - SURF says to only use a circle, but the branching logic would slow it down\r
            // - Gaussian weighting should reduce the effects of the outer points anyway\r
         if (tid2 < 169)\r
+\r
         {\r
                dx -=     texLookups[threadIdx.x    ][threadIdx.y    ];\r
                dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y    ];\r
@@ -709,8 +713,6 @@ namespace cv { namespace gpu { namespace surf
 \r
         dim3 grid;\r
         grid.x = nFeatures;\r
-        grid.y = 1;\r
-        grid.z = 1;\r
 \r
         find_orientation<<<grid, threads>>>(features);\r
         cudaSafeCall( cudaThreadSynchronize() );\r
index b5ab0d9..145ca49 100644 (file)
@@ -61,11 +61,13 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint
 #else /* !defined (HAVE_CUDA) */\r
 \r
 namespace cv { namespace gpu { namespace surf\r
-{    \r
-    void fasthessian_gpu(PtrStepf hessianBuffer, int nIntervals, int x_size, int y_size);\r
+{\r
+    dim3 calcBlockSize(int nIntervals);\r
+    \r
+    void fasthessian_gpu(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threads);\r
     \r
     void nonmaxonly_gpu(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int& maxCounter, \r
-        int nIntervals, int x_size, int y_size, bool use_mask);\r
+        int x_size, int y_size, bool use_mask, const dim3& threads);\r
     \r
     void fh_interp_extremum_gpu(PtrStepf hessianBuffer, const int4* maxPosBuffer, unsigned int maxCounter, \r
         KeyPoint_GPU* featuresBuffer, unsigned int& featureCounter);\r
@@ -103,7 +105,7 @@ namespace
         {\r
             CV_Assert(!img.empty() && img.type() == CV_8UC1);\r
             CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));\r
-            CV_Assert(nOctaves > 0 && nIntervals > 2);\r
+            CV_Assert(nOctaves > 0 && nIntervals > 2 && nIntervals < 22);\r
             CV_Assert(DeviceInfo().has(ATOMICS));\r
 \r
             max_features = static_cast<int>(img.size().area() * featuresRatio);\r
@@ -168,6 +170,7 @@ namespace
 \r
         void detectKeypoints(GpuMat& keypoints)\r
         {\r
+            dim3 threads = calcBlockSize(nIntervals);\r
             for(int octave = 0; octave < nOctaves; ++octave)\r
             {\r
                 int step = initialStep * (1 << octave);\r
@@ -189,12 +192,12 @@ namespace
                 uploadConstant("cv::gpu::surf::c_border", border);\r
                 uploadConstant("cv::gpu::surf::c_step",   step);\r
 \r
-                fasthessian_gpu(hessianBuffer, nIntervals, x_size, y_size);\r
+                fasthessian_gpu(hessianBuffer, x_size, y_size, threads);\r
 \r
                 // Reset the candidate count.\r
                 maxCounter = 0;\r
 \r
-                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, nIntervals, x_size, y_size, use_mask); \r
+                nonmaxonly_gpu(hessianBuffer, maxPosBuffer.ptr<int4>(), maxCounter, x_size, y_size, use_mask, threads); \r
                 \r
                 maxCounter = std::min(maxCounter, static_cast<unsigned int>(max_candidates));\r
 \r
index c109ee7..b2c9385 100644 (file)
@@ -38,6 +38,9 @@ int main(int argc, char* argv[])
     GpuMat descriptors1GPU, descriptors2GPU;
     surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
     surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
+    
+    cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
+    cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
 
     // matching descriptors
     BruteForceMatcher_GPU< L2<float> > matcher;
@@ -57,6 +60,8 @@ int main(int argc, char* argv[])
     // drawing the results
     Mat img_matches;
     drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
+    
+    namedWindow("matches", 0);
     imshow("matches", img_matches);
     waitKey(0);
 
index 8a75447..99fb28d 100644 (file)
@@ -149,12 +149,14 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
         assert(minDist >= 0);\r
         if (!isSimilarKeypoints(validKeypoints[v], calcKeypoints[nearestIdx]))\r
         {\r
+            ts->printf(CvTS::LOG, "Bad keypoints accuracy.\n");\r
             ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );\r
             return;\r
         }\r
 \r
         if (norm(validDescriptors.row(v), calcDescriptors.row(nearestIdx), NORM_L2) > 1.0f)\r
         {\r
+            ts->printf(CvTS::LOG, "Bad descriptors accuracy.\n");\r
             ts->set_failed_test_info( CvTS::FAIL_BAD_ACCURACY );\r
             return;\r
         }\r