optimized memory usage in BruteForceMatcher_GPU_base::knnMatch when k==2
authorVladislav Vinogradov <no@email>
Tue, 30 Aug 2011 08:49:11 +0000 (08:49 +0000)
committerVladislav Vinogradov <no@email>
Tue, 30 Aug 2011 08:49:11 +0000 (08:49 +0000)
modules/gpu/src/brute_force_matcher.cpp
modules/gpu/src/cuda/brute_force_matcher.cu

index f4f53fc..86b74e4 100644 (file)
@@ -446,10 +446,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
     const int nQuery = queryDescs.rows;\r
     const int nTrain = trainDescs.rows;\r
 \r
-    ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);\r
-    ensureSizeIsEnough(nQuery, k, CV_32F, distance);\r
-    if (k != 2)\r
+    if (k == 2)\r
+    {\r
+        ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);\r
+        ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);\r
+    }\r
+    else\r
+    {\r
+        ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);\r
+        ensureSizeIsEnough(nQuery, k, CV_32F, distance);\r
         ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);\r
+    }\r
 \r
     if (stream)\r
     {\r
@@ -491,14 +498,19 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c
     if (trainIdx.empty() || distance.empty())\r
         return;\r
 \r
-    CV_Assert(trainIdx.type() == CV_32SC1);\r
-    CV_Assert(distance.type() == CV_32FC1 && distance.size() == trainIdx.size());\r
+    CV_Assert(trainIdx.type() == CV_32SC2 || trainIdx.type() == CV_32SC1);\r
+    CV_Assert(distance.type() == CV_32FC2 || distance.type() == CV_32FC1);\r
+    CV_Assert(distance.size() == trainIdx.size());\r
+    CV_Assert(trainIdx.isContinuous() && distance.isContinuous());\r
 \r
-    const int nQuery = distance.rows;\r
-    const int k = trainIdx.cols;\r
+    const int nQuery = trainIdx.type() == CV_32SC2 ? trainIdx.cols : trainIdx.rows;\r
+    const int k = trainIdx.type() == CV_32SC2 ? 2 :trainIdx.cols;\r
 \r
     matches.clear();\r
     matches.reserve(nQuery);\r
+    \r
+    const int* trainIdx_ptr = trainIdx.ptr<int>();\r
+    const float* distance_ptr = distance.ptr<float>();\r
 \r
     for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)\r
     {\r
@@ -506,8 +518,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c
         vector<DMatch>& curMatches = matches.back();\r
         curMatches.reserve(k);\r
 \r
-        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);\r
-        const float* distance_ptr = distance.ptr<float>(queryIdx);\r
         for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)\r
         {\r
             int trainIdx = *trainIdx_ptr;\r
index c2c7317..efe3510 100644 (file)
@@ -452,7 +452,7 @@ namespace cv { namespace gpu { namespace bfmatcher
     }\r
 \r
     template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>\r
-    __global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, PtrStep_<int2> trainIdx, PtrStep_<float2> distance)\r
+    __global__ void knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* distance)\r
     {\r
         typedef typename Dist::result_type result_type;\r
         typedef typename Dist::value_type value_type;\r
@@ -520,8 +520,8 @@ namespace cv { namespace gpu { namespace bfmatcher
                 }\r
             }\r
 \r
-            trainIdx.ptr(queryIdx)[0] = make_int2(bestTrainIdx1, bestTrainIdx2);\r
-            distance.ptr(queryIdx)[0] = make_float2(distMin1, distMin2);\r
+            trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);\r
+            distance[queryIdx] = make_float2(distMin1, distMin2);\r
         }\r
     }\r
 \r
@@ -556,7 +556,7 @@ namespace cv { namespace gpu { namespace bfmatcher
         const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);\r
 \r
         knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>\r
-              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);\r
+              <<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);\r
         cudaSafeCall( cudaGetLastError() );\r
 \r
         if (stream == 0)\r