minor SURF_GPU optimization (descriptor calculation, change block size to 6x6)
authorVladislav Vinogradov <no@email>
Mon, 23 May 2011 18:40:09 +0000 (18:40 +0000)
committerVladislav Vinogradov <no@email>
Mon, 23 May 2011 18:40:09 +0000 (18:40 +0000)
modules/gpu/src/cuda/surf.cu

index 834bf43..4d117c0 100644 (file)
@@ -770,25 +770,24 @@ namespace cv { namespace gpu { namespace surf
         // since grids are 2D, need to compute xBlock and yBlock indices\r
         const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4\r
         const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
-        const int xIndex = xBlock * blockDim.x + threadIdx.x;\r
-        const int yIndex = yBlock * blockDim.y + threadIdx.y;\r
+        const int xIndex = xBlock * 5 + threadIdx.x;\r
+        const int yIndex = yBlock * 5 + threadIdx.y;\r
 \r
         s_PATCH[threadIdx.y][threadIdx.x] = calcPATCH(yIndex, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
-        if (threadIdx.x == 0)\r
-            s_PATCH[threadIdx.y][5] = calcPATCH(yIndex, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
-        if (threadIdx.y == 0)\r
-            s_PATCH[5][threadIdx.x] = calcPATCH(yBlock * blockDim.y + 5, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
-        if (threadIdx.x == 0 && threadIdx.y == 0)\r
-            s_PATCH[5][5] = calcPATCH(yBlock * blockDim.y + 5, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
         __syncthreads();\r
 \r
-        const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
+        if (threadIdx.x < 5 && threadIdx.y < 5)\r
+        {\r
+            tid = threadIdx.y * 5 + threadIdx.x;\r
+\r
+            const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
 \r
-        const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;\r
-        const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;\r
+            const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;\r
+            const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;\r
 \r
-        s_dx_bin[tid] = vx;\r
-        s_dy_bin[tid] = vy;\r
+            s_dx_bin[tid] = vx;\r
+            s_dy_bin[tid] = vy;\r
+        }\r
     }\r
 \r
     __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)\r
@@ -986,7 +985,7 @@ namespace cv { namespace gpu { namespace surf
         \r
         if (descriptors.cols == 64)\r
         {\r
-            compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);\r
+            compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);\r
             cudaSafeCall( cudaGetLastError() );\r
 \r
             cudaSafeCall( cudaThreadSynchronize() );\r
@@ -998,7 +997,7 @@ namespace cv { namespace gpu { namespace surf
         }\r
         else\r
         {\r
-            compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            \r
+            compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            \r
             cudaSafeCall( cudaGetLastError() );\r
 \r
             cudaSafeCall( cudaThreadSynchronize() );\r