minor optimization of SURF_GPU (orientation calculation, changed block size to 32x4)
authorVladislav Vinogradov <no@email>
Tue, 24 May 2011 08:02:39 +0000 (08:02 +0000)
committerVladislav Vinogradov <no@email>
Tue, 24 May 2011 08:02:39 +0000 (08:02 +0000)
modules/gpu/src/cuda/surf.cu

index 4d117c0..a7f49e8 100644 (file)
@@ -500,6 +500,20 @@ namespace cv { namespace gpu { namespace surf
     __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};\r
     __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};\r
 \r
+    __device__ void reduceSum32(volatile float* v_sum, float& sum)\r
+    {\r
+        v_sum[threadIdx.x] = sum;\r
+\r
+        if (threadIdx.x < 16)\r
+        {\r
+            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 16];\r
+            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 8];\r
+            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 4];\r
+            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 2];\r
+            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 1];\r
+        }\r
+    }\r
+\r
     __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)\r
     {        \r
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
@@ -508,8 +522,7 @@ namespace cv { namespace gpu { namespace surf
         __shared__ float s_Y[128];\r
         __shared__ float s_angle[128];\r
 \r
-        __shared__ float s_sumx[64 * 4];\r
-        __shared__ float s_sumy[64 * 4];\r
+        __shared__ float s_sum[32 * 4];\r
 \r
         /* The sampling intervals and wavelet sized for selecting an orientation\r
          and building the keypoint descriptor are defined relative to 's' */\r
@@ -525,35 +538,30 @@ namespace cv { namespace gpu { namespace surf
         if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
         {\r
             // Calc X, Y, angle and store it to shared memory\r
-            {\r
-                const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+            const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
 \r
-                float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
+            float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
 \r
-                if (tid < ORI_SAMPLES)\r
-                {\r
-                    const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
-                    const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);\r
-                    const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);\r
+            if (tid < ORI_SAMPLES)\r
+            {\r
+                const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
+                const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);\r
+                const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);\r
 \r
-                    if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
-                    {\r
-                        X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);\r
-                        Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);\r
-                    \r
-                        angle = atan2f(Y, X);\r
-                        if (angle < 0)\r
-                            angle += 2.0f * CV_PI;\r
-                        angle *= 180.0f / CV_PI;\r
-                    }\r
-                }\r
-                if (tid < 128)\r
+                if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
                 {\r
-                    s_X[tid] = X;\r
-                    s_Y[tid] = Y;\r
-                    s_angle[tid] = angle;\r
+                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);\r
+                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);\r
+                \r
+                    angle = atan2f(Y, X);\r
+                    if (angle < 0)\r
+                        angle += 2.0f * CV_PI;\r
+                    angle *= 180.0f / CV_PI;\r
                 }\r
             }\r
+            s_X[tid] = X;\r
+            s_Y[tid] = Y;\r
+            s_angle[tid] = angle;\r
             __syncthreads();\r
 \r
             float bestx = 0, besty = 0, best_mod = 0;\r
@@ -570,43 +578,29 @@ namespace cv { namespace gpu { namespace surf
                     sumx = s_X[threadIdx.x];\r
                     sumy = s_Y[threadIdx.x];\r
                 }\r
+                d = abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);\r
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+                {\r
+                    sumx += s_X[threadIdx.x + 32];\r
+                    sumy += s_Y[threadIdx.x + 32];\r
+                }\r
                 d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);\r
                 if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
                 {\r
                     sumx += s_X[threadIdx.x + 64];\r
                     sumy += s_Y[threadIdx.x + 64];\r
                 }\r
-\r
-                float* s_sumx_row = s_sumx + threadIdx.y * 64;\r
-                float* s_sumy_row = s_sumy + threadIdx.y * 64;\r
-\r
-                s_sumx_row[threadIdx.x] = sumx;\r
-                s_sumy_row[threadIdx.x] = sumy;\r
-                __syncthreads();\r
-\r
-                if (threadIdx.x < 32)\r
+                d = abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);\r
+                if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
                 {\r
-                    volatile float* v_sumx_row = s_sumx_row;\r
-                    volatile float* v_sumy_row = s_sumy_row;\r
-\r
-                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];\r
-                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];\r
-\r
-                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];\r
-                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];\r
-\r
-                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];\r
-                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];\r
-\r
-                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];\r
-                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];\r
+                    sumx += s_X[threadIdx.x + 96];\r
+                    sumy += s_Y[threadIdx.x + 96];\r
+                }\r
 \r
-                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2];\r
-                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];\r
+                float* s_sum_row = s_sum + threadIdx.y * 32;\r
 \r
-                    v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1];\r
-                    v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1];\r
-                }\r
+                reduceSum32(s_sum_row, sumx);\r
+                reduceSum32(s_sum_row, sumy);\r
 \r
                 const float temp_mod = sumx * sumx + sumy * sumy;\r
                 if (temp_mod > best_mod)\r
@@ -615,7 +609,6 @@ namespace cv { namespace gpu { namespace surf
                     bestx = sumx;\r
                     besty = sumy;\r
                 }\r
-                __syncthreads();\r
             }\r
 \r
             if (threadIdx.x == 0)\r
@@ -672,7 +665,7 @@ namespace cv { namespace gpu { namespace surf
     void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) \r
     {\r
         dim3 threads;\r
-        threads.x = 64;\r
+        threads.x = 32;\r
         threads.y = 4;\r
 \r
         dim3 grid;\r
@@ -742,8 +735,7 @@ namespace cv { namespace gpu { namespace surf
     }  \r
 \r
     __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], \r
-        const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, \r
-        int tid)\r
+        const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
     {\r
         __shared__ float s_PATCH[6][6];\r
 \r
@@ -778,7 +770,7 @@ namespace cv { namespace gpu { namespace surf
 \r
         if (threadIdx.x < 5 && threadIdx.y < 5)\r
         {\r
-            tid = threadIdx.y * 5 + threadIdx.x;\r
+            const int tid = threadIdx.y * 5 + threadIdx.x;\r
 \r
             const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
 \r
@@ -834,11 +826,11 @@ namespace cv { namespace gpu { namespace surf
         __shared__ float sdxabs[25];\r
         __shared__ float sdyabs[25];\r
 \r
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
-        calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);\r
+        calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
         __syncthreads();\r
 \r
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
         sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
         sdyabs[tid] = fabs(sdy[tid]); // |dy| array\r
         __syncthreads();\r
@@ -870,11 +862,11 @@ namespace cv { namespace gpu { namespace surf
         __shared__ float sdabs1[25];\r
         __shared__ float sdabs2[25];\r
 \r
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
-        calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);\r
+        calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
         __syncthreads();\r
 \r
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
         if (sdy[tid] >= 0)\r
         {\r
             sd1[tid] = sdx[tid];\r