fixed bug in SURF_GPU (compute descriptors, tid < 25)
authorVladislav Vinogradov <no@email>
Wed, 25 May 2011 08:37:46 +0000 (08:37 +0000)
committerVladislav Vinogradov <no@email>
Wed, 25 May 2011 08:37:46 +0000 (08:37 +0000)
modules/gpu/src/cuda/surf.cu

index 695b0c0..d6e825e 100644 (file)
@@ -831,22 +831,25 @@ namespace cv { namespace gpu { namespace surf
 \r
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
 \r
-        sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
-        sdyabs[tid] = fabs(sdy[tid]); // |dy| array\r
-        __syncthreads();\r
+        if (tid < 25)\r
+        {\r
+            sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
+            sdyabs[tid] = fabs(sdy[tid]); // |dy| array\r
+            __syncthreads();\r
 \r
-        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
-        __syncthreads();\r
+            reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
+            __syncthreads();\r
 \r
-        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
+            float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
 \r
-        // write dx, dy, |dx|, |dy|\r
-        if (tid == 0)\r
-        {\r
-            descriptors_block[0] = sdx[0];\r
-            descriptors_block[1] = sdy[0];\r
-            descriptors_block[2] = sdxabs[0];\r
-            descriptors_block[3] = sdyabs[0];\r
+            // write dx, dy, |dx|, |dy|\r
+            if (tid == 0)\r
+            {\r
+                descriptors_block[0] = sdx[0];\r
+                descriptors_block[1] = sdy[0];\r
+                descriptors_block[2] = sdxabs[0];\r
+                descriptors_block[3] = sdyabs[0];\r
+            }\r
         }\r
     }\r
 \r
@@ -867,63 +870,66 @@ namespace cv { namespace gpu { namespace surf
 \r
         const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
 \r
-        if (sdy[tid] >= 0)\r
-        {\r
-            sd1[tid] = sdx[tid];\r
-            sdabs1[tid] = fabs(sdx[tid]);\r
-            sd2[tid] = 0;\r
-            sdabs2[tid] = 0;\r
-        }\r
-        else\r
+        if (tid < 25)\r
         {\r
-            sd1[tid] = 0;\r
-            sdabs1[tid] = 0;\r
-            sd2[tid] = sdx[tid];\r
-            sdabs2[tid] = fabs(sdx[tid]);\r
-        }\r
-        __syncthreads();\r
+            if (sdy[tid] >= 0)\r
+            {\r
+                sd1[tid] = sdx[tid];\r
+                sdabs1[tid] = fabs(sdx[tid]);\r
+                sd2[tid] = 0;\r
+                sdabs2[tid] = 0;\r
+            }\r
+            else\r
+            {\r
+                sd1[tid] = 0;\r
+                sdabs1[tid] = 0;\r
+                sd2[tid] = sdx[tid];\r
+                sdabs2[tid] = fabs(sdx[tid]);\r
+            }\r
+            __syncthreads();\r
 \r
-        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
-        __syncthreads();\r
+            reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
+            __syncthreads();\r
 \r
-        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
+            float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
 \r
-        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
-        if (tid == 0)\r
-        {\r
-            descriptors_block[0] = sd1[0];\r
-            descriptors_block[1] = sdabs1[0];\r
-            descriptors_block[2] = sd2[0];\r
-            descriptors_block[3] = sdabs2[0];\r
-        }\r
-        __syncthreads();\r
+            // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
+            if (tid == 0)\r
+            {\r
+                descriptors_block[0] = sd1[0];\r
+                descriptors_block[1] = sdabs1[0];\r
+                descriptors_block[2] = sd2[0];\r
+                descriptors_block[3] = sdabs2[0];\r
+            }\r
+            __syncthreads();\r
 \r
-        if (sdx[tid] >= 0)\r
-        {\r
-            sd1[tid] = sdy[tid];\r
-            sdabs1[tid] = fabs(sdy[tid]);\r
-            sd2[tid] = 0;\r
-            sdabs2[tid] = 0;\r
-        }\r
-        else\r
-        {\r
-            sd1[tid] = 0;\r
-            sdabs1[tid] = 0;\r
-            sd2[tid] = sdy[tid];\r
-            sdabs2[tid] = fabs(sdy[tid]);\r
-        }\r
-        __syncthreads();\r
+            if (sdx[tid] >= 0)\r
+            {\r
+                sd1[tid] = sdy[tid];\r
+                sdabs1[tid] = fabs(sdy[tid]);\r
+                sd2[tid] = 0;\r
+                sdabs2[tid] = 0;\r
+            }\r
+            else\r
+            {\r
+                sd1[tid] = 0;\r
+                sdabs1[tid] = 0;\r
+                sd2[tid] = sdy[tid];\r
+                sdabs2[tid] = fabs(sdy[tid]);\r
+            }\r
+            __syncthreads();\r
 \r
-        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
-        __syncthreads();\r
+            reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
+            __syncthreads();\r
 \r
-        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
-        if (tid == 0)\r
-        {\r
-            descriptors_block[4] = sd1[0];\r
-            descriptors_block[5] = sdabs1[0];\r
-            descriptors_block[6] = sd2[0];\r
-            descriptors_block[7] = sdabs2[0];\r
+            // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
+            if (tid == 0)\r
+            {\r
+                descriptors_block[4] = sd1[0];\r
+                descriptors_block[5] = sdabs1[0];\r
+                descriptors_block[6] = sd2[0];\r
+                descriptors_block[7] = sdabs2[0];\r
+            }\r
         }\r
     }\r
 \r