fixed some warnings in surf.cu

author Vladislav Vinogradov <no@email>

Wed, 16 Mar 2011 07:01:27 +0000 (07:01 +0000)

committer Vladislav Vinogradov <no@email>

Wed, 16 Mar 2011 07:01:27 +0000 (07:01 +0000)
author Vladislav Vinogradov <no@email>
Wed, 16 Mar 2011 07:01:27 +0000 (07:01 +0000)
committer Vladislav Vinogradov <no@email>
Wed, 16 Mar 2011 07:01:27 +0000 (07:01 +0000)
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu

index b9f6da9..cd61175 100644 (file)
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -47,6 +47,7 @@
  \r
  #include "internal_shared.hpp"\r
  #include "opencv2/gpu/device/limits_gpu.hpp"\r
+#include "opencv2/gpu/device/saturate_cast.hpp"\r
  \r
  using namespace cv::gpu;\r
  using namespace cv::gpu::device;\r
@@ -85,14 +86,14 @@ namespace cv { namespace gpu { namespace surf
      texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
  \r
      template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)\r
-       {\r
-               #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200\r
+    {\r
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200\r
          typedef double real_t;        \r
          #else\r
          typedef float  real_t;\r
          #endif\r
  \r
-               float ratio = (float)newSize / oldSize;\r
+        float ratio = (float)newSize / oldSize;\r
          \r
          real_t d = 0;\r
  \r
@@ -202,30 +203,24 @@ namespace cv { namespace gpu { namespace surf
      {\r
          static __device__ bool check(int sum_i, int sum_j, int size)\r
          {\r
-                       #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200\r
-                       typedef double real_t;        \r
-                       #else\r
-                       typedef float  real_t;\r
-                       #endif\r
-\r
-                       float ratio = (float)size / 9.0f;\r
-               \r
-                       real_t d = 0;\r
-\r
-                       int dx1 = __float2int_rn(ratio * c_DM[0]);\r
-                       int dy1 = __float2int_rn(ratio * c_DM[1]);\r
-                       int dx2 = __float2int_rn(ratio * c_DM[2]);\r
-                       int dy2 = __float2int_rn(ratio * c_DM[3]);\r
-\r
-                       real_t t = 0;\r
-                       t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);\r
-                       t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);\r
-                       t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);\r
-                       t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);\r
-\r
-                       d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));\r
-\r
-            return (d >= 0.5);\r
+            float ratio = (float)size / 9.0f;\r
+            \r
+            float d = 0;\r
+\r
+            int dx1 = __float2int_rn(ratio * c_DM[0]);\r
+            int dy1 = __float2int_rn(ratio * c_DM[1]);\r
+            int dx2 = __float2int_rn(ratio * c_DM[2]);\r
+            int dy2 = __float2int_rn(ratio * c_DM[3]);\r
+\r
+            float t = 0;\r
+            t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);\r
+            t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);\r
+            t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);\r
+            t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);\r
+\r
+            d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));\r
+\r
+            return (d >= 0.5f);\r
          }\r
      };\r
  \r
@@ -727,27 +722,16 @@ namespace cv { namespace gpu { namespace surf
          3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f\r
      };\r
  \r
-    __device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2)\r
+    __device__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)\r
      {\r
-        const float centerX = s_pt[SF_X];\r
-        const float centerY = s_pt[SF_Y];\r
-        const float size = s_pt[SF_SIZE];\r
-        const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);\r
-\r
-        /* The sampling intervals and wavelet sized for selecting an orientation\r
-         and building the keypoint descriptor are defined relative to 's' */\r
-        const float s = size * 1.2f / 9.0f;\r
-\r
-        /* Extract a window of pixels around the keypoint of size 20s */\r
-        const int win_size = (int)((PATCH_SZ + 1) * s);\r
-\r
-        float sin_dir;\r
-        float cos_dir;\r
-        sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
+        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
+        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
  \r
-        /* Nearest neighbour version (faster) */\r
-        const float win_offset = -(float)(win_size - 1) / 2; \r
+        return tex2D(imgTex, pixel_x, pixel_y);\r
+    }\r
  \r
+    __device__ unsigned char calcPATCH(int i1, int j1, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir, int win_size)\r
+    {\r
          /* Scale the window to size PATCH_SZ so each pixel's size is s. This\r
             makes calculating the gradients with wavelets of size 2s easy */\r
          const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;\r
@@ -756,38 +740,42 @@ namespace cv { namespace gpu { namespace surf
          const int i = __float2int_rd(icoo);\r
          const int j = __float2int_rd(jcoo);\r
  \r
-        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
-        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
+        float res = calcWin(i, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (j + 1 - jcoo);\r
+        res += calcWin(i + 1, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (j + 1 - jcoo);\r
+        res += calcWin(i + 1, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (jcoo - j);\r
+        res += calcWin(i, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (jcoo - j);\r
  \r
-        float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo);\r
+        return saturate_cast<unsigned char>(res);\r
+    }  \r
  \r
-        pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir;\r
-        pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir;\r
+    __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)\r
+    {\r
+        __shared__ float s_PATCH[6][6];\r
  \r
-        res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo);\r
+        // get the interest point parameters (x, y, size, response, angle)\r
+        __shared__ float s_pt[5];\r
+        if (threadIdx.y == 0)\r
+            s_pt[threadIdx.x] = ((float*)(&keypoints[blockIdx.x]))[threadIdx.x];\r
+        __syncthreads();\r
  \r
-        pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir;\r
-        pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir;\r
+        const float centerX = s_pt[SF_X];\r
+        const float centerY = s_pt[SF_Y];\r
+        const float size = s_pt[SF_SIZE];\r
+        const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);\r
  \r
-        res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j);\r
-        \r
-        pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir;\r
-        pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir;\r
+        /* The sampling intervals and wavelet sized for selecting an orientation\r
+         and building the keypoint descriptor are defined relative to 's' */\r
+        const float s = size * 1.2f / 9.0f;\r
  \r
-        res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j);\r
+        /* Extract a window of pixels around the keypoint of size 20s */\r
+        const int win_size = (int)((PATCH_SZ + 1) * s);\r
  \r
-        s_PATCH[i2][j2] = (unsigned char)res;\r
-    }\r
+        float sin_dir;\r
+        float cos_dir;\r
+        sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
  \r
-    __device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)\r
-    {\r
-        // get the interest point parameters (x, y, size, response, angle)\r
-        __shared__ float s_pt[5];\r
-        if (tid < 5)\r
-        {\r
-            s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid];\r
-        }\r
-        __syncthreads();\r
+        /* Nearest neighbour version (faster) */\r
+        const float win_offset = -(float)(win_size - 1) / 2; \r
  \r
          // Compute sampling points\r
          // since grids are 2D, need to compute xBlock and yBlock indices\r
@@ -796,13 +784,13 @@ namespace cv { namespace gpu { namespace surf
          const int xIndex = xBlock * blockDim.x + threadIdx.x;\r
          const int yIndex = yBlock * blockDim.y + threadIdx.y;\r
  \r
-        calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x);\r
+        s_PATCH[threadIdx.y][threadIdx.x] = calcPATCH(yIndex, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
          if (threadIdx.x == 0)\r
-            calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5);\r
+            s_PATCH[threadIdx.y][5] = calcPATCH(yIndex, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
          if (threadIdx.y == 0)\r
-            calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x);\r
+            s_PATCH[5][threadIdx.x] = calcPATCH(yBlock * blockDim.y + 5, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
          if (threadIdx.x == 0 && threadIdx.y == 0)\r
-            calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5);\r
+            s_PATCH[5][5] = calcPATCH(yBlock * blockDim.y + 5, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
          __syncthreads();\r
  \r
          const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
@@ -814,8 +802,7 @@ namespace cv { namespace gpu { namespace surf
          s_dy_bin[tid] = vy;\r
      }\r
  \r
-    __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,\r
-                                 volatile float* sdata3, volatile float* sdata4, int tid)\r
+    __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)\r
      {\r
          // first step is to reduce from 25 to 16\r
          if (tid < 9) // use 9 threads\r
@@ -851,12 +838,9 @@ namespace cv { namespace gpu { namespace surf
          }\r
      }\r
  \r
-    // Spawn 16 blocks per interest point\r
-    // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location\r
-    __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)\r
+       __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)\r
      {\r
          // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
-        __shared__ float s_PATCH[6][6];\r
          __shared__ float sdx[25];\r
          __shared__ float sdy[25];\r
          __shared__ float sdxabs[25];\r
@@ -864,7 +848,7 @@ namespace cv { namespace gpu { namespace surf
  \r
          const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
  \r
-        calc_dx_dy(s_PATCH, sdx, sdy, features, tid);\r
+        calc_dx_dy(sdx, sdy, features, tid);\r
          __syncthreads();\r
  \r
          sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
@@ -886,12 +870,9 @@ namespace cv { namespace gpu { namespace surf
          }\r
      }\r
  \r
-    // Spawn 16 blocks per interest point\r
-    // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location\r
-    __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)\r
+       __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)\r
      {\r
          // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
-        __shared__ float s_PATCH[6][6];\r
          __shared__ float sdx[25];\r
          __shared__ float sdy[25];\r
  \r
@@ -903,7 +884,7 @@ namespace cv { namespace gpu { namespace surf
  \r
          const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
  \r
-        calc_dx_dy(s_PATCH, sdx, sdy, features, tid);\r
+        calc_dx_dy(sdx, sdy, features, tid);\r
          __syncthreads();\r
  \r
          if (sdy[tid] >= 0)\r
author	Vladislav Vinogradov <no@email>
	Wed, 16 Mar 2011 07:01:27 +0000 (07:01 +0000)
committer	Vladislav Vinogradov <no@email>
	Wed, 16 Mar 2011 07:01:27 +0000 (07:01 +0000)