From fcff126086f3c22231f819358bb5b8698d6ab2a1 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 16 Mar 2011 07:01:27 +0000 Subject: [PATCH] fixed some warnings in surf.cu --- modules/gpu/src/cuda/surf.cu | 151 +++++++++++++++++++------------------------ 1 file changed, 66 insertions(+), 85 deletions(-) diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu index b9f6da9..cd61175 100644 --- a/modules/gpu/src/cuda/surf.cu +++ b/modules/gpu/src/cuda/surf.cu @@ -47,6 +47,7 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/limits_gpu.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" using namespace cv::gpu; using namespace cv::gpu::device; @@ -85,14 +86,14 @@ namespace cv { namespace gpu { namespace surf texture maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp); template __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x) - { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 + { + #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 typedef double real_t; #else typedef float real_t; #endif - float ratio = (float)newSize / oldSize; + float ratio = (float)newSize / oldSize; real_t d = 0; @@ -202,30 +203,24 @@ namespace cv { namespace gpu { namespace surf { static __device__ bool check(int sum_i, int sum_j, int size) { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 - typedef double real_t; - #else - typedef float real_t; - #endif - - float ratio = (float)size / 9.0f; - - real_t d = 0; - - int dx1 = __float2int_rn(ratio * c_DM[0]); - int dy1 = __float2int_rn(ratio * c_DM[1]); - int dx2 = __float2int_rn(ratio * c_DM[2]); - int dy2 = __float2int_rn(ratio * c_DM[3]); - - real_t t = 0; - t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1); - t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2); - t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1); - t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2); - - d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); - - return (d >= 0.5); + float ratio = (float)size / 9.0f; + + float d = 0; + + int dx1 = __float2int_rn(ratio * c_DM[0]); + int dy1 = __float2int_rn(ratio * c_DM[1]); + int dx2 = __float2int_rn(ratio * c_DM[2]); + int dy2 = __float2int_rn(ratio * c_DM[3]); + + float t = 0; + t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1); + t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2); + t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1); + t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2); + + d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1)); + + return (d >= 0.5f); } }; @@ -727,27 +722,16 @@ namespace cv { namespace gpu { namespace surf 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f }; - __device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2) + __device__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir) { - const float centerX = s_pt[SF_X]; - const float centerY = s_pt[SF_Y]; - const float size = s_pt[SF_SIZE]; - const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180); - - /* The sampling intervals and wavelet sized for selecting an orientation - and building the keypoint descriptor are defined relative to 's' */ - const float s = size * 1.2f / 9.0f; - - /* Extract a window of pixels around the keypoint of size 20s */ - const int win_size = (int)((PATCH_SZ + 1) * s); - - float sin_dir; - float cos_dir; - sincosf(descriptor_dir, &sin_dir, &cos_dir); + float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; + float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; - /* Nearest neighbour version (faster) */ - const float win_offset = -(float)(win_size - 1) / 2; + return tex2D(imgTex, pixel_x, pixel_y); + } + __device__ unsigned char calcPATCH(int i1, int j1, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir, int win_size) + { /* Scale the window to size PATCH_SZ so each pixel's size is s. This makes calculating the gradients with wavelets of size 2s easy */ const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size; @@ -756,38 +740,42 @@ namespace cv { namespace gpu { namespace surf const int i = __float2int_rd(icoo); const int j = __float2int_rd(jcoo); - float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir; - float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir; + float res = calcWin(i, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (j + 1 - jcoo); + res += calcWin(i + 1, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (j + 1 - jcoo); + res += calcWin(i + 1, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (jcoo - j); + res += calcWin(i, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (jcoo - j); - float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo); + return saturate_cast(res); + } - pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir; - pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir; + __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid) + { + __shared__ float s_PATCH[6][6]; - res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo); + // get the interest point parameters (x, y, size, response, angle) + __shared__ float s_pt[5]; + if (threadIdx.y == 0) + s_pt[threadIdx.x] = ((float*)(&keypoints[blockIdx.x]))[threadIdx.x]; + __syncthreads(); - pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir; - pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir; + const float centerX = s_pt[SF_X]; + const float centerY = s_pt[SF_Y]; + const float size = s_pt[SF_SIZE]; + const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180); - res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j); - - pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir; - pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir; + /* The sampling intervals and wavelet sized for selecting an orientation + and building the keypoint descriptor are defined relative to 's' */ + const float s = size * 1.2f / 9.0f; - res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j); + /* Extract a window of pixels around the keypoint of size 20s */ + const int win_size = (int)((PATCH_SZ + 1) * s); - s_PATCH[i2][j2] = (unsigned char)res; - } + float sin_dir; + float cos_dir; + sincosf(descriptor_dir, &sin_dir, &cos_dir); - __device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid) - { - // get the interest point parameters (x, y, size, response, angle) - __shared__ float s_pt[5]; - if (tid < 5) - { - s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid]; - } - __syncthreads(); + /* Nearest neighbour version (faster) */ + const float win_offset = -(float)(win_size - 1) / 2; // Compute sampling points // since grids are 2D, need to compute xBlock and yBlock indices @@ -796,13 +784,13 @@ namespace cv { namespace gpu { namespace surf const int xIndex = xBlock * blockDim.x + threadIdx.x; const int yIndex = yBlock * blockDim.y + threadIdx.y; - calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x); + s_PATCH[threadIdx.y][threadIdx.x] = calcPATCH(yIndex, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size); if (threadIdx.x == 0) - calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5); + s_PATCH[threadIdx.y][5] = calcPATCH(yIndex, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size); if (threadIdx.y == 0) - calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x); + s_PATCH[5][threadIdx.x] = calcPATCH(yBlock * blockDim.y + 5, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size); if (threadIdx.x == 0 && threadIdx.y == 0) - calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5); + s_PATCH[5][5] = calcPATCH(yBlock * blockDim.y + 5, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size); __syncthreads(); const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; @@ -814,8 +802,7 @@ namespace cv { namespace gpu { namespace surf s_dy_bin[tid] = vy; } - __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, - volatile float* sdata3, volatile float* sdata4, int tid) + __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid) { // first step is to reduce from 25 to 16 if (tid < 9) // use 9 threads @@ -851,12 +838,9 @@ namespace cv { namespace gpu { namespace surf } } - // Spawn 16 blocks per interest point - // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location - __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features) + __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features) { // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) - __shared__ float s_PATCH[6][6]; __shared__ float sdx[25]; __shared__ float sdy[25]; __shared__ float sdxabs[25]; @@ -864,7 +848,7 @@ namespace cv { namespace gpu { namespace surf const int tid = threadIdx.y * blockDim.x + threadIdx.x; - calc_dx_dy(s_PATCH, sdx, sdy, features, tid); + calc_dx_dy(sdx, sdy, features, tid); __syncthreads(); sdxabs[tid] = fabs(sdx[tid]); // |dx| array @@ -886,12 +870,9 @@ namespace cv { namespace gpu { namespace surf } } - // Spawn 16 blocks per interest point - // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location - __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features) + __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features) { // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) - __shared__ float s_PATCH[6][6]; __shared__ float sdx[25]; __shared__ float sdy[25]; @@ -903,7 +884,7 @@ namespace cv { namespace gpu { namespace surf const int tid = threadIdx.y * blockDim.x + threadIdx.x; - calc_dx_dy(s_PATCH, sdx, sdy, features, tid); + calc_dx_dy(sdx, sdy, features, tid); __syncthreads(); if (sdy[tid] >= 0) -- 2.7.4