\r
#include "internal_shared.hpp"\r
#include "opencv2/gpu/device/limits_gpu.hpp"\r
+#include "opencv2/gpu/device/saturate_cast.hpp"\r
\r
using namespace cv::gpu;\r
using namespace cv::gpu::device;\r
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);\r
\r
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)\r
- {\r
- #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200\r
+ {\r
+ #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200\r
typedef double real_t; \r
#else\r
typedef float real_t;\r
#endif\r
\r
- float ratio = (float)newSize / oldSize;\r
+ float ratio = (float)newSize / oldSize;\r
\r
real_t d = 0;\r
\r
{\r
static __device__ bool check(int sum_i, int sum_j, int size)\r
{\r
- #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200\r
- typedef double real_t; \r
- #else\r
- typedef float real_t;\r
- #endif\r
-\r
- float ratio = (float)size / 9.0f;\r
- \r
- real_t d = 0;\r
-\r
- int dx1 = __float2int_rn(ratio * c_DM[0]);\r
- int dy1 = __float2int_rn(ratio * c_DM[1]);\r
- int dx2 = __float2int_rn(ratio * c_DM[2]);\r
- int dy2 = __float2int_rn(ratio * c_DM[3]);\r
-\r
- real_t t = 0;\r
- t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);\r
- t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);\r
- t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);\r
- t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);\r
-\r
- d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));\r
-\r
- return (d >= 0.5);\r
+ float ratio = (float)size / 9.0f;\r
+ \r
+ float d = 0;\r
+\r
+ int dx1 = __float2int_rn(ratio * c_DM[0]);\r
+ int dy1 = __float2int_rn(ratio * c_DM[1]);\r
+ int dx2 = __float2int_rn(ratio * c_DM[2]);\r
+ int dy2 = __float2int_rn(ratio * c_DM[3]);\r
+\r
+ float t = 0;\r
+ t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);\r
+ t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);\r
+ t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);\r
+ t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);\r
+\r
+ d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));\r
+\r
+ return (d >= 0.5f);\r
}\r
};\r
\r
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f\r
};\r
\r
- __device__ void calcPATCH(float s_PATCH[6][6], float s_pt[5], int i1, int j1, int i2, int j2)\r
+ __device__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)\r
{\r
- const float centerX = s_pt[SF_X];\r
- const float centerY = s_pt[SF_Y];\r
- const float size = s_pt[SF_SIZE];\r
- const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);\r
-\r
- /* The sampling intervals and wavelet sized for selecting an orientation\r
- and building the keypoint descriptor are defined relative to 's' */\r
- const float s = size * 1.2f / 9.0f;\r
-\r
- /* Extract a window of pixels around the keypoint of size 20s */\r
- const int win_size = (int)((PATCH_SZ + 1) * s);\r
-\r
- float sin_dir;\r
- float cos_dir;\r
- sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
+ float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
+ float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
\r
- /* Nearest neighbour version (faster) */\r
- const float win_offset = -(float)(win_size - 1) / 2; \r
+ return tex2D(imgTex, pixel_x, pixel_y);\r
+ }\r
\r
+ __device__ unsigned char calcPATCH(int i1, int j1, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir, int win_size)\r
+ {\r
/* Scale the window to size PATCH_SZ so each pixel's size is s. This\r
makes calculating the gradients with wavelets of size 2s easy */\r
const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;\r
const int i = __float2int_rd(icoo);\r
const int j = __float2int_rd(jcoo);\r
\r
- float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;\r
- float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;\r
+ float res = calcWin(i, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (j + 1 - jcoo);\r
+ res += calcWin(i + 1, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (j + 1 - jcoo);\r
+ res += calcWin(i + 1, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (jcoo - j);\r
+ res += calcWin(i, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (jcoo - j);\r
\r
- float res = tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (j + 1 - jcoo);\r
+ return saturate_cast<unsigned char>(res);\r
+ } \r
\r
- pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i + 1) * sin_dir;\r
- pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i + 1) * cos_dir;\r
+ __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)\r
+ {\r
+ __shared__ float s_PATCH[6][6];\r
\r
- res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (j + 1 - jcoo);\r
+ // get the interest point parameters (x, y, size, response, angle)\r
+ __shared__ float s_pt[5];\r
+ if (threadIdx.y == 0)\r
+ s_pt[threadIdx.x] = ((float*)(&keypoints[blockIdx.x]))[threadIdx.x];\r
+ __syncthreads();\r
\r
- pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i + 1) * sin_dir;\r
- pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i + 1) * cos_dir;\r
+ const float centerX = s_pt[SF_X];\r
+ const float centerY = s_pt[SF_Y];\r
+ const float size = s_pt[SF_SIZE];\r
+ const float descriptor_dir = s_pt[SF_DIR] * (float)(CV_PI / 180);\r
\r
- res += tex2D(imgTex, pixel_x, pixel_y) * (icoo - i) * (jcoo - j);\r
- \r
- pixel_x = centerX + (win_offset + j + 1) * cos_dir + (win_offset + i) * sin_dir;\r
- pixel_y = centerY - (win_offset + j + 1) * sin_dir + (win_offset + i) * cos_dir;\r
+ /* The sampling intervals and wavelet sized for selecting an orientation\r
+ and building the keypoint descriptor are defined relative to 's' */\r
+ const float s = size * 1.2f / 9.0f;\r
\r
- res += tex2D(imgTex, pixel_x, pixel_y) * (i + 1 - icoo) * (jcoo - j);\r
+ /* Extract a window of pixels around the keypoint of size 20s */\r
+ const int win_size = (int)((PATCH_SZ + 1) * s);\r
\r
- s_PATCH[i2][j2] = (unsigned char)res;\r
- }\r
+ float sin_dir;\r
+ float cos_dir;\r
+ sincosf(descriptor_dir, &sin_dir, &cos_dir);\r
\r
- __device__ void calc_dx_dy(float s_PATCH[6][6], float s_dx_bin[25], float s_dy_bin[25], const KeyPoint_GPU* keypoints, int tid)\r
- {\r
- // get the interest point parameters (x, y, size, response, angle)\r
- __shared__ float s_pt[5];\r
- if (tid < 5)\r
- {\r
- s_pt[tid] = ((float*)(&keypoints[blockIdx.x]))[tid];\r
- }\r
- __syncthreads();\r
+ /* Nearest neighbour version (faster) */\r
+ const float win_offset = -(float)(win_size - 1) / 2; \r
\r
// Compute sampling points\r
// since grids are 2D, need to compute xBlock and yBlock indices\r
const int xIndex = xBlock * blockDim.x + threadIdx.x;\r
const int yIndex = yBlock * blockDim.y + threadIdx.y;\r
\r
- calcPATCH(s_PATCH, s_pt, yIndex, xIndex, threadIdx.y, threadIdx.x);\r
+ s_PATCH[threadIdx.y][threadIdx.x] = calcPATCH(yIndex, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
if (threadIdx.x == 0)\r
- calcPATCH(s_PATCH, s_pt, yIndex, xBlock * blockDim.x + 5, threadIdx.y, 5);\r
+ s_PATCH[threadIdx.y][5] = calcPATCH(yIndex, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
if (threadIdx.y == 0)\r
- calcPATCH(s_PATCH, s_pt, yBlock * blockDim.y + 5, xIndex, 5, threadIdx.x);\r
+ s_PATCH[5][threadIdx.x] = calcPATCH(yBlock * blockDim.y + 5, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
if (threadIdx.x == 0 && threadIdx.y == 0)\r
- calcPATCH(s_PATCH, s_pt, xBlock * blockDim.x + 5, yBlock * blockDim.y + 5, 5, 5);\r
+ s_PATCH[5][5] = calcPATCH(yBlock * blockDim.y + 5, xBlock * blockDim.x + 5, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);\r
__syncthreads();\r
\r
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
s_dy_bin[tid] = vy;\r
}\r
\r
- __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,\r
- volatile float* sdata3, volatile float* sdata4, int tid)\r
+ __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)\r
{\r
// first step is to reduce from 25 to 16\r
if (tid < 9) // use 9 threads\r
}\r
}\r
\r
- // Spawn 16 blocks per interest point\r
- // - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location\r
- __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)\r
+ __global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)\r
{\r
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
- __shared__ float s_PATCH[6][6];\r
__shared__ float sdx[25];\r
__shared__ float sdy[25];\r
__shared__ float sdxabs[25];\r
\r
const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- calc_dx_dy(s_PATCH, sdx, sdy, features, tid);\r
+ calc_dx_dy(sdx, sdy, features, tid);\r
__syncthreads();\r
\r
sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
}\r
}\r
\r
- // Spawn 16 blocks per interest point\r
- // - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location\r
- __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)\r
+ __global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)\r
{\r
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
- __shared__ float s_PATCH[6][6];\r
__shared__ float sdx[25];\r
__shared__ float sdy[25];\r
\r
\r
const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- calc_dx_dy(s_PATCH, sdx, sdy, features, tid);\r
+ calc_dx_dy(sdx, sdy, features, tid);\r
__syncthreads();\r
\r
if (sdy[tid] >= 0)\r