__constant__ float c_dxy_scale;\r
// The scale associated with the first interval of the first octave\r
__constant__ float c_initialScale;\r
- //! The interest operator threshold\r
+ // The interest operator threshold\r
__constant__ float c_threshold;\r
\r
// Ther octave\r
\r
__device__ float evalDxx(float x, float y, float t, float mask_width, float mask_height, float fscale)\r
{\r
- float Dxx = 0.f;\r
- \r
- Dxx += iiAreaLookupCDHalfWH(x, y, mask_height, mask_width);\r
- Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale , mask_width);\r
+ float Dxx = 0.f;\r
\r
- Dxx *= 1.0f / (fscale * fscale);\r
+ Dxx += iiAreaLookupCDHalfWH(x, y, mask_height, mask_width);\r
+ Dxx -= t * iiAreaLookupCDHalfWH(x, y, fscale , mask_width);\r
\r
- return Dxx;\r
+ Dxx *= 1.0f / (fscale * fscale);\r
+\r
+ return Dxx;\r
}\r
- \r
+\r
__device__ float evalDxy(float x, float y, float fscale)\r
{\r
- float center_offset = c_dxy_center_offset * fscale;\r
- float half_width = c_dxy_half_width * fscale;\r
+ float center_offset = c_dxy_center_offset * fscale;\r
+ float half_width = c_dxy_half_width * fscale;\r
\r
- float Dxy = 0.f;\r
+ float Dxy = 0.f;\r
\r
- Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width);\r
- Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width);\r
- Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width);\r
- Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width);\r
- \r
- Dxy *= 1.0f / (fscale * fscale);\r
+ Dxy += iiAreaLookupCDHalfWH(x - center_offset, y - center_offset, half_width, half_width);\r
+ Dxy -= iiAreaLookupCDHalfWH(x - center_offset, y + center_offset, half_width, half_width);\r
+ Dxy += iiAreaLookupCDHalfWH(x + center_offset, y + center_offset, half_width, half_width);\r
+ Dxy -= iiAreaLookupCDHalfWH(x + center_offset, y - center_offset, half_width, half_width);\r
\r
- return Dxy;\r
+ Dxy *= 1.0f / (fscale * fscale);\r
+\r
+ return Dxy;\r
}\r
\r
__device__ float calcScale(int hidx_z)\r
\r
float fscale = calcScale(hidx_z);\r
\r
- // Compute the lookup location of the mask center\r
+ // Compute the lookup location of the mask center\r
float x = hidx_x * c_step + c_border;\r
float y = hidx_y * c_step + c_border;\r
\r
- // Scale the mask dimensions according to the scale\r
+ // Scale the mask dimensions according to the scale\r
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)\r
{\r
- float mask_width = c_mask_width * fscale;\r
- float mask_height = c_mask_height * fscale;\r
-\r
- // Compute the filter responses\r
- float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);\r
- float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);\r
- float Dxy = evalDxy(x, y, fscale);\r
- \r
- // Combine the responses and store the Laplacian sign\r
- float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);\r
-\r
- if (Dxx + Dyy > 0.f)\r
- setLastBit(result);\r
- else\r
- clearLastBit(result);\r
-\r
- hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
+ float mask_width = c_mask_width * fscale;\r
+ float mask_height = c_mask_height * fscale;\r
+\r
+ // Compute the filter responses\r
+ float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);\r
+ float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);\r
+ float Dxy = evalDxy(x, y, fscale);\r
+\r
+ // Combine the responses and store the Laplacian sign\r
+ float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);\r
+\r
+ if (Dxx + Dyy > 0.f)\r
+ setLastBit(result);\r
+ else\r
+ clearLastBit(result);\r
+\r
+ hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
}\r
}\r
\r
\r
float fscale = calcScale(hidx_z);\r
\r
- // Compute the lookup location of the mask center\r
+ // Compute the lookup location of the mask center\r
float x = hidx_x * c_step + c_border;\r
float y = hidx_y * c_step + c_border;\r
\r
- // Scale the mask dimensions according to the scale\r
+ // Scale the mask dimensions according to the scale\r
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)\r
{\r
- float mask_width = c_mask_width * fscale;\r
- float mask_height = c_mask_height * fscale;\r
-\r
- // Compute the filter responses\r
- float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);\r
- float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);\r
- float Dxy = evalDxy(x, y, fscale);\r
- \r
- // Combine the responses and store the Laplacian sign\r
- float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);\r
-\r
- if (Dxx + Dyy > 0.f)\r
- setLastBit(result);\r
- else\r
- clearLastBit(result);\r
-\r
- hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
+ float mask_width = c_mask_width * fscale;\r
+ float mask_height = c_mask_height * fscale;\r
+\r
+ // Compute the filter responses\r
+ float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);\r
+ float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);\r
+ float Dxy = evalDxy(x, y, fscale);\r
+\r
+ // Combine the responses and store the Laplacian sign\r
+ float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);\r
+\r
+ if (Dxx + Dyy > 0.f)\r
+ setLastBit(result);\r
+ else\r
+ clearLastBit(result);\r
+\r
+ hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;\r
}\r
}\r
- \r
+\r
dim3 calcBlockSize(int nIntervals)\r
{\r
int threadsPerBlock = 512;\r
grid.x = divUp(x_size, threads.x);\r
grid.y = divUp(y_size, threads.y);\r
\r
- fasthessian<<<grid, threads>>>(hessianBuffer);\r
+ fasthessian<<<grid, threads>>>(hessianBuffer);\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
- }\r
+ }\r
\r
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)\r
{\r
grid.x = divUp(x_size, threads.x);\r
grid.y = divUp(y_size, threads.y) * threadsOld.z;\r
\r
- fasthessian_old<<<grid, threads>>>(hessianBuffer);\r
+ fasthessian_old<<<grid, threads>>>(hessianBuffer);\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
- }\r
+ }\r
\r
////////////////////////////////////////////////////////////////////////\r
// NONMAX\r
{\r
static __device__ bool check(float x, float y, float fscale)\r
{\r
- float half_width = fscale / 2;\r
- \r
- float result = 0.f;\r
+ float half_width = fscale / 2;\r
+\r
+ float result = 0.f;\r
\r
result += tex2D(maskSumTex, x - half_width, y - half_width);\r
result -= tex2D(maskSumTex, x + half_width, y - half_width);\r
result -= tex2D(maskSumTex, x - half_width, y + half_width);\r
result += tex2D(maskSumTex, x + half_width, y + half_width);\r
- \r
- result /= (fscale * fscale);\r
+\r
+ result /= (fscale * fscale);\r
\r
return (result >= 0.5f);\r
}\r
\r
template <typename Mask>\r
__global__ void nonmaxonly(PtrStepf hessianBuffer, int4* maxPosBuffer, unsigned int* maxCounter)\r
- { \r
+ {\r
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
\r
extern __shared__ float fh_vals[];\r
fh_vals[localLin] = hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x];\r
}\r
__syncthreads();\r
- \r
+\r
// Is this location one of the ones being processed for nonmax suppression.\r
// Blocks overlap by one so we don't process the border threads.\r
bool inBounds2 = threadIdx.x > 0 && threadIdx.x < blockDim.x-1 && hidx_x < c_x_size - 1 \r
\r
float val = fh_vals[localLin];\r
\r
- // Compute the lookup location of the mask center\r
+ // Compute the lookup location of the mask center\r
float x = hidx_x * c_step + c_border;\r
float y = hidx_y * c_step + c_border;\r
float fscale = calcScale(hidx_z);\r
&& val > fh_vals[localLin + blockDim.x + 1]\r
&& val > fh_vals[localLin + blockDim.x ]\r
&& val > fh_vals[localLin + blockDim.x - 1]\r
- \r
+\r
&& val > fh_vals[localLin - zoff + 1]\r
&& val > fh_vals[localLin - zoff ]\r
&& val > fh_vals[localLin - zoff - 1]\r
&& val > fh_vals[localLin - zoff + blockDim.x + 1]\r
&& val > fh_vals[localLin - zoff + blockDim.x ]\r
&& val > fh_vals[localLin - zoff + blockDim.x - 1]\r
- \r
+\r
&& val > fh_vals[localLin + zoff + 1]\r
&& val > fh_vals[localLin + zoff ]\r
&& val > fh_vals[localLin + zoff - 1]\r
if(condmax) \r
{\r
unsigned int i = atomicInc(maxCounter,(unsigned int) -1);\r
- \r
+\r
if (i < c_max_candidates) \r
{\r
- int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave};\r
- maxPosBuffer[i] = f; \r
+ int4 f = {hidx_x, hidx_y, threadIdx.z, c_octave};\r
+ maxPosBuffer[i] = f;\r
}\r
}\r
- } \r
+ }\r
\r
#endif\r
}\r
nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
else\r
nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);\r
- \r
+\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
#define MID_IDX 1\r
__global__ void fh_interp_extremum(PtrStepf hessianBuffer, const int4* maxPosBuffer, \r
KeyPoint_GPU* featuresBuffer, unsigned int* featureCounter)\r
- { \r
+ {\r
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
\r
int hidx_x = maxPosBuffer[blockIdx.x].x - 1 + threadIdx.x;\r
\r
//dxx\r
H[0][0] = fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] \r
- - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]\r
- + fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ];\r
+ - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]\r
+ + fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ];\r
\r
//dyy\r
- H[1][1] = fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] \r
- - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]\r
- + fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1];\r
- \r
+ H[1][1] = fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1]\r
+ - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]\r
+ + fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1];\r
+\r
//dss\r
H[2][2] = fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] \r
- - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]\r
- + fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ];\r
+ - 2.0f*fh_vals[MID_IDX ][MID_IDX ][MID_IDX ]\r
+ + fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ];\r
\r
//dxy\r
H[0][1]= 0.25f*\r
(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX + 1] -\r
- fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX + 1] -\r
- fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX - 1] + \r
- fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX - 1]);\r
- \r
+ fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX + 1] -\r
+ fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX - 1] +\r
+ fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX - 1]);\r
+\r
//dxs\r
H[0][2]= 0.25f*\r
(fh_vals[MID_IDX + 1][MID_IDX + 1][MID_IDX ] -\r
- fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX ] -\r
- fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX ] + \r
- fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX ]);\r
+ fh_vals[MID_IDX + 1][MID_IDX - 1][MID_IDX ] -\r
+ fh_vals[MID_IDX - 1][MID_IDX + 1][MID_IDX ] +\r
+ fh_vals[MID_IDX - 1][MID_IDX - 1][MID_IDX ]);\r
\r
//dys\r
H[1][2]= 0.25f*\r
(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX + 1] -\r
- fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX - 1] -\r
- fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX + 1] + \r
- fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX - 1]);\r
+ fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX - 1] -\r
+ fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX + 1] +\r
+ fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX - 1]);\r
\r
//dyx = dxy\r
H[1][0] = H[0][1];\r
\r
//dx\r
dD[0] = 0.5f*(fh_vals[MID_IDX ][MID_IDX + 1][MID_IDX ] -\r
- fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]);\r
+ fh_vals[MID_IDX ][MID_IDX - 1][MID_IDX ]);\r
//dy\r
dD[1] = 0.5f*(fh_vals[MID_IDX ][MID_IDX ][MID_IDX + 1] -\r
- fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]);\r
+ fh_vals[MID_IDX ][MID_IDX ][MID_IDX - 1]);\r
//ds\r
dD[2] = 0.5f*(fh_vals[MID_IDX + 1][MID_IDX ][MID_IDX ] -\r
- fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]);\r
+ fh_vals[MID_IDX - 1][MID_IDX ][MID_IDX ]);\r
\r
__shared__ float invdet;\r
invdet = 1.f /\r
x[2] = -(Hinv[2][0]*(dD[0]) + Hinv[2][1]*(dD[1]) + Hinv[2][2]*(dD[2]));\r
\r
if (fabs(x[0]) < 1.f && fabs(x[1]) < 1.f && fabs(x[2]) < 1.f) \r
- { \r
+ {\r
// if the step is within the interpolation region, perform it\r
- \r
- // Get a new feature index.\r
- unsigned int i = atomicInc(featureCounter, (unsigned int)-1);\r
\r
- if (i < c_max_features) \r
- { \r
- p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border;\r
- p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border;\r
+ // Get a new feature index.\r
+ unsigned int i = atomicInc(featureCounter, (unsigned int)-1);\r
+\r
+ if (i < c_max_features)\r
+ {\r
+ p.x = ((float)maxPosBuffer[blockIdx.x].x + x[1]) * (float)c_step + c_border;\r
+ p.y = ((float)maxPosBuffer[blockIdx.x].y + x[0]) * (float)c_step + c_border;\r
\r
- if (x[2] > 0)\r
- {\r
+ if (x[2] > 0)\r
+ {\r
float a = calcScale(maxPosBuffer[blockIdx.x].z);\r
float b = calcScale(maxPosBuffer[blockIdx.x].z + 1);\r
\r
- p.size = (1.f - x[2]) * a + x[2] * b;\r
- } \r
- else\r
- {\r
+ p.size = (1.f - x[2]) * a + x[2] * b;\r
+ }\r
+ else\r
+ {\r
float a = calcScale(maxPosBuffer[blockIdx.x].z);\r
float b = calcScale(maxPosBuffer[blockIdx.x].z - 1);\r
\r
- p.size = (1.f + x[2]) * a - x[2] * b;\r
- }\r
+ p.size = (1.f + x[2]) * a - x[2] * b;\r
+ }\r
+\r
+ p.octave = c_octave;\r
\r
- p.octave = c_octave;\r
- \r
- p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX];\r
+ p.response = fh_vals[MID_IDX][MID_IDX][MID_IDX];\r
\r
- // Should we split up this transfer over many threads?\r
- featuresBuffer[i] = p;\r
- }\r
+ // Should we split up this transfer over many threads?\r
+ featuresBuffer[i] = p;\r
+ }\r
} // If the subpixel interpolation worked\r
} // If this is thread 0.\r
\r
threads.x = 3;\r
threads.y = 3;\r
threads.z = 3;\r
- \r
+\r
dim3 grid;\r
grid.x = maxCounter;\r
\r
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);\r
- \r
+\r
fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);\r
cudaSafeCall( cudaGetLastError() );\r
\r
}\r
\r
__shared__ float texLookups[17][17];\r
- \r
+\r
__shared__ float Edx[13*13];\r
__shared__ float Edy[13*13];\r
__shared__ float xys[3];\r
// Read my x, y, size.\r
if (tid < 3)\r
{\r
- xys[tid] = ((float*)(&features[blockIdx.x]))[tid];\r
+ xys[tid] = ((float*)(&features[blockIdx.x]))[tid];\r
}\r
__syncthreads();\r
\r
\r
float dx = 0.f;\r
float dy = 0.f;\r
- \r
- // Computes lookups for all points in a 13x13 lattice.\r
- // - SURF says to only use a circle, but the branching logic would slow it down\r
- // - Gaussian weighting should reduce the effects of the outer points anyway\r
- if (tid2 < 169)\r
\r
+ // Computes lookups for all points in a 13x13 lattice.\r
+ // - SURF says to only use a circle, but the branching logic would slow it down\r
+ // - Gaussian weighting should reduce the effects of the outer points anyway\r
+ if (tid2 < 169)\r
{\r
- dx -= texLookups[threadIdx.x ][threadIdx.y ];\r
- dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];\r
- dx -= texLookups[threadIdx.x + 4][threadIdx.y ];\r
- dx += texLookups[threadIdx.x ][threadIdx.y + 4];\r
- dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];\r
- dx += texLookups[threadIdx.x + 4][threadIdx.y + 4];\r
-\r
- dy -= texLookups[threadIdx.x ][threadIdx.y ];\r
- dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2];\r
- dy -= texLookups[threadIdx.x ][threadIdx.y + 4];\r
- dy += texLookups[threadIdx.x + 4][threadIdx.y ];\r
- dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];\r
- dy += texLookups[threadIdx.x + 4][threadIdx.y + 4];\r
-\r
- float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y];\r
-\r
- Edx[tid2] = dx * g;\r
- Edy[tid2] = dy * g;\r
+ dx -= texLookups[threadIdx.x ][threadIdx.y ];\r
+ dx += 2.f*texLookups[threadIdx.x + 2][threadIdx.y ];\r
+ dx -= texLookups[threadIdx.x + 4][threadIdx.y ];\r
+ dx += texLookups[threadIdx.x ][threadIdx.y + 4];\r
+ dx -= 2.f*texLookups[threadIdx.x + 2][threadIdx.y + 4];\r
+ dx += texLookups[threadIdx.x + 4][threadIdx.y + 4];\r
+\r
+ dy -= texLookups[threadIdx.x ][threadIdx.y ];\r
+ dy += 2.f*texLookups[threadIdx.x ][threadIdx.y + 2];\r
+ dy -= texLookups[threadIdx.x ][threadIdx.y + 4];\r
+ dy += texLookups[threadIdx.x + 4][threadIdx.y ];\r
+ dy -= 2.f*texLookups[threadIdx.x + 4][threadIdx.y + 2];\r
+ dy += texLookups[threadIdx.x + 4][threadIdx.y + 4];\r
+\r
+ float g = c_gauss1D[threadIdx.x] * c_gauss1D[threadIdx.y];\r
+\r
+ Edx[tid2] = dx * g;\r
+ Edy[tid2] = dy * g;\r
}\r
\r
__syncthreads();\r
// Gets 128-168\r
if (tid < 41)\r
{\r
- Edx[tid] += Edx[tid + 128]; \r
+ Edx[tid] += Edx[tid + 128];\r
} \r
- __syncthreads(); \r
- if (tid < 64) \r
+ __syncthreads();\r
+ if (tid < 64)\r
{\r
- Edx[tid] += Edx[tid + 64]; \r
- } \r
- __syncthreads(); \r
- if (tid < 32) \r
+ Edx[tid] += Edx[tid + 64];\r
+ }\r
+ __syncthreads();\r
+ if (tid < 32)\r
{\r
volatile float* smem = Edx;\r
\r
// Gets 128-168\r
if (tid < 41) \r
{\r
- Edy[tid] += Edy[tid + 128]; \r
- } \r
- __syncthreads(); \r
- if (tid < 64) \r
+ Edy[tid] += Edy[tid + 128];\r
+ }\r
+ __syncthreads();\r
+ if (tid < 64)\r
{\r
- Edy[tid] += Edy[tid + 64]; \r
- } \r
- __syncthreads(); \r
- if (tid < 32) \r
+ Edy[tid] += Edy[tid + 64];\r
+ }\r
+ __syncthreads();\r
+ if (tid < 32)\r
{\r
volatile float* smem = Edy;\r
\r
smem[tid] += smem[tid + 2];\r
smem[tid] += smem[tid + 1];\r
}\r
- \r
+\r
// Thread 0 saves back the result.\r
if (tid == 0)\r
{\r
- features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI);\r
+ features[blockIdx.x].angle = -atan2(Edy[0], Edx[0]) * (180.0f / CV_PI);\r
}\r
}\r
\r
\r
// precomputed values for a Gaussian with a standard deviation of 3.3\r
// - it appears SURF uses a different value, but not sure what it is\r
- __constant__ float c_3p3gauss1D[20] = \r
+ __constant__ float c_3p3gauss1D[20] =\r
{\r
0.001917811039f, 0.004382549939f, 0.009136246641f, 0.017375153068f, 0.030144587513f,\r
- 0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,\r
- 0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,\r
- 0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f\r
- }; \r
+ 0.047710056854f, 0.068885910797f, 0.090734146446f, 0.109026229640f, 0.119511889092f,\r
+ 0.119511889092f, 0.109026229640f, 0.090734146446f, 0.068885910797f, 0.047710056854f,\r
+ 0.030144587513f, 0.017375153068f, 0.009136246641f, 0.004382549939f, 0.001917811039f\r
+ };\r
\r
template <int BLOCK_DIM_X>\r
__global__ void normalize_descriptors(PtrStepf descriptors)\r
if (BLOCK_DIM_X >= 128)\r
{\r
if (threadIdx.x < 64)\r
- sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];\r
+ sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];\r
__syncthreads();\r
}\r
\r
{\r
volatile float* smem = sqDesc;\r
\r
- smem[threadIdx.x] += smem[threadIdx.x + 32];\r
- smem[threadIdx.x] += smem[threadIdx.x + 16];\r
- smem[threadIdx.x] += smem[threadIdx.x + 8];\r
- smem[threadIdx.x] += smem[threadIdx.x + 4];\r
- smem[threadIdx.x] += smem[threadIdx.x + 2];\r
- smem[threadIdx.x] += smem[threadIdx.x + 1];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 32];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 16];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 8];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 4];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 2];\r
+ smem[threadIdx.x] += smem[threadIdx.x + 1];\r
}\r
\r
// compute length (square root)\r
__shared__ float len;\r
if (threadIdx.x == 0)\r
{\r
- len = sqrtf(sqDesc[0]);\r
+ len = sqrtf(sqDesc[0]);\r
}\r
__syncthreads();\r
\r
// normalize and store in output\r
- descriptor_base[threadIdx.x] = lookup / len; \r
+ descriptor_base[threadIdx.x] = lookup / len;\r
}\r
\r
- __device__ void calc_dx_dy(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)\r
+ __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const float* ipt,\r
+ int xIndex, int yIndex, int tid)\r
{\r
- // get the interest point parameters (x, y, size, response, angle)\r
- __shared__ float ipt[5];\r
- if (threadIdx.x < 5 && threadIdx.y == 0 && threadIdx.z == 0)\r
- {\r
- ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];\r
- }\r
- __syncthreads();\r
-\r
float sin_theta, cos_theta;\r
sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);\r
\r
- // Compute sampling points\r
- // since grids are 2D, need to compute xBlock and yBlock indices\r
- const int xIndex = threadIdx.y * 5 + threadIdx.x % 5;\r
- const int yIndex = threadIdx.z * 5 + threadIdx.x / 5;\r
-\r
// Compute rotated sampling points\r
// (clockwise rotation since we are rotating the lattice)\r
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)\r
- const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] \r
+ const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]\r
+ sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);\r
- const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] \r
+ const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]\r
+ cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);\r
\r
// gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)\r
- // a b c\r
- // d f\r
- // g h i\r
-\r
+ // a b c\r
+ // d f\r
+ // g h i\r
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);\r
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);\r
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);\r
\r
// rotate responses (store all dxs then all dys)\r
// - counterclockwise rotation to rotate back to zero orientation\r
- sdx[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx\r
- sdy[threadIdx.z][threadIdx.y][threadIdx.x] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy\r
+ sdx_bin[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx\r
+ sdy_bin[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy\r
}\r
\r
- __device__ void reduce_sum(float sdata1[4][4][25], float sdata2[4][4][25], float sdata3[4][4][25],\r
- float sdata4[4][4][25])\r
+ __device__ void calc_dx_dy(float* sdx_bin, float* sdy_bin, const KeyPoint_GPU* features)//(float sdx[4][4][25], float sdy[4][4][25], const KeyPoint_GPU* features)\r
{\r
- // first step is to reduce from 25 to 16\r
- if (threadIdx.x < 9) // use 9 threads\r
+ // get the interest point parameters (x, y, size, response, angle)\r
+ __shared__ float ipt[5];\r
+ if (threadIdx.x < 5 && threadIdx.y == 0)\r
{\r
- sdata1[threadIdx.z][threadIdx.y][threadIdx.x] += sdata1[threadIdx.z][threadIdx.y][threadIdx.x + 16];\r
- sdata2[threadIdx.z][threadIdx.y][threadIdx.x] += sdata2[threadIdx.z][threadIdx.y][threadIdx.x + 16];\r
- sdata3[threadIdx.z][threadIdx.y][threadIdx.x] += sdata3[threadIdx.z][threadIdx.y][threadIdx.x + 16];\r
- sdata4[threadIdx.z][threadIdx.y][threadIdx.x] += sdata4[threadIdx.z][threadIdx.y][threadIdx.x + 16];\r
+ ipt[threadIdx.x] = ((float*)(&features[blockIdx.x]))[threadIdx.x];\r
}\r
__syncthreads();\r
\r
- // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)\r
- if (threadIdx.x < 16)\r
- {\r
- volatile float* smem = sdata1[threadIdx.z][threadIdx.y];\r
-\r
- smem[threadIdx.x] += smem[threadIdx.x + 8];\r
- smem[threadIdx.x] += smem[threadIdx.x + 4];\r
- smem[threadIdx.x] += smem[threadIdx.x + 2];\r
- smem[threadIdx.x] += smem[threadIdx.x + 1];\r
-\r
- smem = sdata2[threadIdx.z][threadIdx.y];\r
-\r
- smem[threadIdx.x] += smem[threadIdx.x + 8];\r
- smem[threadIdx.x] += smem[threadIdx.x + 4];\r
- smem[threadIdx.x] += smem[threadIdx.x + 2];\r
- smem[threadIdx.x] += smem[threadIdx.x + 1];\r
-\r
- smem = sdata3[threadIdx.z][threadIdx.y];\r
+ // Compute sampling points\r
+ // since grids are 2D, need to compute xBlock and yBlock indices\r
+ const int xBlock = (threadIdx.y & 3); // threadIdx.y % 4\r
+ const int yBlock = (threadIdx.y >> 2); // floor(threadIdx.y / 4)\r
+ const int xIndex = (xBlock * 5) + (threadIdx.x % 5);\r
+ const int yIndex = (yBlock * 5) + (threadIdx.x / 5);\r
\r
- smem[threadIdx.x] += smem[threadIdx.x + 8];\r
- smem[threadIdx.x] += smem[threadIdx.x + 4];\r
- smem[threadIdx.x] += smem[threadIdx.x + 2];\r
- smem[threadIdx.x] += smem[threadIdx.x + 1];\r
+ calc_dx_dy(sdx_bin, sdy_bin, ipt, xIndex, yIndex, threadIdx.x);\r
+ }\r
\r
- smem = sdata4[threadIdx.z][threadIdx.y];\r
+ __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2,\r
+ volatile float* sdata3, volatile float* sdata4, int tid)\r
+ {\r
+ // first step is to reduce from 25 to 16\r
+ if (tid < 9) // use 9 threads\r
+ {\r
+ sdata1[tid] += sdata1[tid + 16];\r
+ sdata2[tid] += sdata2[tid + 16];\r
+ sdata3[tid] += sdata3[tid + 16];\r
+ sdata4[tid] += sdata4[tid + 16];\r
+ }\r
\r
- smem[threadIdx.x] += smem[threadIdx.x + 8];\r
- smem[threadIdx.x] += smem[threadIdx.x + 4];\r
- smem[threadIdx.x] += smem[threadIdx.x + 2];\r
- smem[threadIdx.x] += smem[threadIdx.x + 1];\r
+ // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)\r
+ if (tid < 16)\r
+ {\r
+ sdata1[tid] += sdata1[tid + 8];\r
+ sdata1[tid] += sdata1[tid + 4];\r
+ sdata1[tid] += sdata1[tid + 2];\r
+ sdata1[tid] += sdata1[tid + 1];\r
+\r
+ sdata2[tid] += sdata2[tid + 8];\r
+ sdata2[tid] += sdata2[tid + 4];\r
+ sdata2[tid] += sdata2[tid + 2];\r
+ sdata2[tid] += sdata2[tid + 1];\r
+\r
+ sdata3[tid] += sdata3[tid + 8];\r
+ sdata3[tid] += sdata3[tid + 4];\r
+ sdata3[tid] += sdata3[tid + 2];\r
+ sdata3[tid] += sdata3[tid + 1];\r
+\r
+ sdata4[tid] += sdata4[tid + 8];\r
+ sdata4[tid] += sdata4[tid + 4];\r
+ sdata4[tid] += sdata4[tid + 2];\r
+ sdata4[tid] += sdata4[tid + 1];\r
}\r
}\r
\r
// Spawn 16 blocks per interest point\r
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location\r
__global__ void compute_descriptors64(PtrStepf descriptors, const KeyPoint_GPU* features)\r
- { \r
+ {\r
// 2 floats (dx, dy) for each thread (5x5 sample points in each sub-region)\r
- __shared__ float sdx[4][4][25]; \r
- __shared__ float sdy[4][4][25];\r
+ __shared__ float sdx [16 * 25];\r
+ __shared__ float sdy [16 * 25];\r
+ __shared__ float sdxabs[16 * 25];\r
+ __shared__ float sdyabs[16 * 25];\r
\r
- calc_dx_dy(sdx, sdy, features);\r
+ __shared__ float sdesc[64];\r
+\r
+ float* sdx_bin = sdx + (threadIdx.y * 25);\r
+ float* sdy_bin = sdy + (threadIdx.y * 25);\r
+ float* sdxabs_bin = sdxabs + (threadIdx.y * 25);\r
+ float* sdyabs_bin = sdyabs + (threadIdx.y * 25);\r
+\r
+ calc_dx_dy(sdx_bin, sdy_bin, features);\r
__syncthreads();\r
\r
- __shared__ float sdxabs[4][4][25];\r
- __shared__ float sdyabs[4][4][25];\r
- \r
- sdxabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]); // |dx| array\r
- sdyabs[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]); // |dy| array\r
+ sdxabs_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]); // |dx| array\r
+ sdyabs_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]); // |dy| array\r
__syncthreads();\r
\r
- reduce_sum(sdx, sdy, sdxabs, sdyabs);\r
+ reduce_sum25(sdx_bin, sdy_bin, sdxabs_bin, sdyabs_bin, threadIdx.x);\r
+ __syncthreads();\r
\r
- float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 16 + threadIdx.y * 4;\r
+ float* sdesc_bin = sdesc + (threadIdx.y << 2);\r
\r
// write dx, dy, |dx|, |dy|\r
if (threadIdx.x == 0)\r
{\r
- descriptors_block[0] = sdx[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[1] = sdy[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[2] = sdxabs[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[3] = sdyabs[threadIdx.z][threadIdx.y][0];\r
+ sdesc_bin[0] = sdx_bin[0];\r
+ sdesc_bin[1] = sdy_bin[0];\r
+ sdesc_bin[2] = sdxabs_bin[0];\r
+ sdesc_bin[3] = sdyabs_bin[0];\r
}\r
- } \r
+ __syncthreads();\r
+\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ if (tid < 64)\r
+ descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];\r
+ }\r
\r
// Spawn 16 blocks per interest point\r
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location\r
__global__ void compute_descriptors128(PtrStepf descriptors, const KeyPoint_GPU* features)\r
- { \r
+ {\r
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
- __shared__ float sdx[4][4][25]; \r
- __shared__ float sdy[4][4][25];\r
- \r
- calc_dx_dy(sdx, sdy, features);\r
- __syncthreads();\r
+ __shared__ float sdx[16 * 25];\r
+ __shared__ float sdy[16 * 25];\r
\r
// sum (reduce) 5x5 area response\r
- __shared__ float sd1[4][4][25];\r
- __shared__ float sd2[4][4][25];\r
- __shared__ float sdabs1[4][4][25]; \r
- __shared__ float sdabs2[4][4][25];\r
+ __shared__ float sd1[16 * 25];\r
+ __shared__ float sd2[16 * 25];\r
+ __shared__ float sdabs1[16 * 25];\r
+ __shared__ float sdabs2[16 * 25];\r
+\r
+ __shared__ float sdesc[128];\r
+\r
+ float* sdx_bin = sdx + (threadIdx.y * 25);\r
+ float* sdy_bin = sdy + (threadIdx.y * 25);\r
+ float* sd1_bin = sd1 + (threadIdx.y * 25);\r
+ float* sd2_bin = sd2 + (threadIdx.y * 25);\r
+ float* sdabs1_bin = sdabs1 + (threadIdx.y * 25);\r
+ float* sdabs2_bin = sdabs2 + (threadIdx.y * 25);\r
+\r
+ calc_dx_dy(sdx_bin, sdy_bin, features);\r
+ __syncthreads();\r
\r
- if (sdy[threadIdx.z][threadIdx.y][threadIdx.x] >= 0)\r
+ if (sdy_bin[threadIdx.x] >= 0)\r
{\r
- sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x];\r
- sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]);\r
- sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;\r
- sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;\r
+ sd1_bin[threadIdx.x] = sdx_bin[threadIdx.x];\r
+ sdabs1_bin[threadIdx.x] = fabs(sdx_bin[threadIdx.x]);\r
+ sd2_bin[threadIdx.x] = 0;\r
+ sdabs2_bin[threadIdx.x] = 0;\r
}\r
else\r
{\r
- sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;\r
- sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;\r
- sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdx[threadIdx.z][threadIdx.y][threadIdx.x];\r
- sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdx[threadIdx.z][threadIdx.y][threadIdx.x]);\r
+ sd1_bin[threadIdx.x] = 0;\r
+ sdabs1_bin[threadIdx.x] = 0;\r
+ sd2_bin[threadIdx.x] = sdx_bin[threadIdx.x];\r
+ sdabs2_bin[threadIdx.x] = fabs(sdx[threadIdx.x]);\r
}\r
__syncthreads();\r
- \r
- reduce_sum(sd1, sd2, sdabs1, sdabs2);\r
- \r
- float* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.z * 32 + threadIdx.y * 8;\r
+\r
+ reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);\r
+ __syncthreads();\r
+\r
+ float* sdesc_bin = sdesc + (threadIdx.y << 3);\r
\r
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
if (threadIdx.x == 0)\r
{\r
- descriptors_block[0] = sd1[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[1] = sdabs1[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[2] = sd2[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[3] = sdabs2[threadIdx.z][threadIdx.y][0];\r
+ sdesc_bin[0] = sd1_bin[0];\r
+ sdesc_bin[1] = sdabs1_bin[0];\r
+ sdesc_bin[2] = sd2_bin[0];\r
+ sdesc_bin[3] = sdabs2_bin[0];\r
}\r
__syncthreads();\r
\r
- if (sdx[threadIdx.z][threadIdx.y][threadIdx.x] >= 0)\r
+ if (sdx_bin[threadIdx.x] >= 0)\r
{\r
- sd1[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x];\r
- sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]);\r
- sd2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;\r
- sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = 0;\r
+ sd1_bin[threadIdx.x] = sdy_bin[threadIdx.x];\r
+ sdabs1_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);\r
+ sd2_bin[threadIdx.x] = 0;\r
+ sdabs2_bin[threadIdx.x] = 0;\r
}\r
else\r
{\r
- sd1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;\r
- sdabs1[threadIdx.z][threadIdx.y][threadIdx.x] = 0;\r
- sd2[threadIdx.z][threadIdx.y][threadIdx.x] = sdy[threadIdx.z][threadIdx.y][threadIdx.x];\r
- sdabs2[threadIdx.z][threadIdx.y][threadIdx.x] = fabs(sdy[threadIdx.z][threadIdx.y][threadIdx.x]);\r
+ sd1_bin[threadIdx.x] = 0;\r
+ sdabs1_bin[threadIdx.x] = 0;\r
+ sd2_bin[threadIdx.x] = sdy_bin[threadIdx.x];\r
+ sdabs2_bin[threadIdx.x] = fabs(sdy_bin[threadIdx.x]);\r
}\r
__syncthreads();\r
\r
- reduce_sum(sd1, sd2, sdabs1, sdabs2);\r
+ reduce_sum25(sd1_bin, sd2_bin, sdabs1_bin, sdabs2_bin, threadIdx.x);\r
+ __syncthreads();\r
\r
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
if (threadIdx.x == 0)\r
{\r
- descriptors_block[4] = sd1[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[5] = sdabs1[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[6] = sd2[threadIdx.z][threadIdx.y][0];\r
- descriptors_block[7] = sdabs2[threadIdx.z][threadIdx.y][0];\r
+ sdesc_bin[4] = sd1_bin[0];\r
+ sdesc_bin[5] = sdabs1_bin[0];\r
+ sdesc_bin[6] = sd2_bin[0];\r
+ sdesc_bin[7] = sdabs2_bin[0];\r
}\r
+ __syncthreads();\r
+\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ if (tid < 128)\r
+ descriptors.ptr(blockIdx.x)[tid] = sdesc[tid];\r
}\r
\r
void compute_descriptors_gpu(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)\r
\r
if (descriptors.cols == 64)\r
{\r
- compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);\r
+ compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
else\r
{\r
- compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);\r
+ compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 16, 1)>>>(descriptors, features);\r
cudaSafeCall( cudaGetLastError() );\r
\r
cudaSafeCall( cudaThreadSynchronize() );\r
}\r
__syncthreads();\r
\r
- float sin_theta, cos_theta;\r
- sincosf(ipt[SF_ANGLE] * (CV_PI / 180.0f), &sin_theta, &cos_theta);\r
-\r
// Compute sampling points\r
// since grids are 2D, need to compute xBlock and yBlock indices\r
- const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4\r
+ const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4\r
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)\r
const int xIndex = xBlock * blockDim.x + threadIdx.x;\r
const int yIndex = yBlock * blockDim.y + threadIdx.y;\r
\r
- // Compute rotated sampling points\r
- // (clockwise rotation since we are rotating the lattice)\r
- // (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)\r
- const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] \r
- + sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);\r
- const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE] \r
- + cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);\r
-\r
- // gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)\r
- // a b c\r
- // d f\r
- // g h i\r
- const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);\r
- const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);\r
- const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);\r
- const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);\r
- const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);\r
- const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);\r
- const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]);\r
- const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]); \r
-\r
- // compute axis-aligned HaarX, HaarY\r
- // (could group the additions together into multiplications)\r
- const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)\r
- const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx\r
- const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy\r
-\r
- // rotate responses (store all dxs then all dys)\r
- // - counterclockwise rotation to rotate back to zero orientation\r
- sdx[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx\r
- sdy[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy\r
- }\r
-\r
- __device__ void reduce_sum_old(float sdata[25], int tid)\r
- {\r
- // first step is to reduce from 25 to 16\r
- if (tid < 9) // use 9 threads\r
- sdata[tid] += sdata[tid + 16];\r
- __syncthreads();\r
-\r
- // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)\r
- if (tid < 16)\r
- {\r
- volatile float* smem = sdata;\r
-\r
- smem[tid] += smem[tid + 8];\r
- smem[tid] += smem[tid + 4];\r
- smem[tid] += smem[tid + 2];\r
- smem[tid] += smem[tid + 1];\r
- }\r
+ calc_dx_dy(sdx, sdy, ipt, xIndex, yIndex, tid);\r
}\r
\r
// Spawn 16 blocks per interest point\r
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location\r
__global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)\r
{\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
- \r
- float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
- \r
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
- __shared__ float sdx[25]; \r
+ __shared__ float sdx[25];\r
__shared__ float sdy[25];\r
+ __shared__ float sdxabs[25];\r
+ __shared__ float sdyabs[25];\r
+\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
calc_dx_dy_old(sdx, sdy, features, tid);\r
__syncthreads();\r
\r
- __shared__ float sabs[25];\r
+ sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
+ sdyabs[tid] = fabs(sdy[tid]); // |dy| array\r
+ __syncthreads();\r
\r
- sabs[tid] = fabs(sdx[tid]); // |dx| array\r
+ reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
__syncthreads();\r
\r
- reduce_sum_old(sdx, tid);\r
- reduce_sum_old(sdy, tid);\r
- reduce_sum_old(sabs, tid);\r
+ float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
\r
- // write dx, dy, |dx|\r
+ // write dx, dy, |dx|, |dy|\r
if (tid == 0)\r
{\r
descriptors_block[0] = sdx[0];\r
descriptors_block[1] = sdy[0];\r
- descriptors_block[2] = sabs[0];\r
- }\r
- __syncthreads();\r
-\r
- sabs[tid] = fabs(sdy[tid]); // |dy| array\r
- __syncthreads();\r
- \r
- reduce_sum_old(sabs, tid);\r
-\r
- // write |dy|\r
- if (tid == 0)\r
- {\r
- descriptors_block[3] = sabs[0];\r
+ descriptors_block[2] = sdxabs[0];\r
+ descriptors_block[3] = sdyabs[0];\r
}\r
}\r
\r
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location\r
__global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)\r
{\r
- float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
-\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
- \r
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)\r
- __shared__ float sdx[25]; \r
+ __shared__ float sdx[25];\r
__shared__ float sdy[25];\r
- \r
- calc_dx_dy_old(sdx, sdy, features, tid);\r
- __syncthreads();\r
\r
// sum (reduce) 5x5 area response\r
__shared__ float sd1[25];\r
__shared__ float sd2[25];\r
- __shared__ float sdabs1[25]; \r
+ __shared__ float sdabs1[25];\r
__shared__ float sdabs2[25];\r
\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
+ calc_dx_dy_old(sdx, sdy, features, tid);\r
+ __syncthreads();\r
+\r
if (sdy[tid] >= 0)\r
{\r
sd1[tid] = sdx[tid];\r
sdabs2[tid] = fabs(sdx[tid]);\r
}\r
__syncthreads();\r
- \r
- reduce_sum_old(sd1, tid);\r
- reduce_sum_old(sd2, tid);\r
- reduce_sum_old(sdabs1, tid);\r
- reduce_sum_old(sdabs2, tid);\r
+\r
+ reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);\r
+ __syncthreads();\r
+\r
+ float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
\r
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
if (tid == 0)\r
sdabs2[tid] = fabs(sdy[tid]);\r
}\r
__syncthreads();\r
- \r
- reduce_sum_old(sd1, tid);\r
- reduce_sum_old(sd2, tid);\r
- reduce_sum_old(sdabs1, tid);\r
- reduce_sum_old(sdabs2, tid);\r
+\r
+ reduce_sum25(sd1, sd1, sdabs1, sdabs2, tid);\r
+ __syncthreads();\r
\r
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
if (tid == 0)\r