__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};\r
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};\r
\r
+ __device__ void reduceSum32(volatile float* v_sum, float& sum)\r
+ {\r
+ v_sum[threadIdx.x] = sum;\r
+\r
+ if (threadIdx.x < 16)\r
+ {\r
+ v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 16];\r
+ v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 8];\r
+ v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 4];\r
+ v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 2];\r
+ v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 1];\r
+ }\r
+ }\r
+\r
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)\r
{ \r
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
__shared__ float s_Y[128];\r
__shared__ float s_angle[128];\r
\r
- __shared__ float s_sumx[64 * 4];\r
- __shared__ float s_sumy[64 * 4];\r
+ __shared__ float s_sum[32 * 4];\r
\r
/* The sampling intervals and wavelet sized for selecting an orientation\r
and building the keypoint descriptor are defined relative to 's' */\r
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)\r
{\r
// Calc X, Y, angle and store it to shared memory\r
- {\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
+ float X = 0.0f, Y = 0.0f, angle = 0.0f;\r
\r
- if (tid < ORI_SAMPLES)\r
- {\r
- const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
- const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);\r
- const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);\r
+ if (tid < ORI_SAMPLES)\r
+ {\r
+ const float margin = (float)(grad_wav_size - 1) / 2.0f;\r
+ const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);\r
+ const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);\r
\r
- if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
- {\r
- X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);\r
- Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);\r
- \r
- angle = atan2f(Y, X);\r
- if (angle < 0)\r
- angle += 2.0f * CV_PI;\r
- angle *= 180.0f / CV_PI;\r
- }\r
- }\r
- if (tid < 128)\r
+ if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))\r
{\r
- s_X[tid] = X;\r
- s_Y[tid] = Y;\r
- s_angle[tid] = angle;\r
+ X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);\r
+ Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);\r
+ \r
+ angle = atan2f(Y, X);\r
+ if (angle < 0)\r
+ angle += 2.0f * CV_PI;\r
+ angle *= 180.0f / CV_PI;\r
}\r
}\r
+ s_X[tid] = X;\r
+ s_Y[tid] = Y;\r
+ s_angle[tid] = angle;\r
__syncthreads();\r
\r
float bestx = 0, besty = 0, best_mod = 0;\r
sumx = s_X[threadIdx.x];\r
sumy = s_Y[threadIdx.x];\r
}\r
+ d = abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);\r
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
+ {\r
+ sumx += s_X[threadIdx.x + 32];\r
+ sumy += s_Y[threadIdx.x + 32];\r
+ }\r
d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);\r
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
{\r
sumx += s_X[threadIdx.x + 64];\r
sumy += s_Y[threadIdx.x + 64];\r
}\r
-\r
- float* s_sumx_row = s_sumx + threadIdx.y * 64;\r
- float* s_sumy_row = s_sumy + threadIdx.y * 64;\r
-\r
- s_sumx_row[threadIdx.x] = sumx;\r
- s_sumy_row[threadIdx.x] = sumy;\r
- __syncthreads();\r
-\r
- if (threadIdx.x < 32)\r
+ d = abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);\r
+ if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)\r
{\r
- volatile float* v_sumx_row = s_sumx_row;\r
- volatile float* v_sumy_row = s_sumy_row;\r
-\r
- v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 32];\r
- v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 32];\r
-\r
- v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 16];\r
- v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 16];\r
-\r
- v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 8];\r
- v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 8];\r
-\r
- v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 4];\r
- v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 4];\r
+ sumx += s_X[threadIdx.x + 96];\r
+ sumy += s_Y[threadIdx.x + 96];\r
+ }\r
\r
- v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 2];\r
- v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 2];\r
+ float* s_sum_row = s_sum + threadIdx.y * 32;\r
\r
- v_sumx_row[threadIdx.x] = sumx += v_sumx_row[threadIdx.x + 1];\r
- v_sumy_row[threadIdx.x] = sumy += v_sumy_row[threadIdx.x + 1];\r
- }\r
+ reduceSum32(s_sum_row, sumx);\r
+ reduceSum32(s_sum_row, sumy);\r
\r
const float temp_mod = sumx * sumx + sumy * sumy;\r
if (temp_mod > best_mod)\r
bestx = sumx;\r
besty = sumy;\r
}\r
- __syncthreads();\r
}\r
\r
if (threadIdx.x == 0)\r
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) \r
{\r
dim3 threads;\r
- threads.x = 64;\r
+ threads.x = 32;\r
threads.y = 4;\r
\r
dim3 grid;\r
} \r
\r
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], \r
- const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, \r
- int tid)\r
+ const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)\r
{\r
__shared__ float s_PATCH[6][6];\r
\r
\r
if (threadIdx.x < 5 && threadIdx.y < 5)\r
{\r
- tid = threadIdx.y * 5 + threadIdx.x;\r
+ const int tid = threadIdx.y * 5 + threadIdx.x;\r
\r
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];\r
\r
__shared__ float sdxabs[25];\r
__shared__ float sdyabs[25];\r
\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
- calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);\r
+ calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
__syncthreads();\r
\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
sdyabs[tid] = fabs(sdy[tid]); // |dy| array\r
__syncthreads();\r
__shared__ float sdabs1[25];\r
__shared__ float sdabs2[25];\r
\r
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
-\r
- calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir, tid);\r
+ calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);\r
__syncthreads();\r
\r
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+\r
if (sdy[tid] >= 0)\r
{\r
sd1[tid] = sdx[tid];\r