\r
const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
- sdyabs[tid] = fabs(sdy[tid]); // |dy| array\r
- __syncthreads();\r
+ if (tid < 25)\r
+ {\r
+ sdxabs[tid] = fabs(sdx[tid]); // |dx| array\r
+ sdyabs[tid] = fabs(sdy[tid]); // |dy| array\r
+ __syncthreads();\r
\r
- reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
- __syncthreads();\r
+ reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);\r
+ __syncthreads();\r
\r
- float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
+ float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);\r
\r
- // write dx, dy, |dx|, |dy|\r
- if (tid == 0)\r
- {\r
- descriptors_block[0] = sdx[0];\r
- descriptors_block[1] = sdy[0];\r
- descriptors_block[2] = sdxabs[0];\r
- descriptors_block[3] = sdyabs[0];\r
+ // write dx, dy, |dx|, |dy|\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[0] = sdx[0];\r
+ descriptors_block[1] = sdy[0];\r
+ descriptors_block[2] = sdxabs[0];\r
+ descriptors_block[3] = sdyabs[0];\r
+ }\r
}\r
}\r
\r
\r
const int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
- if (sdy[tid] >= 0)\r
- {\r
- sd1[tid] = sdx[tid];\r
- sdabs1[tid] = fabs(sdx[tid]);\r
- sd2[tid] = 0;\r
- sdabs2[tid] = 0;\r
- }\r
- else\r
+ if (tid < 25)\r
{\r
- sd1[tid] = 0;\r
- sdabs1[tid] = 0;\r
- sd2[tid] = sdx[tid];\r
- sdabs2[tid] = fabs(sdx[tid]);\r
- }\r
- __syncthreads();\r
+ if (sdy[tid] >= 0)\r
+ {\r
+ sd1[tid] = sdx[tid];\r
+ sdabs1[tid] = fabs(sdx[tid]);\r
+ sd2[tid] = 0;\r
+ sdabs2[tid] = 0;\r
+ }\r
+ else\r
+ {\r
+ sd1[tid] = 0;\r
+ sdabs1[tid] = 0;\r
+ sd2[tid] = sdx[tid];\r
+ sdabs2[tid] = fabs(sdx[tid]);\r
+ }\r
+ __syncthreads();\r
\r
- reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
- __syncthreads();\r
+ reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
+ __syncthreads();\r
\r
- float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
+ float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);\r
\r
- // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
- if (tid == 0)\r
- {\r
- descriptors_block[0] = sd1[0];\r
- descriptors_block[1] = sdabs1[0];\r
- descriptors_block[2] = sd2[0];\r
- descriptors_block[3] = sdabs2[0];\r
- }\r
- __syncthreads();\r
+ // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[0] = sd1[0];\r
+ descriptors_block[1] = sdabs1[0];\r
+ descriptors_block[2] = sd2[0];\r
+ descriptors_block[3] = sdabs2[0];\r
+ }\r
+ __syncthreads();\r
\r
- if (sdx[tid] >= 0)\r
- {\r
- sd1[tid] = sdy[tid];\r
- sdabs1[tid] = fabs(sdy[tid]);\r
- sd2[tid] = 0;\r
- sdabs2[tid] = 0;\r
- }\r
- else\r
- {\r
- sd1[tid] = 0;\r
- sdabs1[tid] = 0;\r
- sd2[tid] = sdy[tid];\r
- sdabs2[tid] = fabs(sdy[tid]);\r
- }\r
- __syncthreads();\r
+ if (sdx[tid] >= 0)\r
+ {\r
+ sd1[tid] = sdy[tid];\r
+ sdabs1[tid] = fabs(sdy[tid]);\r
+ sd2[tid] = 0;\r
+ sdabs2[tid] = 0;\r
+ }\r
+ else\r
+ {\r
+ sd1[tid] = 0;\r
+ sdabs1[tid] = 0;\r
+ sd2[tid] = sdy[tid];\r
+ sdabs2[tid] = fabs(sdy[tid]);\r
+ }\r
+ __syncthreads();\r
\r
- reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
- __syncthreads();\r
+ reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);\r
+ __syncthreads();\r
\r
- // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
- if (tid == 0)\r
- {\r
- descriptors_block[4] = sd1[0];\r
- descriptors_block[5] = sdabs1[0];\r
- descriptors_block[6] = sd2[0];\r
- descriptors_block[7] = sdabs2[0];\r
+ // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)\r
+ if (tid == 0)\r
+ {\r
+ descriptors_block[4] = sd1[0];\r
+ descriptors_block[5] = sdabs1[0];\r
+ descriptors_block[6] = sd2[0];\r
+ descriptors_block[7] = sdabs2[0];\r
+ }\r
}\r
}\r
\r