\r
\r
__global__ void matchTemplatePreparedKernel_8U_SQDIFF(\r
- int w, int h, const PtrStepf image_sumsq, float templ_sumsq,\r
+ int w, int h, const PtrStep_<unsigned long long> image_sqsum, float templ_sqsum,\r
DevMem2Df result)\r
{\r
const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
\r
if (x < result.cols && y < result.rows)\r
{\r
- float image_sq = image_sumsq.ptr(y + h)[x + w] \r
- - image_sumsq.ptr(y)[x + w]\r
- - image_sumsq.ptr(y + h)[x]\r
- + image_sumsq.ptr(y)[x];\r
+ float image_sq = (float)(\r
+ (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -\r
+ (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));\r
float ccorr = result.ptr(y)[x];\r
- result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sumsq;\r
+ result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sqsum;\r
}\r
}\r
\r
\r
void matchTemplatePrepared_8U_SQDIFF(\r
- int w, int h, const DevMem2Df image_sumsq, float templ_sumsq,\r
+ int w, int h, const DevMem2D_<unsigned long long> image_sqsum, float templ_sqsum,\r
DevMem2Df result)\r
{\r
dim3 threads(32, 8);\r
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
matchTemplatePreparedKernel_8U_SQDIFF<<<grid, threads>>>(\r
- w, h, image_sumsq, templ_sumsq, result);\r
+ w, h, image_sqsum, templ_sqsum, result);\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+}\r
+\r
+\r
+__global__ void matchTemplatePreparedKernel_8U_SQDIFF_NORMED(\r
+ int w, int h, const PtrStep_<unsigned long long> image_sqsum, float templ_sqsum,\r
+ DevMem2Df result)\r
+{\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sq = (float)(\r
+ (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -\r
+ (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));\r
+ float ccorr = result.ptr(y)[x];\r
+ result.ptr(y)[x] = (image_sq - 2.f * ccorr + templ_sqsum) * \r
+ rsqrtf(image_sq * templ_sqsum);\r
+ }\r
+}\r
+\r
+\r
+void matchTemplatePrepared_8U_SQDIFF_NORMED(\r
+ int w, int h, const DevMem2D_<unsigned long long> image_sqsum, float templ_sqsum,\r
+ DevMem2Df result)\r
+{\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ matchTemplatePreparedKernel_8U_SQDIFF_NORMED<<<grid, threads>>>(\r
+ w, h, image_sqsum, templ_sqsum, result);\r
+ cudaSafeCall(cudaThreadSynchronize());\r
+}\r
+\r
+\r
+__global__ void normalizeKernel_8U(int w, int h, const PtrStep_<unsigned long long> image_sqsum, \r
+ float templ_sqsum, DevMem2Df result)\r
+{\r
+ const int x = blockIdx.x * blockDim.x + threadIdx.x;\r
+ const int y = blockIdx.y * blockDim.y + threadIdx.y;\r
+\r
+ if (x < result.cols && y < result.rows)\r
+ {\r
+ float image_sq = (float)(\r
+ (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -\r
+ (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));\r
+ result.ptr(y)[x] *= rsqrtf(image_sq * templ_sqsum);\r
+ }\r
+}\r
+\r
+\r
+void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, \r
+ float templ_sqsum, DevMem2Df result)\r
+{\r
+ dim3 threads(32, 8);\r
+ dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));\r
+ normalizeKernel_8U<<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);\r
cudaSafeCall(cudaThreadSynchronize());\r
}\r
\r