From: yao Date: Tue, 26 Mar 2013 06:10:29 +0000 (+0800) Subject: more fix of mismatch X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~1314^2~1399^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f36db3a037cae9f495bb7ca0fc0d17a4ba0726fb;p=platform%2Fupstream%2Fopencv.git more fix of mismatch --- diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp index ab867d4..1f76d63 100644 --- a/modules/ocl/src/match_template.cpp +++ b/modules/ocl/src/match_template.cpp @@ -71,6 +71,9 @@ namespace cv void matchTemplate_SQDIFF_NORMED( const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf); + void convolve_32F( + const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf); + void matchTemplate_CCORR( const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf); @@ -90,41 +93,65 @@ namespace cv void matchTemplateNaive_CCORR( const oclMat &image, const oclMat &templ, oclMat &result, int cn); + void extractFirstChannel_32F( + const oclMat &image, oclMat &result); + // Evaluates optimal template's area threshold. If // template's area is less than the threshold, we use naive match // template version, otherwise FFT-based (if available) - static int getTemplateThreshold(int method, int depth) + static bool useNaive(int , int , Size ) { - switch (method) - { - case CV_TM_CCORR: - if (depth == CV_32F) return 250; - if (depth == CV_8U) return 300; - break; - case CV_TM_SQDIFF: - if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F - if (depth == CV_8U) return 300; - break; - } - CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode"); - return 0; + // FIXME! + // always use naive until convolve is imported + return true; } ////////////////////////////////////////////////////////////////////// // SQDIFF void matchTemplate_SQDIFF( - const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &) + const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) + if (useNaive(CV_TM_SQDIFF, image.depth(), templ.size())) { matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels()); return; } else { - // TODO - CV_Error(CV_StsBadArg, "Not supported yet for this size template"); + buf.image_sqsums.resize(1); + + // TODO, add double support for ocl::integral + // use CPU integral temporarily + Mat sums, sqsums; + cv::integral(Mat(image.reshape(1)), sums, sqsums); + buf.image_sqsums[0] = sqsums; + + unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; + matchTemplate_CCORR(image, templ, result, buf); + + //port CUDA's matchTemplatePrepared_SQDIFF_8U + Context *clCxt = image.clCxt; + string kernelName = "matchTemplate_Prepared_SQDIFF"; + vector< pair > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data)); + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data)); + args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows)); + args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols)); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {16, 16, 1}; + + const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : ""; + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt); } } @@ -134,7 +161,6 @@ namespace cv matchTemplate_CCORR(image, templ, result, buf); buf.image_sums.resize(1); - integral(image.reshape(1), buf.image_sums[0]); unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; @@ -156,7 +182,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U); } @@ -191,33 +217,39 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); } ////////////////////////////////////////////////////////////////////// // CCORR + void convolve_32F( + const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &) + { + CV_Error(-1, "convolve is not fully implemented yet"); + } + void matchTemplate_CCORR( const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) + if (useNaive(CV_TM_CCORR, image.depth(), templ.size())) { matchTemplateNaive_CCORR(image, templ, result, image.oclchannels()); return; } else { - CV_Error(CV_StsBadArg, "Not supported yet for this size template"); if(image.depth() == CV_8U && templ.depth() == CV_8U) { image.convertTo(buf.imagef, CV_32F); templ.convertTo(buf.templf, CV_32F); + convolve_32F(buf.imagef, buf.templf, result, buf); + } + else + { + convolve_32F(image, templ, result, buf); } - CV_Assert(image.oclchannels() == 1); - oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels())); - filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0)); - result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1)); } } @@ -249,7 +281,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U); } @@ -284,7 +316,7 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); } ////////////////////////////////////////////////////////////////////// @@ -301,7 +333,7 @@ namespace cv kernelName = "matchTemplate_Prepared_CCOFF"; size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); @@ -313,22 +345,22 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) ); args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + Vec4f templ_sum = Vec4f::all(0); // to be continued in the following section if(image.oclchannels() == 1) { buf.image_sums.resize(1); integral(image, buf.image_sums[0]); - float templ_sum = 0; - templ_sum = (float)sum(templ)[0] / templ.size().area(); + templ_sum[0] = (float)sum(templ)[0] / templ.size().area(); args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) ); - args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) ); + args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) ); } else { - Vec4f templ_sum = Vec4f::all(0); + split(image, buf.images); templ_sum = sum(templ) / templ.size().area(); buf.image_sums.resize(buf.images.size()); @@ -374,7 +406,7 @@ namespace cv kernelName = "matchTemplate_Prepared_CCOFF_NORMED"; size_t globalThreads[3] = {result.cols, result.rows, 1}; - size_t localThreads[3] = {32, 8, 1}; + size_t localThreads[3] = {16, 16, 1}; vector< pair > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); @@ -387,20 +419,22 @@ namespace cv args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_float), (void *)&scale) ); + + Vec4f templ_sum = Vec4f::all(0); + Vec4f templ_sqsum = Vec4f::all(0); // to be continued in the following section if(image.oclchannels() == 1) { buf.image_sums.resize(1); buf.image_sqsums.resize(1); integral(image, buf.image_sums[0], buf.image_sqsums[0]); - float templ_sum = 0; - float templ_sqsum = 0; - templ_sum = (float)sum(templ)[0]; - templ_sqsum = sqrSum(templ)[0]; + templ_sum[0] = (float)sum(templ)[0]; - templ_sqsum -= scale * templ_sum * templ_sum; - templ_sum *= scale; + templ_sqsum[0] = sqrSum(templ)[0]; + + templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0]; + templ_sum[0] *= scale; args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); @@ -408,13 +442,11 @@ namespace cv args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step) ); - args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) ); - args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum) ); + args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) ); + args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) ); } else { - Vec4f templ_sum = Vec4f::all(0); - Vec4f templ_sqsum = Vec4f::all(0); split(image, buf.images); templ_sum = sum(templ); @@ -465,7 +497,27 @@ namespace cv } openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); } + void extractFirstChannel_32F(const oclMat &image, oclMat &result) + { + Context *clCxt = image.clCxt; + string kernelName; + + kernelName = "extractFirstChannel"; + size_t globalThreads[3] = {result.cols, result.rows, 1}; + size_t localThreads[3] = {16, 16, 1}; + vector< pair > args; + args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data) ); + args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) ); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); + args.push_back( make_pair( sizeof(cl_int), (void *)&image.step)); + args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); + + openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1); + } }/*ocl*/ } /*cv*/ diff --git a/modules/ocl/src/opencl/match_template.cl b/modules/ocl/src/opencl/match_template.cl index 3133e62..857f891 100644 --- a/modules/ocl/src/opencl/match_template.cl +++ b/modules/ocl/src/opencl/match_template.cl @@ -45,22 +45,28 @@ #pragma OPENCL EXTENSION cl_amd_printf : enable -#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable +#if defined (DOUBLE_SUPPORT) -#elif defined (__NVIDIA__) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable #endif -#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__)) #define TYPE_IMAGE_SQSUM double #else -#define TYPE_IMAGE_SQSUM ulong +#define TYPE_IMAGE_SQSUM float +#endif + +#ifndef CN4 +#define CN4 1 +#else +#define CN4 4 #endif ////////////////////////////////////////////////// // utilities -#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox) +#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4) #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) // normAcc* are accurate normalization routines which make GPU matchTemplate // consistent with CPU one @@ -95,7 +101,7 @@ float normAcc_SQDIFF(float num, float denum) __kernel void normalizeKernel_C1_D0 ( - __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global const float * img_sqsums, __global float * res, ulong tpl_sqsum, int res_rows, @@ -119,8 +125,8 @@ void normalizeKernel_C1_D0 if(gidx < res_cols && gidy < res_rows) { float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum)); } } @@ -152,8 +158,8 @@ void matchTemplate_Prepared_SQDIFF_C1_D0 if(gidx < res_cols && gidy < res_rows) { float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum; } } @@ -161,7 +167,7 @@ void matchTemplate_Prepared_SQDIFF_C1_D0 __kernel void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 ( - __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global const float * img_sqsums, __global float * res, ulong tpl_sqsum, int res_rows, @@ -185,10 +191,10 @@ void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 if(gidx < res_cols && gidy < res_rows) { float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum, - sqrt(image_sqsum_ * tpl_sqsum)); + sqrt(image_sqsum_ * tpl_sqsum)); } } @@ -628,8 +634,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0 if(gidx < res_cols && gidy < res_rows) { float sum = (float)( - (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); res[res_idx] -= sum * tpl_sum; } } @@ -671,17 +677,17 @@ void matchTemplate_Prepared_CCOFF_C4_D0 { float ccorr = res[res_idx]; ccorr -= tpl_sum_c0*(float)( - (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); + (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); ccorr -= tpl_sum_c1*(float)( - (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); + (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); ccorr -= tpl_sum_c2*(float)( - (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); + (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); ccorr -= tpl_sum_c3*(float)( - (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); + (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); res[res_idx] = ccorr; } } @@ -702,7 +708,7 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0 __global const uint * img_sums, int img_sums_offset, int img_sums_step, - __global const TYPE_IMAGE_SQSUM * img_sqsums, + __global const float * img_sqsums, int img_sqsums_offset, int img_sqsums_step, float tpl_sum, @@ -725,12 +731,12 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0 if(gidx < res_cols && gidy < res_rows) { float image_sum_ = (float)( - (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); + (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) + - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); + (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum, sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_))); } @@ -754,10 +760,10 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 __global const uint * img_sums_c3, int img_sums_offset, int img_sums_step, - __global const TYPE_IMAGE_SQSUM * img_sqsums_c0, - __global const TYPE_IMAGE_SQSUM * img_sqsums_c1, - __global const TYPE_IMAGE_SQSUM * img_sqsums_c2, - __global const TYPE_IMAGE_SQSUM * img_sqsums_c3, + __global const float * img_sqsums_c0, + __global const float * img_sqsums_c1, + __global const float * img_sqsums_c2, + __global const float * img_sqsums_c3, int img_sqsums_offset, int img_sqsums_step, float tpl_sum_c0, @@ -782,42 +788,71 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 if(gidx < res_cols && gidy < res_rows) { float image_sum_c0 = (float)( - (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); + (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); float image_sum_c1 = (float)( - (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); + (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); float image_sum_c2 = (float)( - (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); + (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); float image_sum_c3 = (float)( - (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); + (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) + - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); float image_sqsum_c0 = (float)( - (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); + (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); float image_sqsum_c1 = (float)( - (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); + (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); float image_sqsum_c2 = (float)( - (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); + (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); float image_sqsum_c3 = (float)( - (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); + (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - + (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); float num = res[res_idx] - - image_sum_c0 * tpl_sum_c0 - - image_sum_c1 * tpl_sum_c1 - - image_sum_c2 * tpl_sum_c2 - - image_sum_c3 * tpl_sum_c3; + image_sum_c0 * tpl_sum_c0 - + image_sum_c1 * tpl_sum_c1 - + image_sum_c2 * tpl_sum_c2 - + image_sum_c3 * tpl_sum_c3; float denum = sqrt( tpl_sqsum * ( - image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + - image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + - image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + - image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) - ); + image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + + image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + + image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + + image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) + ); res[res_idx] = normAcc(num, denum); } } + +////////////////////////////////////////////////////////////////////// +// extractFirstChannel +__kernel +void extractFirstChannel +( + const __global float4* img, + __global float* res, + int rows, + int cols, + int img_offset, + int res_offset, + int img_step, + int res_step +) +{ + img_step /= sizeof(float4); + res_step /= sizeof(float); + img_offset /= sizeof(float4); + res_offset /= sizeof(float); + img += img_offset; + res += res_offset; + int gidx = get_global_id(0); + int gidy = get_global_id(1); + if(gidx < cols && gidy < rows) + { + res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x; + } +} diff --git a/modules/ocl/test/test_match_template.cpp b/modules/ocl/test/test_match_template.cpp index 2fc6a10..5da7f01 100644 --- a/modules/ocl/test/test_match_template.cpp +++ b/modules/ocl/test/test_match_template.cpp @@ -75,7 +75,7 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho } }; -TEST_P(MatchTemplate8U, DISABLED_Accuracy) +TEST_P(MatchTemplate8U, Accuracy) { std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl; @@ -138,18 +138,18 @@ TEST_P(MatchTemplate32F, Accuracy) EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss); } -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate8U, testing::Combine( MTEMP_SIZES, - testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), + testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))), testing::Values(Channels(1), Channels(3), Channels(4)), ALL_TEMPLATE_METHODS ) ); -INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine( +INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate32F, testing::Combine( MTEMP_SIZES, - testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), + testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))), testing::Values(Channels(1), Channels(3), Channels(4)), testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR)))); #endif