modules/ocl/src/match_template.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #include "precomp.hpp"
  48 #include "opencl_kernels.hpp"
  49
  50 using namespace cv;
  51 using namespace cv::ocl;
  52
  53 namespace cv
  54 {
  55     namespace ocl
  56     {
  57         void matchTemplate_SQDIFF(
  58             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  59
  60         void matchTemplate_SQDIFF_NORMED(
  61             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  62
  63         void convolve_32F(
  64             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  65
  66         void matchTemplate_CCORR(
  67             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  68
  69         void matchTemplate_CCORR_NORMED(
  70             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  71
  72         void matchTemplate_CCOFF(
  73             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  74
  75         void matchTemplate_CCOFF_NORMED(
  76             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  77
  78
  79         void matchTemplateNaive_SQDIFF(
  80             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  81
  82         void matchTemplateNaive_CCORR(
  83             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  84
  85         void extractFirstChannel_32F(
  86             const oclMat &image, oclMat &result);
  87
  88         // Evaluates optimal template's area threshold. If
  89         // template's area is less  than the threshold, we use naive match
  90         // template version, otherwise FFT-based (if available)
  91         static bool useNaive(int method, int depth, Size size)
  92         {
  93 #ifdef HAVE_CLAMDFFT
  94             if (method == TM_SQDIFF && (depth == CV_32F || !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)))
  95             {
  96                 return true;
  97             }
  98             else if(method == TM_CCORR || (method == TM_SQDIFF && depth == CV_8U))
  99             {
 100                 return size.height < 18 && size.width < 18;
 101             }
 102             else
 103                 return false;
 104 #else
 105 #define UNUSED(x) (void)(x);
 106             UNUSED(method) UNUSED(depth) UNUSED(size)
 107 #undef  UNUSED
 108             return true;
 109 #endif
 110         }
 111
 112         //////////////////////////////////////////////////////////////////////
 113         // SQDIFF
 114         void matchTemplate_SQDIFF(
 115             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
 116         {
 117             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 118             if (useNaive(TM_SQDIFF, image.depth(), templ.size()))
 119             {
 120                 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
 121                 return;
 122             }
 123             else
 124             {
 125                 buf.image_sqsums.resize(1);
 126
 127                 // TODO, add double support for ocl::integral
 128                 // use CPU integral temporarily
 129                 Mat sums, sqsums;
 130                 cv::integral(Mat(image.reshape(1)), sums, sqsums);
 131                 buf.image_sqsums[0] = sqsums;
 132
 133                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 134                 matchTemplate_CCORR(image, templ, result, buf);
 135
 136                 //port CUDA's matchTemplatePrepared_SQDIFF_8U
 137                 Context *clCxt = image.clCxt;
 138                 String kernelName = "matchTemplate_Prepared_SQDIFF";
 139                 std::vector< std::pair<size_t, const void *> > args;
 140
 141                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 142                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 143                 args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 144                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 145                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 146                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 147                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 148                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 149                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 150                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 151                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 152
 153                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 154                 size_t localThreads[3]  = {16, 16, 1};
 155
 156                 const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
 157                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
 158             }
 159         }
 160
 161         void matchTemplate_SQDIFF_NORMED(
 162             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 163         {
 164             matchTemplate_CCORR(image, templ, result, buf);
 165             buf.image_sums.resize(1);
 166
 167             integral(image.reshape(1), buf.image_sums[0]);
 168
 169             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 170
 171             Context *clCxt = image.clCxt;
 172             String kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
 173             std::vector< std::pair<size_t, const void *> > args;
 174
 175             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
 176             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 177             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 178             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 179             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 180             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 181             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 182             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
 183             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
 184             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 185             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 186
 187             size_t globalThreads[3] = {result.cols, result.rows, 1};
 188             size_t localThreads[3]  = {16, 16, 1};
 189             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 190         }
 191
 192         void matchTemplateNaive_SQDIFF(
 193             const oclMat &image, const oclMat &templ, oclMat &result, int)
 194         {
 195             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 196                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 197                      );
 198             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 199             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 200
 201             Context *clCxt = image.clCxt;
 202             String kernelName = "matchTemplate_Naive_SQDIFF";
 203
 204             std::vector< std::pair<size_t, const void *> > args;
 205
 206             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
 207             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
 208             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 209             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
 210             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
 211             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 212             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 213             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 214             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 215             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 216             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
 217             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 218             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 219             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
 220             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 221
 222             size_t globalThreads[3] = {result.cols, result.rows, 1};
 223             size_t localThreads[3]  = {16, 16, 1};
 224             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 225         }
 226
 227         //////////////////////////////////////////////////////////////////////
 228         // CCORR
 229         void convolve_32F(
 230             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 231         {
 232             ConvolveBuf convolve_buf;
 233             convolve_buf.user_block_size = buf.user_block_size;
 234             if (image.oclchannels() == 1)
 235                 convolve(image, templ, result, true, convolve_buf);
 236             else
 237             {
 238                 oclMat result_;
 239                 convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf);
 240                 extractFirstChannel_32F(result_, result);
 241             }
 242         }
 243
 244         void matchTemplate_CCORR(
 245             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 246         {
 247             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 248             if (useNaive(TM_CCORR, image.depth(), templ.size()))
 249             {
 250                 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
 251                 return;
 252             }
 253             else
 254             {
 255                 if(image.depth() == CV_8U && templ.depth() == CV_8U)
 256                 {
 257                     image.convertTo(buf.imagef, CV_32F);
 258                     templ.convertTo(buf.templf, CV_32F);
 259                     convolve_32F(buf.imagef, buf.templf, result, buf);
 260                 }
 261                 else
 262                 {
 263                     convolve_32F(image, templ, result, buf);
 264                 }
 265             }
 266         }
 267
 268         void matchTemplate_CCORR_NORMED(
 269             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 270         {
 271             matchTemplate_CCORR(image, templ, result, buf);
 272             buf.image_sums.resize(1);
 273             buf.image_sqsums.resize(1);
 274
 275             integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
 276
 277             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 278
 279             Context *clCxt = image.clCxt;
 280             String kernelName = "normalizeKernel";
 281             std::vector< std::pair<size_t, const void *> > args;
 282
 283             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 284             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 285             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 286             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 287             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 288             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 289             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 290             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 291             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 292             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 293             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 294
 295             size_t globalThreads[3] = {result.cols, result.rows, 1};
 296             size_t localThreads[3]  = {16, 16, 1};
 297             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 298         }
 299
 300         void matchTemplateNaive_CCORR(
 301             const oclMat &image, const oclMat &templ, oclMat &result, int)
 302         {
 303             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 304                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 305                      );
 306             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 307             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 308
 309             Context *clCxt = image.clCxt;
 310             String kernelName = "matchTemplate_Naive_CCORR";
 311
 312             std::vector< std::pair<size_t, const void *> > args;
 313
 314             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
 315             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
 316             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 317             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
 318             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
 319             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 320             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 321             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 322             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 323             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 324             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
 325             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 326             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 327             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
 328             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 329
 330             size_t globalThreads[3] = {result.cols, result.rows, 1};
 331             size_t localThreads[3]  = {16, 16, 1};
 332             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 333         }
 334         //////////////////////////////////////////////////////////////////////
 335         // CCOFF
 336         void matchTemplate_CCOFF(
 337             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 338         {
 339             CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
 340
 341             matchTemplate_CCORR(image, templ, result, buf);
 342
 343             Context *clCxt = image.clCxt;
 344             String kernelName;
 345
 346             kernelName = "matchTemplate_Prepared_CCOFF";
 347             size_t globalThreads[3] = {result.cols, result.rows, 1};
 348             size_t localThreads[3]  = {16, 16, 1};
 349
 350             std::vector< std::pair<size_t, const void *> > args;
 351             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 352             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
 353             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
 354             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
 355             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
 356             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 357             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 358             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 359             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 360             Vec4f templ_sum = Vec4f::all(0);
 361             // to be continued in the following section
 362             if(image.oclchannels() == 1)
 363             {
 364                 buf.image_sums.resize(1);
 365                 integral(image, buf.image_sums[0]);
 366
 367                 templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
 368                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 369                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 370                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 371                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 372             }
 373             else
 374             {
 375
 376                 split(image, buf.images);
 377                 templ_sum = sum(templ) / templ.size().area();
 378                 buf.image_sums.resize(buf.images.size());
 379
 380
 381                 for(int i = 0; i < image.oclchannels(); i ++)
 382                 {
 383                     integral(buf.images[i], buf.image_sums[i]);
 384                 }
 385                 switch(image.oclchannels())
 386                 {
 387                 case 4:
 388                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 389                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 390                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 391                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 392                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 393                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 394                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 395                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 396                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 397                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 398                     break;
 399                 default:
 400                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
 401                     break;
 402                 }
 403             }
 404             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 405         }
 406
 407         void matchTemplate_CCOFF_NORMED(
 408             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 409         {
 410             image.convertTo(buf.imagef, CV_32F);
 411             templ.convertTo(buf.templf, CV_32F);
 412
 413             matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
 414             float scale = 1.f / templ.size().area();
 415
 416             Context *clCxt = image.clCxt;
 417             String kernelName;
 418
 419             kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
 420             size_t globalThreads[3] = {result.cols, result.rows, 1};
 421             size_t localThreads[3]  = {16, 16, 1};
 422
 423             std::vector< std::pair<size_t, const void *> > args;
 424             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 425             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
 426             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
 427             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
 428             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
 429             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 430             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 431             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 432             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 433             args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale) );
 434
 435             Vec4f templ_sum   = Vec4f::all(0);
 436             Vec4f templ_sqsum = Vec4f::all(0);
 437             // to be continued in the following section
 438             if(image.oclchannels() == 1)
 439             {
 440                 buf.image_sums.resize(1);
 441                 buf.image_sqsums.resize(1);
 442                 integral(image, buf.image_sums[0], buf.image_sqsums[0]);
 443
 444                 templ_sum[0]   = (float)sum(templ)[0];
 445
 446                 templ_sqsum[0] = sqrSum(templ)[0];
 447
 448                 templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
 449                 templ_sum[0]   *= scale;
 450
 451                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 452                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 453                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 454                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 455                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 456                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 457                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 458                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
 459             }
 460             else
 461             {
 462
 463                 split(image, buf.images);
 464                 templ_sum   = sum(templ);
 465
 466                 templ_sqsum = sqrSum(templ);
 467
 468                 templ_sqsum -= scale * templ_sum * templ_sum;
 469
 470                 float templ_sqsum_sum = 0;
 471                 for(int i = 0; i < image.oclchannels(); i ++)
 472                 {
 473                     templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
 474                 }
 475                 templ_sum   *= scale;
 476                 buf.image_sums.resize(buf.images.size());
 477                 buf.image_sqsums.resize(buf.images.size());
 478
 479                 for(int i = 0; i < image.oclchannels(); i ++)
 480                 {
 481                     integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
 482                 }
 483
 484                 switch(image.oclchannels())
 485                 {
 486                 case 4:
 487                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 488                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 489                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 490                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 491                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 492                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 493                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 494                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
 495                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
 496                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
 497                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 498                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 499                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 500                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 501                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 502                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 503                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
 504                     break;
 505                 default:
 506                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
 507                     break;
 508                 }
 509             }
 510             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 511         }
 512         void extractFirstChannel_32F(const oclMat &image, oclMat &result)
 513         {
 514             Context *clCxt = image.clCxt;
 515             String kernelName;
 516
 517             kernelName = "extractFirstChannel";
 518             size_t globalThreads[3] = {result.cols, result.rows, 1};
 519             size_t localThreads[3]  = {16, 16, 1};
 520
 521             std::vector< std::pair<size_t, const void *> > args;
 522             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data) );
 523             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 524             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 525             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 526             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 527             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 528             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 529             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 530
 531             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
 532         }
 533     }/*ocl*/
 534 } /*cv*/
 535
 536 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
 537 {
 538     MatchTemplateBuf buf;
 539     matchTemplate(image, templ, result, method, buf);
 540 }
 541 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
 542 {
 543     CV_Assert(image.type() == templ.type());
 544     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 545
 546     typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
 547
 548     const Caller callers[] =
 549     {
 550         ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
 551         ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
 552         ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
 553     };
 554
 555     Caller caller = callers[method];
 556     CV_Assert(caller);
 557     caller(image, templ, result, buf);
 558 }