modules/ocl/src/match_template.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #include <iomanip>
  48 #include "precomp.hpp"
  49
  50 using namespace cv;
  51 using namespace cv::ocl;
  52
  53 //helper routines
  54 namespace cv
  55 {
  56     namespace ocl
  57     {
  58         ///////////////////////////OpenCL kernel strings///////////////////////////
  59         extern const char *match_template;
  60     }
  61 }
  62
  63 namespace cv
  64 {
  65     namespace ocl
  66     {
  67         void matchTemplate_SQDIFF(
  68             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  69
  70         void matchTemplate_SQDIFF_NORMED(
  71             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  72
  73         void convolve_32F(
  74             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  75
  76         void matchTemplate_CCORR(
  77             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  78
  79         void matchTemplate_CCORR_NORMED(
  80             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  81
  82         void matchTemplate_CCOFF(
  83             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  84
  85         void matchTemplate_CCOFF_NORMED(
  86             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  87
  88
  89         void matchTemplateNaive_SQDIFF(
  90             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  91
  92         void matchTemplateNaive_CCORR(
  93             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  94
  95         void extractFirstChannel_32F(
  96             const oclMat &image, oclMat &result);
  97
  98         // Evaluates optimal template's area threshold. If
  99         // template's area is less  than the threshold, we use naive match
 100         // template version, otherwise FFT-based (if available)
 101         static bool useNaive(int method, int depth, Size size)
 102         {
 103 #ifdef HAVE_CLAMDFFT
 104             if (method == TM_SQDIFF && (depth == CV_32F || !Context::getContext()->supportsFeature(Context::CL_DOUBLE)))
 105             {
 106                 return true;
 107             }
 108             else if(method == TM_CCORR || (method == TM_SQDIFF && depth == CV_8U))
 109             {
 110                 return size.height < 18 && size.width < 18;
 111             }
 112             else
 113                 return false;
 114 #else
 115 #define UNUSED(x) (void)(x);
 116             UNUSED(method) UNUSED(depth) UNUSED(size)
 117 #undef  UNUSED
 118             return true;
 119 #endif
 120         }
 121
 122         //////////////////////////////////////////////////////////////////////
 123         // SQDIFF
 124         void matchTemplate_SQDIFF(
 125             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
 126         {
 127             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 128             if (useNaive(TM_SQDIFF, image.depth(), templ.size()))
 129             {
 130                 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
 131                 return;
 132             }
 133             else
 134             {
 135                 buf.image_sqsums.resize(1);
 136
 137                 // TODO, add double support for ocl::integral
 138                 // use CPU integral temporarily
 139                 Mat sums, sqsums;
 140                 cv::integral(Mat(image.reshape(1)), sums, sqsums);
 141                 buf.image_sqsums[0] = sqsums;
 142
 143                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 144                 matchTemplate_CCORR(image, templ, result, buf);
 145
 146                 //port CUDA's matchTemplatePrepared_SQDIFF_8U
 147                 Context *clCxt = image.clCxt;
 148                 String kernelName = "matchTemplate_Prepared_SQDIFF";
 149                 std::vector< std::pair<size_t, const void *> > args;
 150
 151                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 152                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 153                 args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 154                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 155                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 156                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 157                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 158                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 159                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 160                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 161                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 162
 163                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 164                 size_t localThreads[3]  = {16, 16, 1};
 165
 166                 const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
 167                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
 168             }
 169         }
 170
 171         void matchTemplate_SQDIFF_NORMED(
 172             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 173         {
 174             matchTemplate_CCORR(image, templ, result, buf);
 175             buf.image_sums.resize(1);
 176
 177             integral(image.reshape(1), buf.image_sums[0]);
 178
 179             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 180
 181             Context *clCxt = image.clCxt;
 182             String kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
 183             std::vector< std::pair<size_t, const void *> > args;
 184
 185             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
 186             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 187             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 188             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 189             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 190             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 191             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 192             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
 193             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
 194             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 195             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 196
 197             size_t globalThreads[3] = {result.cols, result.rows, 1};
 198             size_t localThreads[3]  = {16, 16, 1};
 199             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 200         }
 201
 202         void matchTemplateNaive_SQDIFF(
 203             const oclMat &image, const oclMat &templ, oclMat &result, int)
 204         {
 205             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 206                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 207                      );
 208             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 209             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 210
 211             Context *clCxt = image.clCxt;
 212             String kernelName = "matchTemplate_Naive_SQDIFF";
 213
 214             std::vector< std::pair<size_t, const void *> > args;
 215
 216             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
 217             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
 218             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 219             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
 220             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
 221             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 222             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 223             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 224             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 225             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 226             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
 227             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 228             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 229             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
 230             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 231
 232             size_t globalThreads[3] = {result.cols, result.rows, 1};
 233             size_t localThreads[3]  = {16, 16, 1};
 234             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 235         }
 236
 237         //////////////////////////////////////////////////////////////////////
 238         // CCORR
 239         void convolve_32F(
 240             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 241         {
 242             ConvolveBuf convolve_buf;
 243             convolve_buf.user_block_size = buf.user_block_size;
 244             if (image.oclchannels() == 1)
 245                 convolve(image, templ, result, true, convolve_buf);
 246             else
 247             {
 248                 oclMat result_;
 249                 convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf);
 250                 extractFirstChannel_32F(result_, result);
 251             }
 252         }
 253
 254         void matchTemplate_CCORR(
 255             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 256         {
 257             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 258             if (useNaive(TM_CCORR, image.depth(), templ.size()))
 259             {
 260                 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
 261                 return;
 262             }
 263             else
 264             {
 265                 if(image.depth() == CV_8U && templ.depth() == CV_8U)
 266                 {
 267                     image.convertTo(buf.imagef, CV_32F);
 268                     templ.convertTo(buf.templf, CV_32F);
 269                     convolve_32F(buf.imagef, buf.templf, result, buf);
 270                 }
 271                 else
 272                 {
 273                     convolve_32F(image, templ, result, buf);
 274                 }
 275             }
 276         }
 277
 278         void matchTemplate_CCORR_NORMED(
 279             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 280         {
 281             matchTemplate_CCORR(image, templ, result, buf);
 282             buf.image_sums.resize(1);
 283             buf.image_sqsums.resize(1);
 284
 285             integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
 286
 287             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 288
 289             Context *clCxt = image.clCxt;
 290             String kernelName = "normalizeKernel";
 291             std::vector< std::pair<size_t, const void *> > args;
 292
 293             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 294             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 295             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 296             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 297             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 298             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 299             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 300             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 301             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 302             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 303             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 304
 305             size_t globalThreads[3] = {result.cols, result.rows, 1};
 306             size_t localThreads[3]  = {16, 16, 1};
 307             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 308         }
 309
 310         void matchTemplateNaive_CCORR(
 311             const oclMat &image, const oclMat &templ, oclMat &result, int)
 312         {
 313             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 314                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 315                      );
 316             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 317             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 318
 319             Context *clCxt = image.clCxt;
 320             String kernelName = "matchTemplate_Naive_CCORR";
 321
 322             std::vector< std::pair<size_t, const void *> > args;
 323
 324             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
 325             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
 326             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 327             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
 328             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
 329             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 330             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 331             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 332             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 333             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 334             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
 335             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 336             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 337             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
 338             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 339
 340             size_t globalThreads[3] = {result.cols, result.rows, 1};
 341             size_t localThreads[3]  = {16, 16, 1};
 342             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 343         }
 344         //////////////////////////////////////////////////////////////////////
 345         // CCOFF
 346         void matchTemplate_CCOFF(
 347             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 348         {
 349             CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
 350
 351             matchTemplate_CCORR(image, templ, result, buf);
 352
 353             Context *clCxt = image.clCxt;
 354             String kernelName;
 355
 356             kernelName = "matchTemplate_Prepared_CCOFF";
 357             size_t globalThreads[3] = {result.cols, result.rows, 1};
 358             size_t localThreads[3]  = {16, 16, 1};
 359
 360             std::vector< std::pair<size_t, const void *> > args;
 361             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 362             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
 363             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
 364             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
 365             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
 366             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 367             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 368             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 369             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 370             Vec4f templ_sum = Vec4f::all(0);
 371             // to be continued in the following section
 372             if(image.oclchannels() == 1)
 373             {
 374                 buf.image_sums.resize(1);
 375                 integral(image, buf.image_sums[0]);
 376
 377                 templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
 378                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 379                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 380                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 381                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 382             }
 383             else
 384             {
 385
 386                 split(image, buf.images);
 387                 templ_sum = sum(templ) / templ.size().area();
 388                 buf.image_sums.resize(buf.images.size());
 389
 390
 391                 for(int i = 0; i < image.oclchannels(); i ++)
 392                 {
 393                     integral(buf.images[i], buf.image_sums[i]);
 394                 }
 395                 switch(image.oclchannels())
 396                 {
 397                 case 4:
 398                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 399                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 400                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 401                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 402                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 403                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 404                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 405                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 406                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 407                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 408                     break;
 409                 default:
 410                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
 411                     break;
 412                 }
 413             }
 414             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 415         }
 416
 417         void matchTemplate_CCOFF_NORMED(
 418             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 419         {
 420             image.convertTo(buf.imagef, CV_32F);
 421             templ.convertTo(buf.templf, CV_32F);
 422
 423             matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
 424             float scale = 1.f / templ.size().area();
 425
 426             Context *clCxt = image.clCxt;
 427             String kernelName;
 428
 429             kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
 430             size_t globalThreads[3] = {result.cols, result.rows, 1};
 431             size_t localThreads[3]  = {16, 16, 1};
 432
 433             std::vector< std::pair<size_t, const void *> > args;
 434             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 435             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
 436             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
 437             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
 438             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
 439             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 440             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 441             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 442             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 443             args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale) );
 444
 445             Vec4f templ_sum   = Vec4f::all(0);
 446             Vec4f templ_sqsum = Vec4f::all(0);
 447             // to be continued in the following section
 448             if(image.oclchannels() == 1)
 449             {
 450                 buf.image_sums.resize(1);
 451                 buf.image_sqsums.resize(1);
 452                 integral(image, buf.image_sums[0], buf.image_sqsums[0]);
 453
 454                 templ_sum[0]   = (float)sum(templ)[0];
 455
 456                 templ_sqsum[0] = sqrSum(templ)[0];
 457
 458                 templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
 459                 templ_sum[0]   *= scale;
 460
 461                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 462                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 463                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 464                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 465                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 466                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 467                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 468                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
 469             }
 470             else
 471             {
 472
 473                 split(image, buf.images);
 474                 templ_sum   = sum(templ);
 475
 476                 templ_sqsum = sqrSum(templ);
 477
 478                 templ_sqsum -= scale * templ_sum * templ_sum;
 479
 480                 float templ_sqsum_sum = 0;
 481                 for(int i = 0; i < image.oclchannels(); i ++)
 482                 {
 483                     templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
 484                 }
 485                 templ_sum   *= scale;
 486                 buf.image_sums.resize(buf.images.size());
 487                 buf.image_sqsums.resize(buf.images.size());
 488
 489                 for(int i = 0; i < image.oclchannels(); i ++)
 490                 {
 491                     integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
 492                 }
 493
 494                 switch(image.oclchannels())
 495                 {
 496                 case 4:
 497                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 498                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 499                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 500                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 501                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 502                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 503                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 504                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
 505                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
 506                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
 507                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 508                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 509                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 510                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 511                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 512                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 513                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
 514                     break;
 515                 default:
 516                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
 517                     break;
 518                 }
 519             }
 520             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 521         }
 522         void extractFirstChannel_32F(const oclMat &image, oclMat &result)
 523         {
 524             Context *clCxt = image.clCxt;
 525             String kernelName;
 526
 527             kernelName = "extractFirstChannel";
 528             size_t globalThreads[3] = {result.cols, result.rows, 1};
 529             size_t localThreads[3]  = {16, 16, 1};
 530
 531             std::vector< std::pair<size_t, const void *> > args;
 532             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data) );
 533             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 534             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 535             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 536             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 537             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 538             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 539             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 540
 541             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
 542         }
 543     }/*ocl*/
 544 } /*cv*/
 545
 546 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
 547 {
 548     MatchTemplateBuf buf;
 549     matchTemplate(image, templ, result, method, buf);
 550 }
 551 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
 552 {
 553     CV_Assert(image.type() == templ.type());
 554     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 555
 556     typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
 557
 558     const Caller callers[] =
 559     {
 560         ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
 561         ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
 562         ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
 563     };
 564
 565     Caller caller = callers[method];
 566     CV_Assert(caller);
 567     caller(image, templ, result, buf);
 568 }