modules/ocl/src/match_template.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other materials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #include "precomp.hpp"
  48 #include "opencl_kernels.hpp"
  49
  50 using namespace cv;
  51 using namespace cv::ocl;
  52
  53 namespace cv
  54 {
  55     namespace ocl
  56     {
  57         void matchTemplate_SQDIFF(
  58             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  59
  60         void matchTemplate_SQDIFF_NORMED(
  61             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  62
  63         void convolve_32F(
  64             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  65
  66         void matchTemplate_CCORR(
  67             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  68
  69         void matchTemplate_CCORR_NORMED(
  70             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  71
  72         void matchTemplate_CCOFF(
  73             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  74
  75         void matchTemplate_CCOFF_NORMED(
  76             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  77
  78
  79         void matchTemplateNaive_SQDIFF(
  80             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  81
  82         void matchTemplateNaive_CCORR(
  83             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  84
  85         void extractFirstChannel_32F(
  86             const oclMat &image, oclMat &result);
  87
  88         // Evaluates optimal template's area threshold. If
  89         // template's area is less  than the threshold, we use naive match
  90         // template version, otherwise FFT-based (if available)
  91         static bool useNaive(int method, int depth, Size size)
  92         {
  93 #ifdef HAVE_CLAMDFFT
  94             if (method == TM_SQDIFF && (depth == CV_32F || !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)))
  95             {
  96                 return true;
  97             }
  98             else if(method == TM_CCORR || (method == TM_SQDIFF && depth == CV_8U))
  99             {
 100                 return size.height < 18 && size.width < 18;
 101             }
 102             else
 103                 return false;
 104 #else
 105 #define UNUSED(x) (void)(x);
 106             UNUSED(method) UNUSED(depth) UNUSED(size)
 107 #undef  UNUSED
 108             return true;
 109 #endif
 110         }
 111
 112         //////////////////////////////////////////////////////////////////////
 113         // SQDIFF
 114         void matchTemplate_SQDIFF(
 115             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
 116         {
 117             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 118             if (useNaive(TM_SQDIFF, image.depth(), templ.size()))
 119             {
 120                 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
 121                 return;
 122             }
 123             else
 124             {
 125                 buf.image_sqsums.resize(1);
 126
 127                 // TODO, add double support for ocl::integral
 128                 // use CPU integral temporarily
 129                 Mat sums, sqsums;
 130                 cv::integral(Mat(image.reshape(1)), sums, sqsums);
 131                 buf.image_sqsums[0] = sqsums;
 132
 133                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 134                 matchTemplate_CCORR(image, templ, result, buf);
 135
 136                 //port CUDA's matchTemplatePrepared_SQDIFF_8U
 137                 Context *clCxt = image.clCxt;
 138                 String kernelName = "matchTemplate_Prepared_SQDIFF";
 139                 std::vector< std::pair<size_t, const void *> > args;
 140
 141                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 142                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 143                 args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 144                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 145                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 146                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 147                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 148                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 149                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 150                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 151                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 152
 153                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 154                 size_t localThreads[3]  = {16, 16, 1};
 155
 156                 const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
 157                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
 158             }
 159         }
 160
 161         void matchTemplate_SQDIFF_NORMED(
 162             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 163         {
 164             matchTemplate_CCORR(image, templ, result, buf);
 165             buf.image_sums.resize(1);
 166
 167             integral(image.reshape(1), buf.image_sums[0]);
 168
 169             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 170
 171             Context *clCxt = image.clCxt;
 172             String kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
 173             std::vector< std::pair<size_t, const void *> > args;
 174
 175             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
 176             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 177             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 178             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 179             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 180             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 181             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 182             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
 183             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
 184             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 185             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 186
 187             size_t globalThreads[3] = {result.cols, result.rows, 1};
 188             size_t localThreads[3]  = {16, 16, 1};
 189             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 190         }
 191
 192         void matchTemplateNaive_SQDIFF(
 193             const oclMat &image, const oclMat &templ, oclMat &result, int)
 194         {
 195             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 196                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 197                      );
 198             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 199             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 200
 201             Context *clCxt = image.clCxt;
 202             String kernelName = "matchTemplate_Naive_SQDIFF";
 203
 204             std::vector< std::pair<size_t, const void *> > args;
 205
 206             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
 207             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
 208             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 209             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
 210             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
 211             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 212             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 213             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 214             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 215             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 216             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
 217             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 218             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 219             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
 220             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 221
 222             size_t globalThreads[3] = {result.cols, result.rows, 1};
 223             size_t localThreads[3]  = {16, 16, 1};
 224             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 225         }
 226
 227         //////////////////////////////////////////////////////////////////////
 228         // CCORR
 229         void convolve_32F(
 230             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 231         {
 232             ConvolveBuf convolve_buf;
 233             convolve_buf.user_block_size = buf.user_block_size;
 234             if (image.oclchannels() == 1)
 235                 convolve(image, templ, result, true, convolve_buf);
 236             else
 237             {
 238                 oclMat result_;
 239                 convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf);
 240                 extractFirstChannel_32F(result_, result);
 241             }
 242         }
 243
 244         void matchTemplate_CCORR(
 245             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 246         {
 247             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 248             if (useNaive(TM_CCORR, image.depth(), templ.size()))
 249             {
 250                 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
 251                 return;
 252             }
 253             else
 254             {
 255                 if(image.depth() == CV_8U && templ.depth() == CV_8U)
 256                 {
 257                     image.convertTo(buf.imagef, CV_32F);
 258                     templ.convertTo(buf.templf, CV_32F);
 259                     convolve_32F(buf.imagef, buf.templf, result, buf);
 260                 }
 261                 else
 262                 {
 263                     convolve_32F(image, templ, result, buf);
 264                 }
 265             }
 266         }
 267
 268         void matchTemplate_CCORR_NORMED(
 269             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 270         {
 271             cv::ocl::oclMat temp;
 272             matchTemplate_CCORR(image, templ, result, buf);
 273             buf.image_sums.resize(1);
 274             buf.image_sqsums.resize(1);
 275             integral(image.reshape(1), buf.image_sums[0], temp);
 276             if(temp.depth() == CV_64F)
 277                 temp.convertTo(buf.image_sqsums[0], CV_32FC1);
 278             else
 279                 buf.image_sqsums[0] = temp;
 280             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 281
 282             Context *clCxt = image.clCxt;
 283             String kernelName = "normalizeKernel";
 284             std::vector< std::pair<size_t, const void *> > args;
 285
 286             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 287             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 288             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 289             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 290             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 291             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 292             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 293             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 294             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 295             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 296             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 297
 298             size_t globalThreads[3] = {result.cols, result.rows, 1};
 299             size_t localThreads[3]  = {16, 16, 1};
 300             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 301         }
 302
 303         void matchTemplateNaive_CCORR(
 304             const oclMat &image, const oclMat &templ, oclMat &result, int)
 305         {
 306             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 307                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 308                      );
 309             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 310             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 311
 312             Context *clCxt = image.clCxt;
 313             String kernelName = "matchTemplate_Naive_CCORR";
 314
 315             std::vector< std::pair<size_t, const void *> > args;
 316
 317             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
 318             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
 319             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
 320             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
 321             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
 322             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
 323             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
 324             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
 325             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
 326             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 327             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
 328             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 329             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 330             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
 331             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 332
 333             size_t globalThreads[3] = {result.cols, result.rows, 1};
 334             size_t localThreads[3]  = {16, 16, 1};
 335             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 336         }
 337         //////////////////////////////////////////////////////////////////////
 338         // CCOFF
 339         void matchTemplate_CCOFF(
 340             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 341         {
 342             CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
 343
 344             matchTemplate_CCORR(image, templ, result, buf);
 345
 346             Context *clCxt = image.clCxt;
 347             String kernelName;
 348
 349             kernelName = "matchTemplate_Prepared_CCOFF";
 350             size_t globalThreads[3] = {result.cols, result.rows, 1};
 351             size_t localThreads[3]  = {16, 16, 1};
 352
 353             std::vector< std::pair<size_t, const void *> > args;
 354             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 355             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
 356             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
 357             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
 358             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
 359             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 360             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 361             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 362             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 363             Vec4f templ_sum = Vec4f::all(0);
 364             // to be continued in the following section
 365             if(image.oclchannels() == 1)
 366             {
 367                 buf.image_sums.resize(1);
 368                 integral(image, buf.image_sums[0]);
 369
 370                 templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
 371                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 372                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 373                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 374                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 375             }
 376             else
 377             {
 378
 379                 split(image, buf.images);
 380                 templ_sum = sum(templ) / templ.size().area();
 381                 buf.image_sums.resize(buf.images.size());
 382
 383
 384                 for(int i = 0; i < image.oclchannels(); i ++)
 385                 {
 386                     integral(buf.images[i], buf.image_sums[i]);
 387                 }
 388                 switch(image.oclchannels())
 389                 {
 390                 case 4:
 391                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 392                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 393                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 394                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 395                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 396                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 397                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 398                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 399                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 400                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 401                     break;
 402                 default:
 403                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
 404                     break;
 405                 }
 406             }
 407             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 408         }
 409
 410         void matchTemplate_CCOFF_NORMED(
 411             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 412         {
 413             image.convertTo(buf.imagef, CV_32F);
 414             templ.convertTo(buf.templf, CV_32F);
 415
 416             matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
 417             float scale = 1.f / templ.size().area();
 418
 419             Context *clCxt = image.clCxt;
 420             String kernelName;
 421
 422             kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
 423             size_t globalThreads[3] = {result.cols, result.rows, 1};
 424             size_t localThreads[3]  = {16, 16, 1};
 425
 426             std::vector< std::pair<size_t, const void *> > args;
 427             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 428             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
 429             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
 430             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
 431             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
 432             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 433             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 434             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 435             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 436             args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale) );
 437
 438             Vec4f templ_sum   = Vec4f::all(0);
 439             Vec4f templ_sqsum = Vec4f::all(0);
 440             // to be continued in the following section
 441             if(image.oclchannels() == 1)
 442             {
 443                 buf.image_sums.resize(1);
 444                 buf.image_sqsums.resize(1);
 445                 cv::ocl::oclMat temp;
 446                 integral(image, buf.image_sums[0], temp);
 447                 if(temp.depth() == CV_64F)
 448                     temp.convertTo(buf.image_sqsums[0], CV_32FC1);
 449                 else
 450                     buf.image_sqsums[0] = temp;
 451
 452                 templ_sum[0]   = (float)sum(templ)[0];
 453
 454                 templ_sqsum[0] = sqrSum(templ)[0];
 455
 456                 templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
 457                 templ_sum[0]   *= scale;
 458
 459                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 460                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 461                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 462                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 463                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 464                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 465                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 466                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
 467             }
 468             else
 469             {
 470
 471                 split(image, buf.images);
 472                 templ_sum   = sum(templ);
 473
 474                 templ_sqsum = sqrSum(templ);
 475
 476                 templ_sqsum -= scale * templ_sum * templ_sum;
 477
 478                 float templ_sqsum_sum = 0;
 479                 for(int i = 0; i < image.oclchannels(); i ++)
 480                 {
 481                     templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
 482                 }
 483                 templ_sum   *= scale;
 484                 buf.image_sums.resize(buf.images.size());
 485                 buf.image_sqsums.resize(buf.images.size());
 486                 cv::ocl::oclMat temp;
 487                 for(int i = 0; i < image.oclchannels(); i ++)
 488                 {
 489                     integral(buf.images[i], buf.image_sums[i], temp);
 490                     if(temp.depth() == CV_64F)
 491                         temp.convertTo(buf.image_sqsums[i], CV_32FC1);
 492                     else
 493                         buf.image_sqsums[i] = temp;
 494                 }
 495
 496                 switch(image.oclchannels())
 497                 {
 498                 case 4:
 499                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 500                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 501                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 502                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 503                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 504                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 505                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 506                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
 507                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
 508                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
 509                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 510                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 511                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 512                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 513                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 514                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 515                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
 516                     break;
 517                 default:
 518                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
 519                     break;
 520                 }
 521             }
 522             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 523         }
 524         void extractFirstChannel_32F(const oclMat &image, oclMat &result)
 525         {
 526             Context *clCxt = image.clCxt;
 527             String kernelName;
 528
 529             kernelName = "extractFirstChannel";
 530             size_t globalThreads[3] = {result.cols, result.rows, 1};
 531             size_t localThreads[3]  = {16, 16, 1};
 532
 533             std::vector< std::pair<size_t, const void *> > args;
 534             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data) );
 535             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
 536             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
 537             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
 538             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
 539             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
 540             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
 541             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
 542
 543             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
 544         }
 545     }/*ocl*/
 546 } /*cv*/
 547
 548 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
 549 {
 550     MatchTemplateBuf buf;
 551     matchTemplate(image, templ, result, method, buf);
 552 }
 553 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
 554 {
 555     CV_Assert(image.type() == templ.type());
 556     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 557
 558     typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
 559
 560     const Caller callers[] =
 561     {
 562         ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
 563         ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
 564         ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
 565     };
 566
 567     Caller caller = callers[method];
 568     CV_Assert(caller);
 569     caller(image, templ, result, buf);
 570 }