modules/ocl/src/match_template.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other materials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #include "precomp.hpp"
  48 #include "opencl_kernels.hpp"
  49
  50 using namespace cv;
  51 using namespace cv::ocl;
  52
  53 namespace cv
  54 {
  55     namespace ocl
  56     {
  57         void matchTemplate_SQDIFF(
  58             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  59
  60         void matchTemplate_SQDIFF_NORMED(
  61             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  62
  63         void convolve_32F(
  64             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  65
  66         void matchTemplate_CCORR(
  67             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  68
  69         void matchTemplate_CCORR_NORMED(
  70             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  71
  72         void matchTemplate_CCOFF(
  73             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  74
  75         void matchTemplate_CCOFF_NORMED(
  76             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  77
  78
  79         void matchTemplateNaive_SQDIFF(
  80             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  81
  82         void matchTemplateNaive_CCORR(
  83             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  84
  85         void extractFirstChannel_32F(
  86             const oclMat &image, oclMat &result);
  87
  88         // Evaluates optimal template's area threshold. If
  89         // template's area is less  than the threshold, we use naive match
  90         // template version, otherwise FFT-based (if available)
  91         static bool useNaive(int , int , Size )
  92         {
  93             // FIXME!
  94             //   always use naive until convolve is imported
  95             return true;
  96         }
  97
  98         //////////////////////////////////////////////////////////////////////
  99         // SQDIFF
 100         void matchTemplate_SQDIFF(
 101             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
 102         {
 103             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 104             if (useNaive(CV_TM_SQDIFF, image.depth(), templ.size()))
 105             {
 106                 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
 107                 return;
 108             }
 109             else
 110             {
 111                 buf.image_sqsums.resize(1);
 112
 113                 // TODO, add double support for ocl::integral
 114                 // use CPU integral temporarily
 115                 Mat sums, sqsums;
 116                 cv::integral(Mat(image.reshape(1)), sums, sqsums);
 117                 buf.image_sqsums[0] = sqsums;
 118
 119                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 120                 matchTemplate_CCORR(image, templ, result, buf);
 121
 122                 //port CUDA's matchTemplatePrepared_SQDIFF_8U
 123                 Context *clCxt = image.clCxt;
 124                 string kernelName = "matchTemplate_Prepared_SQDIFF";
 125                 vector< pair<size_t, const void *> > args;
 126
 127                 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 128                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 129                 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 130                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 131                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 132                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 133                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 134                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 135                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 136                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 137                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 138
 139                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 140                 size_t localThreads[3]  = {16, 16, 1};
 141
 142                 const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
 143                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
 144             }
 145         }
 146
 147         void matchTemplate_SQDIFF_NORMED(
 148             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 149         {
 150             matchTemplate_CCORR(image, templ, result, buf);
 151             buf.image_sums.resize(1);
 152
 153             integral(image.reshape(1), buf.image_sums[0]);
 154
 155             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 156
 157             Context *clCxt = image.clCxt;
 158             string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
 159             vector< pair<size_t, const void *> > args;
 160
 161             args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
 162             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 163             args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 164             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 165             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 166             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 167             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 168             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
 169             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
 170             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 171             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 172
 173             size_t globalThreads[3] = {result.cols, result.rows, 1};
 174             size_t localThreads[3]  = {16, 16, 1};
 175             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 176         }
 177
 178         void matchTemplateNaive_SQDIFF(
 179             const oclMat &image, const oclMat &templ, oclMat &result, int)
 180         {
 181             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 182                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 183                      );
 184             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 185             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 186
 187             Context *clCxt = image.clCxt;
 188             string kernelName = "matchTemplate_Naive_SQDIFF";
 189
 190             vector< pair<size_t, const void *> > args;
 191
 192             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 193             args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 194             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 195             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 196             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 197             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 198             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 199             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 200             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 201             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 202             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 203             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 204             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 205             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 206             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 207
 208             size_t globalThreads[3] = {result.cols, result.rows, 1};
 209             size_t localThreads[3]  = {16, 16, 1};
 210             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 211         }
 212
 213         //////////////////////////////////////////////////////////////////////
 214         // CCORR
 215         void convolve_32F(
 216             const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &)
 217         {
 218             CV_Error(-1, "convolve is not fully implemented yet");
 219         }
 220
 221         void matchTemplate_CCORR(
 222             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 223         {
 224             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 225             if (useNaive(CV_TM_CCORR, image.depth(), templ.size()))
 226             {
 227                 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
 228                 return;
 229             }
 230             else
 231             {
 232                 if(image.depth() == CV_8U && templ.depth() == CV_8U)
 233                 {
 234                     image.convertTo(buf.imagef, CV_32F);
 235                     templ.convertTo(buf.templf, CV_32F);
 236                     convolve_32F(buf.imagef, buf.templf, result, buf);
 237                 }
 238                 else
 239                 {
 240                     convolve_32F(image, templ, result, buf);
 241                 }
 242             }
 243         }
 244
 245         void matchTemplate_CCORR_NORMED(
 246             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 247         {
 248             cv::ocl::oclMat temp;
 249             matchTemplate_CCORR(image, templ, result, buf);
 250             buf.image_sums.resize(1);
 251             buf.image_sqsums.resize(1);
 252             integral(image.reshape(1), buf.image_sums[0], temp);
 253             if(temp.depth() == CV_64F)
 254                 temp.convertTo(buf.image_sqsums[0], CV_32FC1);
 255             else
 256                 buf.image_sqsums[0] = temp;
 257             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 258
 259             Context *clCxt = image.clCxt;
 260             string kernelName = "normalizeKernel";
 261             vector< pair<size_t, const void *> > args;
 262
 263             args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 264             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 265             args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 266             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 267             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 268             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 269             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 270             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 271             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 272             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 273             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 274
 275             size_t globalThreads[3] = {result.cols, result.rows, 1};
 276             size_t localThreads[3]  = {16, 16, 1};
 277             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 278         }
 279
 280         void matchTemplateNaive_CCORR(
 281             const oclMat &image, const oclMat &templ, oclMat &result, int)
 282         {
 283             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 284                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 285                      );
 286             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 287             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 288
 289             Context *clCxt = image.clCxt;
 290             string kernelName = "matchTemplate_Naive_CCORR";
 291
 292             vector< pair<size_t, const void *> > args;
 293
 294             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 295             args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 296             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 297             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 298             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 299             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 300             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 301             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 302             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 303             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 304             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 305             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 306             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 307             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 308             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 309
 310             size_t globalThreads[3] = {result.cols, result.rows, 1};
 311             size_t localThreads[3]  = {16, 16, 1};
 312             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 313         }
 314         //////////////////////////////////////////////////////////////////////
 315         // CCOFF
 316         void matchTemplate_CCOFF(
 317             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 318         {
 319             CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
 320
 321             matchTemplate_CCORR(image, templ, result, buf);
 322
 323             Context *clCxt = image.clCxt;
 324             string kernelName;
 325
 326             kernelName = "matchTemplate_Prepared_CCOFF";
 327             size_t globalThreads[3] = {result.cols, result.rows, 1};
 328             size_t localThreads[3]  = {16, 16, 1};
 329
 330             vector< pair<size_t, const void *> > args;
 331             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 332             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
 333             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 334             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 335             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 336             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 337             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 338             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 339             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 340             Vec4f templ_sum = Vec4f::all(0);
 341             // to be continued in the following section
 342             if(image.oclchannels() == 1)
 343             {
 344                 buf.image_sums.resize(1);
 345                 integral(image, buf.image_sums[0]);
 346
 347                 templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
 348                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 349                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 350                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 351                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 352             }
 353             else
 354             {
 355
 356                 split(image, buf.images);
 357                 templ_sum = sum(templ) / templ.size().area();
 358                 buf.image_sums.resize(buf.images.size());
 359
 360
 361                 for(int i = 0; i < image.oclchannels(); i ++)
 362                 {
 363                     integral(buf.images[i], buf.image_sums[i]);
 364                 }
 365                 switch(image.oclchannels())
 366                 {
 367                 case 4:
 368                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 369                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 370                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 371                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 372                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 373                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 374                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 375                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 376                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 377                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 378                     break;
 379                 default:
 380                     CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 381                     break;
 382                 }
 383             }
 384             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 385         }
 386
 387         void matchTemplate_CCOFF_NORMED(
 388             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 389         {
 390             image.convertTo(buf.imagef, CV_32F);
 391             templ.convertTo(buf.templf, CV_32F);
 392
 393             matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
 394             float scale = 1.f / templ.size().area();
 395
 396             Context *clCxt = image.clCxt;
 397             string kernelName;
 398
 399             kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
 400             size_t globalThreads[3] = {result.cols, result.rows, 1};
 401             size_t localThreads[3]  = {16, 16, 1};
 402
 403             vector< pair<size_t, const void *> > args;
 404             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 405             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
 406             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 407             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 408             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 409             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 410             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 411             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 412             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 413             args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
 414
 415             Vec4f templ_sum   = Vec4f::all(0);
 416             Vec4f templ_sqsum = Vec4f::all(0);
 417             // to be continued in the following section
 418             if(image.oclchannels() == 1)
 419             {
 420                 buf.image_sums.resize(1);
 421                 buf.image_sqsums.resize(1);
 422                 cv::ocl::oclMat temp;
 423                 integral(image, buf.image_sums[0], temp);
 424                 if(temp.depth() == CV_64F)
 425                     temp.convertTo(buf.image_sqsums[0], CV_32FC1);
 426                 else
 427                     buf.image_sqsums[0] = temp;
 428
 429                 templ_sum[0]   = (float)sum(templ)[0];
 430
 431                 templ_sqsum[0] = sqrSum(templ)[0];
 432
 433                 templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
 434                 templ_sum[0]   *= scale;
 435
 436                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 437                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 438                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 439                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 440                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 441                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 442                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 443                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
 444             }
 445             else
 446             {
 447
 448                 split(image, buf.images);
 449                 templ_sum   = sum(templ);
 450
 451                 templ_sqsum = sqrSum(templ);
 452
 453                 templ_sqsum -= scale * templ_sum * templ_sum;
 454
 455                 float templ_sqsum_sum = 0;
 456                 for(int i = 0; i < image.oclchannels(); i ++)
 457                 {
 458                     templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
 459                 }
 460                 templ_sum   *= scale;
 461                 buf.image_sums.resize(buf.images.size());
 462                 buf.image_sqsums.resize(buf.images.size());
 463                 cv::ocl::oclMat temp;
 464                 for(int i = 0; i < image.oclchannels(); i ++)
 465                 {
 466                     integral(buf.images[i], buf.image_sums[i], temp);
 467                     if(temp.depth() == CV_64F)
 468                         temp.convertTo(buf.image_sqsums[i], CV_32FC1);
 469                     else
 470                         buf.image_sqsums[i] = temp;
 471                 }
 472
 473                 switch(image.oclchannels())
 474                 {
 475                 case 4:
 476                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 477                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 478                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 479                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 480                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 481                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 482                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 483                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
 484                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
 485                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
 486                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 487                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 488                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 489                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 490                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 491                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 492                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
 493                     break;
 494                 default:
 495                     CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 496                     break;
 497                 }
 498             }
 499             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 500         }
 501         void extractFirstChannel_32F(const oclMat &image, oclMat &result)
 502         {
 503             Context *clCxt = image.clCxt;
 504             string kernelName;
 505
 506             kernelName = "extractFirstChannel";
 507             size_t globalThreads[3] = {result.cols, result.rows, 1};
 508             size_t localThreads[3]  = {16, 16, 1};
 509
 510             vector< pair<size_t, const void *> > args;
 511             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data) );
 512             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 513             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 514             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 515             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 516             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 517             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 518             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 519
 520             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
 521         }
 522     }/*ocl*/
 523 } /*cv*/
 524
 525 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
 526 {
 527     MatchTemplateBuf buf;
 528     matchTemplate(image, templ, result, method, buf);
 529 }
 530 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
 531 {
 532     CV_Assert(image.type() == templ.type());
 533     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 534
 535     typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
 536
 537     const Caller callers[] =
 538     {
 539         ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
 540         ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
 541         ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
 542     };
 543
 544     Caller caller = callers[method];
 545     CV_Assert(caller);
 546     caller(image, templ, result, buf);
 547 }