modules/ocl/src/match_template.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #include <iomanip>
  48 #include "precomp.hpp"
  49
  50 using namespace cv;
  51 using namespace cv::ocl;
  52 using namespace std;
  53
  54 //helper routines
  55 namespace cv
  56 {
  57     namespace ocl
  58     {
  59         ///////////////////////////OpenCL kernel strings///////////////////////////
  60         extern const char *match_template;
  61     }
  62 }
  63
  64 namespace cv
  65 {
  66     namespace ocl
  67     {
  68         void matchTemplate_SQDIFF(
  69             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  70
  71         void matchTemplate_SQDIFF_NORMED(
  72             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  73
  74         void convolve_32F(
  75             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  76
  77         void matchTemplate_CCORR(
  78             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  79
  80         void matchTemplate_CCORR_NORMED(
  81             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  82
  83         void matchTemplate_CCOFF(
  84             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  85
  86         void matchTemplate_CCOFF_NORMED(
  87             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  88
  89
  90         void matchTemplateNaive_SQDIFF(
  91             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  92
  93         void matchTemplateNaive_CCORR(
  94             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  95
  96         void extractFirstChannel_32F(
  97             const oclMat &image, oclMat &result);
  98
  99         // Evaluates optimal template's area threshold. If
 100         // template's area is less  than the threshold, we use naive match
 101         // template version, otherwise FFT-based (if available)
 102         static bool useNaive(int , int , Size )
 103         {
 104             // FIXME!
 105             //   always use naive until convolve is imported
 106             return true;
 107         }
 108
 109         //////////////////////////////////////////////////////////////////////
 110         // SQDIFF
 111         void matchTemplate_SQDIFF(
 112             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
 113         {
 114             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 115             if (useNaive(CV_TM_SQDIFF, image.depth(), templ.size()))
 116             {
 117                 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
 118                 return;
 119             }
 120             else
 121             {
 122                 buf.image_sqsums.resize(1);
 123
 124                 // TODO, add double support for ocl::integral
 125                 // use CPU integral temporarily
 126                 Mat sums, sqsums;
 127                 cv::integral(Mat(image.reshape(1)), sums, sqsums);
 128                 buf.image_sqsums[0] = sqsums;
 129
 130                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 131                 matchTemplate_CCORR(image, templ, result, buf);
 132
 133                 //port CUDA's matchTemplatePrepared_SQDIFF_8U
 134                 Context *clCxt = image.clCxt;
 135                 string kernelName = "matchTemplate_Prepared_SQDIFF";
 136                 vector< pair<size_t, const void *> > args;
 137
 138                 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 139                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 140                 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 141                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 142                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 143                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 144                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 145                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 146                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 147                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 148                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 149
 150                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 151                 size_t localThreads[3]  = {16, 16, 1};
 152
 153                 const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
 154                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
 155             }
 156         }
 157
 158         void matchTemplate_SQDIFF_NORMED(
 159             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 160         {
 161             matchTemplate_CCORR(image, templ, result, buf);
 162             buf.image_sums.resize(1);
 163
 164             integral(image.reshape(1), buf.image_sums[0]);
 165
 166             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 167
 168             Context *clCxt = image.clCxt;
 169             string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
 170             vector< pair<size_t, const void *> > args;
 171
 172             args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
 173             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 174             args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 175             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 176             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 177             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 178             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 179             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
 180             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
 181             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 182             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 183
 184             size_t globalThreads[3] = {result.cols, result.rows, 1};
 185             size_t localThreads[3]  = {16, 16, 1};
 186             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 187         }
 188
 189         void matchTemplateNaive_SQDIFF(
 190             const oclMat &image, const oclMat &templ, oclMat &result, int)
 191         {
 192             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 193                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 194                      );
 195             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 196             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 197
 198             Context *clCxt = image.clCxt;
 199             string kernelName = "matchTemplate_Naive_SQDIFF";
 200
 201             vector< pair<size_t, const void *> > args;
 202
 203             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 204             args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 205             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 206             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 207             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 208             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 209             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 210             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 211             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 212             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 213             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 214             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 215             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 216             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 217             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 218
 219             size_t globalThreads[3] = {result.cols, result.rows, 1};
 220             size_t localThreads[3]  = {16, 16, 1};
 221             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 222         }
 223
 224         //////////////////////////////////////////////////////////////////////
 225         // CCORR
 226         void convolve_32F(
 227             const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &)
 228         {
 229             CV_Error(-1, "convolve is not fully implemented yet");
 230         }
 231
 232         void matchTemplate_CCORR(
 233             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 234         {
 235             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 236             if (useNaive(CV_TM_CCORR, image.depth(), templ.size()))
 237             {
 238                 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
 239                 return;
 240             }
 241             else
 242             {
 243                 if(image.depth() == CV_8U && templ.depth() == CV_8U)
 244                 {
 245                     image.convertTo(buf.imagef, CV_32F);
 246                     templ.convertTo(buf.templf, CV_32F);
 247                     convolve_32F(buf.imagef, buf.templf, result, buf);
 248                 }
 249                 else
 250                 {
 251                     convolve_32F(image, templ, result, buf);
 252                 }
 253             }
 254         }
 255
 256         void matchTemplate_CCORR_NORMED(
 257             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 258         {
 259             matchTemplate_CCORR(image, templ, result, buf);
 260             buf.image_sums.resize(1);
 261             buf.image_sqsums.resize(1);
 262
 263             integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
 264
 265             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 266
 267             Context *clCxt = image.clCxt;
 268             string kernelName = "normalizeKernel";
 269             vector< pair<size_t, const void *> > args;
 270
 271             args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 272             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 273             args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 274             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 275             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 276             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 277             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 278             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 279             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 280             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 281             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 282
 283             size_t globalThreads[3] = {result.cols, result.rows, 1};
 284             size_t localThreads[3]  = {16, 16, 1};
 285             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 286         }
 287
 288         void matchTemplateNaive_CCORR(
 289             const oclMat &image, const oclMat &templ, oclMat &result, int)
 290         {
 291             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 292                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 293                      );
 294             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 295             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 296
 297             Context *clCxt = image.clCxt;
 298             string kernelName = "matchTemplate_Naive_CCORR";
 299
 300             vector< pair<size_t, const void *> > args;
 301
 302             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 303             args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 304             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 305             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 306             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 307             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 308             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 309             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 310             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 311             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 312             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 313             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 314             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 315             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 316             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 317
 318             size_t globalThreads[3] = {result.cols, result.rows, 1};
 319             size_t localThreads[3]  = {16, 16, 1};
 320             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 321         }
 322         //////////////////////////////////////////////////////////////////////
 323         // CCOFF
 324         void matchTemplate_CCOFF(
 325             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 326         {
 327             CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
 328
 329             matchTemplate_CCORR(image, templ, result, buf);
 330
 331             Context *clCxt = image.clCxt;
 332             string kernelName;
 333
 334             kernelName = "matchTemplate_Prepared_CCOFF";
 335             size_t globalThreads[3] = {result.cols, result.rows, 1};
 336             size_t localThreads[3]  = {16, 16, 1};
 337
 338             vector< pair<size_t, const void *> > args;
 339             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 340             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
 341             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 342             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 343             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 344             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 345             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 346             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 347             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 348             Vec4f templ_sum = Vec4f::all(0);
 349             // to be continued in the following section
 350             if(image.oclchannels() == 1)
 351             {
 352                 buf.image_sums.resize(1);
 353                 integral(image, buf.image_sums[0]);
 354
 355                 templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
 356                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 357                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 358                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 359                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 360             }
 361             else
 362             {
 363
 364                 split(image, buf.images);
 365                 templ_sum = sum(templ) / templ.size().area();
 366                 buf.image_sums.resize(buf.images.size());
 367
 368
 369                 for(int i = 0; i < image.oclchannels(); i ++)
 370                 {
 371                     integral(buf.images[i], buf.image_sums[i]);
 372                 }
 373                 switch(image.oclchannels())
 374                 {
 375                 case 4:
 376                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 377                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 378                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 379                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 380                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 381                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 382                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 383                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 384                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 385                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 386                     break;
 387                 default:
 388                     CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 389                     break;
 390                 }
 391             }
 392             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 393         }
 394
 395         void matchTemplate_CCOFF_NORMED(
 396             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 397         {
 398             image.convertTo(buf.imagef, CV_32F);
 399             templ.convertTo(buf.templf, CV_32F);
 400
 401             matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
 402             float scale = 1.f / templ.size().area();
 403
 404             Context *clCxt = image.clCxt;
 405             string kernelName;
 406
 407             kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
 408             size_t globalThreads[3] = {result.cols, result.rows, 1};
 409             size_t localThreads[3]  = {16, 16, 1};
 410
 411             vector< pair<size_t, const void *> > args;
 412             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 413             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
 414             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 415             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 416             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 417             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 418             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 419             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 420             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 421             args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
 422
 423             Vec4f templ_sum   = Vec4f::all(0);
 424             Vec4f templ_sqsum = Vec4f::all(0);
 425             // to be continued in the following section
 426             if(image.oclchannels() == 1)
 427             {
 428                 buf.image_sums.resize(1);
 429                 buf.image_sqsums.resize(1);
 430                 integral(image, buf.image_sums[0], buf.image_sqsums[0]);
 431
 432                 templ_sum[0]   = (float)sum(templ)[0];
 433
 434                 templ_sqsum[0] = sqrSum(templ)[0];
 435
 436                 templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
 437                 templ_sum[0]   *= scale;
 438
 439                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 440                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 441                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 442                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 443                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 444                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 445                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 446                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
 447             }
 448             else
 449             {
 450
 451                 split(image, buf.images);
 452                 templ_sum   = sum(templ);
 453
 454                 templ_sqsum = sqrSum(templ);
 455
 456                 templ_sqsum -= scale * templ_sum * templ_sum;
 457
 458                 float templ_sqsum_sum = 0;
 459                 for(int i = 0; i < image.oclchannels(); i ++)
 460                 {
 461                     templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
 462                 }
 463                 templ_sum   *= scale;
 464                 buf.image_sums.resize(buf.images.size());
 465                 buf.image_sqsums.resize(buf.images.size());
 466
 467                 for(int i = 0; i < image.oclchannels(); i ++)
 468                 {
 469                     integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
 470                 }
 471
 472                 switch(image.oclchannels())
 473                 {
 474                 case 4:
 475                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 476                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 477                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 478                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 479                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 480                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 481                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 482                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
 483                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
 484                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
 485                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 486                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 487                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 488                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 489                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 490                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 491                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
 492                     break;
 493                 default:
 494                     CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 495                     break;
 496                 }
 497             }
 498             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 499         }
 500         void extractFirstChannel_32F(const oclMat &image, oclMat &result)
 501         {
 502             Context *clCxt = image.clCxt;
 503             string kernelName;
 504
 505             kernelName = "extractFirstChannel";
 506             size_t globalThreads[3] = {result.cols, result.rows, 1};
 507             size_t localThreads[3]  = {16, 16, 1};
 508
 509             vector< pair<size_t, const void *> > args;
 510             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data) );
 511             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 512             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 513             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 514             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 515             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 516             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 517             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 518
 519             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
 520         }
 521     }/*ocl*/
 522 } /*cv*/
 523
 524 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
 525 {
 526     MatchTemplateBuf buf;
 527     matchTemplate(image, templ, result, method, buf);
 528 }
 529 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
 530 {
 531     CV_Assert(image.type() == templ.type());
 532     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 533
 534     typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
 535
 536     const Caller callers[] =
 537     {
 538         ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
 539         ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
 540         ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
 541     };
 542
 543     Caller caller = callers[method];
 544     CV_Assert(caller);
 545     caller(image, templ, result, buf);
 546 }