modules/ocl/src/match_template.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #include "precomp.hpp"
  48 #include "opencl_kernels.hpp"
  49
  50 using namespace cv;
  51 using namespace cv::ocl;
  52
  53 namespace cv
  54 {
  55     namespace ocl
  56     {
  57         void matchTemplate_SQDIFF(
  58             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  59
  60         void matchTemplate_SQDIFF_NORMED(
  61             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  62
  63         void convolve_32F(
  64             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  65
  66         void matchTemplate_CCORR(
  67             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  68
  69         void matchTemplate_CCORR_NORMED(
  70             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  71
  72         void matchTemplate_CCOFF(
  73             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  74
  75         void matchTemplate_CCOFF_NORMED(
  76             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
  77
  78
  79         void matchTemplateNaive_SQDIFF(
  80             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  81
  82         void matchTemplateNaive_CCORR(
  83             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
  84
  85         void extractFirstChannel_32F(
  86             const oclMat &image, oclMat &result);
  87
  88         // Evaluates optimal template's area threshold. If
  89         // template's area is less  than the threshold, we use naive match
  90         // template version, otherwise FFT-based (if available)
  91         static bool useNaive(int , int , Size )
  92         {
  93             // FIXME!
  94             //   always use naive until convolve is imported
  95             return true;
  96         }
  97
  98         //////////////////////////////////////////////////////////////////////
  99         // SQDIFF
 100         void matchTemplate_SQDIFF(
 101             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
 102         {
 103             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 104             if (useNaive(CV_TM_SQDIFF, image.depth(), templ.size()))
 105             {
 106                 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
 107                 return;
 108             }
 109             else
 110             {
 111                 buf.image_sqsums.resize(1);
 112
 113                 // TODO, add double support for ocl::integral
 114                 // use CPU integral temporarily
 115                 Mat sums, sqsums;
 116                 cv::integral(Mat(image.reshape(1)), sums, sqsums);
 117                 buf.image_sqsums[0] = sqsums;
 118
 119                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 120                 matchTemplate_CCORR(image, templ, result, buf);
 121
 122                 //port CUDA's matchTemplatePrepared_SQDIFF_8U
 123                 Context *clCxt = image.clCxt;
 124                 string kernelName = "matchTemplate_Prepared_SQDIFF";
 125                 vector< pair<size_t, const void *> > args;
 126
 127                 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 128                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 129                 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 130                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 131                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 132                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 133                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 134                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 135                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 136                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 137                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 138
 139                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 140                 size_t localThreads[3]  = {16, 16, 1};
 141
 142                 const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
 143                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
 144             }
 145         }
 146
 147         void matchTemplate_SQDIFF_NORMED(
 148             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 149         {
 150             matchTemplate_CCORR(image, templ, result, buf);
 151             buf.image_sums.resize(1);
 152
 153             integral(image.reshape(1), buf.image_sums[0]);
 154
 155             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 156
 157             Context *clCxt = image.clCxt;
 158             string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
 159             vector< pair<size_t, const void *> > args;
 160
 161             args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
 162             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 163             args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 164             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 165             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 166             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 167             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 168             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
 169             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
 170             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 171             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 172
 173             size_t globalThreads[3] = {result.cols, result.rows, 1};
 174             size_t localThreads[3]  = {16, 16, 1};
 175             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 176         }
 177
 178         void matchTemplateNaive_SQDIFF(
 179             const oclMat &image, const oclMat &templ, oclMat &result, int)
 180         {
 181             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 182                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 183                      );
 184             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 185             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 186
 187             Context *clCxt = image.clCxt;
 188             string kernelName = "matchTemplate_Naive_SQDIFF";
 189
 190             vector< pair<size_t, const void *> > args;
 191
 192             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 193             args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 194             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 195             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 196             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 197             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 198             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 199             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 200             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 201             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 202             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 203             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 204             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 205             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 206             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 207
 208             size_t globalThreads[3] = {result.cols, result.rows, 1};
 209             size_t localThreads[3]  = {16, 16, 1};
 210             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 211         }
 212
 213         //////////////////////////////////////////////////////////////////////
 214         // CCORR
 215         void convolve_32F(
 216             const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &)
 217         {
 218             CV_Error(-1, "convolve is not fully implemented yet");
 219         }
 220
 221         void matchTemplate_CCORR(
 222             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 223         {
 224             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 225             if (useNaive(CV_TM_CCORR, image.depth(), templ.size()))
 226             {
 227                 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
 228                 return;
 229             }
 230             else
 231             {
 232                 if(image.depth() == CV_8U && templ.depth() == CV_8U)
 233                 {
 234                     image.convertTo(buf.imagef, CV_32F);
 235                     templ.convertTo(buf.templf, CV_32F);
 236                     convolve_32F(buf.imagef, buf.templf, result, buf);
 237                 }
 238                 else
 239                 {
 240                     convolve_32F(image, templ, result, buf);
 241                 }
 242             }
 243         }
 244
 245         void matchTemplate_CCORR_NORMED(
 246             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 247         {
 248             matchTemplate_CCORR(image, templ, result, buf);
 249             buf.image_sums.resize(1);
 250             buf.image_sqsums.resize(1);
 251
 252             integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
 253
 254             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 255
 256             Context *clCxt = image.clCxt;
 257             string kernelName = "normalizeKernel";
 258             vector< pair<size_t, const void *> > args;
 259
 260             args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 261             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 262             args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 263             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 264             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 265             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 266             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 267             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 268             args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 269             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 270             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 271
 272             size_t globalThreads[3] = {result.cols, result.rows, 1};
 273             size_t localThreads[3]  = {16, 16, 1};
 274             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 275         }
 276
 277         void matchTemplateNaive_CCORR(
 278             const oclMat &image, const oclMat &templ, oclMat &result, int)
 279         {
 280             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 281                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
 282                      );
 283             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
 284             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 285
 286             Context *clCxt = image.clCxt;
 287             string kernelName = "matchTemplate_Naive_CCORR";
 288
 289             vector< pair<size_t, const void *> > args;
 290
 291             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 292             args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 293             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 294             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 295             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 296             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 297             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 298             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 299             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 300             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 301             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 302             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 303             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 304             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 305             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 306
 307             size_t globalThreads[3] = {result.cols, result.rows, 1};
 308             size_t localThreads[3]  = {16, 16, 1};
 309             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 310         }
 311         //////////////////////////////////////////////////////////////////////
 312         // CCOFF
 313         void matchTemplate_CCOFF(
 314             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 315         {
 316             CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
 317
 318             matchTemplate_CCORR(image, templ, result, buf);
 319
 320             Context *clCxt = image.clCxt;
 321             string kernelName;
 322
 323             kernelName = "matchTemplate_Prepared_CCOFF";
 324             size_t globalThreads[3] = {result.cols, result.rows, 1};
 325             size_t localThreads[3]  = {16, 16, 1};
 326
 327             vector< pair<size_t, const void *> > args;
 328             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 329             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
 330             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 331             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 332             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 333             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 334             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 335             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 336             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 337             Vec4f templ_sum = Vec4f::all(0);
 338             // to be continued in the following section
 339             if(image.oclchannels() == 1)
 340             {
 341                 buf.image_sums.resize(1);
 342                 integral(image, buf.image_sums[0]);
 343
 344                 templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
 345                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 346                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 347                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 348                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 349             }
 350             else
 351             {
 352
 353                 split(image, buf.images);
 354                 templ_sum = sum(templ) / templ.size().area();
 355                 buf.image_sums.resize(buf.images.size());
 356
 357
 358                 for(int i = 0; i < image.oclchannels(); i ++)
 359                 {
 360                     integral(buf.images[i], buf.image_sums[i]);
 361                 }
 362                 switch(image.oclchannels())
 363                 {
 364                 case 4:
 365                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 366                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 367                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 368                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 369                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 370                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 371                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 372                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 373                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 374                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 375                     break;
 376                 default:
 377                     CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 378                     break;
 379                 }
 380             }
 381             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 382         }
 383
 384         void matchTemplate_CCOFF_NORMED(
 385             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
 386         {
 387             image.convertTo(buf.imagef, CV_32F);
 388             templ.convertTo(buf.templf, CV_32F);
 389
 390             matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
 391             float scale = 1.f / templ.size().area();
 392
 393             Context *clCxt = image.clCxt;
 394             string kernelName;
 395
 396             kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
 397             size_t globalThreads[3] = {result.cols, result.rows, 1};
 398             size_t localThreads[3]  = {16, 16, 1};
 399
 400             vector< pair<size_t, const void *> > args;
 401             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 402             args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
 403             args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 404             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 405             args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 406             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 407             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 408             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 409             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 410             args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
 411
 412             Vec4f templ_sum   = Vec4f::all(0);
 413             Vec4f templ_sqsum = Vec4f::all(0);
 414             // to be continued in the following section
 415             if(image.oclchannels() == 1)
 416             {
 417                 buf.image_sums.resize(1);
 418                 buf.image_sqsums.resize(1);
 419                 integral(image, buf.image_sums[0], buf.image_sqsums[0]);
 420
 421                 templ_sum[0]   = (float)sum(templ)[0];
 422
 423                 templ_sqsum[0] = sqrSum(templ)[0];
 424
 425                 templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
 426                 templ_sum[0]   *= scale;
 427
 428                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 429                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 430                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 431                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 432                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 433                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 434                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 435                 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
 436             }
 437             else
 438             {
 439
 440                 split(image, buf.images);
 441                 templ_sum   = sum(templ);
 442
 443                 templ_sqsum = sqrSum(templ);
 444
 445                 templ_sqsum -= scale * templ_sum * templ_sum;
 446
 447                 float templ_sqsum_sum = 0;
 448                 for(int i = 0; i < image.oclchannels(); i ++)
 449                 {
 450                     templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
 451                 }
 452                 templ_sum   *= scale;
 453                 buf.image_sums.resize(buf.images.size());
 454                 buf.image_sqsums.resize(buf.images.size());
 455
 456                 for(int i = 0; i < image.oclchannels(); i ++)
 457                 {
 458                     integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
 459                 }
 460
 461                 switch(image.oclchannels())
 462                 {
 463                 case 4:
 464                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 465                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 466                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 467                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 468                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 469                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 470                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 471                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
 472                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
 473                     args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
 474                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 475                     args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 476                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
 477                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
 478                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
 479                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
 480                     args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
 481                     break;
 482                 default:
 483                     CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 484                     break;
 485                 }
 486             }
 487             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
 488         }
 489         void extractFirstChannel_32F(const oclMat &image, oclMat &result)
 490         {
 491             Context *clCxt = image.clCxt;
 492             string kernelName;
 493
 494             kernelName = "extractFirstChannel";
 495             size_t globalThreads[3] = {result.cols, result.rows, 1};
 496             size_t localThreads[3]  = {16, 16, 1};
 497
 498             vector< pair<size_t, const void *> > args;
 499             args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data) );
 500             args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 501             args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 502             args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 503             args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 504             args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 505             args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 506             args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 507
 508             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
 509         }
 510     }/*ocl*/
 511 } /*cv*/
 512
 513 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
 514 {
 515     MatchTemplateBuf buf;
 516     matchTemplate(image, templ, result, method, buf);
 517 }
 518 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
 519 {
 520     CV_Assert(image.type() == templ.type());
 521     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 522
 523     typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
 524
 525     const Caller callers[] =
 526     {
 527         ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
 528         ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
 529         ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
 530     };
 531
 532     Caller caller = callers[method];
 533     CV_Assert(caller);
 534     caller(image, templ, result, buf);
 535 }