modules/ocl/src/match_template.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Peng Xiao, pengxiao@multicorewareinc.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other oclMaterials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46
  47 #include <iomanip>
  48 #include "precomp.hpp"
  49
  50 using namespace cv;
  51 using namespace cv::ocl;
  52 using namespace std;
  53
  54 #define EXT_FP64 0
  55
  56 #if !defined (HAVE_OPENCL)
  57 void cv::ocl::matchTemplate(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
  58 #else
  59 //helper routines
  60 namespace cv
  61 {
  62         namespace ocl
  63         {
  64                 ///////////////////////////OpenCL kernel strings///////////////////////////
  65                 extern const char *match_template;
  66         }
  67 }
  68
  69 namespace cv { namespace ocl
  70 {
  71         void matchTemplate_SQDIFF(
  72                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
  73
  74         void matchTemplate_SQDIFF_NORMED(
  75                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
  76
  77         void matchTemplate_CCORR(
  78                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
  79
  80         void matchTemplate_CCORR_NORMED(
  81                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
  82
  83         void matchTemplate_CCOFF(
  84                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
  85
  86         void matchTemplate_CCOFF_NORMED(
  87                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
  88
  89
  90         void matchTemplateNaive_SQDIFF(
  91                 const oclMat& image, const oclMat& templ, oclMat& result, int cn);
  92
  93         void matchTemplateNaive_CCORR(
  94                 const oclMat& image, const oclMat& templ, oclMat& result, int cn);
  95
  96         // Evaluates optimal template's area threshold. If
  97         // template's area is less  than the threshold, we use naive match
  98         // template version, otherwise FFT-based (if available)
  99         int getTemplateThreshold(int method, int depth)
 100         {
 101                 switch (method)
 102                 {
 103                 case CV_TM_CCORR:
 104                         if (depth == CV_32F) return 250;
 105                         if (depth == CV_8U) return 300;
 106                         break;
 107                 case CV_TM_SQDIFF:
 108                         if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F
 109                         if (depth == CV_8U) return 300;
 110                         break;
 111                 }
 112                 CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
 113                 return 0;
 114         }
 115
 116
 117         //////////////////////////////////////////////////////////////////////
 118         // SQDIFF
 119         void matchTemplate_SQDIFF(
 120                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 121         {
 122                 result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 123                 if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
 124                 {
 125                         matchTemplateNaive_SQDIFF(image, templ, result, image.channels());
 126                         return;
 127                 }
 128                 else
 129                 {
 130                         // TODO
 131                         CV_Error(CV_StsBadArg, "Not supported yet for this size template");
 132                 }
 133         }
 134
 135         void matchTemplate_SQDIFF_NORMED(
 136                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 137         {
 138                 matchTemplate_CCORR(image,templ,result,buf);
 139                 buf.image_sums.resize(1);
 140                 buf.image_sqsums.resize(1);
 141
 142                 integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
 143
 144 #if EXT_FP64 && SQRSUM_FIXED
 145                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 146 #else
 147                 Mat sqr_mat = templ.reshape(1);
 148                 unsigned long long templ_sqsum = (unsigned long long)sum(sqr_mat.mul(sqr_mat))[0];
 149 #endif
 150
 151                 Context *clCxt = image.clCxt;
 152                 string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
 153                 vector< pair<size_t, const void *> > args;
 154
 155                 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
 156                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 157                 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 158                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 159                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 160                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 161                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 162                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
 163                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
 164                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 165                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 166
 167                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 168                 size_t localThreads[3]  = {32, 8, 1};
 169                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 170         }
 171
 172         void matchTemplateNaive_SQDIFF(
 173                 const oclMat& image, const oclMat& templ, oclMat& result, int cn)
 174         {
 175                 CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 176                         || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
 177                 CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
 178                 CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 179
 180                 Context *clCxt = image.clCxt;
 181                 string kernelName = "matchTemplate_Naive_SQDIFF";
 182
 183                 vector< pair<size_t, const void *> > args;
 184
 185                 args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 186                 args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 187                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 188                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 189                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 190                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 191                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 192                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 193                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 194                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 195                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 196                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 197                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 198                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 199                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 200
 201                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 202                 size_t localThreads[3]  = {32, 8, 1};
 203                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
 204         }
 205
 206         //////////////////////////////////////////////////////////////////////
 207         // CCORR
 208         void matchTemplate_CCORR(
 209                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 210         {
 211                 result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
 212                 if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
 213                 {
 214                         matchTemplateNaive_CCORR(image, templ, result, image.channels());
 215                         return;
 216                 }
 217                 else
 218                 {
 219                         CV_Error(CV_StsBadArg, "Not supported yet for this size template");
 220                         if(image.depth() == CV_8U && templ.depth() == CV_8U)
 221                         {
 222                                 image.convertTo(buf.imagef, CV_32F);
 223                                 templ.convertTo(buf.templf, CV_32F);
 224                         }
 225                         CV_Assert(image.channels() == 1);
 226                         oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels()));
 227                         filter2D(buf.imagef,o_result,CV_32F,buf.templf, Point(0,0));
 228                         result = o_result(Rect(0,0,image.rows - templ.rows + 1, image.cols - templ.cols + 1));
 229                 }
 230         }
 231
 232         void matchTemplate_CCORR_NORMED(
 233                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 234         {
 235                 matchTemplate_CCORR(image,templ,result,buf);
 236                 buf.image_sums.resize(1);
 237                 buf.image_sqsums.resize(1);
 238
 239                 integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
 240 #if EXT_FP64 && SQRSUM_FIXED
 241                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
 242 #elif EXT_FP64
 243                 oclMat templ_c1 = templ.reshape(1);
 244                 multiply(templ_c1, templ_c1, templ_c1);
 245                 unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0];
 246 #else
 247                 Mat m_templ_c1 = templ.reshape(1);
 248                 multiply(m_templ_c1, m_templ_c1, m_templ_c1);
 249                 unsigned long long templ_sqsum = (unsigned long long)sum(m_templ_c1)[0];
 250 #endif
 251                 Context *clCxt = image.clCxt;
 252                 string kernelName = "normalizeKernel";
 253                 vector< pair<size_t, const void *> > args;
 254
 255                 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
 256                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 257                 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
 258                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 259                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 260                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 261                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 262                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
 263                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
 264                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 265                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 266
 267                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 268                 size_t localThreads[3]  = {32, 8, 1};
 269                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
 270         }
 271
 272         void matchTemplateNaive_CCORR(
 273                 const oclMat& image, const oclMat& templ, oclMat& result, int cn)
 274         {
 275                 CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
 276                         || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
 277                 CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
 278                 CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
 279
 280                 Context *clCxt = image.clCxt;
 281                 string kernelName = "matchTemplate_Naive_CCORR";
 282
 283                 vector< pair<size_t, const void *> > args;
 284
 285                 args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
 286                 args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
 287                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
 288                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
 289                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
 290                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
 291                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
 292                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
 293                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
 294                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
 295                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
 296                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 297                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
 298                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
 299                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 300
 301                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 302                 size_t localThreads[3]  = {32, 8, 1};
 303                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
 304         }
 305         //////////////////////////////////////////////////////////////////////
 306         // CCOFF
 307         void matchTemplate_CCOFF(
 308                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 309         {
 310                 CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
 311
 312                 matchTemplate_CCORR(image,templ,result,buf);
 313
 314                 Context *clCxt = image.clCxt;
 315                 string kernelName;
 316
 317                 kernelName = "matchTemplate_Prepared_CCOFF";
 318                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 319                 size_t localThreads[3]  = {32, 8, 1};
 320
 321                 vector< pair<size_t, const void *> > args;
 322                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 323                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
 324                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 325                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 326                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 327                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 328                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 329                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 330                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 331                 // to be continued in the following section
 332                 if(image.channels() == 1)
 333                 {
 334                         buf.image_sums.resize(1);
 335                         // FIXME: temp fix for incorrect integral kernel
 336                         oclMat tmp_oclmat;
 337                         integral(image, buf.image_sums[0], tmp_oclmat);
 338
 339                         float templ_sum = 0;
 340 #if EXT_FP64
 341                         templ_sum = (float)sum(templ)[0] / templ.size().area();
 342 #else
 343                         Mat o_templ = templ;
 344                         templ_sum = (float)sum(o_templ)[0] / o_templ.size().area(); // temp fix for non-double supported machine
 345 #endif
 346                         args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 347                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 348                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 349                         args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
 350                 }
 351                 else
 352                 {
 353                         Vec4f templ_sum = Vec4f::all(0);
 354 #if EXT_FP64
 355                         split(image,buf.images);
 356                         templ_sum = sum(templ) / templ.size().area();
 357 #else
 358                         // temp fix for non-double supported machine
 359                         Mat o_templ = templ, o_image = image;
 360                         vector<Mat> o_mat_vector;
 361                         o_mat_vector.resize(image.channels());
 362                         buf.images.resize(image.channels());
 363                         split(o_image, o_mat_vector);
 364                         for(int i = 0; i < o_mat_vector.size(); i ++)
 365                         {
 366                                 buf.images[i] = oclMat(o_mat_vector[i]);
 367                         }
 368                         templ_sum = sum(o_templ) / templ.size().area();
 369 #endif
 370                         buf.image_sums.resize(buf.images.size());
 371
 372                         for(int i = 0; i < image.channels(); i ++)
 373                         {
 374                                 // FIXME: temp fix for incorrect integral kernel
 375                                 oclMat omat_temp;
 376                                 integral(buf.images[i], buf.image_sums[i], omat_temp);
 377                         }
 378                         switch(image.channels())
 379                         {
 380                         case 4:
 381                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 382                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 383                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 384                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 385                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 386                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 387                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
 388                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
 389                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
 390                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
 391                                 break;
 392                         default:
 393                                 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 394                                 break;
 395                         }
 396                 }
 397                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
 398         }
 399
 400         void matchTemplate_CCOFF_NORMED(
 401                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
 402         {
 403                 image.convertTo(buf.imagef, CV_32F);
 404                 templ.convertTo(buf.templf, CV_32F);
 405
 406                 matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
 407                 float scale = 1.f/templ.size().area();
 408
 409                 Context *clCxt = image.clCxt;
 410                 string kernelName;
 411
 412                 kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
 413                 size_t globalThreads[3] = {result.cols, result.rows, 1};
 414                 size_t localThreads[3]  = {32, 8, 1};
 415
 416                 vector< pair<size_t, const void *> > args;
 417                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
 418                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
 419                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
 420                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
 421                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
 422                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
 423                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
 424                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
 425                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
 426                 args.push_back( make_pair( sizeof(cl_float),(void *)&scale) );
 427                 // to be continued in the following section
 428                 if(image.channels() == 1)
 429                 {
 430                         buf.image_sums.resize(1);
 431                         buf.image_sqsums.resize(1);
 432                         integral(image, buf.image_sums[0], buf.image_sqsums[0]);
 433                         float templ_sum = 0;
 434                         float templ_sqsum = 0;
 435 #if EXT_FP64
 436                         templ_sum   = (float)sum(templ)[0];
 437 #if SQRSUM_FIXED
 438                         templ_sqsum = sqrSum(templ);
 439 #else
 440                         oclMat templ_sqr = templ;
 441                         multiply(templ,templ, templ_sqr);
 442                         templ_sqsum  = sum(templ_sqr)[0];
 443 #endif //SQRSUM_FIXED
 444                         templ_sqsum -= scale * templ_sum * templ_sum;
 445                         templ_sum   *= scale;
 446 #else
 447                         // temp fix for non-double supported machine
 448                         Mat o_templ = templ;
 449                         templ_sum   = (float)sum(o_templ)[0];
 450                         templ_sqsum = sum(o_templ.mul(o_templ))[0] - scale * templ_sum * templ_sum;
 451                         templ_sum  *= scale;
 452 #endif
 453                         args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 454                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 455                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 456                         args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 457                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 458                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 459                         args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
 460                         args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum) );
 461                 }
 462                 else
 463                 {
 464                         Vec4f templ_sum   = Vec4f::all(0);
 465                         Vec4f templ_sqsum = Vec4f::all(0);
 466 #if EXT_FP64
 467                         split(image,buf.images);
 468                         templ_sum   = sum(templ);
 469 #if SQRSUM_FIXED
 470                         templ_sqsum = sqrSum(templ);
 471 #else
 472                         oclMat templ_sqr = templ;
 473                         multiply(templ,templ, templ_sqr);
 474                         templ_sqsum  = sum(templ_sqr);
 475 #endif //SQRSUM_FIXED
 476                         templ_sqsum -= scale * templ_sum * templ_sum;
 477
 478 #else
 479                         // temp fix for non-double supported machine
 480                         Mat o_templ = templ, o_image = image;
 481
 482                         vector<Mat> o_mat_vector;
 483                         o_mat_vector.resize(image.channels());
 484                         buf.images.resize(image.channels());
 485                         split(o_image, o_mat_vector);
 486                         for(int i = 0; i < o_mat_vector.size(); i ++)
 487                         {
 488                                 buf.images[i] = oclMat(o_mat_vector[i]);
 489                         }
 490                         templ_sum    = sum(o_templ);
 491                         templ_sqsum  = sum(o_templ.mul(o_templ));
 492 #endif
 493                         float templ_sqsum_sum = 0;
 494                         for(int i = 0; i < image.channels(); i ++)
 495                         {
 496                                 templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
 497                         }
 498                         templ_sum   *= scale;
 499                         buf.image_sums.resize(buf.images.size());
 500                         buf.image_sqsums.resize(buf.images.size());
 501
 502                         for(int i = 0; i < image.channels(); i ++)
 503                         {
 504                                 integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
 505                         }
 506
 507                         switch(image.channels())
 508                         {
 509                         case 4:
 510                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
 511                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
 512                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
 513                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
 514                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
 515                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
 516                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
 517                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
 518                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
 519                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
 520                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
 521                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
 522                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
 523                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
 524                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
 525                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
 526                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum_sum) );
 527                                 break;
 528                         default:
 529                                 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
 530                                 break;
 531                         }
 532                 }
 533                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
 534         }
 535
 536 }/*ocl*/} /*cv*/
 537
 538 void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
 539 {
 540         MatchTemplateBuf buf;
 541         matchTemplate(image,templ, result, method,buf);
 542 }
 543 void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf)
 544 {
 545         CV_Assert(image.type() == templ.type());
 546         CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
 547
 548         typedef void (*Caller)(const oclMat&, const oclMat&, oclMat&, MatchTemplateBuf&);
 549
 550         const Caller callers[] = {
 551                 ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
 552                 ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
 553                 ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
 554         };
 555
 556         Caller caller = callers[method];
 557         CV_Assert(caller);
 558         caller(image, templ, result, buf);
 559 }
 560 #endif //