fix ocl/match_template compiling error on Linux
[profile/ivi/opencv.git] / modules / ocl / src / match_template.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // @Authors
18 //    Peng Xiao, pengxiao@multicorewareinc.com
19 //
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
22 //
23 //   * Redistribution's of source code must retain the above copyright notice,
24 //     this list of conditions and the following disclaimer.
25 //
26 //   * Redistribution's in binary form must reproduce the above copyright notice,
27 //     this list of conditions and the following disclaimer in the documentation
28 //     and/or other oclMaterials provided with the distribution.
29 //
30 //   * The name of the copyright holders may not be used to endorse or promote products
31 //     derived from this software without specific prior written permission.
32 //
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
43 //
44 //M*/
45
46
47 #include <iomanip>
48 #include "precomp.hpp"
49
50 using namespace cv;
51 using namespace cv::ocl;
52 using namespace std;
53
54 #define EXT_FP64 0
55
56 #if !defined (HAVE_OPENCL)
57 void cv::ocl::matchTemplate(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
58 #else
59 //helper routines
60 namespace cv
61 {
62         namespace ocl
63         {
64                 ///////////////////////////OpenCL kernel strings///////////////////////////
65                 extern const char *match_template;
66         }
67 }
68
69 namespace cv { namespace ocl
70 {
71         void matchTemplate_SQDIFF(
72                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
73
74         void matchTemplate_SQDIFF_NORMED(
75                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
76
77         void matchTemplate_CCORR(
78                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
79
80         void matchTemplate_CCORR_NORMED(
81                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
82
83         void matchTemplate_CCOFF(
84                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
85
86         void matchTemplate_CCOFF_NORMED(
87                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf);
88
89
90         void matchTemplateNaive_SQDIFF(
91                 const oclMat& image, const oclMat& templ, oclMat& result, int cn);
92
93         void matchTemplateNaive_CCORR(
94                 const oclMat& image, const oclMat& templ, oclMat& result, int cn);
95
96         // Evaluates optimal template's area threshold. If 
97         // template's area is less  than the threshold, we use naive match 
98         // template version, otherwise FFT-based (if available)
99         int getTemplateThreshold(int method, int depth)
100         {
101                 switch (method)
102                 {
103                 case CV_TM_CCORR: 
104                         if (depth == CV_32F) return 250;
105                         if (depth == CV_8U) return 300;
106                         break;
107                 case CV_TM_SQDIFF:
108                         if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F
109                         if (depth == CV_8U) return 300;
110                         break;
111                 }
112                 CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
113                 return 0;
114         }
115
116
117         //////////////////////////////////////////////////////////////////////
118         // SQDIFF
119         void matchTemplate_SQDIFF(
120                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
121         {
122                 result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
123                 if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
124                 {
125                         matchTemplateNaive_SQDIFF(image, templ, result, image.channels());
126                         return;
127                 }
128                 else
129                 {
130                         // TODO
131                         CV_Error(CV_StsBadArg, "Not supported yet for this size template");
132                 }
133         }
134
135         void matchTemplate_SQDIFF_NORMED(
136                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
137         {
138                 matchTemplate_CCORR(image,templ,result,buf);
139                 buf.image_sums.resize(1);
140                 buf.image_sqsums.resize(1);
141
142                 integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
143
144 #if EXT_FP64 && SQRSUM_FIXED
145                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
146 #else
147                 Mat sqr_mat = templ.reshape(1);
148                 unsigned long long templ_sqsum = (unsigned long long)sum(sqr_mat.mul(sqr_mat))[0];
149 #endif
150
151                 Context *clCxt = image.clCxt;
152                 string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
153                 vector< pair<size_t, const void *> > args;
154
155                 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
156                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
157                 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
158                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
159                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
160                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
161                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
162                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
163                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
164                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
165                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
166
167                 size_t globalThreads[3] = {result.cols, result.rows, 1};
168                 size_t localThreads[3]  = {32, 8, 1};
169                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
170         }
171
172         void matchTemplateNaive_SQDIFF(
173                 const oclMat& image, const oclMat& templ, oclMat& result, int cn)
174         {
175                 CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
176                         || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
177                 CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
178                 CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
179
180                 Context *clCxt = image.clCxt;
181                 string kernelName = "matchTemplate_Naive_SQDIFF";
182
183                 vector< pair<size_t, const void *> > args;
184
185                 args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
186                 args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
187                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
188                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
189                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
190                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
191                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
192                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
193                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
194                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
195                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
196                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
197                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
198                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
199                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
200
201                 size_t globalThreads[3] = {result.cols, result.rows, 1};
202                 size_t localThreads[3]  = {32, 8, 1};
203                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
204         }
205
206         //////////////////////////////////////////////////////////////////////
207         // CCORR
208         void matchTemplate_CCORR(
209                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
210         {
211                 result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
212                 if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
213                 {
214                         matchTemplateNaive_CCORR(image, templ, result, image.channels());
215                         return;
216                 }
217                 else
218                 {
219                         CV_Error(CV_StsBadArg, "Not supported yet for this size template");
220                         if(image.depth() == CV_8U && templ.depth() == CV_8U)
221                         {
222                                 image.convertTo(buf.imagef, CV_32F);
223                                 templ.convertTo(buf.templf, CV_32F);
224                         }
225                         CV_Assert(image.channels() == 1);
226                         oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels()));
227                         filter2D(buf.imagef,o_result,CV_32F,buf.templf, Point(0,0));
228                         result = o_result(Rect(0,0,image.rows - templ.rows + 1, image.cols - templ.cols + 1));
229                 }
230         }
231
232         void matchTemplate_CCORR_NORMED(
233                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
234         {
235                 matchTemplate_CCORR(image,templ,result,buf);
236                 buf.image_sums.resize(1);
237                 buf.image_sqsums.resize(1);
238
239                 integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
240 #if EXT_FP64 && SQRSUM_FIXED
241                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
242 #elif EXT_FP64
243                 oclMat templ_c1 = templ.reshape(1);
244                 multiply(templ_c1, templ_c1, templ_c1);
245                 unsigned long long templ_sqsum = (unsigned long long)sum(templ_c1)[0];
246 #else
247                 Mat m_templ_c1 = templ.reshape(1);
248                 multiply(m_templ_c1, m_templ_c1, m_templ_c1);
249                 unsigned long long templ_sqsum = (unsigned long long)sum(m_templ_c1)[0];
250 #endif
251                 Context *clCxt = image.clCxt;
252                 string kernelName = "normalizeKernel";
253                 vector< pair<size_t, const void *> > args;
254
255                 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
256                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
257                 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
258                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
259                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
260                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
261                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
262                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
263                 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
264                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
265                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
266
267                 size_t globalThreads[3] = {result.cols, result.rows, 1};
268                 size_t localThreads[3]  = {32, 8, 1};
269                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
270         }
271
272         void matchTemplateNaive_CCORR(
273                 const oclMat& image, const oclMat& templ, oclMat& result, int cn)
274         {
275                 CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
276                         || (image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F);
277                 CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.channels() == 4) && result.channels() == 1);
278                 CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
279
280                 Context *clCxt = image.clCxt;
281                 string kernelName = "matchTemplate_Naive_CCORR";
282
283                 vector< pair<size_t, const void *> > args;
284
285                 args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
286                 args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
287                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
288                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
289                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
290                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
291                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
292                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
293                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
294                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
295                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
296                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
297                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
298                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
299                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
300
301                 size_t globalThreads[3] = {result.cols, result.rows, 1};
302                 size_t localThreads[3]  = {32, 8, 1};
303                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
304         }
305         //////////////////////////////////////////////////////////////////////
306         // CCOFF
307         void matchTemplate_CCOFF(
308                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
309         {
310                 CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
311
312                 matchTemplate_CCORR(image,templ,result,buf);
313
314                 Context *clCxt = image.clCxt;
315                 string kernelName;
316
317                 kernelName = "matchTemplate_Prepared_CCOFF";
318                 size_t globalThreads[3] = {result.cols, result.rows, 1};
319                 size_t localThreads[3]  = {32, 8, 1};
320
321                 vector< pair<size_t, const void *> > args;
322                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
323                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
324                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
325                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
326                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
327                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
328                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
329                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
330                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
331                 // to be continued in the following section
332                 if(image.channels() == 1)
333                 {
334                         buf.image_sums.resize(1);
335                         // FIXME: temp fix for incorrect integral kernel
336                         oclMat tmp_oclmat;
337                         integral(image, buf.image_sums[0], tmp_oclmat);
338
339                         float templ_sum = 0;
340 #if EXT_FP64
341                         templ_sum = (float)sum(templ)[0] / templ.size().area();
342 #else
343                         Mat o_templ = templ;
344                         templ_sum = (float)sum(o_templ)[0] / o_templ.size().area(); // temp fix for non-double supported machine
345 #endif
346                         args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
347                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
348                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
349                         args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
350                 }
351                 else
352                 {
353                         Vec4f templ_sum = Vec4f::all(0);
354 #if EXT_FP64
355                         split(image,buf.images);
356                         templ_sum = sum(templ) / templ.size().area();
357 #else 
358                         // temp fix for non-double supported machine
359                         Mat o_templ = templ, o_image = image;
360                         vector<Mat> o_mat_vector;
361                         o_mat_vector.resize(image.channels());
362                         buf.images.resize(image.channels());
363                         split(o_image, o_mat_vector);
364                         for(int i = 0; i < o_mat_vector.size(); i ++)
365                         {
366                                 buf.images[i] = oclMat(o_mat_vector[i]);
367                         }
368                         templ_sum = sum(o_templ) / templ.size().area();
369 #endif
370                         buf.image_sums.resize(buf.images.size());
371
372                         for(int i = 0; i < image.channels(); i ++)
373                         {
374                                 // FIXME: temp fix for incorrect integral kernel
375                                 oclMat omat_temp;
376                                 integral(buf.images[i], buf.image_sums[i], omat_temp);
377                         }
378                         switch(image.channels())
379                         {
380                         case 4:
381                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
382                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
383                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
384                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
385                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
386                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
387                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
388                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
389                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
390                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
391                                 break;
392                         default:
393                                 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
394                                 break;
395                         }
396                 }
397                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
398         }
399
400         void matchTemplate_CCOFF_NORMED(
401                 const oclMat& image, const oclMat& templ, oclMat& result, MatchTemplateBuf &buf)
402         {
403                 image.convertTo(buf.imagef, CV_32F);
404                 templ.convertTo(buf.templf, CV_32F);
405
406                 matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
407                 float scale = 1.f/templ.size().area();
408
409                 Context *clCxt = image.clCxt;
410                 string kernelName;
411
412                 kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
413                 size_t globalThreads[3] = {result.cols, result.rows, 1};
414                 size_t localThreads[3]  = {32, 8, 1};
415
416                 vector< pair<size_t, const void *> > args;
417                 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
418                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) ); 
419                 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
420                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
421                 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
422                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
423                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
424                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
425                 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
426                 args.push_back( make_pair( sizeof(cl_float),(void *)&scale) );
427                 // to be continued in the following section
428                 if(image.channels() == 1)
429                 {
430                         buf.image_sums.resize(1);
431                         buf.image_sqsums.resize(1);
432                         integral(image, buf.image_sums[0], buf.image_sqsums[0]);
433                         float templ_sum = 0;
434                         float templ_sqsum = 0;
435 #if EXT_FP64
436                         templ_sum   = (float)sum(templ)[0];
437 #if SQRSUM_FIXED
438                         templ_sqsum = sqrSum(templ);
439 #else
440                         oclMat templ_sqr = templ;
441                         multiply(templ,templ, templ_sqr);
442                         templ_sqsum  = sum(templ_sqr)[0];
443 #endif //SQRSUM_FIXED
444                         templ_sqsum -= scale * templ_sum * templ_sum;
445                         templ_sum   *= scale;
446 #else
447                         // temp fix for non-double supported machine
448                         Mat o_templ = templ;
449                         templ_sum   = (float)sum(o_templ)[0]; 
450                         templ_sqsum = sum(o_templ.mul(o_templ))[0] - scale * templ_sum * templ_sum;
451                         templ_sum  *= scale;
452 #endif
453                         args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
454                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
455                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
456                         args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
457                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
458                         args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
459                         args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum) );
460                         args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum) );
461                 }
462                 else
463                 {
464                         Vec4f templ_sum   = Vec4f::all(0);
465                         Vec4f templ_sqsum = Vec4f::all(0);
466 #if EXT_FP64
467                         split(image,buf.images);
468                         templ_sum   = sum(templ);
469 #if SQRSUM_FIXED
470                         templ_sqsum = sqrSum(templ);
471 #else
472                         oclMat templ_sqr = templ;
473                         multiply(templ,templ, templ_sqr);
474                         templ_sqsum  = sum(templ_sqr);
475 #endif //SQRSUM_FIXED
476                         templ_sqsum -= scale * templ_sum * templ_sum;
477                         
478 #else 
479                         // temp fix for non-double supported machine
480                         Mat o_templ = templ, o_image = image;
481                         
482                         vector<Mat> o_mat_vector;
483                         o_mat_vector.resize(image.channels());
484                         buf.images.resize(image.channels());
485                         split(o_image, o_mat_vector);
486                         for(int i = 0; i < o_mat_vector.size(); i ++)
487                         {
488                                 buf.images[i] = oclMat(o_mat_vector[i]);
489                         }
490                         templ_sum    = sum(o_templ);
491                         templ_sqsum  = sum(o_templ.mul(o_templ));
492 #endif
493                         float templ_sqsum_sum = 0;
494                         for(int i = 0; i < image.channels(); i ++)
495                         {
496                                 templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
497                         }
498                         templ_sum   *= scale;
499                         buf.image_sums.resize(buf.images.size());
500                         buf.image_sqsums.resize(buf.images.size());
501                         
502                         for(int i = 0; i < image.channels(); i ++)
503                         {
504                                 integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
505                         }
506                         
507                         switch(image.channels())
508                         {
509                         case 4:
510                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
511                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
512                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
513                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
514                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
515                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
516                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
517                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
518                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
519                                 args.push_back( make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
520                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
521                                 args.push_back( make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
522                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[0]) );
523                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[1]) );
524                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[2]) );
525                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sum[3]) );
526                                 args.push_back( make_pair( sizeof(cl_float),(void *)&templ_sqsum_sum) );
527                                 break;
528                         default:
529                                 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
530                                 break;
531                         }
532                 }
533                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.channels(), image.depth());
534         }
535
536 }/*ocl*/} /*cv*/
537
538 void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method)
539 {
540         MatchTemplateBuf buf;
541         matchTemplate(image,templ, result, method,buf);
542 }
543 void cv::ocl::matchTemplate(const oclMat& image, const oclMat& templ, oclMat& result, int method, MatchTemplateBuf& buf)
544 {
545         CV_Assert(image.type() == templ.type());
546         CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
547
548         typedef void (*Caller)(const oclMat&, const oclMat&, oclMat&, MatchTemplateBuf&);
549
550         const Caller callers[] = { 
551                 ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED, 
552                 ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED, 
553                 ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
554         };
555
556         Caller caller = callers[method];
557         CV_Assert(caller);
558         caller(image, templ, result, buf);
559 }
560 #endif //