Merge remote-tracking branch 'origin/2.4' into merge-2.4
[profile/ivi/opencv.git] / modules / ocl / src / match_template.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // @Authors
18 //    Peng Xiao, pengxiao@multicorewareinc.com
19 //
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
22 //
23 //   * Redistribution's of source code must retain the above copyright notice,
24 //     this list of conditions and the following disclaimer.
25 //
26 //   * Redistribution's in binary form must reproduce the above copyright notice,
27 //     this list of conditions and the following disclaimer in the documentation
28 //     and/or other oclMaterials provided with the distribution.
29 //
30 //   * The name of the copyright holders may not be used to endorse or promote products
31 //     derived from this software without specific prior written permission.
32 //
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
43 //
44 //M*/
45
46
47 #include <iomanip>
48 #include "precomp.hpp"
49
50 using namespace cv;
51 using namespace cv::ocl;
52
53 //helper routines
54 namespace cv
55 {
56     namespace ocl
57     {
58         ///////////////////////////OpenCL kernel strings///////////////////////////
59         extern const char *match_template;
60     }
61 }
62
63 namespace cv
64 {
65     namespace ocl
66     {
67         void matchTemplate_SQDIFF(
68             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
69
70         void matchTemplate_SQDIFF_NORMED(
71             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
72
73         void convolve_32F(
74             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
75
76         void matchTemplate_CCORR(
77             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
78
79         void matchTemplate_CCORR_NORMED(
80             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
81
82         void matchTemplate_CCOFF(
83             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
84
85         void matchTemplate_CCOFF_NORMED(
86             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
87
88
89         void matchTemplateNaive_SQDIFF(
90             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
91
92         void matchTemplateNaive_CCORR(
93             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
94
95         void extractFirstChannel_32F(
96             const oclMat &image, oclMat &result);
97
98         // Evaluates optimal template's area threshold. If
99         // template's area is less  than the threshold, we use naive match
100         // template version, otherwise FFT-based (if available)
101         static bool useNaive(int method, int depth, Size size)
102         {
103 #ifdef HAVE_CLAMDFFT
104             if (method == TM_SQDIFF && (depth == CV_32F || !Context::getContext()->supportsFeature(Context::CL_DOUBLE)))
105             {
106                 return true;
107             }
108             else if(method == TM_CCORR || (method == TM_SQDIFF && depth == CV_8U))
109             {
110                 return size.height < 18 && size.width < 18;
111             }
112             else
113                 return false;
114 #else
115 #define UNUSED(x) (void)(x);
116             UNUSED(method) UNUSED(depth) UNUSED(size)
117 #undef  UNUSED
118             return true;
119 #endif
120         }
121
122         //////////////////////////////////////////////////////////////////////
123         // SQDIFF
124         void matchTemplate_SQDIFF(
125             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
126         {
127             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
128             if (useNaive(TM_SQDIFF, image.depth(), templ.size()))
129             {
130                 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
131                 return;
132             }
133             else
134             {
135                 buf.image_sqsums.resize(1);
136
137                 // TODO, add double support for ocl::integral
138                 // use CPU integral temporarily
139                 Mat sums, sqsums;
140                 cv::integral(Mat(image.reshape(1)), sums, sqsums);
141                 buf.image_sqsums[0] = sqsums;
142
143                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
144                 matchTemplate_CCORR(image, templ, result, buf);
145
146                 //port CUDA's matchTemplatePrepared_SQDIFF_8U
147                 Context *clCxt = image.clCxt;
148                 String kernelName = "matchTemplate_Prepared_SQDIFF";
149                 std::vector< std::pair<size_t, const void *> > args;
150
151                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
152                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
153                 args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
154                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
155                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
156                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
157                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
158                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
159                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
160                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
161                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
162
163                 size_t globalThreads[3] = {result.cols, result.rows, 1};
164                 size_t localThreads[3]  = {16, 16, 1};
165
166                 const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
167                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
168             }
169         }
170
171         void matchTemplate_SQDIFF_NORMED(
172             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
173         {
174             matchTemplate_CCORR(image, templ, result, buf);
175             buf.image_sums.resize(1);
176
177             integral(image.reshape(1), buf.image_sums[0]);
178
179             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
180
181             Context *clCxt = image.clCxt;
182             String kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
183             std::vector< std::pair<size_t, const void *> > args;
184
185             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
186             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
187             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
188             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
189             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
190             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
191             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
192             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
193             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
194             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
195             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
196
197             size_t globalThreads[3] = {result.cols, result.rows, 1};
198             size_t localThreads[3]  = {16, 16, 1};
199             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
200         }
201
202         void matchTemplateNaive_SQDIFF(
203             const oclMat &image, const oclMat &templ, oclMat &result, int)
204         {
205             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
206                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
207                      );
208             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
209             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
210
211             Context *clCxt = image.clCxt;
212             String kernelName = "matchTemplate_Naive_SQDIFF";
213
214             std::vector< std::pair<size_t, const void *> > args;
215
216             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
217             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
218             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
219             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
220             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
221             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
222             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
223             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
224             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
225             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
226             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
227             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
228             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
229             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
230             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
231
232             size_t globalThreads[3] = {result.cols, result.rows, 1};
233             size_t localThreads[3]  = {16, 16, 1};
234             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
235         }
236
237         //////////////////////////////////////////////////////////////////////
238         // CCORR
239         void convolve_32F(
240             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
241         {
242             ConvolveBuf convolve_buf;
243             convolve_buf.user_block_size = buf.user_block_size;
244             if (image.oclchannels() == 1)
245                 convolve(image, templ, result, true, convolve_buf);
246             else
247             {
248                 oclMat result_;
249                 convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf);
250                 extractFirstChannel_32F(result_, result);
251             }
252         }
253
254         void matchTemplate_CCORR(
255             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
256         {
257             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
258             if (useNaive(TM_CCORR, image.depth(), templ.size()))
259             {
260                 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
261                 return;
262             }
263             else
264             {
265                 if(image.depth() == CV_8U && templ.depth() == CV_8U)
266                 {
267                     image.convertTo(buf.imagef, CV_32F);
268                     templ.convertTo(buf.templf, CV_32F);
269                     convolve_32F(buf.imagef, buf.templf, result, buf);
270                 }
271                 else
272                 {
273                     convolve_32F(image, templ, result, buf);
274                 }
275             }
276         }
277
278         void matchTemplate_CCORR_NORMED(
279             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
280         {
281             matchTemplate_CCORR(image, templ, result, buf);
282             buf.image_sums.resize(1);
283             buf.image_sqsums.resize(1);
284
285             integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
286
287             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
288
289             Context *clCxt = image.clCxt;
290             String kernelName = "normalizeKernel";
291             std::vector< std::pair<size_t, const void *> > args;
292
293             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
294             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
295             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
296             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
297             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
298             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
299             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
300             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
301             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
302             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
303             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
304
305             size_t globalThreads[3] = {result.cols, result.rows, 1};
306             size_t localThreads[3]  = {16, 16, 1};
307             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
308         }
309
310         void matchTemplateNaive_CCORR(
311             const oclMat &image, const oclMat &templ, oclMat &result, int)
312         {
313             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
314                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
315                      );
316             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
317             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
318
319             Context *clCxt = image.clCxt;
320             String kernelName = "matchTemplate_Naive_CCORR";
321
322             std::vector< std::pair<size_t, const void *> > args;
323
324             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
325             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
326             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
327             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
328             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
329             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
330             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
331             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
332             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
333             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
334             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
335             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
336             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
337             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
338             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
339
340             size_t globalThreads[3] = {result.cols, result.rows, 1};
341             size_t localThreads[3]  = {16, 16, 1};
342             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
343         }
344         //////////////////////////////////////////////////////////////////////
345         // CCOFF
346         void matchTemplate_CCOFF(
347             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
348         {
349             CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
350
351             matchTemplate_CCORR(image, templ, result, buf);
352
353             Context *clCxt = image.clCxt;
354             String kernelName;
355
356             kernelName = "matchTemplate_Prepared_CCOFF";
357             size_t globalThreads[3] = {result.cols, result.rows, 1};
358             size_t localThreads[3]  = {16, 16, 1};
359
360             std::vector< std::pair<size_t, const void *> > args;
361             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
362             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
363             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
364             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
365             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
366             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
367             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
368             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
369             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
370             Vec4f templ_sum = Vec4f::all(0);
371             // to be continued in the following section
372             if(image.oclchannels() == 1)
373             {
374                 buf.image_sums.resize(1);
375                 integral(image, buf.image_sums[0]);
376
377                 templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
378                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
379                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
380                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
381                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
382             }
383             else
384             {
385
386                 split(image, buf.images);
387                 templ_sum = sum(templ) / templ.size().area();
388                 buf.image_sums.resize(buf.images.size());
389
390
391                 for(int i = 0; i < image.oclchannels(); i ++)
392                 {
393                     integral(buf.images[i], buf.image_sums[i]);
394                 }
395                 switch(image.oclchannels())
396                 {
397                 case 4:
398                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
399                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
400                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
401                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
402                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
403                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
404                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
405                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
406                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
407                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
408                     break;
409                 default:
410                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
411                     break;
412                 }
413             }
414             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
415         }
416
417         void matchTemplate_CCOFF_NORMED(
418             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
419         {
420             image.convertTo(buf.imagef, CV_32F);
421             templ.convertTo(buf.templf, CV_32F);
422
423             matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
424             float scale = 1.f / templ.size().area();
425
426             Context *clCxt = image.clCxt;
427             String kernelName;
428
429             kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
430             size_t globalThreads[3] = {result.cols, result.rows, 1};
431             size_t localThreads[3]  = {16, 16, 1};
432
433             std::vector< std::pair<size_t, const void *> > args;
434             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
435             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
436             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
437             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
438             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
439             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
440             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
441             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
442             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
443             args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale) );
444
445             Vec4f templ_sum   = Vec4f::all(0);
446             Vec4f templ_sqsum = Vec4f::all(0);
447             // to be continued in the following section
448             if(image.oclchannels() == 1)
449             {
450                 buf.image_sums.resize(1);
451                 buf.image_sqsums.resize(1);
452                 integral(image, buf.image_sums[0], buf.image_sqsums[0]);
453
454                 templ_sum[0]   = (float)sum(templ)[0];
455
456                 templ_sqsum[0] = sqrSum(templ)[0];
457
458                 templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
459                 templ_sum[0]   *= scale;
460
461                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
462                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
463                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
464                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
465                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
466                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
467                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
468                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
469             }
470             else
471             {
472
473                 split(image, buf.images);
474                 templ_sum   = sum(templ);
475
476                 templ_sqsum = sqrSum(templ);
477
478                 templ_sqsum -= scale * templ_sum * templ_sum;
479
480                 float templ_sqsum_sum = 0;
481                 for(int i = 0; i < image.oclchannels(); i ++)
482                 {
483                     templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
484                 }
485                 templ_sum   *= scale;
486                 buf.image_sums.resize(buf.images.size());
487                 buf.image_sqsums.resize(buf.images.size());
488
489                 for(int i = 0; i < image.oclchannels(); i ++)
490                 {
491                     integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
492                 }
493
494                 switch(image.oclchannels())
495                 {
496                 case 4:
497                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
498                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
499                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
500                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
501                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
502                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
503                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
504                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
505                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
506                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
507                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
508                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
509                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
510                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
511                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
512                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
513                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
514                     break;
515                 default:
516                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
517                     break;
518                 }
519             }
520             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
521         }
522         void extractFirstChannel_32F(const oclMat &image, oclMat &result)
523         {
524             Context *clCxt = image.clCxt;
525             String kernelName;
526
527             kernelName = "extractFirstChannel";
528             size_t globalThreads[3] = {result.cols, result.rows, 1};
529             size_t localThreads[3]  = {16, 16, 1};
530
531             std::vector< std::pair<size_t, const void *> > args;
532             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data) );
533             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
534             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
535             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
536             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
537             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
538             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
539             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
540
541             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
542         }
543     }/*ocl*/
544 } /*cv*/
545
546 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
547 {
548     MatchTemplateBuf buf;
549     matchTemplate(image, templ, result, method, buf);
550 }
551 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
552 {
553     CV_Assert(image.type() == templ.type());
554     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
555
556     typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
557
558     const Caller callers[] =
559     {
560         ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
561         ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
562         ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
563     };
564
565     Caller caller = callers[method];
566     CV_Assert(caller);
567     caller(image, templ, result, buf);
568 }