Merge remote-tracking branch 'origin/2.4' into merge-2.4
[profile/ivi/opencv.git] / modules / ocl / src / match_template.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // @Authors
18 //    Peng Xiao, pengxiao@multicorewareinc.com
19 //
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
22 //
23 //   * Redistribution's of source code must retain the above copyright notice,
24 //     this list of conditions and the following disclaimer.
25 //
26 //   * Redistribution's in binary form must reproduce the above copyright notice,
27 //     this list of conditions and the following disclaimer in the documentation
28 //     and/or other materials provided with the distribution.
29 //
30 //   * The name of the copyright holders may not be used to endorse or promote products
31 //     derived from this software without specific prior written permission.
32 //
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
43 //
44 //M*/
45
46
47 #include "precomp.hpp"
48 #include "opencl_kernels.hpp"
49
50 using namespace cv;
51 using namespace cv::ocl;
52
53 namespace cv
54 {
55     namespace ocl
56     {
57         void matchTemplate_SQDIFF(
58             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
59
60         void matchTemplate_SQDIFF_NORMED(
61             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
62
63         void convolve_32F(
64             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
65
66         void matchTemplate_CCORR(
67             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
68
69         void matchTemplate_CCORR_NORMED(
70             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
71
72         void matchTemplate_CCOFF(
73             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
74
75         void matchTemplate_CCOFF_NORMED(
76             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
77
78
79         void matchTemplateNaive_SQDIFF(
80             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
81
82         void matchTemplateNaive_CCORR(
83             const oclMat &image, const oclMat &templ, oclMat &result, int cn);
84
85         void extractFirstChannel_32F(
86             const oclMat &image, oclMat &result);
87
88         // Evaluates optimal template's area threshold. If
89         // template's area is less  than the threshold, we use naive match
90         // template version, otherwise FFT-based (if available)
91         static bool useNaive(int method, int depth, Size size)
92         {
93 #ifdef HAVE_CLAMDFFT
94             if (method == TM_SQDIFF && (depth == CV_32F || !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)))
95             {
96                 return true;
97             }
98             else if(method == TM_CCORR || (method == TM_SQDIFF && depth == CV_8U))
99             {
100                 return size.height < 18 && size.width < 18;
101             }
102             else
103                 return false;
104 #else
105 #define UNUSED(x) (void)(x);
106             UNUSED(method) UNUSED(depth) UNUSED(size)
107 #undef  UNUSED
108             return true;
109 #endif
110         }
111
112         //////////////////////////////////////////////////////////////////////
113         // SQDIFF
114         void matchTemplate_SQDIFF(
115             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
116         {
117             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
118             if (useNaive(TM_SQDIFF, image.depth(), templ.size()))
119             {
120                 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
121                 return;
122             }
123             else
124             {
125                 buf.image_sqsums.resize(1);
126
127                 // TODO, add double support for ocl::integral
128                 // use CPU integral temporarily
129                 Mat sums, sqsums;
130                 cv::integral(Mat(image.reshape(1)), sums, sqsums);
131                 buf.image_sqsums[0] = sqsums;
132
133                 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
134                 matchTemplate_CCORR(image, templ, result, buf);
135
136                 //port CUDA's matchTemplatePrepared_SQDIFF_8U
137                 Context *clCxt = image.clCxt;
138                 String kernelName = "matchTemplate_Prepared_SQDIFF";
139                 std::vector< std::pair<size_t, const void *> > args;
140
141                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
142                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
143                 args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
144                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
145                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
146                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
147                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
148                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
149                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
150                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
151                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
152
153                 size_t globalThreads[3] = {result.cols, result.rows, 1};
154                 size_t localThreads[3]  = {16, 16, 1};
155
156                 const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
157                 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
158             }
159         }
160
161         void matchTemplate_SQDIFF_NORMED(
162             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
163         {
164             matchTemplate_CCORR(image, templ, result, buf);
165             buf.image_sums.resize(1);
166
167             integral(image.reshape(1), buf.image_sums[0]);
168
169             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
170
171             Context *clCxt = image.clCxt;
172             String kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
173             std::vector< std::pair<size_t, const void *> > args;
174
175             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
176             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
177             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
178             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
179             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
180             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
181             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
182             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
183             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
184             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
185             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
186
187             size_t globalThreads[3] = {result.cols, result.rows, 1};
188             size_t localThreads[3]  = {16, 16, 1};
189             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
190         }
191
192         void matchTemplateNaive_SQDIFF(
193             const oclMat &image, const oclMat &templ, oclMat &result, int)
194         {
195             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
196                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
197                      );
198             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
199             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
200
201             Context *clCxt = image.clCxt;
202             String kernelName = "matchTemplate_Naive_SQDIFF";
203
204             std::vector< std::pair<size_t, const void *> > args;
205
206             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
207             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
208             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
209             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
210             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
211             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
212             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
213             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
214             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
215             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
216             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
217             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
218             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
219             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
220             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
221
222             size_t globalThreads[3] = {result.cols, result.rows, 1};
223             size_t localThreads[3]  = {16, 16, 1};
224             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
225         }
226
227         //////////////////////////////////////////////////////////////////////
228         // CCORR
229         void convolve_32F(
230             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
231         {
232             ConvolveBuf convolve_buf;
233             convolve_buf.user_block_size = buf.user_block_size;
234             if (image.oclchannels() == 1)
235                 convolve(image, templ, result, true, convolve_buf);
236             else
237             {
238                 oclMat result_;
239                 convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf);
240                 extractFirstChannel_32F(result_, result);
241             }
242         }
243
244         void matchTemplate_CCORR(
245             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
246         {
247             result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
248             if (useNaive(TM_CCORR, image.depth(), templ.size()))
249             {
250                 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
251                 return;
252             }
253             else
254             {
255                 if(image.depth() == CV_8U && templ.depth() == CV_8U)
256                 {
257                     image.convertTo(buf.imagef, CV_32F);
258                     templ.convertTo(buf.templf, CV_32F);
259                     convolve_32F(buf.imagef, buf.templf, result, buf);
260                 }
261                 else
262                 {
263                     convolve_32F(image, templ, result, buf);
264                 }
265             }
266         }
267
268         void matchTemplate_CCORR_NORMED(
269             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
270         {
271             cv::ocl::oclMat temp;
272             matchTemplate_CCORR(image, templ, result, buf);
273             buf.image_sums.resize(1);
274             buf.image_sqsums.resize(1);
275             integral(image.reshape(1), buf.image_sums[0], temp);
276             if(temp.depth() == CV_64F)
277                 temp.convertTo(buf.image_sqsums[0], CV_32FC1);
278             else
279                 buf.image_sqsums[0] = temp;
280             unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
281
282             Context *clCxt = image.clCxt;
283             String kernelName = "normalizeKernel";
284             std::vector< std::pair<size_t, const void *> > args;
285
286             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
287             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
288             args.push_back( std::make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
289             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
290             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
291             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
292             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
293             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
294             args.push_back( std::make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
295             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
296             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
297
298             size_t globalThreads[3] = {result.cols, result.rows, 1};
299             size_t localThreads[3]  = {16, 16, 1};
300             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
301         }
302
303         void matchTemplateNaive_CCORR(
304             const oclMat &image, const oclMat &templ, oclMat &result, int)
305         {
306             CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
307                       || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
308                      );
309             CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
310             CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
311
312             Context *clCxt = image.clCxt;
313             String kernelName = "matchTemplate_Naive_CCORR";
314
315             std::vector< std::pair<size_t, const void *> > args;
316
317             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data));
318             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&templ.data));
319             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data));
320             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows));
321             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols));
322             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows));
323             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols));
324             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows));
325             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols));
326             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
327             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.offset));
328             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
329             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
330             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.step));
331             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
332
333             size_t globalThreads[3] = {result.cols, result.rows, 1};
334             size_t localThreads[3]  = {16, 16, 1};
335             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
336         }
337         //////////////////////////////////////////////////////////////////////
338         // CCOFF
339         void matchTemplate_CCOFF(
340             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
341         {
342             CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
343
344             matchTemplate_CCORR(image, templ, result, buf);
345
346             Context *clCxt = image.clCxt;
347             String kernelName;
348
349             kernelName = "matchTemplate_Prepared_CCOFF";
350             size_t globalThreads[3] = {result.cols, result.rows, 1};
351             size_t localThreads[3]  = {16, 16, 1};
352
353             std::vector< std::pair<size_t, const void *> > args;
354             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
355             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
356             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
357             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
358             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
359             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
360             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
361             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
362             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
363             Vec4f templ_sum = Vec4f::all(0);
364             // to be continued in the following section
365             if(image.oclchannels() == 1)
366             {
367                 buf.image_sums.resize(1);
368                 integral(image, buf.image_sums[0]);
369
370                 templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
371                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
372                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
373                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
374                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
375             }
376             else
377             {
378
379                 split(image, buf.images);
380                 templ_sum = sum(templ) / templ.size().area();
381                 buf.image_sums.resize(buf.images.size());
382
383
384                 for(int i = 0; i < image.oclchannels(); i ++)
385                 {
386                     integral(buf.images[i], buf.image_sums[i]);
387                 }
388                 switch(image.oclchannels())
389                 {
390                 case 4:
391                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
392                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
393                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
394                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
395                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
396                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
397                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
398                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
399                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
400                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
401                     break;
402                 default:
403                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
404                     break;
405                 }
406             }
407             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
408         }
409
410         void matchTemplate_CCOFF_NORMED(
411             const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
412         {
413             image.convertTo(buf.imagef, CV_32F);
414             templ.convertTo(buf.templf, CV_32F);
415
416             matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
417             float scale = 1.f / templ.size().area();
418
419             Context *clCxt = image.clCxt;
420             String kernelName;
421
422             kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
423             size_t globalThreads[3] = {result.cols, result.rows, 1};
424             size_t localThreads[3]  = {16, 16, 1};
425
426             std::vector< std::pair<size_t, const void *> > args;
427             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
428             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.rows) );
429             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.cols) );
430             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.rows) );
431             args.push_back( std::make_pair( sizeof(cl_int), (void *)&templ.cols) );
432             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
433             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
434             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
435             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
436             args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale) );
437
438             Vec4f templ_sum   = Vec4f::all(0);
439             Vec4f templ_sqsum = Vec4f::all(0);
440             // to be continued in the following section
441             if(image.oclchannels() == 1)
442             {
443                 buf.image_sums.resize(1);
444                 buf.image_sqsums.resize(1);
445                 cv::ocl::oclMat temp;
446                 integral(image, buf.image_sums[0], temp);
447                 if(temp.depth() == CV_64F)
448                     temp.convertTo(buf.image_sqsums[0], CV_32FC1);
449                 else
450                     buf.image_sqsums[0] = temp;
451
452                 templ_sum[0]   = (float)sum(templ)[0];
453
454                 templ_sqsum[0] = sqrSum(templ)[0];
455
456                 templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
457                 templ_sum[0]   *= scale;
458
459                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
460                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
461                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
462                 args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
463                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
464                 args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
465                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
466                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
467             }
468             else
469             {
470
471                 split(image, buf.images);
472                 templ_sum   = sum(templ);
473
474                 templ_sqsum = sqrSum(templ);
475
476                 templ_sqsum -= scale * templ_sum * templ_sum;
477
478                 float templ_sqsum_sum = 0;
479                 for(int i = 0; i < image.oclchannels(); i ++)
480                 {
481                     templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
482                 }
483                 templ_sum   *= scale;
484                 buf.image_sums.resize(buf.images.size());
485                 buf.image_sqsums.resize(buf.images.size());
486                 cv::ocl::oclMat temp;
487                 for(int i = 0; i < image.oclchannels(); i ++)
488                 {
489                     integral(buf.images[i], buf.image_sums[i], temp);
490                     if(temp.depth() == CV_64F)
491                         temp.convertTo(buf.image_sqsums[i], CV_32FC1);
492                     else
493                         buf.image_sqsums[i] = temp;
494                 }
495
496                 switch(image.oclchannels())
497                 {
498                 case 4:
499                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[0].data) );
500                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[1].data) );
501                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[2].data) );
502                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sums[3].data) );
503                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].offset) );
504                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sums[0].step) );
505                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[0].data) );
506                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[1].data) );
507                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[2].data) );
508                     args.push_back( std::make_pair( sizeof(cl_mem),  (void *)&buf.image_sqsums[3].data) );
509                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].offset) );
510                     args.push_back( std::make_pair( sizeof(cl_int),  (void *)&buf.image_sqsums[0].step) );
511                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
512                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
513                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
514                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
515                     args.push_back( std::make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
516                     break;
517                 default:
518                     CV_Error(Error::StsBadArg, "matchTemplate: unsupported number of channels");
519                     break;
520                 }
521             }
522             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
523         }
524         void extractFirstChannel_32F(const oclMat &image, oclMat &result)
525         {
526             Context *clCxt = image.clCxt;
527             String kernelName;
528
529             kernelName = "extractFirstChannel";
530             size_t globalThreads[3] = {result.cols, result.rows, 1};
531             size_t localThreads[3]  = {16, 16, 1};
532
533             std::vector< std::pair<size_t, const void *> > args;
534             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&image.data) );
535             args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data) );
536             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.rows) );
537             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.cols) );
538             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.offset));
539             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.offset));
540             args.push_back( std::make_pair( sizeof(cl_int), (void *)&image.step));
541             args.push_back( std::make_pair( sizeof(cl_int), (void *)&result.step));
542
543             openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
544         }
545     }/*ocl*/
546 } /*cv*/
547
548 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
549 {
550     MatchTemplateBuf buf;
551     matchTemplate(image, templ, result, method, buf);
552 }
553 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
554 {
555     CV_Assert(image.type() == templ.type());
556     CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
557
558     typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
559
560     const Caller callers[] =
561     {
562         ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
563         ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
564         ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
565     };
566
567     Caller caller = callers[method];
568     CV_Assert(caller);
569     caller(image, templ, result, buf);
570 }