1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
18 // Peng Xiao, pengxiao@multicorewareinc.com
20 // Redistribution and use in source and binary forms, with or without modification,
21 // are permitted provided that the following conditions are met:
23 // * Redistribution's of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimer.
26 // * Redistribution's in binary form must reproduce the above copyright notice,
27 // this list of conditions and the following disclaimer in the documentation
28 // and/or other oclMaterials provided with the distribution.
30 // * The name of the copyright holders may not be used to endorse or promote products
31 // derived from this software without specific prior written permission.
33 // This software is provided by the copyright holders and contributors as is and
34 // any express or implied warranties, including, but not limited to, the implied
35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
36 // In no event shall the Intel Corporation or contributors be liable for any direct,
37 // indirect, incidental, special, exemplary, or consequential damages
38 // (including, but not limited to, procurement of substitute goods or services;
39 // loss of use, data, or profits; or business interruption) however caused
40 // and on any theory of liability, whether in contract, strict liability,
41 // or tort (including negligence or otherwise) arising in any way out of
42 // the use of this software, even if advised of the possibility of such damage.
48 #include "precomp.hpp"
51 using namespace cv::ocl;
54 #if !defined (HAVE_OPENCL)
55 void cv::ocl::matchTemplate(const oclMat &, const oclMat &, oclMat &)
65 ///////////////////////////OpenCL kernel strings///////////////////////////
66 extern const char *match_template;
74 void matchTemplate_SQDIFF(
75 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
77 void matchTemplate_SQDIFF_NORMED(
78 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
80 void matchTemplate_CCORR(
81 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
83 void matchTemplate_CCORR_NORMED(
84 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
86 void matchTemplate_CCOFF(
87 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
89 void matchTemplate_CCOFF_NORMED(
90 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
93 void matchTemplateNaive_SQDIFF(
94 const oclMat &image, const oclMat &templ, oclMat &result, int cn);
96 void matchTemplateNaive_CCORR(
97 const oclMat &image, const oclMat &templ, oclMat &result, int cn);
99 // Evaluates optimal template's area threshold. If
100 // template's area is less than the threshold, we use naive match
101 // template version, otherwise FFT-based (if available)
102 static int getTemplateThreshold(int method, int depth)
107 if (depth == CV_32F) return 250;
108 if (depth == CV_8U) return 300;
111 if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F
112 if (depth == CV_8U) return 300;
115 CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
119 //////////////////////////////////////////////////////////////////////
121 void matchTemplate_SQDIFF(
122 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &)
124 result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
125 if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
127 matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
133 CV_Error(CV_StsBadArg, "Not supported yet for this size template");
137 void matchTemplate_SQDIFF_NORMED(
138 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
140 matchTemplate_CCORR(image, templ, result, buf);
141 buf.image_sums.resize(1);
144 integral(image.reshape(1), buf.image_sums[0]);
146 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
148 Context *clCxt = image.clCxt;
149 string kernelName = "matchTemplate_Prepared_SQDIFF_NORMED";
150 vector< pair<size_t, const void *> > args;
152 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data));
153 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
154 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
155 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
156 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
157 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
158 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
159 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset));
160 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step));
161 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
162 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
164 size_t globalThreads[3] = {result.cols, result.rows, 1};
165 size_t localThreads[3] = {32, 8, 1};
166 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
169 void matchTemplateNaive_SQDIFF(
170 const oclMat &image, const oclMat &templ, oclMat &result, int)
172 CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
173 || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
175 CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
176 CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
178 Context *clCxt = image.clCxt;
179 string kernelName = "matchTemplate_Naive_SQDIFF";
181 vector< pair<size_t, const void *> > args;
183 args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
184 args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
185 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
186 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
187 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
188 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
189 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
190 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
191 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
192 args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
193 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
194 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
195 args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
196 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
197 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
199 size_t globalThreads[3] = {result.cols, result.rows, 1};
200 size_t localThreads[3] = {32, 8, 1};
201 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
204 //////////////////////////////////////////////////////////////////////
206 void matchTemplate_CCORR(
207 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
209 result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
210 if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
212 matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
217 CV_Error(CV_StsBadArg, "Not supported yet for this size template");
218 if(image.depth() == CV_8U && templ.depth() == CV_8U)
220 image.convertTo(buf.imagef, CV_32F);
221 templ.convertTo(buf.templf, CV_32F);
223 CV_Assert(image.oclchannels() == 1);
224 oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels()));
225 filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0));
226 result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1));
230 void matchTemplate_CCORR_NORMED(
231 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
233 matchTemplate_CCORR(image, templ, result, buf);
234 buf.image_sums.resize(1);
235 buf.image_sqsums.resize(1);
237 integral(image.reshape(1), buf.image_sums[0], buf.image_sqsums[0]);
239 unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
241 Context *clCxt = image.clCxt;
242 string kernelName = "normalizeKernel";
243 vector< pair<size_t, const void *> > args;
245 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
246 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
247 args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
248 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
249 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
250 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
251 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
252 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
253 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
254 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
255 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
257 size_t globalThreads[3] = {result.cols, result.rows, 1};
258 size_t localThreads[3] = {32, 8, 1};
259 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
262 void matchTemplateNaive_CCORR(
263 const oclMat &image, const oclMat &templ, oclMat &result, int)
265 CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
266 || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
268 CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
269 CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
271 Context *clCxt = image.clCxt;
272 string kernelName = "matchTemplate_Naive_CCORR";
274 vector< pair<size_t, const void *> > args;
276 args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data));
277 args.push_back( make_pair( sizeof(cl_mem), (void *)&templ.data));
278 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
279 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows));
280 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols));
281 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
282 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
283 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
284 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
285 args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
286 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.offset));
287 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
288 args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
289 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.step));
290 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
292 size_t globalThreads[3] = {result.cols, result.rows, 1};
293 size_t localThreads[3] = {32, 8, 1};
294 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
296 //////////////////////////////////////////////////////////////////////
298 void matchTemplate_CCOFF(
299 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
301 CV_Assert(image.depth() == CV_8U && templ.depth() == CV_8U);
303 matchTemplate_CCORR(image, templ, result, buf);
305 Context *clCxt = image.clCxt;
308 kernelName = "matchTemplate_Prepared_CCOFF";
309 size_t globalThreads[3] = {result.cols, result.rows, 1};
310 size_t localThreads[3] = {32, 8, 1};
312 vector< pair<size_t, const void *> > args;
313 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
314 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
315 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
316 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
317 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
318 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
319 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
320 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
321 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
322 // to be continued in the following section
323 if(image.oclchannels() == 1)
325 buf.image_sums.resize(1);
326 integral(image, buf.image_sums[0]);
329 templ_sum = (float)sum(templ)[0] / templ.size().area();
330 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) );
331 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) );
332 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) );
333 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) );
337 Vec4f templ_sum = Vec4f::all(0);
338 split(image, buf.images);
339 templ_sum = sum(templ) / templ.size().area();
340 buf.image_sums.resize(buf.images.size());
343 for(int i = 0; i < image.oclchannels(); i ++)
345 integral(buf.images[i], buf.image_sums[i]);
347 switch(image.oclchannels())
350 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) );
351 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[1].data) );
352 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[2].data) );
353 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[3].data) );
354 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) );
355 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) );
356 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
357 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
358 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
359 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
362 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
366 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
369 void matchTemplate_CCOFF_NORMED(
370 const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
372 image.convertTo(buf.imagef, CV_32F);
373 templ.convertTo(buf.templf, CV_32F);
375 matchTemplate_CCORR(buf.imagef, buf.templf, result, buf);
376 float scale = 1.f / templ.size().area();
378 Context *clCxt = image.clCxt;
381 kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
382 size_t globalThreads[3] = {result.cols, result.rows, 1};
383 size_t localThreads[3] = {32, 8, 1};
385 vector< pair<size_t, const void *> > args;
386 args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
387 args.push_back( make_pair( sizeof(cl_int), (void *)&image.rows) );
388 args.push_back( make_pair( sizeof(cl_int), (void *)&image.cols) );
389 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows) );
390 args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols) );
391 args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
392 args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
393 args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
394 args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
395 args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
396 // to be continued in the following section
397 if(image.oclchannels() == 1)
399 buf.image_sums.resize(1);
400 buf.image_sqsums.resize(1);
401 integral(image, buf.image_sums[0], buf.image_sqsums[0]);
403 float templ_sqsum = 0;
404 templ_sum = (float)sum(templ)[0];
406 templ_sqsum = sqrSum(templ)[0];
408 templ_sqsum -= scale * templ_sum * templ_sum;
411 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) );
412 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) );
413 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) );
414 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data) );
415 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset) );
416 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step) );
417 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) );
418 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum) );
422 Vec4f templ_sum = Vec4f::all(0);
423 Vec4f templ_sqsum = Vec4f::all(0);
425 split(image, buf.images);
426 templ_sum = sum(templ);
428 templ_sqsum = sqrSum(templ);
430 templ_sqsum -= scale * templ_sum * templ_sum;
432 float templ_sqsum_sum = 0;
433 for(int i = 0; i < image.oclchannels(); i ++)
435 templ_sqsum_sum += templ_sqsum[i] - scale * templ_sum[i] * templ_sum[i];
438 buf.image_sums.resize(buf.images.size());
439 buf.image_sqsums.resize(buf.images.size());
441 for(int i = 0; i < image.oclchannels(); i ++)
443 integral(buf.images[i], buf.image_sums[i], buf.image_sqsums[i]);
446 switch(image.oclchannels())
449 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) );
450 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[1].data) );
451 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[2].data) );
452 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[3].data) );
453 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) );
454 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) );
455 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data) );
456 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[1].data) );
457 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[2].data) );
458 args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[3].data) );
459 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset) );
460 args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step) );
461 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
462 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[1]) );
463 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[2]) );
464 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[3]) );
465 args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum_sum) );
468 CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
472 openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
478 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method)
480 MatchTemplateBuf buf;
481 matchTemplate(image, templ, result, method, buf);
483 void cv::ocl::matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf)
485 CV_Assert(image.type() == templ.type());
486 CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
488 typedef void (*Caller)(const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &);
490 const Caller callers[] =
492 ::matchTemplate_SQDIFF, ::matchTemplate_SQDIFF_NORMED,
493 ::matchTemplate_CCORR, ::matchTemplate_CCORR_NORMED,
494 ::matchTemplate_CCOFF, ::matchTemplate_CCOFF_NORMED
497 Caller caller = callers[method];
499 caller(image, templ, result, buf);