1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
19 // Niko Li, newlife20080214@gmail.com
20 // Jia Haipeng, jiahaipeng95@gmail.com
21 // Shengen Yan, yanshengen@gmail.com
22 // Jiang Liyuan, jlyuan001.good@163.com
23 // Rock Li, Rock.Li@amd.com
24 // Zailong Wu, bullet@yeah.net
25 // Peng Xiao, pengxiao@outlook.com
27 // Redistribution and use in source and binary forms, with or without modification,
28 // are permitted provided that the following conditions are met:
30 // * Redistribution's of source code must retain the above copyright notice,
31 // this list of conditions and the following disclaimer.
33 // * Redistribution's in binary form must reproduce the above copyright notice,
34 // this list of conditions and the following disclaimer in the documentation
35 // and/or other materials provided with the distribution.
37 // * The name of the copyright holders may not be used to endorse or promote products
38 // derived from this software without specific prior written permission.
40 // This software is provided by the copyright holders and contributors "as is" and
41 // any express or implied warranties, including, but not limited to, the implied
42 // warranties of merchantability and fitness for a particular purpose are disclaimed.
43 // In no event shall the Intel Corporation or contributors be liable for any direct,
44 // indirect, incidental, special, exemplary, or consequential damages
45 // (including, but not limited to, procurement of substitute goods or services;
46 // loss of use, data, or profits; or business interruption) however caused
47 // and on any theory of liability, whether in contract, strict liability,
48 // or tort (including negligence or otherwise) arising in any way out of
49 // the use of this software, even if advised of the possibility of such damage.
53 #include "precomp.hpp"
54 #include "opencl_kernels.hpp"
57 using namespace cv::ocl;
59 static std::vector<uchar> scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn)
61 CV_Assert(ocn == cn || (ocn == 4 && cn == 3));
63 static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort),
64 sizeof(short), sizeof(int), sizeof(float), sizeof(double) };
66 int elemSize1 = sizeMap[depth];
67 int bufSize = elemSize1 * ocn;
68 std::vector<uchar> _buf(bufSize);
69 uchar * buf = &_buf[0];
70 scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn));
71 memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1);
76 //////////////////////////////////////////////////////////////////////////////
77 /////////////// add subtract multiply divide min max /////////////////////////
78 //////////////////////////////////////////////////////////////////////////////
80 enum { ADD = 0, SUB, MUL, DIV, ABS, ABS_DIFF, MIN, MAX };
82 static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const Scalar & scalar, const oclMat & mask,
83 oclMat &dst, int op_type, bool use_scalar = false)
85 Context *clCxt = src1.clCxt;
86 bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
87 if (!hasDouble && (src1.depth() == CV_64F || src2.depth() == CV_64F || dst.depth() == CV_64F))
89 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
93 CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size()));
94 CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
95 CV_Assert(op_type >= ADD && op_type <= MAX);
97 dst.create(src1.size(), src1.type());
99 int oclChannels = src1.oclchannels(), depth = src1.depth();
100 int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize();
101 int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
102 int maskstep1 = mask.step, maskoffset1 = mask.offset / mask.elemSize();
103 int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
104 std::vector<uchar> m;
106 size_t localThreads[3] = { 16, 16, 1 };
107 size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
109 std::string kernelName = "arithm_binary_op";
111 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
112 const char * const WTypeMap[] = { "short", "short", "int", "int", "int", "float", "double" };
113 const char * const funcMap[] = { "FUNC_ADD", "FUNC_SUB", "FUNC_MUL", "FUNC_DIV", "FUNC_ABS", "FUNC_ABS_DIFF", "FUNC_MIN", "FUNC_MAX" };
114 const char * const channelMap[] = { "", "", "2", "4", "4" };
115 bool haveScalar = use_scalar || src2.empty();
119 WDepth = hasDouble && WDepth == CV_64F ? CV_64F : CV_32F;
121 WDepth = hasDouble ? CV_64F : CV_32F;
122 else if (op_type == MUL)
123 WDepth = hasDouble && (depth == CV_32S || depth == CV_64F) ? CV_64F : CV_32F;
125 std::string buildOptions = format("-D T=%s%s -D WT=%s%s -D convertToT=convert_%s%s%s -D %s "
126 "-D convertToWT=convert_%s%s",
127 typeMap[depth], channelMap[oclChannels],
128 WTypeMap[WDepth], channelMap[oclChannels],
129 typeMap[depth], channelMap[oclChannels], (depth >= CV_32F ? "" : (depth == CV_32S ? "_rte" : "_sat_rte")),
130 funcMap[op_type], WTypeMap[WDepth], channelMap[oclChannels]);
132 std::vector<std::pair<size_t , const void *> > args;
133 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
134 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
135 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
139 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
140 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
141 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
143 kernelName += "_mat";
146 buildOptions += " -D HAVE_SCALAR";
151 const int WDepthMap[] = { CV_16S, CV_16S, CV_32S, CV_32S, CV_32S, CV_32F, CV_64F };
152 m = scalarToVector(scalar, WDepthMap[WDepth], oclChannels, src1.channels());
154 args.push_back( std::make_pair( m.size(), (void *)&m[0]));
156 kernelName += "_scalar";
161 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
162 args.push_back( std::make_pair( sizeof(cl_int), (void *)&maskstep1 ));
163 args.push_back( std::make_pair( sizeof(cl_int), (void *)&maskoffset1 ));
165 kernelName += "_mask";
168 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
169 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
170 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
172 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
173 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
175 openCLExecuteKernel(clCxt, mask.empty() ?
176 (!src2.empty() ? &arithm_add : &arithm_add_scalar) :
177 (!src2.empty() ? &arithm_add_mask : &arithm_add_scalar_mask),
178 kernelName, globalThreads, localThreads,
179 args, -1, -1, buildOptions.c_str());
182 void cv::ocl::add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
184 arithmetic_run_generic(src1, src2, Scalar(), mask, dst, ADD);
187 void cv::ocl::add(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
189 arithmetic_run_generic(src1, oclMat(), src2, mask, dst, ADD);
192 void cv::ocl::subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
194 arithmetic_run_generic(src1, src2, Scalar(), mask, dst, SUB);
197 void cv::ocl::subtract(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
199 arithmetic_run_generic(src1, oclMat(), src2, mask, dst, SUB);
202 void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
204 const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits<double>::epsilon());
205 arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, MUL, use_scalar);
208 void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst)
210 arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, MUL);
213 void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
215 const bool use_scalar = !(std::abs(scalar - 1.0) < std::numeric_limits<double>::epsilon());
216 arithmetic_run_generic(src1, src2, Scalar::all(scalar), oclMat(), dst, DIV, use_scalar);
219 void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
221 arithmetic_run_generic(src, oclMat(), Scalar::all(scalar), oclMat(), dst, DIV);
224 void cv::ocl::min(const oclMat &src1, const oclMat &src2, oclMat &dst)
226 arithmetic_run_generic(src1, src2, Scalar::all(0), oclMat(), dst, MIN);
229 void cv::ocl::max(const oclMat &src1, const oclMat &src2, oclMat &dst)
231 arithmetic_run_generic(src1, src2, Scalar::all(0), oclMat(), dst, MAX);
234 //////////////////////////////////////////////////////////////////////////////
235 /////////////////////////////Abs, Absdiff ////////////////////////////////////
236 //////////////////////////////////////////////////////////////////////////////
238 void cv::ocl::abs(const oclMat &src, oclMat &dst)
240 // explicitly uses use_scalar (even if zero) so that the correct kernel is used
241 arithmetic_run_generic(src, oclMat(), Scalar(), oclMat(), dst, ABS, true);
244 void cv::ocl::absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst)
246 arithmetic_run_generic(src1, src2, Scalar(), oclMat(), dst, ABS_DIFF);
249 void cv::ocl::absdiff(const oclMat &src1, const Scalar &src2, oclMat &dst)
251 arithmetic_run_generic(src1, oclMat(), src2, oclMat(), dst, ABS_DIFF);
254 //////////////////////////////////////////////////////////////////////////////
255 ///////////////////////////////// compare ///////////////////////////////////
256 //////////////////////////////////////////////////////////////////////////////
258 static void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpOp,
259 String kernelName, const cv::ocl::ProgramEntry* source)
261 dst.create(src1.size(), CV_8UC1);
263 int depth = src1.depth();
264 size_t localThreads[3] = { 64, 4, 1 };
265 size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
267 int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
268 int src2step1 = src2.step1(), src2offset1 = src2.offset / src2.elemSize1();
269 int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
271 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
272 const char * operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
273 std::string buildOptions = format("-D T=%s -D Operation=%s", typeMap[depth], operationMap[cmpOp]);
275 std::vector<std::pair<size_t , const void *> > args;
276 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
277 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
278 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
279 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
280 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
281 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
282 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
283 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
284 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
285 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
286 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
288 openCLExecuteKernel(src1.clCxt, source, kernelName, globalThreads, localThreads,
289 args, -1, -1, buildOptions.c_str());
292 void cv::ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst , int cmpOp)
294 if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
296 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
300 CV_Assert(src1.type() == src2.type() && src1.channels() == 1);
301 CV_Assert(cmpOp >= CMP_EQ && cmpOp <= CMP_NE);
303 compare_run(src1, src2, dst, cmpOp, "arithm_compare", &arithm_compare);
306 //////////////////////////////////////////////////////////////////////////////
307 ////////////////////////////////// sum //////////////////////////////////////
308 //////////////////////////////////////////////////////////////////////////////
310 enum { SUM = 0, ABS_SUM, SQR_SUM };
312 static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int groupnum, int type, int ddepth)
314 int ochannels = src.oclchannels();
315 int all_cols = src.step / src.elemSize();
316 int pre_cols = (src.offset % src.step) / src.elemSize();
317 int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
318 int invalid_cols = pre_cols + sec_cols;
319 int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
320 int offset = src.offset / src.elemSize();
322 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
323 const char * const funcMap[] = { "FUNC_SUM", "FUNC_ABS_SUM", "FUNC_SQR_SUM" };
324 const char * const channelMap[] = { " ", " ", "2", "4", "4" };
325 String buildOptions = format("-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s -D %s",
326 typeMap[src.depth()], channelMap[ochannels],
327 typeMap[ddepth], channelMap[ochannels],
328 typeMap[ddepth], channelMap[ochannels],
331 std::vector<std::pair<size_t , const void *> > args;
332 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
333 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
334 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
335 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
336 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
337 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
338 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
339 size_t globalThreads[3] = { groupnum * 256, 1, 1 };
340 size_t localThreads[3] = { 256, 1, 1 };
342 openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", globalThreads, localThreads,
343 args, -1, -1, buildOptions.c_str());
346 template <typename T>
347 Scalar arithmetic_sum(const oclMat &src, int type, int ddepth)
349 CV_Assert(src.step % src.elemSize() == 0);
351 size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
352 CV_Assert(groupnum != 0);
354 int dbsize = groupnum * src.oclchannels();
355 Context *clCxt = src.clCxt;
357 AutoBuffer<T> _buf(dbsize);
359 memset(p, 0, dbsize * sizeof(T));
361 cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T));
362 arithmetic_sum_buffer_run(src, dstBuffer, groupnum, type, ddepth);
363 openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T));
364 openCLFree(dstBuffer);
366 Scalar s = Scalar::all(0.0);
367 for (int i = 0; i < dbsize;)
368 for (int j = 0; j < src.oclchannels(); j++, i++)
374 typedef Scalar (*sumFunc)(const oclMat &src, int type, int ddepth);
376 Scalar cv::ocl::sum(const oclMat &src)
378 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
380 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
381 return Scalar::all(0);
383 static sumFunc functab[3] =
386 arithmetic_sum<float>,
387 arithmetic_sum<double>
390 int ddepth = std::max(src.depth(), CV_32S);
391 sumFunc func = functab[ddepth - CV_32S];
392 return func(src, SUM, ddepth);
395 Scalar cv::ocl::absSum(const oclMat &src)
397 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
399 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
400 return cv::Scalar::all(0);
403 static sumFunc functab[3] =
406 arithmetic_sum<float>,
407 arithmetic_sum<double>
410 int ddepth = std::max(src.depth(), CV_32S);
411 sumFunc func = functab[ddepth - CV_32S];
412 return func(src, ABS_SUM, ddepth);
415 Scalar cv::ocl::sqrSum(const oclMat &src)
417 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
419 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
420 return cv::Scalar::all(0);
422 static sumFunc functab[3] =
425 arithmetic_sum<float>,
426 arithmetic_sum<double>
429 int ddepth = std::max(src.depth(), CV_32S);
430 sumFunc func = functab[ddepth - CV_32S];
431 return func(src, SQR_SUM, ddepth);
434 //////////////////////////////////////////////////////////////////////////////
435 //////////////////////////////// meanStdDev //////////////////////////////////
436 //////////////////////////////////////////////////////////////////////////////
438 void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
440 if (src.depth() == CV_64F && !src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
442 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
446 double total = 1.0 / src.size().area();
449 stddev = sqrSum(src);
451 for (int i = 0; i < 4; ++i)
454 stddev[i] = std::sqrt(std::max(stddev[i] * total - mean.val[i] * mean.val[i] , 0.));
458 //////////////////////////////////////////////////////////////////////////////
459 //////////////////////////////////// minMax /////////////////////////////////
460 //////////////////////////////////////////////////////////////////////////////
462 template <typename T, typename WT>
463 static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem &dst, int groupnum, String kernelName)
465 int all_cols = src.step / src.elemSize();
466 int pre_cols = (src.offset % src.step) / src.elemSize();
467 int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
468 int invalid_cols = pre_cols + sec_cols;
469 int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
470 int offset = src.offset / src.elemSize();
472 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
473 const char * const channelMap[] = { " ", " ", "2", "4", "4" };
475 std::ostringstream stream;
476 stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()];
477 if (std::numeric_limits<T>::is_integer)
479 stream << " -D MAX_VAL=" << (WT)std::numeric_limits<T>::max();
480 stream << " -D MIN_VAL=" << (WT)std::numeric_limits<T>::min();
483 stream << " -D DEPTH_" << src.depth();
484 std::string buildOptions = stream.str();
486 std::vector<std::pair<size_t , const void *> > args;
487 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
488 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
489 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
490 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
491 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
492 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
493 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
495 int minvalid_cols = 0, moffset = 0;
498 int mall_cols = mask.step / mask.elemSize();
499 int mpre_cols = (mask.offset % mask.step) / mask.elemSize();
500 int msec_cols = mall_cols - (mask.offset % mask.step + mask.cols * mask.elemSize() - 1) / mask.elemSize() - 1;
501 minvalid_cols = mpre_cols + msec_cols;
502 moffset = mask.offset / mask.elemSize();
504 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
505 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&minvalid_cols ));
506 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&moffset ));
508 kernelName = kernelName + "_mask";
511 size_t globalThreads[3] = {groupnum * 256, 1, 1};
512 size_t localThreads[3] = {256, 1, 1};
514 openCLExecuteKernel(src.clCxt, &arithm_minMax, kernelName, globalThreads, localThreads,
515 args, -1, -1, buildOptions.c_str());
518 template <typename T, typename WT>
519 void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
521 size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
522 CV_Assert(groupnum != 0);
524 int dbsize = groupnum * 2 * src.elemSize();
526 ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
528 cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
529 arithmetic_minMax_run<T, WT>(src, mask, buf_data, groupnum, "arithm_op_minMax");
531 Mat matbuf = Mat(buf);
532 T *p = matbuf.ptr<T>();
535 *minVal = std::numeric_limits<double>::max();
536 for (int i = 0, end = src.oclchannels() * (int)groupnum; i < end; i++)
537 *minVal = *minVal < p[i] ? *minVal : p[i];
541 *maxVal = -std::numeric_limits<double>::max();
542 for (int i = src.oclchannels() * (int)groupnum, end = i << 1; i < end; i++)
543 *maxVal = *maxVal > p[i] ? *maxVal : p[i];
547 typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask);
549 void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
551 CV_Assert(src.channels() == 1);
552 CV_Assert(src.size() == mask.size() || mask.empty());
553 CV_Assert(src.step % src.elemSize() == 0);
555 if (minVal == NULL && maxVal == NULL)
558 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
560 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
564 static minMaxFunc functab[] =
566 arithmetic_minMax<uchar, int>,
567 arithmetic_minMax<char, int>,
568 arithmetic_minMax<ushort, int>,
569 arithmetic_minMax<short, int>,
570 arithmetic_minMax<int, int>,
571 arithmetic_minMax<float, float>,
572 arithmetic_minMax<double, double>,
576 minMaxFunc func = functab[src.depth()];
577 CV_Assert(func != 0);
579 func(src, minVal, maxVal, mask);
582 //////////////////////////////////////////////////////////////////////////////
583 /////////////////////////////////// norm /////////////////////////////////////
584 //////////////////////////////////////////////////////////////////////////////
586 double cv::ocl::norm(const oclMat &src1, int normType)
588 CV_Assert((normType & NORM_RELATIVE) == 0);
589 return norm(src1, oclMat(), normType);
592 static void arithm_absdiff_nonsaturate_run(const oclMat & src1, const oclMat & src2, oclMat & diff, int ntype)
594 Context *clCxt = src1.clCxt;
595 if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
597 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
600 CV_Assert(src1.step % src1.elemSize() == 0 && (src2.empty() || src2.step % src2.elemSize() == 0));
602 int ddepth = std::max(src1.depth(), CV_32S);
603 if (ntype == NORM_L2)
604 ddepth = std::max<int>(CV_32F, ddepth);
606 diff.create(src1.size(), CV_MAKE_TYPE(ddepth, src1.channels()));
607 CV_Assert(diff.step % diff.elemSize() == 0);
609 int oclChannels = src1.oclchannels(), sdepth = src1.depth();
610 int src1step1 = src1.step / src1.elemSize(), src1offset1 = src1.offset / src1.elemSize();
611 int src2step1 = src2.step / src2.elemSize(), src2offset1 = src2.offset / src2.elemSize();
612 int diffstep1 = diff.step / diff.elemSize(), diffoffset1 = diff.offset / diff.elemSize();
614 String kernelName = "arithm_absdiff_nonsaturate";
615 size_t localThreads[3] = { 16, 16, 1 };
616 size_t globalThreads[3] = { diff.cols, diff.rows, 1 };
618 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
619 const char * const channelMap[] = { "", "", "2", "4", "4" };
621 std::string buildOptions = format("-D srcT=%s%s -D dstT=%s%s -D convertToDstT=convert_%s%s",
622 typeMap[sdepth], channelMap[oclChannels],
623 typeMap[ddepth], channelMap[oclChannels],
624 typeMap[ddepth], channelMap[oclChannels]);
626 std::vector<std::pair<size_t , const void *> > args;
627 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
628 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
629 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
633 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
634 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
635 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
637 kernelName = kernelName + "_binary";
640 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&diff.data ));
641 args.push_back( std::make_pair( sizeof(cl_int), (void *)&diffstep1 ));
642 args.push_back( std::make_pair( sizeof(cl_int), (void *)&diffoffset1 ));
644 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
645 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
647 openCLExecuteKernel(clCxt, &arithm_absdiff_nonsaturate,
648 kernelName, globalThreads, localThreads,
649 args, -1, -1, buildOptions.c_str());
652 double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
654 if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
656 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
659 CV_Assert(src2.empty() || (src1.type() == src2.type() && src1.size() == src2.size()));
661 bool isRelative = (normType & NORM_RELATIVE) != 0;
662 normType &= NORM_TYPE_MASK;
663 CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
666 int cn = src1.channels();
670 arithm_absdiff_nonsaturate_run(src1, src2, diff, normType);
675 diff = diff.reshape(1);
676 minMax(diff, NULL, &r);
680 for (int i = 0; i < cn; ++i)
685 for (int i = 0; i < cn; ++i)
691 r = r / (norm(src2, normType) + DBL_EPSILON);
696 //////////////////////////////////////////////////////////////////////////////
697 ////////////////////////////////// flip //////////////////////////////////////
698 //////////////////////////////////////////////////////////////////////////////
700 enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
702 static void arithmetic_flip_run(const oclMat &src, oclMat &dst, String kernelName, int flipType)
704 int cols = dst.cols, rows = dst.rows;
705 if ((cols == 1 && flipType == FLIP_COLS) ||
706 (rows == 1 && flipType == FLIP_ROWS) ||
707 (rows == 1 && cols == 1 && flipType == FLIP_BOTH))
713 cols = flipType == FLIP_COLS ? divUp(cols, 2) : cols;
714 rows = flipType & FLIP_ROWS ? divUp(rows, 2) : rows;
716 const char * const channelMap[] = { "", "", "2", "4", "4" };
717 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
718 std::string buildOptions = format("-D T=%s%s", typeMap[dst.depth()], channelMap[dst.oclchannels()]);
720 size_t localThreads[3] = { 64, 4, 1 };
721 size_t globalThreads[3] = { cols, rows, 1 };
723 int elemSize = src.elemSize();
724 int src_step = src.step / elemSize, src_offset = src.offset / elemSize;
725 int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize;
727 std::vector<std::pair<size_t , const void *> > args;
728 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
729 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step ));
730 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset ));
731 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
732 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
733 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset ));
734 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
735 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols ));
736 args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
737 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
739 openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args,
740 -1, -1, buildOptions.c_str());
743 void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
745 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
747 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
751 dst.create(src.size(), src.type());
754 arithmetic_flip_run(src, dst, "arithm_flip_rows", FLIP_ROWS);
755 else if (flipCode > 0)
756 arithmetic_flip_run(src, dst, "arithm_flip_cols", FLIP_COLS);
758 arithmetic_flip_run(src, dst, "arithm_flip_rows_cols", FLIP_BOTH);
761 //////////////////////////////////////////////////////////////////////////////
762 ////////////////////////////////// LUT //////////////////////////////////////
763 //////////////////////////////////////////////////////////////////////////////
765 static void arithmetic_lut_run(const oclMat &src, const oclMat &lut, oclMat &dst, String kernelName)
767 int sdepth = src.depth();
768 int src_step1 = src.step1(), dst_step1 = dst.step1();
769 int src_offset1 = src.offset / src.elemSize1(), dst_offset1 = dst.offset / dst.elemSize1();
770 int lut_offset1 = lut.offset / lut.elemSize1() + (sdepth == CV_8U ? 0 : 128) * lut.channels();
771 int cols1 = src.cols * src.oclchannels();
773 size_t localSize[] = { 16, 16, 1 };
774 size_t globalSize[] = { lut.channels() == 1 ? cols1 : src.cols, src.rows, 1 };
776 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
777 std::string buildOptions = format("-D srcT=%s -D dstT=%s", typeMap[sdepth], typeMap[dst.depth()]);
779 std::vector<std::pair<size_t , const void *> > args;
780 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
781 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
782 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
783 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1));
784 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
785 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset1 ));
786 args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut_offset1 ));
787 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset1 ));
788 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step1 ));
789 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
791 openCLExecuteKernel(src.clCxt, &arithm_LUT, kernelName, globalSize, localSize,
792 args, lut.oclchannels(), -1, buildOptions.c_str());
795 void cv::ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)
797 if (!lut.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && lut.depth() == CV_64F)
799 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
803 int cn = src.channels(), depth = src.depth();
805 CV_Assert(depth == CV_8U || depth == CV_8S);
806 CV_Assert(lut.channels() == 1 || lut.channels() == src.channels());
807 CV_Assert(lut.rows == 1 && lut.cols == 256);
809 dst.create(src.size(), CV_MAKETYPE(lut.depth(), cn));
810 arithmetic_lut_run(src, lut, dst, "LUT");
813 //////////////////////////////////////////////////////////////////////////////
814 //////////////////////////////// exp log /////////////////////////////////////
815 //////////////////////////////////////////////////////////////////////////////
817 static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
819 Context *clCxt = src.clCxt;
820 if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
822 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
826 CV_Assert( src.depth() == CV_32F || src.depth() == CV_64F);
827 dst.create(src.size(), src.type());
829 int ddepth = dst.depth();
830 int cols1 = src.cols * src.oclchannels();
831 int srcoffset1 = src.offset / src.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1();
832 int srcstep1 = src.step1(), dststep1 = dst.step1();
834 size_t localThreads[3] = { 64, 4, 1 };
835 size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
837 std::string buildOptions = format("-D srcT=%s",
838 ddepth == CV_32F ? "float" : "double");
840 std::vector<std::pair<size_t , const void *> > args;
841 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
842 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
843 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
844 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
845 args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcoffset1 ));
846 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
847 args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcstep1 ));
848 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
850 openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads,
851 args, src.oclchannels(), -1, buildOptions.c_str());
854 void cv::ocl::exp(const oclMat &src, oclMat &dst)
856 arithmetic_exp_log_run(src, dst, "arithm_exp", &arithm_exp);
859 void cv::ocl::log(const oclMat &src, oclMat &dst)
861 arithmetic_exp_log_run(src, dst, "arithm_log", &arithm_log);
864 //////////////////////////////////////////////////////////////////////////////
865 ////////////////////////////// magnitude phase ///////////////////////////////
866 //////////////////////////////////////////////////////////////////////////////
868 static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName)
870 int channels = dst.oclchannels();
871 int depth = dst.depth();
873 size_t vector_length = 1;
874 int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
875 int cols = divUp(dst.cols * channels + offset_cols, vector_length);
877 size_t localThreads[3] = { 64, 4, 1 };
878 size_t globalThreads[3] = { cols, dst.rows, 1 };
880 std::vector<std::pair<size_t , const void *> > args;
881 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
882 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
883 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
884 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
885 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
886 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
887 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
888 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
889 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
890 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
891 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
893 openCLExecuteKernel(src1.clCxt, &arithm_magnitude, kernelName, globalThreads, localThreads, args, -1, depth);
896 void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
898 if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src1.depth() == CV_64F)
900 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
904 CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() &&
905 (src1.depth() == CV_32F || src1.depth() == CV_64F));
907 dst.create(src1.size(), src1.type());
908 arithmetic_magnitude_phase_run(src1, src2, dst, "arithm_magnitude");
911 static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
913 int depth = dst.depth(), cols1 = src1.cols * src1.oclchannels();
914 int src1step1 = src1.step / src1.elemSize1(), src1offset1 = src1.offset / src1.elemSize1();
915 int src2step1 = src2.step / src2.elemSize1(), src2offset1 = src2.offset / src2.elemSize1();
916 int dststep1 = dst.step / dst.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1();
918 size_t localThreads[3] = { 64, 4, 1 };
919 size_t globalThreads[3] = { cols1, dst.rows, 1 };
921 std::vector<std::pair<size_t , const void *> > args;
922 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
923 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
924 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1 ));
925 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
926 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
927 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1 ));
928 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
929 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
930 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
931 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
932 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
934 openCLExecuteKernel(src1.clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
937 void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle, bool angleInDegrees)
939 if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.depth() == CV_64F)
941 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
945 CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
946 CV_Assert(x.step % x.elemSize() == 0 && y.step % y.elemSize() == 0);
948 Angle.create(x.size(), x.type());
949 arithmetic_phase_run(x, y, Angle, angleInDegrees ? "arithm_phase_indegrees" : "arithm_phase_inradians", &arithm_phase);
952 //////////////////////////////////////////////////////////////////////////////
953 ////////////////////////////////// cartToPolar ///////////////////////////////
954 //////////////////////////////////////////////////////////////////////////////
956 static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
957 String kernelName, bool angleInDegrees)
959 int channels = src1.oclchannels();
960 int depth = src1.depth();
962 int cols = src1.cols * channels;
964 size_t localThreads[3] = { 64, 4, 1 };
965 size_t globalThreads[3] = { cols, src1.rows, 1 };
967 int tmp = angleInDegrees ? 1 : 0;
968 std::vector<std::pair<size_t , const void *> > args;
969 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
970 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
971 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
972 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
973 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
974 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
975 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst_mag.data ));
976 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_mag.step ));
977 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_mag.offset ));
978 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst_cart.data ));
979 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart.step ));
980 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_cart.offset ));
981 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
982 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
983 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tmp ));
985 openCLExecuteKernel(src1.clCxt, &arithm_cartToPolar, kernelName, globalThreads, localThreads, args, -1, depth);
988 void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat &angle, bool angleInDegrees)
990 if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.depth() == CV_64F)
992 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
996 CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
998 mag.create(x.size(), x.type());
999 angle.create(x.size(), x.type());
1001 arithmetic_cartToPolar_run(x, y, mag, angle, "arithm_cartToPolar", angleInDegrees);
1004 //////////////////////////////////////////////////////////////////////////////
1005 ////////////////////////////////// polarToCart ///////////////////////////////
1006 //////////////////////////////////////////////////////////////////////////////
1008 static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
1011 int channels = src2.oclchannels();
1012 int depth = src2.depth();
1014 int cols = src2.cols * channels;
1015 int rows = src2.rows;
1017 size_t localThreads[3] = { 64, 4, 1 };
1018 size_t globalThreads[3] = { cols, rows, 1 };
1020 int tmp = angleInDegrees ? 1 : 0;
1021 std::vector<std::pair<size_t , const void *> > args;
1024 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
1025 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
1026 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
1028 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
1029 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
1030 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
1031 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst1.data ));
1032 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst1.step ));
1033 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst1.offset ));
1034 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst2.data ));
1035 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst2.step ));
1036 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst2.offset ));
1037 args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows ));
1038 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
1039 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tmp ));
1041 openCLExecuteKernel(src1.clCxt, &arithm_polarToCart, kernelName, globalThreads, localThreads, args, -1, depth);
1044 void cv::ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees)
1046 if (!magnitude.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && magnitude.depth() == CV_64F)
1048 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
1052 CV_Assert(angle.depth() == CV_32F || angle.depth() == CV_64F);
1053 CV_Assert(magnitude.size() == angle.size() && magnitude.type() == angle.type());
1055 x.create(angle.size(), angle.type());
1056 y.create(angle.size(), angle.type());
1058 if ( magnitude.data )
1059 arithmetic_ptc_run(magnitude, angle, x, y, angleInDegrees, "arithm_polarToCart_mag");
1061 arithmetic_ptc_run(magnitude, angle, x, y, angleInDegrees, "arithm_polarToCart");
1064 //////////////////////////////////////////////////////////////////////////////
1065 /////////////////////////////////// minMaxLoc ////////////////////////////////
1066 //////////////////////////////////////////////////////////////////////////////
1068 static void arithmetic_minMaxLoc_run(const oclMat &src, cl_mem &dst, int vlen , int groupnum)
1070 std::vector<std::pair<size_t , const void *> > args;
1071 int all_cols = src.step / (vlen * src.elemSize1());
1072 int pre_cols = (src.offset % src.step) / (vlen * src.elemSize1());
1073 int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize1() - 1) / (vlen * src.elemSize1()) - 1;
1074 int invalid_cols = pre_cols + sec_cols;
1075 int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
1076 int offset = src.offset / (vlen * src.elemSize1());
1077 int repeat_s = src.offset / src.elemSize1() - offset * vlen;
1078 int repeat_e = (offset + cols) * vlen - src.offset / src.elemSize1() - src.cols;
1079 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
1080 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
1081 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
1082 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
1083 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
1084 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
1085 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
1086 char build_options[50];
1087 sprintf(build_options, "-D DEPTH_%d -D REPEAT_S%d -D REPEAT_E%d", src.depth(), repeat_s, repeat_e);
1088 size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
1089 openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc, "arithm_op_minMaxLoc", gt, lt, args, -1, -1, build_options);
1092 static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask, cl_mem &dst, int vlen, int groupnum)
1094 std::vector<std::pair<size_t , const void *> > args;
1095 size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
1096 char build_options[50];
1097 if (src.oclchannels() == 1)
1099 int cols = (src.cols - 1) / vlen + 1;
1100 int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
1101 int offset = src.offset / src.elemSize1();
1102 int repeat_me = vlen - (mask.cols % vlen == 0 ? vlen : mask.cols % vlen);
1103 int minvalid_cols = mask.step / (vlen * mask.elemSize1()) - cols;
1104 int moffset = mask.offset / mask.elemSize1();
1105 int elemnum = cols * src.rows;
1106 sprintf(build_options, "-D DEPTH_%d -D REPEAT_E%d", src.depth(), repeat_me);
1107 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
1108 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
1109 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
1110 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
1111 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
1112 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
1113 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&minvalid_cols ));
1114 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&moffset ));
1115 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
1116 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
1118 openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc_mask, "arithm_op_minMaxLoc_mask", gt, lt, args, -1, -1, build_options);
1122 template <typename T>
1123 void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
1124 Point *minLoc, Point *maxLoc, const oclMat &mask)
1126 CV_Assert(src.oclchannels() == 1);
1127 size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
1128 CV_Assert(groupnum != 0);
1129 int minloc = -1 , maxloc = -1;
1130 int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
1131 Context *clCxt = src.clCxt;
1132 cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
1133 *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
1136 arithmetic_minMaxLoc_run(src, dstBuffer, vlen, groupnum);
1138 arithmetic_minMaxLoc_mask_run(src, mask, dstBuffer, vlen, groupnum);
1140 AutoBuffer<T> _buf(groupnum * vlen * 4);
1142 memset(p, 0, dbsize);
1144 openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
1145 for (int i = 0; i < vlen * (int)groupnum; i++)
1147 *minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i];
1148 minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : cvRound(p[i + 2 * vlen * groupnum]);
1150 for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
1152 *maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i];
1153 maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : cvRound(p[i + 2 * vlen * groupnum]);
1156 int pre_rows = src.offset / src.step;
1157 int pre_cols = (src.offset % src.step) / src.elemSize1();
1158 int wholecols = src.step / src.elemSize1();
1163 minLoc->y = minloc / wholecols - pre_rows;
1164 minLoc->x = minloc % wholecols - pre_cols;
1167 minLoc->x = minLoc->y = -1;
1173 maxLoc->y = maxloc / wholecols - pre_rows;
1174 maxLoc->x = maxloc % wholecols - pre_cols;
1177 maxLoc->x = maxLoc->y = -1;
1180 openCLSafeCall(clReleaseMemObject(dstBuffer));
1183 typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
1184 Point *minLoc, Point *maxLoc, const oclMat &mask);
1186 void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
1187 Point *minLoc, Point *maxLoc, const oclMat &mask)
1189 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1191 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
1195 static minMaxLocFunc functab[2] =
1197 arithmetic_minMaxLoc<float>,
1198 arithmetic_minMaxLoc<double>
1202 func = functab[(int)src.clCxt->supportsFeature(FEATURE_CL_DOUBLE)];
1203 func(src, minVal, maxVal, minLoc, maxLoc, mask);
1206 //////////////////////////////////////////////////////////////////////////////
1207 ///////////////////////////// countNonZero ///////////////////////////////////
1208 //////////////////////////////////////////////////////////////////////////////
1210 static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int groupnum, String kernelName)
1212 int ochannels = src.oclchannels();
1213 int all_cols = src.step / src.elemSize();
1214 int pre_cols = (src.offset % src.step) / src.elemSize();
1215 int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
1216 int invalid_cols = pre_cols + sec_cols;
1217 int cols = all_cols - invalid_cols , elemnum = cols * src.rows;;
1218 int offset = src.offset / src.elemSize();
1220 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
1221 const char * const channelMap[] = { " ", " ", "2", "4", "4" };
1222 String buildOptions = format("-D srcT=%s%s -D dstT=int%s", typeMap[src.depth()], channelMap[ochannels],
1223 channelMap[ochannels]);
1225 std::vector<std::pair<size_t , const void *> > args;
1226 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
1227 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
1228 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
1229 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
1230 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
1231 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
1232 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst ));
1234 size_t globalThreads[3] = { groupnum * 256, 1, 1 };
1235 size_t localThreads[3] = { 256, 1, 1 };
1237 openCLExecuteKernel(src.clCxt, &arithm_nonzero, kernelName, globalThreads, localThreads,
1238 args, -1, -1, buildOptions.c_str());
1241 int cv::ocl::countNonZero(const oclMat &src)
1243 CV_Assert(src.step % src.elemSize() == 0);
1244 CV_Assert(src.channels() == 1);
1246 Context *clCxt = src.clCxt;
1247 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1249 CV_Error(Error::OpenCLDoubleNotSupported, "selected device doesn't support double");
1253 size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
1254 CV_Assert(groupnum != 0);
1255 int dbsize = groupnum;
1257 String kernelName = "arithm_op_nonzero";
1259 AutoBuffer<int> _buf(dbsize);
1260 int *p = (int*)_buf, nonzero = 0;
1261 memset(p, 0, dbsize * sizeof(int));
1263 cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(int));
1264 arithmetic_countNonZero_run(src, dstBuffer, groupnum, kernelName);
1265 openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(int));
1267 for (int i = 0; i < dbsize; i++)
1270 openCLSafeCall(clReleaseMemObject(dstBuffer));
1275 //////////////////////////////////////////////////////////////////////////////
1276 ////////////////////////////////bitwise_op////////////////////////////////////
1277 //////////////////////////////////////////////////////////////////////////////
1279 static void bitwise_unary_run(const oclMat &src1, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
1281 dst.create(src1.size(), src1.type());
1283 int channels = dst.oclchannels();
1284 int depth = dst.depth();
1286 int vector_lengths[4][7] = {{4, 4, 4, 4, 1, 1, 1},
1287 {4, 4, 4, 4, 1, 1, 1},
1288 {4, 4, 4, 4, 1, 1, 1},
1289 {4, 4, 4, 4, 1, 1, 1}
1292 size_t vector_length = vector_lengths[channels - 1][depth];
1293 int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1);
1294 int cols = divUp(dst.cols * channels + offset_cols, vector_length);
1296 size_t localThreads[3] = { 64, 4, 1 };
1297 size_t globalThreads[3] = { cols, dst.rows, 1 };
1299 int dst_step1 = dst.cols * dst.elemSize();
1300 std::vector<std::pair<size_t , const void *> > args;
1301 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
1302 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
1303 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
1304 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1305 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1306 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
1307 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
1308 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
1309 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
1311 openCLExecuteKernel(src1.clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
1314 enum { AND = 0, OR, XOR };
1316 static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Scalar& src3, const oclMat &mask,
1317 oclMat &dst, int operationType)
1319 CV_Assert(operationType >= AND && operationType <= XOR);
1320 CV_Assert(src2.empty() || (!src2.empty() && src1.type() == src2.type() && src1.size() == src2.size()));
1321 CV_Assert(mask.empty() || (!mask.empty() && mask.type() == CV_8UC1 && mask.size() == src1.size()));
1323 dst.create(src1.size(), src1.type());
1326 const char operationMap[] = { '&', '|', '^' };
1327 std::string kernelName("arithm_bitwise_binary");
1329 int vlen = std::min<int>(8, src1.elemSize1() * src1.oclchannels());
1330 std::string vlenstr = vlen > 1 ? format("%d", vlen) : "";
1331 std::string buildOptions = format("-D Operation=%c -D vloadn=vload%s -D vstoren=vstore%s -D elemSize=%d -D vlen=%d"
1332 " -D ucharv=uchar%s",
1333 operationMap[operationType], vlenstr.c_str(), vlenstr.c_str(),
1334 (int)src1.elemSize(), vlen, vlenstr.c_str());
1336 size_t localThreads[3] = { 16, 16, 1 };
1337 size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
1339 std::vector<std::pair<size_t , const void *> > args;
1340 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
1341 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
1342 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
1346 m.create(1, 1, dst.type());
1349 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&m.data ));
1351 kernelName += "_scalar";
1355 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
1356 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
1357 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset ));
1362 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
1363 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.step ));
1364 args.push_back( std::make_pair( sizeof(cl_int), (void *)&mask.offset ));
1366 kernelName += "_mask";
1369 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1370 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1371 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
1373 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.cols ));
1374 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
1376 openCLExecuteKernel(src1.clCxt, mask.empty() ? (!src2.empty() ? &arithm_bitwise_binary : &arithm_bitwise_binary_scalar) :
1377 (!src2.empty() ? &arithm_bitwise_binary_mask : &arithm_bitwise_binary_scalar_mask),
1378 kernelName, globalThreads, localThreads,
1379 args, -1, -1, buildOptions.c_str());
1382 void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
1384 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1386 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
1390 dst.create(src.size(), src.type());
1391 bitwise_unary_run(src, dst, "arithm_bitwise_not", &arithm_bitwise_not);
1394 void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
1396 bitwise_binary_run(src1, src2, Scalar(), mask, dst, OR);
1399 void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
1401 bitwise_binary_run(src1, oclMat(), src2, mask, dst, OR);
1404 void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
1406 bitwise_binary_run(src1, src2, Scalar(), mask, dst, AND);
1409 void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
1411 bitwise_binary_run(src1, oclMat(), src2, mask, dst, AND);
1414 void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
1416 bitwise_binary_run(src1, src2, Scalar(), mask, dst, XOR);
1419 void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
1421 bitwise_binary_run(src1, oclMat(), src2, mask, dst, XOR);
1424 oclMat cv::ocl::operator ~ (const oclMat &src)
1426 return oclMatExpr(src, oclMat(), MAT_NOT);
1429 oclMat cv::ocl::operator | (const oclMat &src1, const oclMat &src2)
1431 return oclMatExpr(src1, src2, MAT_OR);
1434 oclMat cv::ocl::operator & (const oclMat &src1, const oclMat &src2)
1436 return oclMatExpr(src1, src2, MAT_AND);
1439 oclMat cv::ocl::operator ^ (const oclMat &src1, const oclMat &src2)
1441 return oclMatExpr(src1, src2, MAT_XOR);
1444 cv::ocl::oclMatExpr cv::ocl::operator + (const oclMat &src1, const oclMat &src2)
1446 return oclMatExpr(src1, src2, cv::ocl::MAT_ADD);
1449 cv::ocl::oclMatExpr cv::ocl::operator - (const oclMat &src1, const oclMat &src2)
1451 return oclMatExpr(src1, src2, cv::ocl::MAT_SUB);
1454 cv::ocl::oclMatExpr cv::ocl::operator * (const oclMat &src1, const oclMat &src2)
1456 return oclMatExpr(src1, src2, cv::ocl::MAT_MUL);
1459 cv::ocl::oclMatExpr cv::ocl::operator / (const oclMat &src1, const oclMat &src2)
1461 return oclMatExpr(src1, src2, cv::ocl::MAT_DIV);
1464 void oclMatExpr::assign(oclMat& m) const
1484 bitwise_and(a, b, m);
1487 bitwise_or(a, b, m);
1490 bitwise_xor(a, b, m);
1495 oclMatExpr::operator oclMat() const
1502 //////////////////////////////////////////////////////////////////////////////
1503 /////////////////////////////// transpose ////////////////////////////////////
1504 //////////////////////////////////////////////////////////////////////////////
1506 #define TILE_DIM (32)
1507 #define BLOCK_ROWS (256 / TILE_DIM)
1509 static void transpose_run(const oclMat &src, oclMat &dst, String kernelName, bool inplace = false)
1511 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
1512 const char channelsString[] = { ' ', ' ', '2', '4', '4' };
1513 std::string buildOptions = format("-D T=%s%c", typeMap[src.depth()],
1514 channelsString[src.channels()]);
1516 size_t localThreads[3] = { TILE_DIM, BLOCK_ROWS, 1 };
1517 size_t globalThreads[3] = { src.cols, inplace ? src.rows : divUp(src.rows, TILE_DIM) * BLOCK_ROWS, 1 };
1519 int srcstep1 = src.step / src.elemSize(), dststep1 = dst.step / dst.elemSize();
1520 int srcoffset1 = src.offset / src.elemSize(), dstoffset1 = dst.offset / dst.elemSize();
1522 std::vector<std::pair<size_t , const void *> > args;
1523 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1524 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1525 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
1526 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
1527 args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcstep1 ));
1528 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
1529 args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcoffset1 ));
1530 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1 ));
1532 openCLExecuteKernel(src.clCxt, &arithm_transpose, kernelName, globalThreads, localThreads,
1533 args, -1, -1, buildOptions.c_str());
1536 void cv::ocl::transpose(const oclMat &src, oclMat &dst)
1538 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1540 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
1544 if ( src.data == dst.data && src.cols == src.rows && dst.offset == src.offset
1545 && dst.size() == src.size())
1546 transpose_run( src, dst, "transpose_inplace", true);
1549 dst.create(src.cols, src.rows, src.type());
1550 transpose_run( src, dst, "transpose");
1554 //////////////////////////////////////////////////////////////////////////////
1555 ////////////////////////////// addWeighted ///////////////////////////////////
1556 //////////////////////////////////////////////////////////////////////////////
1558 void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst)
1560 Context *clCxt = src1.clCxt;
1561 bool hasDouble = clCxt->supportsFeature(FEATURE_CL_DOUBLE);
1562 if (!hasDouble && src1.depth() == CV_64F)
1564 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
1568 CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
1569 dst.create(src1.size(), src1.type());
1571 int channels = dst.oclchannels();
1572 int depth = dst.depth();
1574 int cols1 = src1.cols * channels;
1575 int src1step1 = src1.step1(), src1offset1 = src1.offset / src1.elemSize1();
1576 int src2step1 = src2.step1(), src2offset1 = src2.offset / src1.elemSize1();
1577 int dststep1 = dst.step1(), dstoffset1 = dst.offset / dst.elemSize1();
1579 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
1580 std::string buildOptions = format("-D T=%s -D WT=%s -D convertToT=convert_%s%s",
1581 typeMap[depth], hasDouble ? "double" : "float", typeMap[depth],
1582 depth >= CV_32F ? "" : "_sat_rte");
1584 size_t localThreads[3] = { 256, 1, 1 };
1585 size_t globalThreads[3] = { cols1, dst.rows, 1};
1587 float alpha_f = static_cast<float>(alpha),
1588 beta_f = static_cast<float>(beta),
1589 gama_f = static_cast<float>(gama);
1591 std::vector<std::pair<size_t , const void *> > args;
1592 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
1593 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1step1 ));
1594 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1offset1));
1595 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
1596 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2step1 ));
1597 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2offset1));
1598 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1599 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dststep1 ));
1600 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstoffset1));
1604 args.push_back( std::make_pair( sizeof(cl_float), (void *)&alpha_f ));
1605 args.push_back( std::make_pair( sizeof(cl_float), (void *)&beta_f ));
1606 args.push_back( std::make_pair( sizeof(cl_float), (void *)&gama_f ));
1610 args.push_back( std::make_pair( sizeof(cl_double), (void *)&alpha ));
1611 args.push_back( std::make_pair( sizeof(cl_double), (void *)&beta ));
1612 args.push_back( std::make_pair( sizeof(cl_double), (void *)&gama ));
1615 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols1 ));
1616 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
1618 openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads,
1619 args, -1, -1, buildOptions.c_str());
1622 //////////////////////////////////////////////////////////////////////////////
1623 /////////////////////////////////// Pow //////////////////////////////////////
1624 //////////////////////////////////////////////////////////////////////////////
1626 static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, String kernelName, const cv::ocl::ProgramEntry* source)
1628 int channels = dst.oclchannels();
1629 int depth = dst.depth();
1631 size_t vector_length = 1;
1632 int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
1633 int cols = divUp(dst.cols * channels + offset_cols, vector_length);
1634 int rows = dst.rows;
1636 size_t localThreads[3] = { 64, 4, 1 };
1637 size_t globalThreads[3] = { cols, rows, 1 };
1639 int dst_step1 = dst.cols * dst.elemSize();
1640 std::vector<std::pair<size_t , const void *> > args;
1641 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
1642 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
1643 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset ));
1644 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1645 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1646 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset ));
1647 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows ));
1648 args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
1649 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
1651 float pf = static_cast<float>(p);
1652 if (!src1.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
1653 args.push_back( std::make_pair( sizeof(cl_float), (void *)&pf ));
1655 args.push_back( std::make_pair( sizeof(cl_double), (void *)&p ));
1657 openCLExecuteKernel(src1.clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
1660 void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
1662 if (!x.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && x.depth() == CV_64F)
1664 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
1668 CV_Assert(x.depth() == CV_32F || x.depth() == CV_64F);
1669 y.create(x.size(), x.type());
1671 arithmetic_pow_run(x, p, y, "arithm_pow", &arithm_pow);
1674 //////////////////////////////////////////////////////////////////////////////
1675 /////////////////////////////// setIdentity //////////////////////////////////
1676 //////////////////////////////////////////////////////////////////////////////
1678 void cv::ocl::setIdentity(oclMat& src, const Scalar & scalar)
1680 if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1682 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
1686 CV_Assert(src.step % src.elemSize() == 0);
1688 int src_step1 = src.step / src.elemSize(), src_offset1 = src.offset / src.elemSize();
1689 size_t local_threads[] = { 16, 16, 1 };
1690 size_t global_threads[] = { src.cols, src.rows, 1 };
1692 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
1693 const char * const channelMap[] = { "", "", "2", "4", "4" };
1694 String buildOptions = format("-D T=%s%s", typeMap[src.depth()], channelMap[src.oclchannels()]);
1696 std::vector<std::pair<size_t , const void *> > args;
1697 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1698 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step1 ));
1699 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset1 ));
1700 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols));
1701 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows));
1703 oclMat sc(1, 1, src.type(), scalar);
1704 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&sc.data ));
1706 openCLExecuteKernel(src.clCxt, &arithm_setidentity, "setIdentity", global_threads, local_threads,
1707 args, -1, -1, buildOptions.c_str());
1710 //////////////////////////////////////////////////////////////////////////////
1711 ////////////////////////////////// Repeat ////////////////////////////////////
1712 //////////////////////////////////////////////////////////////////////////////
1714 void cv::ocl::repeat(const oclMat & src, int ny, int nx, oclMat & dst)
1716 CV_Assert(nx > 0 && ny > 0);
1717 dst.create(src.rows * ny, src.cols * nx, src.type());
1719 for (int y = 0; y < ny; ++y)
1720 for (int x = 0; x < nx; ++x)
1722 Rect roi(x * src.cols, y * src.rows, src.cols, src.rows);
1723 oclMat hdr = dst(roi);