48 #include "precomp.hpp"
49 #include "opencl_kernels.hpp"
52 using namespace cv::ocl;
55 #define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN
62 extern DevMemType gDeviceMemType;
63 extern DevMemRW gDeviceMemRW;
67 ////////////////////////////////////////////////////////////////////////
70 static void convert_C3C4(const cl_mem &src, oclMat &dst)
72 Context *clCxt = dst.clCxt;
73 int pixel_end = dst.wholecols * dst.wholerows - 1;
74 int dstStep_in_pixel = dst.step1() / dst.oclchannels();
76 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
77 std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[dst.depth()]);
79 std::vector< std::pair<size_t, const void *> > args;
80 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src));
81 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
82 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.wholecols));
83 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.wholerows));
84 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
85 args.push_back( std::make_pair( sizeof(cl_int), (void *)&pixel_end));
87 size_t globalThreads[3] = { divUp(dst.wholecols * dst.wholerows, 4), 1, 1 };
88 size_t localThreads[3] = { 256, 1, 1 };
90 openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, localThreads,
91 args, -1, -1, buildOptions.c_str());
94 ////////////////////////////////////////////////////////////////////////
97 static void convert_C4C3(const oclMat &src, cl_mem &dst)
99 int srcStep_in_pixel = src.step1() / src.oclchannels();
100 int pixel_end = src.wholecols * src.wholerows - 1;
101 Context *clCxt = src.clCxt;
103 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
104 std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[src.depth()]);
106 std::vector< std::pair<size_t, const void *> > args;
107 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
108 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst));
109 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.wholecols));
110 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.wholerows));
111 args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
112 args.push_back( std::make_pair( sizeof(cl_int), (void *)&pixel_end));
114 size_t globalThreads[3] = { divUp(src.wholecols * src.wholerows, 4), 1, 1};
115 size_t localThreads[3] = { 256, 1, 1 };
117 openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
120 void cv::ocl::oclMat::upload(const Mat &m)
122 if (!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && m.depth() == CV_64F)
124 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
128 CV_DbgAssert(!m.empty());
131 m.locateROI(wholeSize, ofs);
132 create(wholeSize, m.type());
134 if (m.channels() == 3)
136 int pitch = wholeSize.width * 3 * m.elemSize1();
137 int tail_padding = m.elemSize1() * 3072;
139 cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
140 (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
141 openCLVerifyCall(err);
143 openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
144 convert_C3C4(temp, *this);
145 openCLSafeCall(clReleaseMemObject(temp));
148 openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
152 offset = ofs.y * step + ofs.x * elemSize();
155 cv::ocl::oclMat::operator cv::_InputArray()
157 return _InputArray(cv::_InputArray::OCL_MAT, this);
160 cv::ocl::oclMat::operator cv::_OutputArray()
162 return _OutputArray(cv::_InputArray::OCL_MAT, this);
165 cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
167 CV_Assert(src.kind() == cv::_InputArray::OCL_MAT);
168 return *(oclMat*)src.getObj();
171 cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
173 CV_Assert(src.kind() == cv::_InputArray::OCL_MAT);
174 return *(oclMat*)src.getObj();
177 void cv::ocl::oclMat::download(cv::Mat &m) const
179 CV_DbgAssert(!this->empty());
180 m.create(wholerows, wholecols, type());
182 if(m.channels() == 3)
184 int pitch = wholecols * 3 * m.elemSize1();
185 int tail_padding = m.elemSize1() * 3072;
187 cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
188 (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
189 openCLVerifyCall(err);
191 convert_C4C3(*this, temp);
192 openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
193 openCLSafeCall(clReleaseMemObject(temp));
197 openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
202 locateROI(wholesize, ofs);
203 m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
206 ///////////////////////////////////////////////////////////////////////////
207 ////////////////////////////////// CopyTo /////////////////////////////////
208 ///////////////////////////////////////////////////////////////////////////
209 static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, String kernelName)
211 CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
212 src.rows == dst.rows && src.cols == dst.cols
213 && mask.type() == CV_8UC1);
215 std::vector<std::pair<size_t , const void *> > args;
217 String string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
218 {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
219 {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
220 {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
223 char compile_option[32];
224 sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
225 size_t localThreads[3] = {16, 16, 1};
226 size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
228 int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
229 int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
231 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
232 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
233 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
234 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
235 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
236 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
237 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
238 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
239 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
240 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
241 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
243 openCLExecuteKernel(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
244 localThreads, args, -1, -1, compile_option);
247 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
251 CV_DbgAssert(!this->empty());
252 mat.create(size(), type());
253 openCLCopyBuffer2D(clCxt, mat.data, mat.step, mat.offset,
254 data, step, cols * elemSize(), rows, offset);
258 mat.create(size(), type());
259 copy_to_with_mask(*this, mat, mask, "copy_to_with_mask");
263 ///////////////////////////////////////////////////////////////////////////
264 //////////////////////////////// ConvertTo ////////////////////////////////
265 ///////////////////////////////////////////////////////////////////////////
267 static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
269 String kernelName = "convert_to";
270 float alpha_f = alpha, beta_f = beta;
271 int sdepth = src.depth(), ddepth = dst.depth();
272 int sstep1 = (int)src.step1(), dstep1 = (int)dst.step1();
273 int cols1 = src.cols * src.oclchannels();
275 char buildOptions[150], convertString[50];
276 const char * typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
277 sprintf(convertString, "convert_%s_sat_rte", typeMap[ddepth]);
278 sprintf(buildOptions, "-D srcT=%s -D dstT=%s -D convertToDstType=%s", typeMap[sdepth],
279 typeMap[ddepth], CV_32F == ddepth || ddepth == CV_64F ? "" : convertString);
281 CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
282 std::vector<std::pair<size_t , const void *> > args;
284 size_t localThreads[3] = { 16, 16, 1 };
285 size_t globalThreads[3] = { divUp(cols1, localThreads[0]) * localThreads[0],
286 divUp(dst.rows, localThreads[1]) * localThreads[1], 1 };
288 int doffset1 = dst.offset / dst.elemSize1();
289 int soffset1 = src.offset / src.elemSize1();
291 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
292 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
293 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols1 ));
294 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
295 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sstep1 ));
296 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&soffset1 ));
297 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstep1 ));
298 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&doffset1 ));
299 args.push_back( std::make_pair( sizeof(cl_float) , (void *)&alpha_f ));
300 args.push_back( std::make_pair( sizeof(cl_float) , (void *)&beta_f ));
302 openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
303 localThreads, args, -1, -1, buildOptions);
306 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
308 if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) &&
309 (depth() == CV_64F || dst.depth() == CV_64F))
311 CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
315 bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
316 && fabs(beta) < std::numeric_limits<double>::epsilon();
321 rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
323 int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
324 if( sdepth == ddepth && noScale )
331 const oclMat *psrc = this;
332 if( sdepth != ddepth && psrc == &dst )
333 psrc = &(temp = *this);
335 dst.create( size(), rtype );
336 convert_run(*psrc, dst, alpha, beta);
339 ///////////////////////////////////////////////////////////////////////////
340 //////////////////////////////// setTo ////////////////////////////////////
341 ///////////////////////////////////////////////////////////////////////////
343 oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
349 #ifdef CL_VERSION_1_2
351 template <typename CLT, typename PT>
352 static std::vector<uchar> cvt1(const cv::Scalar & s)
354 std::vector<uchar> _buf(sizeof(CLT));
355 CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
356 buf[0] = saturate_cast<PT>(s[0]);
360 template <typename CLT, typename PT>
361 static std::vector<uchar> cvt2(const cv::Scalar & s)
363 std::vector<uchar> _buf(sizeof(CLT));
364 CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
365 buf->s[0] = saturate_cast<PT>(s[0]);
366 buf->s[1] = saturate_cast<PT>(s[1]);
370 template <typename CLT, typename PT>
371 static std::vector<uchar> cvt4(const cv::Scalar & s)
373 std::vector<uchar> _buf(sizeof(CLT));
374 CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
375 buf->s[0] = saturate_cast<PT>(s[0]);
376 buf->s[1] = saturate_cast<PT>(s[1]);
377 buf->s[2] = saturate_cast<PT>(s[2]);
378 buf->s[3] = saturate_cast<PT>(s[3]);
382 typedef std::vector<uchar> (*ConvertFunc)(const cv::Scalar & s);
384 static std::vector<uchar> scalarToCLVector(const cv::Scalar & s, int type)
386 const int depth = CV_MAT_DEPTH(type);
387 const int channels = CV_MAT_CN(type);
389 static const ConvertFunc funcs[4][7] =
391 { cvt1<cl_uchar, uchar>, cvt1<cl_char, char>, cvt1<cl_ushort, ushort>, cvt1<cl_short, short>,
392 cvt1<cl_int, int>, cvt1<cl_float, float>, cvt1<cl_double, double> },
394 { cvt2<cl_uchar2, uchar>, cvt2<cl_char2, char>, cvt2<cl_ushort2, ushort>, cvt2<cl_short2, short>,
395 cvt2<cl_int2, int>, cvt2<cl_float2, float>, cvt2<cl_double2, double> },
397 { 0, 0, 0, 0, 0, 0, 0 },
399 { cvt4<cl_uchar4, uchar>, cvt4<cl_char4, char>, cvt4<cl_ushort4, ushort>, cvt4<cl_short4, short>,
400 cvt4<cl_int4, int>, cvt4<cl_float4, float>, cvt4<cl_double4, double> }
403 ConvertFunc func = funcs[channels - 1][depth];
409 static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, String kernelName)
411 std::vector<std::pair<size_t , const void *> > args;
413 size_t localThreads[3] = {16, 16, 1};
414 size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
415 int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
417 if (dst.type() == CV_8UC1)
418 globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
420 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
421 const char channelMap[] = { ' ', ' ', '2', '4', '4' };
422 std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
424 Mat mat(1, 1, dst.type(), scalar);
426 #ifdef CL_VERSION_1_2
427 // this enables backwards portability to
428 // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
429 if (Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2) && dst.isContinuous())
431 std::vector<uchar> p = ::scalarToCLVector(scalar, CV_MAKE_TYPE(dst.depth(), dst.oclchannels()));
432 clEnqueueFillBuffer(getClCommandQueue(dst.clCxt),
433 (cl_mem)dst.data, (void*)&p[0], p.size(),
434 0, dst.step * dst.rows, 0, NULL, NULL);
440 args.push_back( std::make_pair( sizeof(cl_mem) , (void*)&m.data ));
441 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
442 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
443 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
444 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
445 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
447 openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
448 localThreads, args, -1, -1, buildOptions.c_str());
452 static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, String kernelName)
454 CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
455 std::vector<std::pair<size_t , const void *> > args;
456 size_t localThreads[3] = { 16, 16, 1 };
457 size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
458 int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
460 const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
461 const char channelMap[] = { ' ', ' ', '2', '4', '4' };
462 std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
464 oclMat m(Mat(1, 1, dst.type(), scalar));
465 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&m.data ));
466 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
467 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
468 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
469 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
470 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
471 args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
472 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
473 args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
474 openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
475 localThreads, args, -1, -1, buildOptions.c_str());
478 oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
480 CV_Assert(mask.type() == CV_8UC1);
481 CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
482 CV_DbgAssert( !this->empty());
485 set_to_withoutmask_run(*this, scalar, type() == CV_8UC1 ?
486 "set_to_without_mask_C1_D0" : "set_to_without_mask");
489 set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
494 oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
496 if( new_rows != 0 && new_rows != rows)
498 CV_Error( Error::StsBadFunc, "oclMat's number of rows can not be changed for current version" );
503 int cn = oclchannels();
507 int total_width = cols * cn;
508 if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
509 new_rows = rows * total_width / new_cn;
511 if (new_rows != 0 && new_rows != rows)
513 int total_size = total_width * rows;
516 CV_Error(Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
518 if ((unsigned)new_rows > (unsigned)total_size)
519 CV_Error(Error::StsOutOfRange, "Bad new number of rows");
521 total_width = total_size / new_rows;
522 if (total_width * new_rows != total_size)
523 CV_Error(Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
526 hdr.step = total_width * elemSize1();
529 int new_width = total_width / new_cn;
530 if (new_width * new_cn != total_width)
531 CV_Error(Error::BadNumChannels, "The total width is not divisible by the new number of channels");
533 hdr.cols = new_width;
534 hdr.wholecols = new_width;
535 hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
540 void cv::ocl::oclMat::createEx(Size size, int type,
541 DevMemRW rw_type, DevMemType mem_type)
543 createEx(size.height, size.width, type, rw_type, mem_type);
546 void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
548 createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
551 void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
552 DevMemRW rw_type, DevMemType mem_type)
554 clCxt = Context::getContext();
556 _type &= Mat::TYPE_MASK;
557 if( rows == _rows && cols == _cols && type() == _type && data )
561 CV_DbgAssert( _rows >= 0 && _cols >= 0 );
562 if( _rows > 0 && _cols > 0 )
564 flags = Mat::MAGIC_VAL + _type;
569 size_t esz = elemSize();
572 openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
574 if (esz * cols == step)
575 flags |= Mat::CONTINUOUS_FLAG;
577 int64 _nettosize = (int64)step * rows;
578 size_t nettosize = (size_t)_nettosize;
580 datastart = data = (uchar *)dev_ptr;
581 dataend = data + nettosize;
583 refcount = (int *)fastMalloc(sizeof(*refcount));
588 void cv::ocl::oclMat::release()
590 if( refcount && CV_XADD(refcount, -1) == 1 )
593 openCLFree(datastart);
595 data = datastart = dataend = 0;
596 step = rows = cols = 0;
597 offset = wholerows = wholecols = 0;
601 oclMat& cv::ocl::oclMat::operator+=( const oclMat& m )
603 add(*this, m, *this);
607 oclMat& cv::ocl::oclMat::operator-=( const oclMat& m )
609 subtract(*this, m, *this);
613 oclMat& cv::ocl::oclMat::operator*=( const oclMat& m )
615 multiply(*this, m, *this);
619 oclMat& cv::ocl::oclMat::operator/=( const oclMat& m )
621 divide(*this, m, *this);