1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
19 // Niko Li, newlife20080214@gmail.com
20 // Yao Wang, bitwangyaoyao@gmail.com
22 // Redistribution and use in source and binary forms, with or without modification,
23 // are permitted provided that the following conditions are met:
25 // * Redistribution's of source code must retain the above copyright notice,
26 // this list of conditions and the following disclaimer.
28 // * Redistribution's in binary form must reproduce the above copyright notice,
29 // this list of conditions and the following disclaimer in the documentation
30 // and/or other oclMaterials provided with the distribution.
32 // * The name of the copyright holders may not be used to endorse or promote products
33 // derived from this software without specific prior written permission.
35 // This software is provided by the copyright holders and contributors "as is" and
36 // any express or implied warranties, including, but not limited to, the implied
37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
38 // In no event shall the Intel Corporation or contributors be liable for any direct,
39 // indirect, incidental, special, exemplary, or consequential damages
40 // (including, but not limited to, procurement of substitute goods or services;
41 // loss of use, data, or profits; or business interruption) however caused
42 // and on any theory of liability, whether in contract, strict liability,
43 // or tort (including negligence or otherwise) arising in any way out of
44 // the use of this software, even if advised of the possibility of such damage.
48 #include "precomp.hpp"
51 #define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN
54 using namespace cv::ocl;
57 ////////////////////////////////////////////////////////////////////////
58 //////////////////////////////// oclMat ////////////////////////////////
59 ////////////////////////////////////////////////////////////////////////
66 ///////////////////////////OpenCL kernel strings///////////////////////////
67 extern const char *operator_copyToM;
68 extern const char *operator_convertTo;
69 extern const char *operator_setTo;
70 extern const char *operator_setToM;
71 extern const char *convertC3C4;
72 extern DevMemType gDeviceMemType;
73 extern DevMemRW gDeviceMemRW;
78 ////////////////////////////////////////////////////////////////////////
80 static void convert_C3C4(const cl_mem &src, oclMat &dst)
82 int dstStep_in_pixel = dst.step1() / dst.oclchannels();
83 int pixel_end = dst.wholecols * dst.wholerows - 1;
84 Context *clCxt = dst.clCxt;
85 string kernelName = "convertC3C4";
86 char compile_option[32];
90 sprintf(compile_option, "-D GENTYPE4=uchar4");
93 sprintf(compile_option, "-D GENTYPE4=char4");
96 sprintf(compile_option, "-D GENTYPE4=ushort4");
99 sprintf(compile_option, "-D GENTYPE4=short4");
102 sprintf(compile_option, "-D GENTYPE4=int4");
105 sprintf(compile_option, "-D GENTYPE4=float4");
108 sprintf(compile_option, "-D GENTYPE4=double4");
111 CV_Error(CV_StsUnsupportedFormat, "unknown depth");
113 vector< pair<size_t, const void *> > args;
114 args.push_back( make_pair( sizeof(cl_mem), (void *)&src));
115 args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
116 args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholecols));
117 args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholerows));
118 args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
119 args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
121 size_t globalThreads[3] = {((dst.wholecols * dst.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
122 size_t localThreads[3] = {256, 1, 1};
124 openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
126 ////////////////////////////////////////////////////////////////////////
128 static void convert_C4C3(const oclMat &src, cl_mem &dst)
130 int srcStep_in_pixel = src.step1() / src.oclchannels();
131 int pixel_end = src.wholecols * src.wholerows - 1;
132 Context *clCxt = src.clCxt;
133 string kernelName = "convertC4C3";
134 char compile_option[32];
138 sprintf(compile_option, "-D GENTYPE4=uchar4");
141 sprintf(compile_option, "-D GENTYPE4=char4");
144 sprintf(compile_option, "-D GENTYPE4=ushort4");
147 sprintf(compile_option, "-D GENTYPE4=short4");
150 sprintf(compile_option, "-D GENTYPE4=int4");
153 sprintf(compile_option, "-D GENTYPE4=float4");
156 sprintf(compile_option, "-D GENTYPE4=double4");
159 CV_Error(CV_StsUnsupportedFormat, "unknown depth");
162 vector< pair<size_t, const void *> > args;
163 args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
164 args.push_back( make_pair( sizeof(cl_mem), (void *)&dst));
165 args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols));
166 args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows));
167 args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
168 args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
170 size_t globalThreads[3] = {((src.wholecols * src.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
171 size_t localThreads[3] = {256, 1, 1};
173 openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
176 void cv::ocl::oclMat::upload(const Mat &m)
178 CV_DbgAssert(!m.empty());
181 m.locateROI(wholeSize, ofs);
182 // int type = m.type();
183 // if(m.oclchannels() == 3)
185 // type = CV_MAKETYPE(m.depth(), 4);
187 create(wholeSize, m.type());
189 if(m.channels() == 3)
191 int pitch = wholeSize.width * 3 * m.elemSize1();
192 int tail_padding = m.elemSize1() * 3072;
194 cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
195 (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
196 openCLVerifyCall(err);
198 openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
199 convert_C3C4(temp, *this);
200 //int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
201 //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
202 //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
203 // 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL));
204 //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
205 // 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
206 //for(int i=0;i<wholeSize.height;i++)
208 // int *a = cputemp+i*wholeSize.width * 3,*b = cpudata + i*this->step/sizeof(int);
209 // for(int j=0;j<wholeSize.width;j++)
211 // if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
212 // printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
213 // i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
218 openCLSafeCall(clReleaseMemObject(temp));
222 openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
227 offset = ofs.y * step + ofs.x * elemSize();
228 //download_channels = m.channels();
231 cv::ocl::oclMat::operator cv::_InputArray()
233 _InputArray newInputArray;
234 newInputArray.flags = cv::_InputArray::OCL_MAT;
235 newInputArray.obj = reinterpret_cast<void *>(this);
236 return newInputArray;
239 cv::ocl::oclMat::operator cv::_OutputArray()
241 _OutputArray newOutputArray;
242 newOutputArray.flags = cv::_InputArray::OCL_MAT;
243 newOutputArray.obj = reinterpret_cast<void *>(this);
244 return newOutputArray;
247 cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
249 CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
250 return *reinterpret_cast<oclMat*>(src.obj);
253 cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
255 CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
256 return *reinterpret_cast<oclMat*>(src.obj);
259 void cv::ocl::oclMat::download(cv::Mat &m) const
261 CV_DbgAssert(!this->empty());
263 // if(download_channels == 3)
265 // t = CV_MAKETYPE(depth(), 3);
267 m.create(wholerows, wholecols, type());
269 if(m.channels() == 3)
271 int pitch = wholecols * 3 * m.elemSize1();
272 int tail_padding = m.elemSize1() * 3072;
274 cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
275 (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
276 openCLVerifyCall(err);
278 convert_C4C3(*this, temp);
279 openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
280 //int* cputemp=new int[wholecols*wholerows * 3];
281 //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
282 //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
283 // 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL));
284 //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
285 // 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
286 //for(int i=0;i<wholerows;i++)
288 // int *a = cputemp+i*wholecols * 3,*b = cpudata + i*this->step/sizeof(int);
289 // for(int j=0;j<wholecols;j++)
291 // if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
292 // printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
293 // i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
298 openCLSafeCall(clReleaseMemObject(temp));
302 openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
306 locateROI(wholesize, ofs);
307 m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
310 /////////////////////common//////////////////////////////////////
311 inline int divUp(int total, int grain)
313 return (total + grain - 1) / grain;
315 ///////////////////////////////////////////////////////////////////////////
316 ////////////////////////////////// CopyTo /////////////////////////////////
317 ///////////////////////////////////////////////////////////////////////////
318 static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName)
320 CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
321 src.rows == dst.rows && src.cols == dst.cols
322 && mask.type() == CV_8UC1);
324 vector<pair<size_t , const void *> > args;
326 std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
327 {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
328 {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
329 {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
331 char compile_option[32];
332 sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
333 size_t localThreads[3] = {16, 16, 1};
334 size_t globalThreads[3];
336 globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
337 globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
338 globalThreads[2] = 1;
340 int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
341 int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
343 args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
344 args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
345 args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
346 args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
347 args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
348 args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
349 args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
350 args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
351 args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
352 args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
353 args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
355 openCLExecuteKernel(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
356 localThreads, args, -1, -1, compile_option);
359 void cv::ocl::oclMat::copyTo( oclMat &m ) const
361 CV_DbgAssert(!this->empty());
362 m.create(size(), type());
363 openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
364 data, step, cols * elemSize(), rows, offset);
367 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
375 mat.create(size(), type());
376 copy_to_with_mask(*this, mat, mask, "copy_to_with_mask");
380 ///////////////////////////////////////////////////////////////////////////
381 //////////////////////////////// ConvertTo ////////////////////////////////
382 ///////////////////////////////////////////////////////////////////////////
383 static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
385 string kernelName = "convert_to_S";
387 idxStr << src.depth();
388 kernelName += idxStr.str();
389 float alpha_f = alpha, beta_f = beta;
390 CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
391 vector<pair<size_t , const void *> > args;
392 size_t localThreads[3] = {16, 16, 1};
393 size_t globalThreads[3];
394 globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
395 globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
396 globalThreads[2] = 1;
397 int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
398 int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
399 if(dst.type() == CV_8UC1)
401 globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
403 args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
404 args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
405 args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
406 args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
407 args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
408 args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
409 args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
410 args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
411 args.push_back( make_pair( sizeof(cl_float) , (void *)&alpha_f ));
412 args.push_back( make_pair( sizeof(cl_float) , (void *)&beta_f ));
413 openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
414 localThreads, args, dst.oclchannels(), dst.depth());
416 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
418 //cout << "cv::ocl::oclMat::convertTo()" << endl;
420 bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
421 && fabs(beta) < std::numeric_limits<double>::epsilon();
426 rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
428 //int scn = channels();
429 int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
430 if( sdepth == ddepth && noScale )
437 const oclMat *psrc = this;
438 if( sdepth != ddepth && psrc == &dst )
439 psrc = &(temp = *this);
441 dst.create( size(), rtype );
442 convert_run(*psrc, dst, alpha, beta);
445 ///////////////////////////////////////////////////////////////////////////
446 //////////////////////////////// setTo ////////////////////////////////////
447 ///////////////////////////////////////////////////////////////////////////
448 oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
450 //cout << "cv::ocl::oclMat::=" << endl;
454 static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kernelName)
456 vector<pair<size_t , const void *> > args;
458 size_t localThreads[3] = {16, 16, 1};
459 size_t globalThreads[3];
460 globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
461 globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
462 globalThreads[2] = 1;
463 int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
464 if(dst.type() == CV_8UC1)
466 globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
468 char compile_option[32];
482 val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
483 val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
484 val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
485 val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
486 switch(dst.oclchannels())
489 sprintf(compile_option, "-D GENTYPE=uchar");
490 args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
493 sprintf(compile_option, "-D GENTYPE=uchar4");
494 args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
497 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
501 val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
502 val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
503 val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
504 val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
505 switch(dst.oclchannels())
508 sprintf(compile_option, "-D GENTYPE=char");
509 args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
512 sprintf(compile_option, "-D GENTYPE=char4");
513 args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
516 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
520 val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
521 val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
522 val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
523 val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
524 switch(dst.oclchannels())
527 sprintf(compile_option, "-D GENTYPE=ushort");
528 args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
531 sprintf(compile_option, "-D GENTYPE=ushort4");
532 args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
535 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
539 val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
540 val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
541 val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
542 val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
543 switch(dst.oclchannels())
546 sprintf(compile_option, "-D GENTYPE=short");
547 args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
550 sprintf(compile_option, "-D GENTYPE=short4");
551 args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
554 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
558 val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
559 val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
560 val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
561 val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
562 switch(dst.oclchannels())
565 sprintf(compile_option, "-D GENTYPE=int");
566 args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
569 sprintf(compile_option, "-D GENTYPE=int2");
571 i2val.s[0] = val.ival.s[0];
572 i2val.s[1] = val.ival.s[1];
573 args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
576 sprintf(compile_option, "-D GENTYPE=int4");
577 args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
580 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
584 val.fval.s[0] = scalar.val[0];
585 val.fval.s[1] = scalar.val[1];
586 val.fval.s[2] = scalar.val[2];
587 val.fval.s[3] = scalar.val[3];
588 switch(dst.oclchannels())
591 sprintf(compile_option, "-D GENTYPE=float");
592 args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
595 sprintf(compile_option, "-D GENTYPE=float4");
596 args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
599 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
603 val.dval.s[0] = scalar.val[0];
604 val.dval.s[1] = scalar.val[1];
605 val.dval.s[2] = scalar.val[2];
606 val.dval.s[3] = scalar.val[3];
607 switch(dst.oclchannels())
610 sprintf(compile_option, "-D GENTYPE=double");
611 args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
614 sprintf(compile_option, "-D GENTYPE=double4");
615 args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
618 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
622 CV_Error(CV_StsUnsupportedFormat, "unknown depth");
624 #ifdef CL_VERSION_1_2
625 //this enables backwards portability to
626 //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
627 if(Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
628 dst.offset == 0 && dst.cols == dst.wholecols)
630 clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
631 (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
636 args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
637 args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
638 args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
639 args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
640 args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
641 openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
642 localThreads, args, -1, -1, compile_option);
646 static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, string kernelName)
648 CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
649 vector<pair<size_t , const void *> > args;
650 size_t localThreads[3] = {16, 16, 1};
651 size_t globalThreads[3];
652 globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
653 globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
654 globalThreads[2] = 1;
655 int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
656 char compile_option[32];
670 val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
671 val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
672 val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
673 val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
674 switch(dst.oclchannels())
677 sprintf(compile_option, "-D GENTYPE=uchar");
678 args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
681 sprintf(compile_option, "-D GENTYPE=uchar4");
682 args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
685 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
689 val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
690 val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
691 val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
692 val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
693 switch(dst.oclchannels())
696 sprintf(compile_option, "-D GENTYPE=char");
697 args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
700 sprintf(compile_option, "-D GENTYPE=char4");
701 args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
704 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
708 val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
709 val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
710 val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
711 val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
712 switch(dst.oclchannels())
715 sprintf(compile_option, "-D GENTYPE=ushort");
716 args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
719 sprintf(compile_option, "-D GENTYPE=ushort4");
720 args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
723 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
727 val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
728 val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
729 val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
730 val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
731 switch(dst.oclchannels())
734 sprintf(compile_option, "-D GENTYPE=short");
735 args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
738 sprintf(compile_option, "-D GENTYPE=short4");
739 args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
742 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
746 val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
747 val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
748 val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
749 val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
750 switch(dst.oclchannels())
753 sprintf(compile_option, "-D GENTYPE=int");
754 args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
757 sprintf(compile_option, "-D GENTYPE=int4");
758 args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
761 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
765 val.fval.s[0] = scalar.val[0];
766 val.fval.s[1] = scalar.val[1];
767 val.fval.s[2] = scalar.val[2];
768 val.fval.s[3] = scalar.val[3];
769 switch(dst.oclchannels())
772 sprintf(compile_option, "-D GENTYPE=float");
773 args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
776 sprintf(compile_option, "-D GENTYPE=float4");
777 args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
780 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
784 val.dval.s[0] = scalar.val[0];
785 val.dval.s[1] = scalar.val[1];
786 val.dval.s[2] = scalar.val[2];
787 val.dval.s[3] = scalar.val[3];
788 switch(dst.oclchannels())
791 sprintf(compile_option, "-D GENTYPE=double");
792 args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
795 sprintf(compile_option, "-D GENTYPE=double4");
796 args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
799 CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
803 CV_Error(CV_StsUnsupportedFormat, "unknown depth");
805 args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
806 args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
807 args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
808 args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
809 args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
810 args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
811 args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
812 args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
813 openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
814 localThreads, args, -1, -1, compile_option);
817 oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
819 //cout << "cv::ocl::oclMat::setTo()" << endl;
820 CV_Assert(mask.type() == CV_8UC1);
821 CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
822 CV_DbgAssert( !this->empty());
825 //mem = clCreateBuffer(this->clCxt->clContext,CL_MEM_READ_WRITE,
826 // sizeof(double)*4,NULL,&status);
827 //openCLVerifyCall(status);
828 //double* s = (double *)scalar.val;
829 //openCLSafeCall(clEnqueueWriteBuffer(this->clCxt->clCmdQueue,
830 // (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0));
833 if(type() == CV_8UC1)
835 set_to_withoutmask_run(*this, scalar, "set_to_without_mask_C1_D0");
839 set_to_withoutmask_run(*this, scalar, "set_to_without_mask");
844 set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
850 oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
852 if( new_rows != 0 && new_rows != rows)
856 CV_Error( CV_StsBadFunc,
858 "oclMat's number of rows can not be changed for current version" );
864 int cn = oclchannels();
872 int total_width = cols * cn;
876 if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
878 new_rows = rows * total_width / new_cn;
882 if (new_rows != 0 && new_rows != rows)
886 int total_size = total_width * rows;
892 CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
896 if ((unsigned)new_rows > (unsigned)total_size)
898 CV_Error(CV_StsOutOfRange, "Bad new number of rows");
902 total_width = total_size / new_rows;
906 if (total_width * new_rows != total_size)
908 CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
914 hdr.step = total_width * elemSize1();
920 int new_width = total_width / new_cn;
924 if (new_width * new_cn != total_width)
926 CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");
930 hdr.cols = new_width;
932 hdr.wholecols = new_width;
934 hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
942 void cv::ocl::oclMat::createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type)
944 createEx(size.height, size.width, type, rw_type, mem_type);
947 void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
949 createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
952 void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, DevMemRW rw_type, DevMemType mem_type)
954 clCxt = Context::getContext();
957 //download_channels = CV_MAT_CN(_type);
958 //if(download_channels==3)
960 // _type = CV_MAKE_TYPE((CV_MAT_DEPTH(_type)),4);
962 if( rows == _rows && cols == _cols && type() == _type && data )
966 CV_DbgAssert( _rows >= 0 && _cols >= 0 );
967 if( _rows > 0 && _cols > 0 )
969 flags = Mat::MAGIC_VAL + _type;
974 size_t esz = elemSize();
977 openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
978 //openCLMallocPitch(clCxt,&dev_ptr, &step, esz * cols, rows);
980 if (esz * cols == step)
981 flags |= Mat::CONTINUOUS_FLAG;
983 int64 _nettosize = (int64)step * rows;
984 size_t nettosize = (size_t)_nettosize;
986 datastart = data = (uchar *)dev_ptr;
987 dataend = data + nettosize;
989 refcount = (int *)fastMalloc(sizeof(*refcount));
994 void cv::ocl::oclMat::release()
996 //cout << "cv::ocl::oclMat::release()" << endl;
997 if( refcount && CV_XADD(refcount, -1) == 1 )
1000 openCLFree(datastart);
1002 data = datastart = dataend = 0;
1003 step = rows = cols = 0;
1004 offset = wholerows = wholecols = 0;
1008 oclMat& cv::ocl::oclMat::operator+=( const oclMat& m )
1010 add(*this, m, *this);
1014 oclMat& cv::ocl::oclMat::operator-=( const oclMat& m )
1016 subtract(*this, m, *this);
1020 oclMat& cv::ocl::oclMat::operator*=( const oclMat& m )
1022 multiply(*this, m, *this);
1026 oclMat& cv::ocl::oclMat::operator/=( const oclMat& m )
1028 divide(*this, m, *this);