modules/ocl/src/matrix_operations.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // @Authors
  19 //    Niko Li, newlife20080214@gmail.com
  20 //    Yao Wang, bitwangyaoyao@gmail.com
  21 //
  22 // Redistribution and use in source and binary forms, with or without modification,
  23 // are permitted provided that the following conditions are met:
  24 //
  25 //   * Redistribution's of source code must retain the above copyright notice,
  26 //     this list of conditions and the following disclaimer.
  27 //
  28 //   * Redistribution's in binary form must reproduce the above copyright notice,
  29 //     this list of conditions and the following disclaimer in the documentation
  30 //     and/or other oclMaterials provided with the distribution.
  31 //
  32 //   * The name of the copyright holders may not be used to endorse or promote products
  33 //     derived from this software without specific prior written permission.
  34 //
  35 // This software is provided by the copyright holders and contributors "as is" and
  36 // any express or implied warranties, including, but not limited to, the implied
  37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  38 // In no event shall the Intel Corporation or contributors be liable for any direct,
  39 // indirect, incidental, special, exemplary, or consequential damages
  40 // (including, but not limited to, procurement of substitute goods or services;
  41 // loss of use, data, or profits; or business interruption) however caused
  42 // and on any theory of liability, whether in contract, strict liability,
  43 // or tort (including negligence or otherwise) arising in any way out of
  44 // the use of this software, even if advised of the possibility of such damage.
  45 //
  46 //M*/
  47
  48 #include "precomp.hpp"
  49
  50 #define ALIGN 32
  51 #define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN
  52
  53 using namespace cv;
  54 using namespace cv::ocl;
  55 using namespace std;
  56
  57 ////////////////////////////////////////////////////////////////////////
  58 //////////////////////////////// oclMat ////////////////////////////////
  59 ////////////////////////////////////////////////////////////////////////
  60
  61 //helper routines
  62 namespace cv
  63 {
  64     namespace ocl
  65     {
  66         ///////////////////////////OpenCL kernel strings///////////////////////////
  67         extern const char *operator_copyToM;
  68         extern const char *operator_convertTo;
  69         extern const char *operator_setTo;
  70         extern const char *operator_setToM;
  71         extern const char *convertC3C4;
  72         extern DevMemType gDeviceMemType;
  73         extern DevMemRW gDeviceMemRW;
  74     }
  75 }
  76
  77
  78 ////////////////////////////////////////////////////////////////////////
  79 // convert_C3C4
  80 static void convert_C3C4(const cl_mem &src, oclMat &dst)
  81 {
  82     int dstStep_in_pixel = dst.step1() / dst.oclchannels();
  83     int pixel_end = dst.wholecols * dst.wholerows - 1;
  84     Context *clCxt = dst.clCxt;
  85     string kernelName = "convertC3C4";
  86     char compile_option[32];
  87     switch(dst.depth())
  88     {
  89     case 0:
  90         sprintf(compile_option, "-D GENTYPE4=uchar4");
  91         break;
  92     case 1:
  93         sprintf(compile_option, "-D GENTYPE4=char4");
  94         break;
  95     case 2:
  96         sprintf(compile_option, "-D GENTYPE4=ushort4");
  97         break;
  98     case 3:
  99         sprintf(compile_option, "-D GENTYPE4=short4");
 100         break;
 101     case 4:
 102         sprintf(compile_option, "-D GENTYPE4=int4");
 103         break;
 104     case 5:
 105         sprintf(compile_option, "-D GENTYPE4=float4");
 106         break;
 107     case 6:
 108         sprintf(compile_option, "-D GENTYPE4=double4");
 109         break;
 110     default:
 111         CV_Error(CV_StsUnsupportedFormat, "unknown depth");
 112     }
 113     vector< pair<size_t, const void *> > args;
 114     args.push_back( make_pair( sizeof(cl_mem), (void *)&src));
 115     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
 116     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholecols));
 117     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholerows));
 118     args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
 119     args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
 120
 121     size_t globalThreads[3] = {((dst.wholecols * dst.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
 122     size_t localThreads[3] = {256, 1, 1};
 123
 124     openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 125 }
 126 ////////////////////////////////////////////////////////////////////////
 127 // convert_C4C3
 128 static void convert_C4C3(const oclMat &src, cl_mem &dst)
 129 {
 130     int srcStep_in_pixel = src.step1() / src.oclchannels();
 131     int pixel_end = src.wholecols * src.wholerows - 1;
 132     Context *clCxt = src.clCxt;
 133     string kernelName = "convertC4C3";
 134     char compile_option[32];
 135     switch(src.depth())
 136     {
 137     case 0:
 138         sprintf(compile_option, "-D GENTYPE4=uchar4");
 139         break;
 140     case 1:
 141         sprintf(compile_option, "-D GENTYPE4=char4");
 142         break;
 143     case 2:
 144         sprintf(compile_option, "-D GENTYPE4=ushort4");
 145         break;
 146     case 3:
 147         sprintf(compile_option, "-D GENTYPE4=short4");
 148         break;
 149     case 4:
 150         sprintf(compile_option, "-D GENTYPE4=int4");
 151         break;
 152     case 5:
 153         sprintf(compile_option, "-D GENTYPE4=float4");
 154         break;
 155     case 6:
 156         sprintf(compile_option, "-D GENTYPE4=double4");
 157         break;
 158     default:
 159         CV_Error(CV_StsUnsupportedFormat, "unknown depth");
 160     }
 161
 162     vector< pair<size_t, const void *> > args;
 163     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
 164     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst));
 165     args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols));
 166     args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows));
 167     args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
 168     args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
 169
 170     size_t globalThreads[3] = {((src.wholecols * src.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1};
 171     size_t localThreads[3] = {256, 1, 1};
 172
 173     openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 174 }
 175
 176 void cv::ocl::oclMat::upload(const Mat &m)
 177 {
 178     CV_DbgAssert(!m.empty());
 179     Size wholeSize;
 180     Point ofs;
 181     m.locateROI(wholeSize, ofs);
 182     //   int type = m.type();
 183     //   if(m.oclchannels() == 3)
 184     //{
 185     //  type = CV_MAKETYPE(m.depth(), 4);
 186     //}
 187     create(wholeSize, m.type());
 188
 189     if(m.channels() == 3)
 190     {
 191         int pitch = wholeSize.width * 3 * m.elemSize1();
 192         int tail_padding = m.elemSize1() * 3072;
 193         int err;
 194         cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
 195                                      (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
 196         openCLVerifyCall(err);
 197
 198         openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
 199         convert_C3C4(temp, *this);
 200         //int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
 201         //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
 202         //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
 203         //                                              0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL));
 204         //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
 205         //                                              0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
 206         //for(int i=0;i<wholeSize.height;i++)
 207         //{
 208         //      int *a = cputemp+i*wholeSize.width * 3,*b = cpudata + i*this->step/sizeof(int);
 209         //      for(int j=0;j<wholeSize.width;j++)
 210         //      {
 211         //              if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
 212         //                      printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
 213         //                      i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
 214         //      }
 215         //}
 216         //delete []cputemp;
 217         //delete []cpudata;
 218         openCLSafeCall(clReleaseMemObject(temp));
 219     }
 220     else
 221     {
 222         openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
 223     }
 224
 225     rows = m.rows;
 226     cols = m.cols;
 227     offset = ofs.y * step + ofs.x * elemSize();
 228     //download_channels = m.channels();
 229 }
 230
 231 cv::ocl::oclMat::operator cv::_InputArray()
 232 {
 233     _InputArray newInputArray;
 234     newInputArray.flags = cv::_InputArray::OCL_MAT;
 235     newInputArray.obj   = reinterpret_cast<void *>(this);
 236     return newInputArray;
 237 }
 238
 239 cv::ocl::oclMat::operator cv::_OutputArray()
 240 {
 241     _OutputArray newOutputArray;
 242     newOutputArray.flags = cv::_InputArray::OCL_MAT;
 243     newOutputArray.obj   = reinterpret_cast<void *>(this);
 244     return newOutputArray;
 245 }
 246
 247 cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
 248 {
 249     CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
 250     return *reinterpret_cast<oclMat*>(src.obj);
 251 }
 252
 253 cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
 254 {
 255     CV_Assert(src.flags & cv::_InputArray::OCL_MAT);
 256     return *reinterpret_cast<oclMat*>(src.obj);
 257 }
 258
 259 void cv::ocl::oclMat::download(cv::Mat &m) const
 260 {
 261     CV_DbgAssert(!this->empty());
 262     //   int t = type();
 263     //   if(download_channels == 3)
 264     //{
 265     //  t = CV_MAKETYPE(depth(), 3);
 266     //}
 267     m.create(wholerows, wholecols, type());
 268
 269     if(m.channels() == 3)
 270     {
 271         int pitch = wholecols * 3 * m.elemSize1();
 272         int tail_padding = m.elemSize1() * 3072;
 273         int err;
 274         cl_mem temp = clCreateBuffer((cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE,
 275                                      (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
 276         openCLVerifyCall(err);
 277
 278         convert_C4C3(*this, temp);
 279         openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
 280         //int* cputemp=new int[wholecols*wholerows * 3];
 281         //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
 282         //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
 283         //                                              0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL));
 284         //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
 285         //                                              0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
 286         //for(int i=0;i<wholerows;i++)
 287         //{
 288         //      int *a = cputemp+i*wholecols * 3,*b = cpudata + i*this->step/sizeof(int);
 289         //      for(int j=0;j<wholecols;j++)
 290         //      {
 291         //              if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
 292         //                      printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
 293         //                      i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
 294         //      }
 295         //}
 296         //delete []cputemp;
 297         //delete []cpudata;
 298         openCLSafeCall(clReleaseMemObject(temp));
 299     }
 300     else
 301     {
 302         openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
 303     }
 304     Size wholesize;
 305     Point ofs;
 306     locateROI(wholesize, ofs);
 307     m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
 308 }
 309
 310 /////////////////////common//////////////////////////////////////
 311 inline int divUp(int total, int grain)
 312 {
 313     return (total + grain - 1) / grain;
 314 }
 315 ///////////////////////////////////////////////////////////////////////////
 316 ////////////////////////////////// CopyTo /////////////////////////////////
 317 ///////////////////////////////////////////////////////////////////////////
 318 static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName)
 319 {
 320     CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
 321                   src.rows == dst.rows && src.cols == dst.cols
 322                   && mask.type() == CV_8UC1);
 323
 324     vector<pair<size_t , const void *> > args;
 325
 326     std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
 327         {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
 328         {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
 329         {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
 330     };
 331     char compile_option[32];
 332     sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
 333     size_t localThreads[3] = {16, 16, 1};
 334     size_t globalThreads[3];
 335
 336     globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
 337     globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
 338     globalThreads[2] = 1;
 339
 340     int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
 341     int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
 342
 343     args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
 344     args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 345     args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
 346     args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
 347     args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
 348     args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
 349     args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
 350     args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
 351     args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
 352     args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
 353     args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
 354
 355     openCLExecuteKernel(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
 356                         localThreads, args, -1, -1, compile_option);
 357 }
 358
 359 void cv::ocl::oclMat::copyTo( oclMat &m ) const
 360 {
 361     CV_DbgAssert(!this->empty());
 362     m.create(size(), type());
 363     openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
 364                        data, step, cols * elemSize(), rows, offset);
 365 }
 366
 367 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 368 {
 369     if (mask.empty())
 370     {
 371         copyTo(mat);
 372     }
 373     else
 374     {
 375         mat.create(size(), type());
 376         copy_to_with_mask(*this, mat, mask, "copy_to_with_mask");
 377     }
 378 }
 379
 380 ///////////////////////////////////////////////////////////////////////////
 381 //////////////////////////////// ConvertTo ////////////////////////////////
 382 ///////////////////////////////////////////////////////////////////////////
 383 static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
 384 {
 385     string kernelName = "convert_to_S";
 386     stringstream idxStr;
 387     idxStr << src.depth();
 388     kernelName += idxStr.str();
 389     float alpha_f = alpha, beta_f = beta;
 390     CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
 391     vector<pair<size_t , const void *> > args;
 392     size_t localThreads[3] = {16, 16, 1};
 393     size_t globalThreads[3];
 394     globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 395     globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
 396     globalThreads[2] = 1;
 397     int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
 398     int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
 399     if(dst.type() == CV_8UC1)
 400     {
 401         globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
 402     }
 403     args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
 404     args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 405     args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
 406     args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
 407     args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
 408     args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
 409     args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
 410     args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
 411     args.push_back( make_pair( sizeof(cl_float) , (void *)&alpha_f ));
 412     args.push_back( make_pair( sizeof(cl_float) , (void *)&beta_f ));
 413     openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
 414                         localThreads, args, dst.oclchannels(), dst.depth());
 415 }
 416 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
 417 {
 418     //cout << "cv::ocl::oclMat::convertTo()" << endl;
 419
 420     bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
 421                    && fabs(beta) < std::numeric_limits<double>::epsilon();
 422
 423     if( rtype < 0 )
 424         rtype = type();
 425     else
 426         rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
 427
 428     //int scn = channels();
 429     int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
 430     if( sdepth == ddepth && noScale )
 431     {
 432         copyTo(dst);
 433         return;
 434     }
 435
 436     oclMat temp;
 437     const oclMat *psrc = this;
 438     if( sdepth != ddepth && psrc == &dst )
 439         psrc = &(temp = *this);
 440
 441     dst.create( size(), rtype );
 442     convert_run(*psrc, dst, alpha, beta);
 443 }
 444
 445 ///////////////////////////////////////////////////////////////////////////
 446 //////////////////////////////// setTo ////////////////////////////////////
 447 ///////////////////////////////////////////////////////////////////////////
 448 oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
 449 {
 450     //cout << "cv::ocl::oclMat::=" << endl;
 451     setTo(s);
 452     return *this;
 453 }
 454 static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kernelName)
 455 {
 456     vector<pair<size_t , const void *> > args;
 457
 458     size_t localThreads[3] = {16, 16, 1};
 459     size_t globalThreads[3];
 460     globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 461     globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
 462     globalThreads[2] = 1;
 463     int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
 464     if(dst.type() == CV_8UC1)
 465     {
 466         globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 467     }
 468     char compile_option[32];
 469     union sc
 470     {
 471         cl_uchar4 uval;
 472         cl_char4  cval;
 473         cl_ushort4 usval;
 474         cl_short4 shval;
 475         cl_int4 ival;
 476         cl_float4 fval;
 477         cl_double4 dval;
 478     } val;
 479     switch(dst.depth())
 480     {
 481     case CV_8U:
 482         val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
 483         val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
 484         val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
 485         val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
 486         switch(dst.oclchannels())
 487         {
 488         case 1:
 489             sprintf(compile_option, "-D GENTYPE=uchar");
 490             args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
 491             break;
 492         case 4:
 493             sprintf(compile_option, "-D GENTYPE=uchar4");
 494             args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
 495             break;
 496         default:
 497             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 498         }
 499         break;
 500     case CV_8S:
 501         val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
 502         val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
 503         val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
 504         val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
 505         switch(dst.oclchannels())
 506         {
 507         case 1:
 508             sprintf(compile_option, "-D GENTYPE=char");
 509             args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
 510             break;
 511         case 4:
 512             sprintf(compile_option, "-D GENTYPE=char4");
 513             args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
 514             break;
 515         default:
 516             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 517         }
 518         break;
 519     case CV_16U:
 520         val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
 521         val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
 522         val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
 523         val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
 524         switch(dst.oclchannels())
 525         {
 526         case 1:
 527             sprintf(compile_option, "-D GENTYPE=ushort");
 528             args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
 529             break;
 530         case 4:
 531             sprintf(compile_option, "-D GENTYPE=ushort4");
 532             args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
 533             break;
 534         default:
 535             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 536         }
 537         break;
 538     case CV_16S:
 539         val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
 540         val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
 541         val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
 542         val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
 543         switch(dst.oclchannels())
 544         {
 545         case 1:
 546             sprintf(compile_option, "-D GENTYPE=short");
 547             args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
 548             break;
 549         case 4:
 550             sprintf(compile_option, "-D GENTYPE=short4");
 551             args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
 552             break;
 553         default:
 554             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 555         }
 556         break;
 557     case CV_32S:
 558         val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
 559         val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
 560         val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
 561         val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
 562         switch(dst.oclchannels())
 563         {
 564         case 1:
 565             sprintf(compile_option, "-D GENTYPE=int");
 566             args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
 567             break;
 568         case 2:
 569             sprintf(compile_option, "-D GENTYPE=int2");
 570             cl_int2 i2val;
 571             i2val.s[0] = val.ival.s[0];
 572             i2val.s[1] = val.ival.s[1];
 573             args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
 574             break;
 575         case 4:
 576             sprintf(compile_option, "-D GENTYPE=int4");
 577             args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
 578             break;
 579         default:
 580             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 581         }
 582         break;
 583     case CV_32F:
 584         val.fval.s[0] = scalar.val[0];
 585         val.fval.s[1] = scalar.val[1];
 586         val.fval.s[2] = scalar.val[2];
 587         val.fval.s[3] = scalar.val[3];
 588         switch(dst.oclchannels())
 589         {
 590         case 1:
 591             sprintf(compile_option, "-D GENTYPE=float");
 592             args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
 593             break;
 594         case 4:
 595             sprintf(compile_option, "-D GENTYPE=float4");
 596             args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
 597             break;
 598         default:
 599             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 600         }
 601         break;
 602     case CV_64F:
 603         val.dval.s[0] = scalar.val[0];
 604         val.dval.s[1] = scalar.val[1];
 605         val.dval.s[2] = scalar.val[2];
 606         val.dval.s[3] = scalar.val[3];
 607         switch(dst.oclchannels())
 608         {
 609         case 1:
 610             sprintf(compile_option, "-D GENTYPE=double");
 611             args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
 612             break;
 613         case 4:
 614             sprintf(compile_option, "-D GENTYPE=double4");
 615             args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
 616             break;
 617         default:
 618             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 619         }
 620         break;
 621     default:
 622         CV_Error(CV_StsUnsupportedFormat, "unknown depth");
 623     }
 624 #ifdef CL_VERSION_1_2
 625     //this enables backwards portability to
 626     //run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
 627     if(Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
 628         dst.offset == 0 && dst.cols == dst.wholecols)
 629     {
 630         clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
 631             (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
 632     }
 633     else
 634 #endif
 635     {
 636         args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 637         args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
 638         args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
 639         args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
 640         args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
 641         openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
 642             localThreads, args, -1, -1, compile_option);
 643     }
 644 }
 645
 646 static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, string kernelName)
 647 {
 648     CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
 649     vector<pair<size_t , const void *> > args;
 650     size_t localThreads[3] = {16, 16, 1};
 651     size_t globalThreads[3];
 652     globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 653     globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
 654     globalThreads[2] = 1;
 655     int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
 656     char compile_option[32];
 657     union sc
 658     {
 659         cl_uchar4 uval;
 660         cl_char4  cval;
 661         cl_ushort4 usval;
 662         cl_short4 shval;
 663         cl_int4 ival;
 664         cl_float4 fval;
 665         cl_double4 dval;
 666     } val;
 667     switch(dst.depth())
 668     {
 669     case CV_8U:
 670         val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
 671         val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
 672         val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
 673         val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
 674         switch(dst.oclchannels())
 675         {
 676         case 1:
 677             sprintf(compile_option, "-D GENTYPE=uchar");
 678             args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
 679             break;
 680         case 4:
 681             sprintf(compile_option, "-D GENTYPE=uchar4");
 682             args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
 683             break;
 684         default:
 685             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 686         }
 687         break;
 688     case CV_8S:
 689         val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
 690         val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
 691         val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
 692         val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
 693         switch(dst.oclchannels())
 694         {
 695         case 1:
 696             sprintf(compile_option, "-D GENTYPE=char");
 697             args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
 698             break;
 699         case 4:
 700             sprintf(compile_option, "-D GENTYPE=char4");
 701             args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
 702             break;
 703         default:
 704             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 705         }
 706         break;
 707     case CV_16U:
 708         val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
 709         val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
 710         val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
 711         val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
 712         switch(dst.oclchannels())
 713         {
 714         case 1:
 715             sprintf(compile_option, "-D GENTYPE=ushort");
 716             args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
 717             break;
 718         case 4:
 719             sprintf(compile_option, "-D GENTYPE=ushort4");
 720             args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
 721             break;
 722         default:
 723             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 724         }
 725         break;
 726     case CV_16S:
 727         val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
 728         val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
 729         val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
 730         val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
 731         switch(dst.oclchannels())
 732         {
 733         case 1:
 734             sprintf(compile_option, "-D GENTYPE=short");
 735             args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
 736             break;
 737         case 4:
 738             sprintf(compile_option, "-D GENTYPE=short4");
 739             args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
 740             break;
 741         default:
 742             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 743         }
 744         break;
 745     case CV_32S:
 746         val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
 747         val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
 748         val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
 749         val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
 750         switch(dst.oclchannels())
 751         {
 752         case 1:
 753             sprintf(compile_option, "-D GENTYPE=int");
 754             args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
 755             break;
 756         case 4:
 757             sprintf(compile_option, "-D GENTYPE=int4");
 758             args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
 759             break;
 760         default:
 761             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 762         }
 763         break;
 764     case CV_32F:
 765         val.fval.s[0] = scalar.val[0];
 766         val.fval.s[1] = scalar.val[1];
 767         val.fval.s[2] = scalar.val[2];
 768         val.fval.s[3] = scalar.val[3];
 769         switch(dst.oclchannels())
 770         {
 771         case 1:
 772             sprintf(compile_option, "-D GENTYPE=float");
 773             args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
 774             break;
 775         case 4:
 776             sprintf(compile_option, "-D GENTYPE=float4");
 777             args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
 778             break;
 779         default:
 780             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 781         }
 782         break;
 783     case CV_64F:
 784         val.dval.s[0] = scalar.val[0];
 785         val.dval.s[1] = scalar.val[1];
 786         val.dval.s[2] = scalar.val[2];
 787         val.dval.s[3] = scalar.val[3];
 788         switch(dst.oclchannels())
 789         {
 790         case 1:
 791             sprintf(compile_option, "-D GENTYPE=double");
 792             args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
 793             break;
 794         case 4:
 795             sprintf(compile_option, "-D GENTYPE=double4");
 796             args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
 797             break;
 798         default:
 799             CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 800         }
 801         break;
 802     default:
 803         CV_Error(CV_StsUnsupportedFormat, "unknown depth");
 804     }
 805     args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 806     args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
 807     args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
 808     args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
 809     args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
 810     args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
 811     args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
 812     args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
 813     openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
 814                         localThreads, args, -1, -1, compile_option);
 815 }
 816
 817 oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
 818 {
 819     //cout << "cv::ocl::oclMat::setTo()" << endl;
 820     CV_Assert(mask.type() == CV_8UC1);
 821     CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
 822     CV_DbgAssert( !this->empty());
 823     //cl_int status;
 824     //cl_mem mem;
 825     //mem = clCreateBuffer(this->clCxt->clContext,CL_MEM_READ_WRITE,
 826     //                   sizeof(double)*4,NULL,&status);
 827     //openCLVerifyCall(status);
 828     //double* s =  (double *)scalar.val;
 829     //openCLSafeCall(clEnqueueWriteBuffer(this->clCxt->clCmdQueue,
 830     //                   (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0));
 831     if (mask.empty())
 832     {
 833         if(type() == CV_8UC1)
 834         {
 835             set_to_withoutmask_run(*this, scalar, "set_to_without_mask_C1_D0");
 836         }
 837         else
 838         {
 839             set_to_withoutmask_run(*this, scalar, "set_to_without_mask");
 840         }
 841     }
 842     else
 843     {
 844         set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
 845     }
 846
 847     return *this;
 848 }
 849
 850 oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
 851 {
 852     if( new_rows != 0 && new_rows != rows)
 853
 854     {
 855
 856         CV_Error( CV_StsBadFunc,
 857
 858                   "oclMat's number of rows can not be changed for current version" );
 859
 860     }
 861
 862     oclMat hdr = *this;
 863
 864     int cn = oclchannels();
 865
 866     if (new_cn == 0)
 867
 868         new_cn = cn;
 869
 870
 871
 872     int total_width = cols * cn;
 873
 874
 875
 876     if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
 877
 878         new_rows = rows * total_width / new_cn;
 879
 880
 881
 882     if (new_rows != 0 && new_rows != rows)
 883
 884     {
 885
 886         int total_size = total_width * rows;
 887
 888
 889
 890         if (!isContinuous())
 891
 892             CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
 893
 894
 895
 896         if ((unsigned)new_rows > (unsigned)total_size)
 897
 898             CV_Error(CV_StsOutOfRange, "Bad new number of rows");
 899
 900
 901
 902         total_width = total_size / new_rows;
 903
 904
 905
 906         if (total_width * new_rows != total_size)
 907
 908             CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
 909
 910
 911
 912         hdr.rows = new_rows;
 913
 914         hdr.step = total_width * elemSize1();
 915
 916     }
 917
 918
 919
 920     int new_width = total_width / new_cn;
 921
 922
 923
 924     if (new_width * new_cn != total_width)
 925
 926         CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");
 927
 928
 929
 930     hdr.cols = new_width;
 931
 932     hdr.wholecols = new_width;
 933
 934     hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
 935
 936
 937
 938     return hdr;
 939
 940 }
 941
 942 void cv::ocl::oclMat::createEx(Size size, int type, DevMemRW rw_type, DevMemType mem_type)
 943 {
 944     createEx(size.height, size.width, type, rw_type, mem_type);
 945 }
 946
 947 void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
 948 {
 949     createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
 950 }
 951
 952 void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type, DevMemRW rw_type, DevMemType mem_type)
 953 {
 954     clCxt = Context::getContext();
 955     /* core logic */
 956     _type &= TYPE_MASK;
 957     //download_channels = CV_MAT_CN(_type);
 958     //if(download_channels==3)
 959     //{
 960     //  _type = CV_MAKE_TYPE((CV_MAT_DEPTH(_type)),4);
 961     //}
 962     if( rows == _rows && cols == _cols && type() == _type && data )
 963         return;
 964     if( data )
 965         release();
 966     CV_DbgAssert( _rows >= 0 && _cols >= 0 );
 967     if( _rows > 0 && _cols > 0 )
 968     {
 969         flags = Mat::MAGIC_VAL + _type;
 970         rows = _rows;
 971         cols = _cols;
 972         wholerows = _rows;
 973         wholecols = _cols;
 974         size_t esz = elemSize();
 975
 976         void *dev_ptr;
 977         openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
 978         //openCLMallocPitch(clCxt,&dev_ptr, &step, esz * cols, rows);
 979
 980         if (esz * cols == step)
 981             flags |= Mat::CONTINUOUS_FLAG;
 982
 983         int64 _nettosize = (int64)step * rows;
 984         size_t nettosize = (size_t)_nettosize;
 985
 986         datastart = data = (uchar *)dev_ptr;
 987         dataend = data + nettosize;
 988
 989         refcount = (int *)fastMalloc(sizeof(*refcount));
 990         *refcount = 1;
 991     }
 992 }
 993
 994 void cv::ocl::oclMat::release()
 995 {
 996     //cout << "cv::ocl::oclMat::release()" << endl;
 997     if( refcount && CV_XADD(refcount, -1) == 1 )
 998     {
 999         fastFree(refcount);
1000         openCLFree(datastart);
1001     }
1002     data = datastart = dataend = 0;
1003     step = rows = cols = 0;
1004     offset = wholerows = wholecols = 0;
1005     refcount = 0;
1006 }
1007
1008 oclMat& cv::ocl::oclMat::operator+=( const oclMat& m )
1009 {
1010     add(*this, m, *this);
1011     return *this;
1012 }
1013
1014 oclMat& cv::ocl::oclMat::operator-=( const oclMat& m )
1015 {
1016     subtract(*this, m, *this);
1017     return *this;
1018 }
1019
1020 oclMat& cv::ocl::oclMat::operator*=( const oclMat& m )
1021 {
1022     multiply(*this, m, *this);
1023     return *this;
1024 }
1025
1026 oclMat& cv::ocl::oclMat::operator/=( const oclMat& m )
1027 {
1028     divide(*this, m, *this);
1029     return *this;
1030 }