modules/ocl/src/matrix_operations.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // @Authors
  19 //    Niko Li, newlife20080214@gmail.com
  20 //    Yao Wang, bitwangyaoyao@gmail.com
  21 //
  22 // Redistribution and use in source and binary forms, with or without modification,
  23 // are permitted provided that the following conditions are met:
  24 //
  25 //   * Redistribution's of source code must retain the above copyright notice,
  26 //     this list of conditions and the following disclaimer.
  27 //
  28 //   * Redistribution's in binary form must reproduce the above copyright notice,
  29 //     this list of conditions and the following disclaimer in the documentation
  30 //     and/or other materials provided with the distribution.
  31 //
  32 //   * The name of the copyright holders may not be used to endorse or promote products
  33 //     derived from this software without specific prior written permission.
  34 //
  35 // This software is provided by the copyright holders and contributors "as is" and
  36 // any express or implied warranties, including, but not limited to, the implied
  37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  38 // In no event shall the Intel Corporation or contributors be liable for any direct,
  39 // indirect, incidental, special, exemplary, or consequential damages
  40 // (including, but not limited to, procurement of substitute goods or services;
  41 // loss of use, data, or profits; or business interruption) however caused
  42 // and on any theory of liability, whether in contract, strict liability,
  43 // or tort (including negligence or otherwise) arising in any way out of
  44 // the use of this software, even if advised of the possibility of such damage.
  45 //
  46 //M*/
  47
  48 #include "precomp.hpp"
  49 #include "opencl_kernels.hpp"
  50
  51 using namespace cv;
  52 using namespace cv::ocl;
  53
  54 #define ALIGN 32
  55 #define GPU_MATRIX_MALLOC_STEP(step) (((step) + ALIGN - 1) / ALIGN) * ALIGN
  56
  57 // helper routines
  58 namespace cv
  59 {
  60     namespace ocl
  61     {
  62         extern DevMemType gDeviceMemType;
  63         extern DevMemRW gDeviceMemRW;
  64     }
  65 }
  66
  67 ////////////////////////////////////////////////////////////////////////
  68 // convert_C3C4
  69
  70 static void convert_C3C4(const cl_mem &src, oclMat &dst)
  71 {
  72     Context *clCxt = dst.clCxt;
  73     int pixel_end = dst.wholecols * dst.wholerows - 1;
  74     int dstStep_in_pixel = dst.step1() / dst.oclchannels();
  75
  76     const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
  77     std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[dst.depth()]);
  78
  79     std::vector< std::pair<size_t, const void *> > args;
  80     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src));
  81     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data));
  82     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.wholecols));
  83     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.wholerows));
  84     args.push_back( std::make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
  85     args.push_back( std::make_pair( sizeof(cl_int), (void *)&pixel_end));
  86
  87     size_t globalThreads[3] = { divUp(dst.wholecols * dst.wholerows, 4), 1, 1 };
  88     size_t localThreads[3] = { 256, 1, 1 };
  89
  90     openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, localThreads,
  91                         args, -1, -1, buildOptions.c_str());
  92 }
  93
  94 ////////////////////////////////////////////////////////////////////////
  95 // convert_C4C3
  96
  97 static void convert_C4C3(const oclMat &src, cl_mem &dst)
  98 {
  99     int srcStep_in_pixel = src.step1() / src.oclchannels();
 100     int pixel_end = src.wholecols * src.wholerows - 1;
 101     Context *clCxt = src.clCxt;
 102
 103     const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
 104     std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[src.depth()]);
 105
 106     std::vector< std::pair<size_t, const void *> > args;
 107     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data));
 108     args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst));
 109     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.wholecols));
 110     args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.wholerows));
 111     args.push_back( std::make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
 112     args.push_back( std::make_pair( sizeof(cl_int), (void *)&pixel_end));
 113
 114     size_t globalThreads[3] = { divUp(src.wholecols * src.wholerows, 4), 1, 1};
 115     size_t localThreads[3] = { 256, 1, 1 };
 116
 117     openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, localThreads, args, -1, -1, buildOptions.c_str());
 118 }
 119
 120 void cv::ocl::oclMat::upload(const Mat &m)
 121 {
 122     if (!Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && m.depth() == CV_64F)
 123     {
 124         CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
 125         return;
 126     }
 127
 128     CV_DbgAssert(!m.empty());
 129     Size wholeSize;
 130     Point ofs;
 131     m.locateROI(wholeSize, ofs);
 132     create(wholeSize, m.type());
 133
 134     if (m.channels() == 3)
 135     {
 136         int pitch = wholeSize.width * 3 * m.elemSize1();
 137         int tail_padding = m.elemSize1() * 3072;
 138         int err;
 139         cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
 140                                      (pitch * wholeSize.height + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
 141         openCLVerifyCall(err);
 142
 143         openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
 144         convert_C3C4(temp, *this);
 145         openCLSafeCall(clReleaseMemObject(temp));
 146     }
 147     else
 148         openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
 149
 150     rows = m.rows;
 151     cols = m.cols;
 152     offset = ofs.y * step + ofs.x * elemSize();
 153 }
 154
 155 cv::ocl::oclMat::operator cv::_InputArray()
 156 {
 157     return _InputArray(cv::_InputArray::OCL_MAT, this);
 158 }
 159
 160 cv::ocl::oclMat::operator cv::_OutputArray()
 161 {
 162     return _OutputArray(cv::_InputArray::OCL_MAT, this);
 163 }
 164
 165 cv::ocl::oclMat& cv::ocl::getOclMatRef(InputArray src)
 166 {
 167     CV_Assert(src.kind() == cv::_InputArray::OCL_MAT);
 168     return *(oclMat*)src.getObj();
 169 }
 170
 171 cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src)
 172 {
 173     CV_Assert(src.kind() == cv::_InputArray::OCL_MAT);
 174     return *(oclMat*)src.getObj();
 175 }
 176
 177 void cv::ocl::oclMat::download(cv::Mat &m) const
 178 {
 179     CV_DbgAssert(!this->empty());
 180     m.create(wholerows, wholecols, type());
 181
 182     if(m.channels() == 3)
 183     {
 184         int pitch = wholecols * 3 * m.elemSize1();
 185         int tail_padding = m.elemSize1() * 3072;
 186         int err;
 187         cl_mem temp = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE,
 188                                      (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
 189         openCLVerifyCall(err);
 190
 191         convert_C4C3(*this, temp);
 192         openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
 193         openCLSafeCall(clReleaseMemObject(temp));
 194     }
 195     else
 196     {
 197         openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
 198     }
 199
 200     Size wholesize;
 201     Point ofs;
 202     locateROI(wholesize, ofs);
 203     m.adjustROI(-ofs.y, ofs.y + rows - wholerows, -ofs.x, ofs.x + cols - wholecols);
 204 }
 205
 206 ///////////////////////////////////////////////////////////////////////////
 207 ////////////////////////////////// CopyTo /////////////////////////////////
 208 ///////////////////////////////////////////////////////////////////////////
 209 static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask, String kernelName)
 210 {
 211     CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
 212                   src.rows == dst.rows && src.cols == dst.cols
 213                   && mask.type() == CV_8UC1);
 214
 215     std::vector<std::pair<size_t , const void *> > args;
 216
 217     String string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
 218         {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
 219         {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
 220         {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
 221     };
 222
 223     char compile_option[32];
 224     sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
 225     size_t localThreads[3] = {16, 16, 1};
 226     size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 227
 228     int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
 229     int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
 230
 231     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
 232     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 233     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
 234     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
 235     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
 236     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
 237     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
 238     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
 239     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
 240     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
 241     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
 242
 243     openCLExecuteKernel(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
 244                         localThreads, args, -1, -1, compile_option);
 245 }
 246
 247 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 248 {
 249     if (mask.empty())
 250     {
 251         CV_DbgAssert(!this->empty());
 252         mat.create(size(), type());
 253         openCLCopyBuffer2D(clCxt, mat.data, mat.step, mat.offset,
 254                            data, step, cols * elemSize(), rows, offset);
 255     }
 256     else
 257     {
 258         mat.create(size(), type());
 259         copy_to_with_mask(*this, mat, mask, "copy_to_with_mask");
 260     }
 261 }
 262
 263 ///////////////////////////////////////////////////////////////////////////
 264 //////////////////////////////// ConvertTo ////////////////////////////////
 265 ///////////////////////////////////////////////////////////////////////////
 266
 267 static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta)
 268 {
 269     String kernelName = "convert_to";
 270     float alpha_f = alpha, beta_f = beta;
 271     int sdepth = src.depth(), ddepth = dst.depth();
 272     int sstep1 = (int)src.step1(), dstep1 = (int)dst.step1();
 273     int cols1 = src.cols * src.oclchannels();
 274
 275     char buildOptions[150], convertString[50];
 276     const char * typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
 277     sprintf(convertString, "convert_%s_sat_rte", typeMap[ddepth]);
 278     sprintf(buildOptions, "-D srcT=%s -D dstT=%s -D convertToDstType=%s", typeMap[sdepth],
 279             typeMap[ddepth], CV_32F == ddepth || ddepth == CV_64F ? "" : convertString);
 280
 281     CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols);
 282     std::vector<std::pair<size_t , const void *> > args;
 283
 284     size_t localThreads[3] = { 16, 16, 1 };
 285     size_t globalThreads[3] = { divUp(cols1, localThreads[0]) * localThreads[0],
 286                                 divUp(dst.rows, localThreads[1]) * localThreads[1], 1 };
 287
 288     int doffset1 = dst.offset / dst.elemSize1();
 289     int soffset1 = src.offset / src.elemSize1();
 290
 291     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
 292     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 293     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols1 ));
 294     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
 295     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sstep1 ));
 296     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&soffset1 ));
 297     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstep1 ));
 298     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&doffset1 ));
 299     args.push_back( std::make_pair( sizeof(cl_float) , (void *)&alpha_f ));
 300     args.push_back( std::make_pair( sizeof(cl_float) , (void *)&beta_f ));
 301
 302     openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads,
 303                         localThreads, args, -1, -1, buildOptions);
 304 }
 305
 306 void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const
 307 {
 308     if (!clCxt->supportsFeature(FEATURE_CL_DOUBLE) &&
 309             (depth() == CV_64F || dst.depth() == CV_64F))
 310     {
 311         CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double");
 312         return;
 313     }
 314
 315     bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
 316                    && fabs(beta) < std::numeric_limits<double>::epsilon();
 317
 318     if( rtype < 0 )
 319         rtype = type();
 320     else
 321         rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
 322
 323     int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype);
 324     if( sdepth == ddepth && noScale )
 325     {
 326         copyTo(dst);
 327         return;
 328     }
 329
 330     oclMat temp;
 331     const oclMat *psrc = this;
 332     if( sdepth != ddepth && psrc == &dst )
 333         psrc = &(temp = *this);
 334
 335     dst.create( size(), rtype );
 336     convert_run(*psrc, dst, alpha, beta);
 337 }
 338
 339 ///////////////////////////////////////////////////////////////////////////
 340 //////////////////////////////// setTo ////////////////////////////////////
 341 ///////////////////////////////////////////////////////////////////////////
 342
 343 oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
 344 {
 345     setTo(s);
 346     return *this;
 347 }
 348
 349 #ifdef CL_VERSION_1_2
 350
 351 template <typename CLT, typename PT>
 352 static std::vector<uchar> cvt1(const cv::Scalar & s)
 353 {
 354     std::vector<uchar> _buf(sizeof(CLT));
 355     CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
 356     buf[0] = saturate_cast<PT>(s[0]);
 357     return _buf;
 358 }
 359
 360 template <typename CLT, typename PT>
 361 static std::vector<uchar> cvt2(const cv::Scalar & s)
 362 {
 363     std::vector<uchar> _buf(sizeof(CLT));
 364     CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
 365     buf->s[0] = saturate_cast<PT>(s[0]);
 366     buf->s[1] = saturate_cast<PT>(s[1]);
 367     return _buf;
 368 }
 369
 370 template <typename CLT, typename PT>
 371 static std::vector<uchar> cvt4(const cv::Scalar & s)
 372 {
 373     std::vector<uchar> _buf(sizeof(CLT));
 374     CLT * const buf = reinterpret_cast<CLT *>(&_buf[0]);
 375     buf->s[0] = saturate_cast<PT>(s[0]);
 376     buf->s[1] = saturate_cast<PT>(s[1]);
 377     buf->s[2] = saturate_cast<PT>(s[2]);
 378     buf->s[3] = saturate_cast<PT>(s[3]);
 379     return _buf;
 380 }
 381
 382 typedef std::vector<uchar> (*ConvertFunc)(const cv::Scalar & s);
 383
 384 static std::vector<uchar> scalarToCLVector(const cv::Scalar & s, int type)
 385 {
 386     const int depth = CV_MAT_DEPTH(type);
 387     const int channels = CV_MAT_CN(type);
 388
 389     static const ConvertFunc funcs[4][7] =
 390     {
 391         { cvt1<cl_uchar, uchar>, cvt1<cl_char, char>, cvt1<cl_ushort, ushort>, cvt1<cl_short, short>,
 392           cvt1<cl_int, int>, cvt1<cl_float, float>, cvt1<cl_double, double> },
 393
 394         { cvt2<cl_uchar2, uchar>, cvt2<cl_char2, char>, cvt2<cl_ushort2, ushort>, cvt2<cl_short2, short>,
 395           cvt2<cl_int2, int>, cvt2<cl_float2, float>, cvt2<cl_double2, double> },
 396
 397         { 0, 0, 0, 0, 0, 0, 0 },
 398
 399         { cvt4<cl_uchar4, uchar>, cvt4<cl_char4, char>, cvt4<cl_ushort4, ushort>, cvt4<cl_short4, short>,
 400           cvt4<cl_int4, int>, cvt4<cl_float4, float>, cvt4<cl_double4, double> }
 401     };
 402
 403     ConvertFunc func = funcs[channels - 1][depth];
 404     return func(s);
 405 }
 406
 407 #endif
 408
 409 static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, String kernelName)
 410 {
 411     std::vector<std::pair<size_t , const void *> > args;
 412
 413     size_t localThreads[3] = {16, 16, 1};
 414     size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 415     int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
 416
 417     if (dst.type() == CV_8UC1)
 418         globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 419
 420     const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
 421     const char channelMap[] = { ' ', ' ', '2', '4', '4' };
 422     std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
 423
 424     Mat mat(1, 1, dst.type(), scalar);
 425
 426 #ifdef CL_VERSION_1_2
 427     // this enables backwards portability to
 428     // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
 429     if (Context::getContext()->supportsFeature(FEATURE_CL_VER_1_2) && dst.isContinuous())
 430     {
 431         std::vector<uchar> p = ::scalarToCLVector(scalar, CV_MAKE_TYPE(dst.depth(), dst.oclchannels()));
 432         clEnqueueFillBuffer(getClCommandQueue(dst.clCxt),
 433                 (cl_mem)dst.data, (void*)&p[0], p.size(),
 434                 0, dst.step * dst.rows, 0, NULL, NULL);
 435     }
 436     else
 437 #endif
 438     {
 439         oclMat m(mat);
 440         args.push_back( std::make_pair( sizeof(cl_mem) , (void*)&m.data ));
 441         args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 442         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
 443         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
 444         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
 445         args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
 446
 447         openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
 448             localThreads, args, -1, -1, buildOptions.c_str());
 449     }
 450 }
 451
 452 static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, String kernelName)
 453 {
 454     CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
 455     std::vector<std::pair<size_t , const void *> > args;
 456     size_t localThreads[3] = { 16, 16, 1 };
 457     size_t globalThreads[3] = { dst.cols, dst.rows, 1 };
 458     int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
 459
 460     const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
 461     const char channelMap[] = { ' ', ' ', '2', '4', '4' };
 462     std::string buildOptions = format("-D GENTYPE=%s%c", typeMap[dst.depth()], channelMap[dst.channels()]);
 463
 464     oclMat m(Mat(1, 1, dst.type(), scalar));
 465     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&m.data ));
 466     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
 467     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
 468     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
 469     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
 470     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel ));
 471     args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&mask.data ));
 472     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.step ));
 473     args.push_back( std::make_pair( sizeof(cl_int) , (void *)&mask.offset ));
 474     openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
 475                         localThreads, args, -1, -1, buildOptions.c_str());
 476 }
 477
 478 oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
 479 {
 480     CV_Assert(mask.type() == CV_8UC1);
 481     CV_Assert( this->depth() >= 0 && this->depth() <= 6 );
 482     CV_DbgAssert( !this->empty());
 483     if (mask.empty())
 484     {
 485         set_to_withoutmask_run(*this, scalar, type() == CV_8UC1 ?
 486                                    "set_to_without_mask_C1_D0" : "set_to_without_mask");
 487     }
 488     else
 489         set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
 490
 491     return *this;
 492 }
 493
 494 oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
 495 {
 496     if( new_rows != 0 && new_rows != rows)
 497     {
 498         CV_Error( Error::StsBadFunc, "oclMat's number of rows can not be changed for current version" );
 499     }
 500
 501     oclMat hdr = *this;
 502
 503     int cn = oclchannels();
 504     if (new_cn == 0)
 505         new_cn = cn;
 506
 507     int total_width = cols * cn;
 508     if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
 509         new_rows = rows * total_width / new_cn;
 510
 511     if (new_rows != 0 && new_rows != rows)
 512     {
 513         int total_size = total_width * rows;
 514
 515         if (!isContinuous())
 516             CV_Error(Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
 517
 518         if ((unsigned)new_rows > (unsigned)total_size)
 519             CV_Error(Error::StsOutOfRange, "Bad new number of rows");
 520
 521         total_width = total_size / new_rows;
 522         if (total_width * new_rows != total_size)
 523             CV_Error(Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
 524
 525         hdr.rows = new_rows;
 526         hdr.step = total_width * elemSize1();
 527     }
 528
 529     int new_width = total_width / new_cn;
 530     if (new_width * new_cn != total_width)
 531         CV_Error(Error::BadNumChannels, "The total width is not divisible by the new number of channels");
 532
 533     hdr.cols = new_width;
 534     hdr.wholecols = new_width;
 535     hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
 536     return hdr;
 537
 538 }
 539
 540 void cv::ocl::oclMat::createEx(Size size, int type,
 541                                DevMemRW rw_type, DevMemType mem_type)
 542 {
 543     createEx(size.height, size.width, type, rw_type, mem_type);
 544 }
 545
 546 void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
 547 {
 548     createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
 549 }
 550
 551 void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
 552                                DevMemRW rw_type, DevMemType mem_type)
 553 {
 554     clCxt = Context::getContext();
 555     /* core logic */
 556     _type &= Mat::TYPE_MASK;
 557     if( rows == _rows && cols == _cols && type() == _type && data )
 558         return;
 559     if( data )
 560         release();
 561     CV_DbgAssert( _rows >= 0 && _cols >= 0 );
 562     if( _rows > 0 && _cols > 0 )
 563     {
 564         flags = Mat::MAGIC_VAL + _type;
 565         rows = _rows;
 566         cols = _cols;
 567         wholerows = _rows;
 568         wholecols = _cols;
 569         size_t esz = elemSize();
 570
 571         void *dev_ptr;
 572         openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols), rows, rw_type, mem_type);
 573
 574         if (esz * cols == step)
 575             flags |= Mat::CONTINUOUS_FLAG;
 576
 577         int64 _nettosize = (int64)step * rows;
 578         size_t nettosize = (size_t)_nettosize;
 579
 580         datastart = data = (uchar *)dev_ptr;
 581         dataend = data + nettosize;
 582
 583         refcount = (int *)fastMalloc(sizeof(*refcount));
 584         *refcount = 1;
 585     }
 586 }
 587
 588 void cv::ocl::oclMat::release()
 589 {
 590     if( refcount && CV_XADD(refcount, -1) == 1 )
 591     {
 592         fastFree(refcount);
 593         openCLFree(datastart);
 594     }
 595     data = datastart = dataend = 0;
 596     step = rows = cols = 0;
 597     offset = wholerows = wholecols = 0;
 598     refcount = 0;
 599 }
 600
 601 oclMat& cv::ocl::oclMat::operator+=( const oclMat& m )
 602 {
 603     add(*this, m, *this);
 604     return *this;
 605 }
 606
 607 oclMat& cv::ocl::oclMat::operator-=( const oclMat& m )
 608 {
 609     subtract(*this, m, *this);
 610     return *this;
 611 }
 612
 613 oclMat& cv::ocl::oclMat::operator*=( const oclMat& m )
 614 {
 615     multiply(*this, m, *this);
 616     return *this;
 617 }
 618
 619 oclMat& cv::ocl::oclMat::operator/=( const oclMat& m )
 620 {
 621     divide(*this, m, *this);
 622     return *this;
 623 }