modules/ocl/src/imgproc.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // @Authors
  19 //    Niko Li, newlife20080214@gmail.com
  20 //    Jia Haipeng, jiahaipeng95@gmail.com
  21 //    Shengen Yan, yanshengen@gmail.com
  22 //    Rock Li, Rock.Li@amd.com
  23 //    Zero Lin, Zero.Lin@amd.com
  24 //    Zhang Ying, zhangying913@gmail.com
  25 //    Xu Pang, pangxu010@163.com
  26 //    Wu Zailong, bullet@yeah.net
  27 //    Wenju He, wenju@multicorewareinc.com
  28 //    Sen Liu, swjtuls1987@126.com
  29 //
  30 // Redistribution and use in source and binary forms, with or without modification,
  31 // are permitted provided that the following conditions are met:
  32 //
  33 //   * Redistribution's of source code must retain the above copyright notice,
  34 //     this list of conditions and the following disclaimer.
  35 //
  36 //   * Redistribution's in binary form must reproduce the above copyright notice,
  37 //     this list of conditions and the following disclaimer in the documentation
  38 //     and/or other oclMaterials provided with the distribution.
  39 //
  40 //   * The name of the copyright holders may not be used to endorse or promote products
  41 //     derived from this software without specific prior written permission.
  42 //
  43 // This software is provided by the copyright holders and contributors "as is" and
  44 // any express or implied warranties, including, but not limited to, the implied
  45 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  46 // In no event shall the Intel Corporation or contributors be liable for any direct,
  47 // indirect, incidental, special, exemplary, or consequential damages
  48 // (including, but not limited to, procurement of substitute goods or services;
  49 // loss of use, data, or profits; or business interruption) however caused
  50 // and on any theory of liability, whether in contract, strict liability,
  51 // or tort (including negligence or otherwise) arising in any way out of
  52 // the use of this software, even if advised of the possibility of such damage.
  53 //
  54 //M*/
  55
  56 #include "precomp.hpp"
  57 #include "opencl_kernels.hpp"
  58
  59 using namespace cv;
  60 using namespace cv::ocl;
  61
  62 namespace cv
  63 {
  64     namespace ocl
  65     {
  66         ////////////////////////////////////OpenCL call wrappers////////////////////////////
  67
  68         template <typename T> struct index_and_sizeof;
  69         template <> struct index_and_sizeof<char>
  70         {
  71             enum { index = 1 };
  72         };
  73         template <> struct index_and_sizeof<unsigned char>
  74         {
  75             enum { index = 2 };
  76         };
  77         template <> struct index_and_sizeof<short>
  78         {
  79             enum { index = 3 };
  80         };
  81         template <> struct index_and_sizeof<unsigned short>
  82         {
  83             enum { index = 4 };
  84         };
  85         template <> struct index_and_sizeof<int>
  86         {
  87             enum { index = 5 };
  88         };
  89         template <> struct index_and_sizeof<float>
  90         {
  91             enum { index = 6 };
  92         };
  93         template <> struct index_and_sizeof<double>
  94         {
  95             enum { index = 7 };
  96         };
  97
  98         /////////////////////////////////////////////////////////////////////////////////////
  99         // threshold
 100
 101         typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
 102
 103         static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
 104         {
 105             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
 106             Context *clCxt = src.clCxt;
 107
 108             uchar thresh_uchar = cvFloor(thresh);
 109             uchar max_val = cvRound(maxVal);
 110             string kernelName = "threshold";
 111
 112             size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
 113             size_t bSizeX = 16, bSizeY = 16;
 114             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
 115             size_t gSizeY = dst.rows;
 116             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
 117             size_t localThreads[3] = {bSizeX, bSizeY, 1};
 118
 119             vector< pair<size_t, const void *> > args;
 120             args.push_back( make_pair(sizeof(cl_mem), &src.data));
 121             args.push_back( make_pair(sizeof(cl_mem), &dst.data));
 122             args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
 123             args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
 124             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
 125             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 126             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 127             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
 128             args.push_back( make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
 129             args.push_back( make_pair(sizeof(cl_uchar), (void *)&max_val));
 130             args.push_back( make_pair(sizeof(cl_int), (void *)&type));
 131             openCLExecuteKernel(clCxt, &imgproc_threshold, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 132         }
 133
 134         static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
 135         {
 136             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
 137             Context *clCxt = src.clCxt;
 138
 139             float thresh_f = thresh;
 140             float max_val = maxVal;
 141             int dst_offset = (dst.offset >> 2);
 142             int dst_step = (dst.step >> 2);
 143             int src_offset = (src.offset >> 2);
 144             int src_step = (src.step >> 2);
 145
 146             string kernelName = "threshold";
 147
 148             size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
 149             //size_t cols = dst.cols;
 150             size_t bSizeX = 16, bSizeY = 16;
 151             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
 152             size_t gSizeY = dst.rows;
 153             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
 154             size_t localThreads[3] = {bSizeX, bSizeY, 1};
 155
 156             vector< pair<size_t, const void *> > args;
 157             args.push_back( make_pair(sizeof(cl_mem), &src.data));
 158             args.push_back( make_pair(sizeof(cl_mem), &dst.data));
 159             args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset));
 160             args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
 161             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset));
 162             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 163             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 164             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
 165             args.push_back( make_pair(sizeof(cl_float), (void *)&thresh_f));
 166             args.push_back( make_pair(sizeof(cl_float), (void *)&max_val));
 167             args.push_back( make_pair(sizeof(cl_int), (void *)&type));
 168             openCLExecuteKernel(clCxt, &imgproc_threshold, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 169
 170         }
 171
 172         //threshold: support 8UC1 and 32FC1 data type and five threshold type
 173         double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
 174         {
 175             //TODO: These limitations shall be removed later.
 176             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
 177             CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
 178                       || type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
 179
 180             static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
 181
 182             dst.create( src.size(), src.type() );
 183             gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
 184
 185             return thresh;
 186         }
 187         ////////////////////////////////////////////////////////////////////////////////////////////
 188         ///////////////////////////////   remap   //////////////////////////////////////////////////
 189         ////////////////////////////////////////////////////////////////////////////////////////////
 190
 191         void remap( const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int borderType, const Scalar &borderValue )
 192         {
 193             Context *clCxt = src.clCxt;
 194             CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST
 195                       || interpolation == INTER_CUBIC || interpolation == INTER_LANCZOS4);
 196             CV_Assert((map1.type() == CV_16SC2 && !map2.data) || (map1.type() == CV_32FC2 && !map2.data) || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1));
 197             CV_Assert(!map2.data || map2.size() == map1.size());
 198             CV_Assert(dst.size() == map1.size());
 199
 200             dst.create(map1.size(), src.type());
 201
 202
 203             string kernelName;
 204
 205             if( map1.type() == CV_32FC2 && !map2.data )
 206             {
 207                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
 208                     kernelName = "remapLNFConstant";
 209                 else if(interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
 210                     kernelName = "remapNNFConstant";
 211             }
 212             else if(map1.type() == CV_16SC2 && !map2.data)
 213             {
 214                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
 215                     kernelName = "remapLNSConstant";
 216                 else if(interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
 217                     kernelName = "remapNNSConstant";
 218
 219             }
 220             else if(map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
 221             {
 222                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
 223                     kernelName = "remapLNF1Constant";
 224                 else if (interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
 225                     kernelName = "remapNNF1Constant";
 226             }
 227
 228             size_t blkSizeX = 16, blkSizeY = 16;
 229             size_t glbSizeX;
 230             int cols = dst.cols;
 231             if(src.type() == CV_8UC1)
 232             {
 233                 cols = (dst.cols + dst.offset % 4 + 3) / 4;
 234                 glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 235
 236             }
 237             else if(src.type() == CV_32FC1 && interpolation == INTER_LINEAR)
 238             {
 239                 cols = (dst.cols + (dst.offset >> 2) % 4 + 3) / 4;
 240                 glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 241             }
 242             else
 243             {
 244                 glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
 245
 246             }
 247
 248             size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
 249             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
 250             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 251
 252             float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
 253             vector< pair<size_t, const void *> > args;
 254             if(map1.channels() == 2)
 255             {
 256                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 257                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 258                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
 259                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
 260                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
 261                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
 262                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
 263                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
 264                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
 265                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 266                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 267                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 268                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 269                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
 270                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
 271                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
 272
 273                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
 274                 {
 275                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
 276                 }
 277                 else
 278                 {
 279                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
 280                 }
 281             }
 282             if(map1.channels() == 1)
 283             {
 284                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 285                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 286                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
 287                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map2.data));
 288                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
 289                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
 290                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
 291                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
 292                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
 293                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
 294                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 295                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 296                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 297                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 298                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
 299                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
 300                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
 301                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
 302                 {
 303                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
 304                 }
 305                 else
 306                 {
 307                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
 308                 }
 309             }
 310             openCLExecuteKernel(clCxt, &imgproc_remap, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 311         }
 312
 313         ////////////////////////////////////////////////////////////////////////////////////////////
 314         // resize
 315
 316         static void resize_gpu( const oclMat &src, oclMat &dst, double fx, double fy, int interpolation)
 317         {
 318             CV_Assert( (src.channels() == dst.channels()) );
 319             Context *clCxt = src.clCxt;
 320             float ifx = 1. / fx;
 321             float ify = 1. / fy;
 322             double ifx_d = 1. / fx;
 323             double ify_d = 1. / fy;
 324             int srcStep_in_pixel = src.step1() / src.oclchannels();
 325             int srcoffset_in_pixel = src.offset / src.elemSize();
 326             int dstStep_in_pixel = dst.step1() / dst.oclchannels();
 327             int dstoffset_in_pixel = dst.offset / dst.elemSize();
 328             //printf("%d %d\n",src.step1() , dst.elemSize());
 329             string kernelName;
 330             if(interpolation == INTER_LINEAR)
 331                 kernelName = "resizeLN";
 332             else if(interpolation == INTER_NEAREST)
 333                 kernelName = "resizeNN";
 334
 335             //TODO: improve this kernel
 336             size_t blkSizeX = 16, blkSizeY = 16;
 337             size_t glbSizeX;
 338             if(src.type() == CV_8UC1)
 339             {
 340                 size_t cols = (dst.cols + dst.offset % 4 + 3) / 4;
 341                 glbSizeX = cols % blkSizeX == 0 && cols != 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 342             }
 343             else
 344             {
 345                 glbSizeX = dst.cols % blkSizeX == 0 && dst.cols != 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
 346             }
 347             size_t glbSizeY = dst.rows % blkSizeY == 0 && dst.rows != 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
 348             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
 349             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 350
 351             vector< pair<size_t, const void *> > args;
 352             if(interpolation == INTER_NEAREST)
 353             {
 354                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 355                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 356                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
 357                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
 358                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
 359                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
 360                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 361                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 362                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 363                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 364                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
 365                 {
 366                     args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
 367                     args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
 368                 }
 369                 else
 370                 {
 371                     args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
 372                     args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
 373                 }
 374             }
 375             else
 376             {
 377                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 378                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 379                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
 380                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
 381                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
 382                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
 383                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 384                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 385                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 386                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 387                 args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
 388                 args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
 389             }
 390
 391             openCLExecuteKernel(clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 392         }
 393
 394
 395         void resize(const oclMat &src, oclMat &dst, Size dsize,
 396                     double fx, double fy, int interpolation)
 397         {
 398             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4
 399                       || src.type() == CV_32FC1 || src.type() == CV_32FC3 || src.type() == CV_32FC4);
 400             CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST);
 401             CV_Assert( src.size().area() > 0 );
 402             CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
 403
 404             if(!(dsize == Size()) && (fx > 0 && fy > 0))
 405             {
 406                 if(dsize.width != (int)(src.cols * fx) || dsize.height != (int)(src.rows * fy))
 407                 {
 408                     CV_Error(CV_StsUnmatchedSizes, "invalid dsize and fx, fy!");
 409                 }
 410             }
 411             if( dsize == Size() )
 412             {
 413                 dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
 414             }
 415             else
 416             {
 417                 fx = (double)dsize.width / src.cols;
 418                 fy = (double)dsize.height / src.rows;
 419             }
 420
 421             dst.create(dsize, src.type());
 422
 423             if( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR )
 424             {
 425                 resize_gpu( src, dst, fx, fy, interpolation);
 426                 return;
 427             }
 428             CV_Error(CV_StsUnsupportedFormat, "Non-supported interpolation method");
 429         }
 430
 431
 432         ////////////////////////////////////////////////////////////////////////
 433         // medianFilter
 434         void medianFilter(const oclMat &src, oclMat &dst, int m)
 435         {
 436             CV_Assert( m % 2 == 1 && m > 1 );
 437             CV_Assert( m <= 5 || src.depth() == CV_8U );
 438             CV_Assert( src.cols <= dst.cols && src.rows <= dst.rows );
 439
 440             if(src.data == dst.data)
 441             {
 442                 oclMat src1;
 443                 src.copyTo(src1);
 444                 return medianFilter(src1, dst, m);
 445             }
 446
 447             int srcStep = src.step1() / src.oclchannels();
 448             int dstStep = dst.step1() / dst.oclchannels();
 449             int srcOffset = src.offset / src.oclchannels() / src.elemSize1();
 450             int dstOffset = dst.offset / dst.oclchannels() / dst.elemSize1();
 451
 452             Context *clCxt = src.clCxt;
 453             string kernelName = "medianFilter";
 454
 455
 456             vector< pair<size_t, const void *> > args;
 457             args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
 458             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
 459             args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
 460             args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
 461             args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
 462             args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
 463             args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
 464             args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
 465
 466             size_t globalThreads[3] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
 467             size_t localThreads[3] = {16, 16, 1};
 468
 469             if(m == 3)
 470             {
 471                 string kernelName = "medianFilter3";
 472                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 473             }
 474             else if(m == 5)
 475             {
 476                 string kernelName = "medianFilter5";
 477                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 478             }
 479             else
 480                 CV_Error(CV_StsUnsupportedFormat, "Non-supported filter length");
 481         }
 482
 483         ////////////////////////////////////////////////////////////////////////
 484         // copyMakeBorder
 485         void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
 486         {
 487             CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
 488             if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
 489             {
 490                 if(((bordertype & cv::BORDER_ISOLATED) == 0) &&
 491                         (bordertype != cv::BORDER_CONSTANT) &&
 492                         (bordertype != cv::BORDER_REPLICATE))
 493                 {
 494                     CV_Error(CV_StsBadArg, "unsupported border type");
 495                 }
 496             }
 497             bordertype &= ~cv::BORDER_ISOLATED;
 498             if((bordertype == cv::BORDER_REFLECT) || (bordertype == cv::BORDER_WRAP))
 499             {
 500                 CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
 501             }
 502
 503             if(bordertype == cv::BORDER_REFLECT_101)
 504             {
 505                 CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
 506             }
 507
 508             dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
 509             int srcStep = src.step1() / src.oclchannels();
 510             int dstStep = dst.step1() / dst.oclchannels();
 511             int srcOffset = src.offset / src.elemSize();
 512             int dstOffset = dst.offset / dst.elemSize();
 513             int __bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101};
 514             const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"};
 515             size_t bordertype_index;
 516             for(bordertype_index = 0; bordertype_index < sizeof(__bordertype) / sizeof(int); bordertype_index++)
 517             {
 518                 if(__bordertype[bordertype_index] == bordertype)
 519                     break;
 520             }
 521             if(bordertype_index == sizeof(__bordertype) / sizeof(int))
 522             {
 523                 CV_Error(CV_StsBadArg, "unsupported border type");
 524             }
 525             string kernelName = "copymakeborder";
 526             size_t localThreads[3] = {16, 16, 1};
 527             size_t globalThreads[3] = {(dst.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
 528                                        (dst.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1
 529                                       };
 530
 531             vector< pair<size_t, const void *> > args;
 532             args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
 533             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
 534             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
 535             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows));
 536             args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
 537             args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
 538             args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
 539             args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
 540             args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
 541             args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
 542             args.push_back( make_pair( sizeof(cl_int), (void *)&top));
 543             args.push_back( make_pair( sizeof(cl_int), (void *)&left));
 544             char compile_option[64];
 545             union sc
 546             {
 547                 cl_uchar4 uval;
 548                 cl_char4  cval;
 549                 cl_ushort4 usval;
 550                 cl_short4 shval;
 551                 cl_int4 ival;
 552                 cl_float4 fval;
 553                 cl_double4 dval;
 554             } val;
 555             switch(dst.depth())
 556             {
 557             case CV_8U:
 558                 val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
 559                 val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
 560                 val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
 561                 val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
 562                 switch(dst.oclchannels())
 563                 {
 564                 case 1:
 565                     sprintf(compile_option, "-D GENTYPE=uchar -D %s", borderstr[bordertype_index]);
 566                     args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
 567                     if(((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
 568                     {
 569                         kernelName = "copymakeborder_C1_D0";
 570                         globalThreads[0] = (dst.cols / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 571                     }
 572                     break;
 573                 case 4:
 574                     sprintf(compile_option, "-D GENTYPE=uchar4 -D %s", borderstr[bordertype_index]);
 575                     args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
 576                     break;
 577                 default:
 578                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 579                 }
 580                 break;
 581             case CV_8S:
 582                 val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
 583                 val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
 584                 val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
 585                 val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
 586                 switch(dst.oclchannels())
 587                 {
 588                 case 1:
 589                     sprintf(compile_option, "-D GENTYPE=char -D %s", borderstr[bordertype_index]);
 590                     args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
 591                     break;
 592                 case 4:
 593                     sprintf(compile_option, "-D GENTYPE=char4 -D %s", borderstr[bordertype_index]);
 594                     args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
 595                     break;
 596                 default:
 597                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 598                 }
 599                 break;
 600             case CV_16U:
 601                 val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
 602                 val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
 603                 val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
 604                 val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
 605                 switch(dst.oclchannels())
 606                 {
 607                 case 1:
 608                     sprintf(compile_option, "-D GENTYPE=ushort -D %s", borderstr[bordertype_index]);
 609                     args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
 610                     break;
 611                 case 4:
 612                     sprintf(compile_option, "-D GENTYPE=ushort4 -D %s", borderstr[bordertype_index]);
 613                     args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
 614                     break;
 615                 default:
 616                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 617                 }
 618                 break;
 619             case CV_16S:
 620                 val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
 621                 val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
 622                 val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
 623                 val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
 624                 switch(dst.oclchannels())
 625                 {
 626                 case 1:
 627                     sprintf(compile_option, "-D GENTYPE=short -D %s", borderstr[bordertype_index]);
 628                     args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
 629                     break;
 630                 case 4:
 631                     sprintf(compile_option, "-D GENTYPE=short4 -D %s", borderstr[bordertype_index]);
 632                     args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
 633                     break;
 634                 default:
 635                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 636                 }
 637                 break;
 638             case CV_32S:
 639                 val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
 640                 val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
 641                 val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
 642                 val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
 643                 switch(dst.oclchannels())
 644                 {
 645                 case 1:
 646                     sprintf(compile_option, "-D GENTYPE=int -D %s", borderstr[bordertype_index]);
 647                     args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
 648                     break;
 649                 case 2:
 650                     sprintf(compile_option, "-D GENTYPE=int2 -D %s", borderstr[bordertype_index]);
 651                     cl_int2 i2val;
 652                     i2val.s[0] = val.ival.s[0];
 653                     i2val.s[1] = val.ival.s[1];
 654                     args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
 655                     break;
 656                 case 4:
 657                     sprintf(compile_option, "-D GENTYPE=int4 -D %s", borderstr[bordertype_index]);
 658                     args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
 659                     break;
 660                 default:
 661                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 662                 }
 663                 break;
 664             case CV_32F:
 665                 val.fval.s[0] = scalar.val[0];
 666                 val.fval.s[1] = scalar.val[1];
 667                 val.fval.s[2] = scalar.val[2];
 668                 val.fval.s[3] = scalar.val[3];
 669                 switch(dst.oclchannels())
 670                 {
 671                 case 1:
 672                     sprintf(compile_option, "-D GENTYPE=float -D %s", borderstr[bordertype_index]);
 673                     args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
 674                     break;
 675                 case 4:
 676                     sprintf(compile_option, "-D GENTYPE=float4 -D %s", borderstr[bordertype_index]);
 677                     args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
 678                     break;
 679                 default:
 680                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 681                 }
 682                 break;
 683             case CV_64F:
 684                 val.dval.s[0] = scalar.val[0];
 685                 val.dval.s[1] = scalar.val[1];
 686                 val.dval.s[2] = scalar.val[2];
 687                 val.dval.s[3] = scalar.val[3];
 688                 switch(dst.oclchannels())
 689                 {
 690                 case 1:
 691                     sprintf(compile_option, "-D GENTYPE=double -D %s", borderstr[bordertype_index]);
 692                     args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
 693                     break;
 694                 case 4:
 695                     sprintf(compile_option, "-D GENTYPE=double4 -D %s", borderstr[bordertype_index]);
 696                     args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
 697                     break;
 698                 default:
 699                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 700                 }
 701                 break;
 702             default:
 703                 CV_Error(CV_StsUnsupportedFormat, "unknown depth");
 704             }
 705
 706             openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 707         }
 708
 709         ////////////////////////////////////////////////////////////////////////
 710         // warp
 711
 712         namespace
 713         {
 714 #define F double
 715
 716             void convert_coeffs(F *M)
 717             {
 718                 double D = M[0] * M[4] - M[1] * M[3];
 719                 D = D != 0 ? 1. / D : 0;
 720                 double A11 = M[4] * D, A22 = M[0] * D;
 721                 M[0] = A11;
 722                 M[1] *= -D;
 723                 M[3] *= -D;
 724                 M[4] = A22;
 725                 double b1 = -M[0] * M[2] - M[1] * M[5];
 726                 double b2 = -M[3] * M[2] - M[4] * M[5];
 727                 M[2] = b1;
 728                 M[5] = b2;
 729             }
 730
 731             double invert(double *M)
 732             {
 733 #define Sd(y,x) (Sd[y*3+x])
 734 #define Dd(y,x) (Dd[y*3+x])
 735 #define det3(m)    (m(0,0)*(m(1,1)*m(2,2) - m(1,2)*m(2,1)) -  \
 736                     m(0,1)*(m(1,0)*m(2,2) - m(1,2)*m(2,0)) +  \
 737                     m(0,2)*(m(1,0)*m(2,1) - m(1,1)*m(2,0)))
 738                 double *Sd = M;
 739                 double *Dd = M;
 740                 double d = det3(Sd);
 741                 double result = 0;
 742                 if( d != 0)
 743                 {
 744                     double t[9];
 745                     result = d;
 746                     d = 1. / d;
 747
 748                     t[0] = (Sd(1, 1) * Sd(2, 2) - Sd(1, 2) * Sd(2, 1)) * d;
 749                     t[1] = (Sd(0, 2) * Sd(2, 1) - Sd(0, 1) * Sd(2, 2)) * d;
 750                     t[2] = (Sd(0, 1) * Sd(1, 2) - Sd(0, 2) * Sd(1, 1)) * d;
 751
 752                     t[3] = (Sd(1, 2) * Sd(2, 0) - Sd(1, 0) * Sd(2, 2)) * d;
 753                     t[4] = (Sd(0, 0) * Sd(2, 2) - Sd(0, 2) * Sd(2, 0)) * d;
 754                     t[5] = (Sd(0, 2) * Sd(1, 0) - Sd(0, 0) * Sd(1, 2)) * d;
 755
 756                     t[6] = (Sd(1, 0) * Sd(2, 1) - Sd(1, 1) * Sd(2, 0)) * d;
 757                     t[7] = (Sd(0, 1) * Sd(2, 0) - Sd(0, 0) * Sd(2, 1)) * d;
 758                     t[8] = (Sd(0, 0) * Sd(1, 1) - Sd(0, 1) * Sd(1, 0)) * d;
 759
 760                     Dd(0, 0) = t[0];
 761                     Dd(0, 1) = t[1];
 762                     Dd(0, 2) = t[2];
 763                     Dd(1, 0) = t[3];
 764                     Dd(1, 1) = t[4];
 765                     Dd(1, 2) = t[5];
 766                     Dd(2, 0) = t[6];
 767                     Dd(2, 1) = t[7];
 768                     Dd(2, 2) = t[8];
 769                 }
 770                 return result;
 771             }
 772
 773             void warpAffine_gpu(const oclMat &src, oclMat &dst, F coeffs[2][3], int interpolation)
 774             {
 775                 CV_Assert( (src.oclchannels() == dst.oclchannels()) );
 776                 int srcStep = src.step1();
 777                 int dstStep = dst.step1();
 778                 float float_coeffs[2][3];
 779                 cl_mem coeffs_cm;
 780
 781                 Context *clCxt = src.clCxt;
 782                 string s[3] = {"NN", "Linear", "Cubic"};
 783                 string kernelName = "warpAffine" + s[interpolation];
 784
 785
 786                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
 787                 {
 788                     cl_int st;
 789                     coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
 790                     openCLVerifyCall(st);
 791                     openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
 792                 }
 793                 else
 794                 {
 795                     cl_int st;
 796                     for(int m = 0; m < 2; m++)
 797                         for(int n = 0; n < 3; n++)
 798                         {
 799                             float_coeffs[m][n] = coeffs[m][n];
 800                         }
 801                         coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
 802                         openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
 803
 804                 }
 805                 //TODO: improve this kernel
 806                 size_t blkSizeX = 16, blkSizeY = 16;
 807                 size_t glbSizeX;
 808                 size_t cols;
 809                 //if(src.type() == CV_8UC1 && interpolation != 2)
 810                 if(src.type() == CV_8UC1 && interpolation != 2)
 811                 {
 812                     cols = (dst.cols + dst.offset % 4 + 3) / 4;
 813                     glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 814                 }
 815                 else
 816                 {
 817                     cols = dst.cols;
 818                     glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
 819                 }
 820                 size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
 821                 size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
 822                 size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 823
 824                 vector< pair<size_t, const void *> > args;
 825
 826                 args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
 827                 args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
 828                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
 829                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
 830                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols));
 831                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows));
 832                 args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
 833                 args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
 834                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset));
 835                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset));
 836                 args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
 837                 args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
 838
 839                 openCLExecuteKernel(clCxt, &imgproc_warpAffine, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 840                 openCLSafeCall(clReleaseMemObject(coeffs_cm));
 841             }
 842
 843
 844             void warpPerspective_gpu(const oclMat &src, oclMat &dst, double coeffs[3][3], int interpolation)
 845             {
 846                 CV_Assert( (src.oclchannels() == dst.oclchannels()) );
 847                 int srcStep = src.step1();
 848                 int dstStep = dst.step1();
 849                 float float_coeffs[3][3];
 850                 cl_mem coeffs_cm;
 851
 852                 Context *clCxt = src.clCxt;
 853                 string s[3] = {"NN", "Linear", "Cubic"};
 854                 string kernelName = "warpPerspective" + s[interpolation];
 855
 856                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
 857                 {
 858                     cl_int st;
 859                     coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
 860                     openCLVerifyCall(st);
 861                     openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
 862                 }
 863                 else
 864                 {
 865                     cl_int st;
 866                     for(int m = 0; m < 3; m++)
 867                         for(int n = 0; n < 3; n++)
 868                             float_coeffs[m][n] = coeffs[m][n];
 869
 870                     coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
 871                     openCLVerifyCall(st);
 872                     openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
 873                 }
 874                 //TODO: improve this kernel
 875                 size_t blkSizeX = 16, blkSizeY = 16;
 876                 size_t glbSizeX;
 877                 size_t cols;
 878                 if(src.type() == CV_8UC1 && interpolation == 0)
 879                 {
 880                     cols = (dst.cols + dst.offset % 4 + 3) / 4;
 881                     glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 882                 }
 883                 else
 884                     /*
 885                     */
 886                 {
 887                     cols = dst.cols;
 888                     glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
 889                 }
 890                 size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
 891                 size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
 892                 size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 893
 894                 vector< pair<size_t, const void *> > args;
 895
 896                 args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
 897                 args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
 898                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
 899                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
 900                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols));
 901                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows));
 902                 args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
 903                 args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
 904                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset));
 905                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset));
 906                 args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
 907                 args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
 908
 909                 openCLExecuteKernel(clCxt, &imgproc_warpPerspective, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 910                 openCLSafeCall(clReleaseMemObject(coeffs_cm));
 911             }
 912         }
 913
 914         void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
 915         {
 916             int interpolation = flags & INTER_MAX;
 917
 918             CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
 919             CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
 920
 921             dst.create(dsize, src.type());
 922
 923             CV_Assert(M.rows == 2 && M.cols == 3);
 924
 925             int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
 926             F coeffs[2][3];
 927
 928             double coeffsM[2*3];
 929             Mat coeffsMat(2, 3, CV_64F, (void *)coeffsM);
 930             M.convertTo(coeffsMat, coeffsMat.type());
 931             if(!warpInd)
 932             {
 933                 convert_coeffs(coeffsM);
 934             }
 935
 936             for(int i = 0; i < 2; ++i)
 937                 for(int j = 0; j < 3; ++j)
 938                     coeffs[i][j] = coeffsM[i*3+j];
 939
 940             warpAffine_gpu(src, dst, coeffs, interpolation);
 941         }
 942
 943         void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
 944         {
 945             int interpolation = flags & INTER_MAX;
 946
 947             CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
 948             CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
 949
 950             dst.create(dsize, src.type());
 951
 952
 953             CV_Assert(M.rows == 3 && M.cols == 3);
 954
 955             int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
 956             double coeffs[3][3];
 957
 958             double coeffsM[3*3];
 959             Mat coeffsMat(3, 3, CV_64F, (void *)coeffsM);
 960             M.convertTo(coeffsMat, coeffsMat.type());
 961             if(!warpInd)
 962             {
 963                 invert(coeffsM);
 964             }
 965
 966             for(int i = 0; i < 3; ++i)
 967                 for(int j = 0; j < 3; ++j)
 968                     coeffs[i][j] = coeffsM[i*3+j];
 969
 970             warpPerspective_gpu(src, dst, coeffs, interpolation);
 971         }
 972
 973         ////////////////////////////////////////////////////////////////////////
 974         // integral
 975         void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
 976         {
 977             CV_Assert(src.type() == CV_8UC1);
 978             if(!src.clCxt->supportsFeature(ocl::FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
 979             {
 980                 CV_Error(CV_OpenCLDoubleNotSupported, "select device don't support double");
 981                 return;
 982             }
 983
 984             int vlen = 4;
 985             int offset = src.offset / vlen;
 986             int pre_invalid = src.offset % vlen;
 987             int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
 988
 989             oclMat t_sum , t_sqsum;
 990             int w = src.cols + 1, h = src.rows + 1;
 991             int depth = src.depth() == CV_8U ? CV_32S : CV_64F;
 992             int type = CV_MAKE_TYPE(depth, 1);
 993
 994             t_sum.create(src.cols, src.rows, type);
 995             sum.create(h, w, type);
 996
 997             t_sqsum.create(src.cols, src.rows, CV_32FC1);
 998             sqsum.create(h, w, CV_32FC1);
 999
1000             int sum_offset = sum.offset / vlen;
1001             int sqsum_offset = sqsum.offset / vlen;
1002
1003             vector<pair<size_t , const void *> > args;
1004             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1005             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1006             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
1007             args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
1008             args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
1009             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
1010             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
1011             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1012             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
1013             size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
1014             openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, depth);
1015
1016             args.clear();
1017             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1018             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
1019             args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
1020             args.push_back( make_pair( sizeof(cl_mem) , (void *)&sqsum.data ));
1021             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
1022             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
1023             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
1024             args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
1025             args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum.step));
1026             args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
1027             args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
1028             size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
1029             openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, depth);
1030         }
1031
1032         void integral(const oclMat &src, oclMat &sum)
1033         {
1034             CV_Assert(src.type() == CV_8UC1);
1035             int vlen = 4;
1036             int offset = src.offset / vlen;
1037             int pre_invalid = src.offset % vlen;
1038             int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
1039
1040             oclMat t_sum;
1041             int w = src.cols + 1, h = src.rows + 1;
1042             int depth = src.depth() == CV_8U ? CV_32S : CV_32F;
1043             int type = CV_MAKE_TYPE(depth, 1);
1044
1045             t_sum.create(src.cols, src.rows, type);
1046             sum.create(h, w, type);
1047
1048             int sum_offset = sum.offset / vlen;
1049             vector<pair<size_t , const void *> > args;
1050             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1051             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1052             args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
1053             args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
1054             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
1055             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
1056             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1057             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
1058             size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
1059             openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, depth);
1060
1061             args.clear();
1062             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1063             args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
1064             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
1065             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
1066             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
1067             args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
1068             args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
1069             size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
1070             openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, depth);
1071         }
1072
1073         /////////////////////// corner //////////////////////////////
1074         static void extractCovData(const oclMat &src, oclMat &Dx, oclMat &Dy,
1075                             int blockSize, int ksize, int borderType)
1076         {
1077             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
1078             double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
1079             if (ksize < 0)
1080                 scale *= 2.;
1081
1082             if (src.depth() == CV_8U)
1083             {
1084                 scale *= 255.;
1085                 scale = 1. / scale;
1086             }
1087             else
1088             {
1089                 scale = 1. / scale;
1090             }
1091             if (ksize > 0)
1092             {
1093                 Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
1094                 Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
1095             }
1096             else
1097             {
1098                 Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
1099                 Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
1100             }
1101             CV_Assert(Dx.offset == 0 && Dy.offset == 0);
1102         }
1103
1104         static void corner_ocl(const cv::ocl::ProgramEntry* source, string kernelName, int block_size, float k, oclMat &Dx, oclMat &Dy,
1105                         oclMat &dst, int border_type)
1106         {
1107             char borderType[30];
1108             switch (border_type)
1109             {
1110             case cv::BORDER_CONSTANT:
1111                 sprintf(borderType, "BORDER_CONSTANT");
1112                 break;
1113             case cv::BORDER_REFLECT101:
1114                 sprintf(borderType, "BORDER_REFLECT101");
1115                 break;
1116             case cv::BORDER_REFLECT:
1117                 sprintf(borderType, "BORDER_REFLECT");
1118                 break;
1119             case cv::BORDER_REPLICATE:
1120                 sprintf(borderType, "BORDER_REPLICATE");
1121                 break;
1122             default:
1123                 cout << "BORDER type is not supported!" << endl;
1124             }
1125             char build_options[150];
1126             sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s",
1127                     block_size / 2, block_size / 2, block_size, block_size, borderType);
1128
1129             size_t blockSizeX = 256, blockSizeY = 1;
1130             size_t gSize = blockSizeX - block_size / 2 * 2;
1131             size_t globalSizeX = (Dx.cols) % gSize == 0 ? Dx.cols / gSize * blockSizeX : (Dx.cols / gSize + 1) * blockSizeX;
1132             size_t rows_per_thread = 2;
1133             size_t globalSizeY = ((Dx.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ?
1134                                  ((Dx.rows + rows_per_thread - 1) / rows_per_thread) :
1135                                  (((Dx.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
1136
1137             size_t gt[3] = { globalSizeX, globalSizeY, 1 };
1138             size_t lt[3]  = { blockSizeX, blockSizeY, 1 };
1139             vector<pair<size_t , const void *> > args;
1140             args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
1141             args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dy.data));
1142             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
1143             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.offset ));
1144             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholerows ));
1145             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholecols ));
1146             args.push_back( make_pair(sizeof(cl_int), (void *)&Dx.step));
1147             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.offset ));
1148             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholerows ));
1149             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholecols ));
1150             args.push_back( make_pair(sizeof(cl_int), (void *)&Dy.step));
1151             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
1152             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
1153             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
1154             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
1155             args.push_back( make_pair( sizeof(cl_float) , (void *)&k));
1156             openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, build_options);
1157         }
1158
1159         void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
1160                           double k, int borderType)
1161         {
1162             oclMat dx, dy;
1163             cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
1164         }
1165
1166         void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
1167                           double k, int borderType)
1168         {
1169             if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1170             {
1171                 CV_Error(CV_OpenCLDoubleNotSupported, "select device don't support double");
1172             }
1173             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
1174             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1175             extractCovData(src, dx, dy, blockSize, ksize, borderType);
1176             dst.create(src.size(), CV_32F);
1177             corner_ocl(&imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
1178         }
1179
1180         void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
1181         {
1182             oclMat dx, dy;
1183             cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
1184         }
1185
1186         void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
1187         {
1188             if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1189             {
1190                 CV_Error(CV_OpenCLDoubleNotSupported, "select device don't support double");
1191             }
1192             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
1193             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1194             extractCovData(src, dx, dy, blockSize, ksize, borderType);
1195             dst.create(src.size(), CV_32F);
1196             corner_ocl(&imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
1197         }
1198         /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
1199         static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
1200         {
1201             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
1202             CV_Assert( !(dst.step & 0x3) );
1203             Context *clCxt = src.clCxt;
1204
1205             //Arrange the NDRange
1206             int col = src.cols, row = src.rows;
1207             int ltx = 16, lty = 8;
1208             if(src.cols % ltx != 0)
1209                 col = (col / ltx + 1) * ltx;
1210             if(src.rows % lty != 0)
1211                 row = (row / lty + 1) * lty;
1212
1213             size_t globalThreads[3] = {col, row, 1};
1214             size_t localThreads[3]  = {ltx, lty, 1};
1215
1216             //set args
1217             vector<pair<size_t , const void *> > args;
1218             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
1219             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.step ));
1220             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1221             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1222             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.offset ));
1223             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.offset ));
1224             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
1225             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
1226             args.push_back( make_pair( sizeof(cl_int) , (void *)&sp ));
1227             args.push_back( make_pair( sizeof(cl_int) , (void *)&sr ));
1228             args.push_back( make_pair( sizeof(cl_int) , (void *)&maxIter ));
1229             args.push_back( make_pair( sizeof(cl_float) , (void *)&eps ));
1230             openCLExecuteKernel(clCxt, &meanShift, "meanshift_kernel", globalThreads, localThreads, args, -1, -1);
1231         }
1232
1233         void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, TermCriteria criteria)
1234         {
1235             if( src.empty() )
1236                 CV_Error( CV_StsBadArg, "The input image is empty" );
1237
1238             if( src.depth() != CV_8U || src.oclchannels() != 4 )
1239                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
1240
1241             dst.create( src.size(), CV_8UC4 );
1242
1243             if( !(criteria.type & TermCriteria::MAX_ITER) )
1244                 criteria.maxCount = 5;
1245
1246             int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
1247
1248             float eps;
1249             if( !(criteria.type & TermCriteria::EPS) )
1250                 eps = 1.f;
1251             eps = (float)std::max(criteria.epsilon, 0.0);
1252
1253             meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);
1254
1255         }
1256
1257         static void meanShiftProc_gpu(const oclMat &src, oclMat dstr, oclMat dstsp, int sp, int sr, int maxIter, float eps)
1258         {
1259             //sanity checks
1260             CV_Assert( (src.cols == dstr.cols) && (src.rows == dstr.rows) &&
1261                        (src.rows == dstsp.rows) && (src.cols == dstsp.cols));
1262             CV_Assert( !(dstsp.step & 0x3) );
1263             Context *clCxt = src.clCxt;
1264
1265             //Arrange the NDRange
1266             int col = src.cols, row = src.rows;
1267             int ltx = 16, lty = 8;
1268             if(src.cols % ltx != 0)
1269                 col = (col / ltx + 1) * ltx;
1270             if(src.rows % lty != 0)
1271                 row = (row / lty + 1) * lty;
1272
1273             size_t globalThreads[3] = {col, row, 1};
1274             size_t localThreads[3]  = {ltx, lty, 1};
1275
1276             //set args
1277             vector<pair<size_t , const void *> > args;
1278             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1279             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dstr.data ));
1280             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dstsp.data ));
1281             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1282             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.step ));
1283             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstsp.step ));
1284             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.offset ));
1285             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.offset ));
1286             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstsp.offset ));
1287             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.cols ));
1288             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.rows ));
1289             args.push_back( make_pair( sizeof(cl_int) , (void *)&sp ));
1290             args.push_back( make_pair( sizeof(cl_int) , (void *)&sr ));
1291             args.push_back( make_pair( sizeof(cl_int) , (void *)&maxIter ));
1292             args.push_back( make_pair( sizeof(cl_float) , (void *)&eps ));
1293             openCLExecuteKernel(clCxt, &meanShift, "meanshiftproc_kernel", globalThreads, localThreads, args, -1, -1);
1294         }
1295
1296         void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, TermCriteria criteria)
1297         {
1298             if( src.empty() )
1299                 CV_Error( CV_StsBadArg, "The input image is empty" );
1300
1301             if( src.depth() != CV_8U || src.oclchannels() != 4 )
1302                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
1303
1304 //            if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
1305 //            {
1306 //                CV_Error( CV_OpenCLDoubleNotSupportedNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
1307 //                return;
1308 //            }
1309
1310             dstr.create( src.size(), CV_8UC4 );
1311             dstsp.create( src.size(), CV_16SC2 );
1312
1313             if( !(criteria.type & TermCriteria::MAX_ITER) )
1314                 criteria.maxCount = 5;
1315
1316             int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
1317
1318             float eps;
1319             if( !(criteria.type & TermCriteria::EPS) )
1320                 eps = 1.f;
1321             eps = (float)std::max(criteria.epsilon, 0.0);
1322
1323             meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);
1324         }
1325
1326         ///////////////////////////////////////////////////////////////////////////////////////////////////
1327         ////////////////////////////////////////////////////hist///////////////////////////////////////////////
1328         /////////////////////////////////////////////////////////////////////////////////////////////////////
1329         namespace histograms
1330         {
1331             const int PARTIAL_HISTOGRAM256_COUNT = 256;
1332             const int HISTOGRAM256_BIN_COUNT = 256;
1333         }
1334         ///////////////////////////////calcHist/////////////////////////////////////////////////////////////////
1335         static void calc_sub_hist(const oclMat &mat_src, const oclMat &mat_sub_hist)
1336         {
1337             using namespace histograms;
1338
1339             Context  *clCxt = mat_src.clCxt;
1340             int depth = mat_src.depth();
1341
1342             string kernelName = "calc_sub_hist";
1343
1344             size_t localThreads[3]  = { HISTOGRAM256_BIN_COUNT, 1, 1 };
1345             size_t globalThreads[3] = { PARTIAL_HISTOGRAM256_COUNT *localThreads[0], 1, 1};
1346
1347             int dataWidth = 16;
1348             int dataWidth_bits = 4;
1349             int mask = dataWidth - 1;
1350
1351             int cols = mat_src.cols * mat_src.oclchannels();
1352             int src_offset = mat_src.offset;
1353             int hist_step = mat_sub_hist.step >> 2;
1354             int left_col = 0, right_col = 0;
1355
1356             if(cols >= dataWidth * 2 - 1)
1357             {
1358                 left_col = dataWidth - (src_offset & mask);
1359                 left_col &= mask;
1360                 src_offset += left_col;
1361                 cols -= left_col;
1362                 right_col = cols & mask;
1363                 cols -= right_col;
1364             }
1365             else
1366             {
1367                 left_col = cols;
1368                 right_col = 0;
1369                 cols = 0;
1370                 globalThreads[0] = 0;
1371             }
1372
1373             vector<pair<size_t , const void *> > args;
1374             if(globalThreads[0] != 0)
1375             {
1376                 int tempcols = cols >> dataWidth_bits;
1377                 int inc_x = globalThreads[0] % tempcols;
1378                 int inc_y = globalThreads[0] / tempcols;
1379                 src_offset >>= dataWidth_bits;
1380                 int src_step = mat_src.step >> dataWidth_bits;
1381                 int datacount = tempcols * mat_src.rows;
1382                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
1383                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
1384                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
1385                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
1386                 args.push_back( make_pair( sizeof(cl_int), (void *)&datacount));
1387                 args.push_back( make_pair( sizeof(cl_int), (void *)&tempcols));
1388                 args.push_back( make_pair( sizeof(cl_int), (void *)&inc_x));
1389                 args.push_back( make_pair( sizeof(cl_int), (void *)&inc_y));
1390                 args.push_back( make_pair( sizeof(cl_int), (void *)&hist_step));
1391                 openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, depth);
1392             }
1393             if(left_col != 0 || right_col != 0)
1394             {
1395                 kernelName = "calc_sub_hist_border";
1396                 src_offset = mat_src.offset;
1397                 localThreads[0] = 1;
1398                 localThreads[1] = 256;
1399                 globalThreads[0] = left_col + right_col;
1400                 globalThreads[1] = (mat_src.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
1401
1402                 args.clear();
1403                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
1404                 args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step));
1405                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
1406                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
1407                 args.push_back( make_pair( sizeof(cl_int), (void *)&left_col));
1408                 args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
1409                 args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows));
1410                 args.push_back( make_pair( sizeof(cl_int), (void *)&hist_step));
1411                 openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, depth);
1412             }
1413         }
1414         static void merge_sub_hist(const oclMat &sub_hist, oclMat &mat_hist)
1415         {
1416             using namespace histograms;
1417
1418             Context  *clCxt = sub_hist.clCxt;
1419             string kernelName = "merge_hist";
1420
1421             size_t localThreads[3]  = { 256, 1, 1 };
1422             size_t globalThreads[3] = { HISTOGRAM256_BIN_COUNT *localThreads[0], 1, 1};
1423             int src_step = sub_hist.step >> 2;
1424             vector<pair<size_t , const void *> > args;
1425             args.push_back( make_pair( sizeof(cl_mem), (void *)&sub_hist.data));
1426             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
1427             args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
1428             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
1429         }
1430         void calcHist(const oclMat &mat_src, oclMat &mat_hist)
1431         {
1432             using namespace histograms;
1433             CV_Assert(mat_src.type() == CV_8UC1);
1434             mat_hist.create(1, 256, CV_32SC1);
1435
1436             oclMat buf(PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_BIN_COUNT, CV_32SC1);
1437             buf.setTo(0);
1438
1439             calc_sub_hist(mat_src, buf);
1440             merge_sub_hist(buf, mat_hist);
1441         }
1442         ///////////////////////////////////equalizeHist/////////////////////////////////////////////////////
1443         void equalizeHist(const oclMat &mat_src, oclMat &mat_dst)
1444         {
1445             mat_dst.create(mat_src.rows, mat_src.cols, CV_8UC1);
1446
1447             oclMat mat_hist(1, 256, CV_32SC1);
1448
1449             calcHist(mat_src, mat_hist);
1450
1451             Context *clCxt = mat_src.clCxt;
1452             string kernelName = "calLUT";
1453             size_t localThreads[3] = { 256, 1, 1};
1454             size_t globalThreads[3] = { 256, 1, 1};
1455             oclMat lut(1, 256, CV_8UC1);
1456             vector<pair<size_t , const void *> > args;
1457             int total = mat_src.rows * mat_src.cols;
1458             args.push_back( make_pair( sizeof(cl_mem), (void *)&lut.data));
1459             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
1460             args.push_back( make_pair( sizeof(int), (void *)&total));
1461             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
1462             LUT(mat_src, lut, mat_dst);
1463         }
1464
1465         ////////////////////////////////////////////////////////////////////////
1466         // CLAHE
1467         namespace clahe
1468         {
1469             static void calcLut(const oclMat &src, oclMat &dst,
1470                 const int tilesX, const int tilesY, const cv::Size tileSize,
1471                 const int clipLimit, const float lutScale)
1472             {
1473                 cl_int2 tile_size;
1474                 tile_size.s[0] = tileSize.width;
1475                 tile_size.s[1] = tileSize.height;
1476
1477                 std::vector<pair<size_t , const void *> > args;
1478                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1479                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1480                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
1481                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1482                 args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
1483                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
1484                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
1485                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
1486
1487                 String kernelName = "calcLut";
1488                 size_t localThreads[3]  = { 32, 8, 1 };
1489                 size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
1490                 bool is_cpu = isCpuDevice();
1491                 if (is_cpu)
1492                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
1493                 else
1494                 {
1495                     cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
1496                     size_t wave_size = queryWaveFrontSize(kernel);
1497                     openCLSafeCall(clReleaseKernel(kernel));
1498
1499                     static char opt[20] = {0};
1500                     sprintf(opt, " -D WAVE_SIZE=%d", (int)wave_size);
1501                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt);
1502                 }
1503             }
1504
1505             static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
1506                 const int tilesX, const int tilesY, const cv::Size tileSize)
1507             {
1508                 cl_int2 tile_size;
1509                 tile_size.s[0] = tileSize.width;
1510                 tile_size.s[1] = tileSize.height;
1511
1512                 std::vector<pair<size_t , const void *> > args;
1513                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1514                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1515                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
1516                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
1517                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1518                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.step ));
1519                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
1520                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
1521                 args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
1522                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
1523                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
1524
1525                 String kernelName = "transform";
1526                 size_t localThreads[3]  = { 32, 8, 1 };
1527                 size_t globalThreads[3] = { src.cols, src.rows, 1 };
1528
1529                 openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
1530             }
1531         }
1532
1533         namespace
1534         {
1535             class CLAHE_Impl : public cv::CLAHE
1536             {
1537             public:
1538                 CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
1539
1540                 cv::AlgorithmInfo* info() const;
1541
1542                 void apply(cv::InputArray src, cv::OutputArray dst);
1543
1544                 void setClipLimit(double clipLimit);
1545                 double getClipLimit() const;
1546
1547                 void setTilesGridSize(cv::Size tileGridSize);
1548                 cv::Size getTilesGridSize() const;
1549
1550                 void collectGarbage();
1551
1552             private:
1553                 double clipLimit_;
1554                 int tilesX_;
1555                 int tilesY_;
1556
1557                 oclMat srcExt_;
1558                 oclMat lut_;
1559             };
1560             CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
1561             clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
1562             {
1563             }
1564
1565             CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE_OCL",
1566                 obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
1567                 obj.info()->addParam(obj, "tilesX", obj.tilesX_);
1568                 obj.info()->addParam(obj, "tilesY", obj.tilesY_))
1569             void CLAHE_Impl::apply(cv::InputArray src_raw, cv::OutputArray dst_raw)
1570             {
1571                 oclMat& src = getOclMatRef(src_raw);
1572                 oclMat& dst = getOclMatRef(dst_raw);
1573                 CV_Assert( src.type() == CV_8UC1 );
1574
1575                 dst.create( src.size(), src.type() );
1576
1577                 const int histSize = 256;
1578
1579                 ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
1580
1581                 cv::Size tileSize;
1582                 oclMat srcForLut;
1583
1584                 if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
1585                 {
1586                     tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
1587                     srcForLut = src;
1588                 }
1589                 else
1590                 {
1591                     cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar());
1592
1593                     tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
1594                     srcForLut = srcExt_;
1595                 }
1596
1597                 const int tileSizeTotal = tileSize.area();
1598                 const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
1599
1600                 int clipLimit = 0;
1601                 if (clipLimit_ > 0.0)
1602                 {
1603                     clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
1604                     clipLimit = std::max(clipLimit, 1);
1605                 }
1606
1607                 clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale);
1608                 //finish();
1609                 clahe::transform(src, dst, lut_, tilesX_, tilesY_, tileSize);
1610             }
1611
1612             void CLAHE_Impl::setClipLimit(double clipLimit)
1613             {
1614                 clipLimit_ = clipLimit;
1615             }
1616
1617             double CLAHE_Impl::getClipLimit() const
1618             {
1619                 return clipLimit_;
1620             }
1621
1622             void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
1623             {
1624                 tilesX_ = tileGridSize.width;
1625                 tilesY_ = tileGridSize.height;
1626             }
1627
1628             cv::Size CLAHE_Impl::getTilesGridSize() const
1629             {
1630                 return cv::Size(tilesX_, tilesY_);
1631             }
1632
1633             void CLAHE_Impl::collectGarbage()
1634             {
1635                 srcExt_.release();
1636                 lut_.release();
1637             }
1638         }
1639
1640         cv::Ptr<cv::CLAHE> createCLAHE(double clipLimit, cv::Size tileGridSize)
1641         {
1642             return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
1643         }
1644
1645         //////////////////////////////////bilateralFilter////////////////////////////////////////////////////
1646         static void
1647         oclbilateralFilter_8u( const oclMat &src, oclMat &dst, int d,
1648                                double sigma_color, double sigma_space,
1649                                int borderType )
1650         {
1651             int cn = src.channels();
1652             int i, j, maxk, radius;
1653
1654             CV_Assert( (src.channels() == 1 || src.channels() == 3) &&
1655                        src.type() == dst.type() && src.size() == dst.size() &&
1656                        src.data != dst.data );
1657
1658             if( sigma_color <= 0 )
1659                 sigma_color = 1;
1660             if( sigma_space <= 0 )
1661                 sigma_space = 1;
1662
1663             double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
1664             double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
1665
1666             if( d <= 0 )
1667                 radius = cvRound(sigma_space * 1.5);
1668             else
1669                 radius = d / 2;
1670             radius = MAX(radius, 1);
1671             d = radius * 2 + 1;
1672
1673             oclMat temp;
1674             copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
1675
1676             vector<float> _color_weight(cn * 256);
1677             vector<float> _space_weight(d * d);
1678             vector<int> _space_ofs(d * d);
1679             float *color_weight = &_color_weight[0];
1680             float *space_weight = &_space_weight[0];
1681             int *space_ofs = &_space_ofs[0];
1682             int dst_step_in_pixel = dst.step / dst.elemSize();
1683             int dst_offset_in_pixel = dst.offset / dst.elemSize();
1684             int temp_step_in_pixel = temp.step / temp.elemSize();
1685             // initialize color-related bilateral filter coefficients
1686             for( i = 0; i < 256 * cn; i++ )
1687                 color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
1688
1689             // initialize space-related bilateral filter coefficients
1690             for( i = -radius, maxk = 0; i <= radius; i++ )
1691                 for( j = -radius; j <= radius; j++ )
1692                 {
1693                     double r = std::sqrt((double)i * i + (double)j * j);
1694                     if( r > radius )
1695                         continue;
1696                     space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
1697                     space_ofs[maxk++] = (int)(i * temp_step_in_pixel + j);
1698                 }
1699             oclMat oclcolor_weight(1, cn * 256, CV_32FC1, color_weight);
1700             oclMat oclspace_weight(1, d * d, CV_32FC1, space_weight);
1701             oclMat oclspace_ofs(1, d * d, CV_32SC1, space_ofs);
1702
1703             string kernelName = "bilateral";
1704             size_t localThreads[3]  = { 16, 16, 1 };
1705             size_t globalThreads[3] = { (dst.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
1706                                         (dst.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1],
1707                                         1
1708                                       };
1709             if((dst.type() == CV_8UC1) && ((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
1710             {
1711                 kernelName = "bilateral2";
1712                 globalThreads[0] = (dst.cols / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
1713             }
1714             vector<pair<size_t , const void *> > args;
1715             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
1716             args.push_back( make_pair( sizeof(cl_mem), (void *)&temp.data ));
1717             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
1718             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
1719             args.push_back( make_pair( sizeof(cl_int), (void *)&maxk ));
1720             args.push_back( make_pair( sizeof(cl_int), (void *)&radius ));
1721             args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step_in_pixel ));
1722             args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_in_pixel ));
1723             args.push_back( make_pair( sizeof(cl_int), (void *)&temp_step_in_pixel ));
1724             args.push_back( make_pair( sizeof(cl_int), (void *)&temp.rows ));
1725             args.push_back( make_pair( sizeof(cl_int), (void *)&temp.cols ));
1726             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data ));
1727             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data ));
1728             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data ));
1729             openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, dst.oclchannels(), dst.depth());
1730         }
1731         void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType)
1732         {
1733
1734             dst.create( src.size(), src.type() );
1735             if( src.depth() == CV_8U )
1736                 oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType );
1737             else
1738                 CV_Error( CV_StsUnsupportedFormat,
1739                           "Bilateral filtering is only implemented for 8uimages" );
1740         }
1741
1742     }
1743 }
1744 //////////////////////////////////convolve////////////////////////////////////////////////////
1745
1746 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
1747 {
1748     CV_Assert(src.depth() == CV_32FC1);
1749     CV_Assert(temp1.depth() == CV_32F);
1750     CV_Assert(temp1.cols <= 17 && temp1.rows <= 17);
1751
1752     dst.create(src.size(), src.type());
1753
1754     CV_Assert(src.cols == dst.cols && src.rows == dst.rows);
1755     CV_Assert(src.type() == dst.type());
1756
1757     Context  *clCxt = src.clCxt;
1758     int channels = dst.oclchannels();
1759     int depth = dst.depth();
1760
1761     size_t vector_length = 1;
1762     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
1763     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
1764     int rows = dst.rows;
1765
1766     size_t localThreads[3]  = { 16, 16, 1 };
1767     size_t globalThreads[3] = { cols, rows, 1 };
1768
1769     vector<pair<size_t , const void *> > args;
1770     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
1771     args.push_back( make_pair( sizeof(cl_mem), (void *)&temp1.data ));
1772     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
1773     args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
1774     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
1775     args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
1776     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
1777     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.step ));
1778     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.rows ));
1779     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.cols ));
1780
1781     openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
1782 }
1783 void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y)
1784 {
1785     CV_Assert(x.depth() == CV_32F);
1786     CV_Assert(t.depth() == CV_32F);
1787     CV_Assert(x.type() == y.type() && x.size() == y.size());
1788     y.create(x.size(), x.type());
1789     string kernelName = "convolve";
1790
1791     convolve_run(x, t, y, kernelName, &imgproc_convolve);
1792 }