modules/ocl/src/imgproc.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // @Authors
  19 //    Niko Li, newlife20080214@gmail.com
  20 //    Jia Haipeng, jiahaipeng95@gmail.com
  21 //    Shengen Yan, yanshengen@gmail.com
  22 //    Rock Li, Rock.Li@amd.com
  23 //    Zero Lin, Zero.Lin@amd.com
  24 //    Zhang Ying, zhangying913@gmail.com
  25 //    Xu Pang, pangxu010@163.com
  26 //    Wu Zailong, bullet@yeah.net
  27 //    Wenju He, wenju@multicorewareinc.com
  28 //    Sen Liu, swjtuls1987@126.com
  29 //
  30 // Redistribution and use in source and binary forms, with or without modification,
  31 // are permitted provided that the following conditions are met:
  32 //
  33 //   * Redistribution's of source code must retain the above copyright notice,
  34 //     this list of conditions and the following disclaimer.
  35 //
  36 //   * Redistribution's in binary form must reproduce the above copyright notice,
  37 //     this list of conditions and the following disclaimer in the documentation
  38 //     and/or other oclMaterials provided with the distribution.
  39 //
  40 //   * The name of the copyright holders may not be used to endorse or promote products
  41 //     derived from this software without specific prior written permission.
  42 //
  43 // This software is provided by the copyright holders and contributors "as is" and
  44 // any express or implied warranties, including, but not limited to, the implied
  45 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  46 // In no event shall the Intel Corporation or contributors be liable for any direct,
  47 // indirect, incidental, special, exemplary, or consequential damages
  48 // (including, but not limited to, procurement of substitute goods or services;
  49 // loss of use, data, or profits; or business interruption) however caused
  50 // and on any theory of liability, whether in contract, strict liability,
  51 // or tort (including negligence or otherwise) arising in any way out of
  52 // the use of this software, even if advised of the possibility of such damage.
  53 //
  54 //M*/
  55
  56 #include "precomp.hpp"
  57 #include <iomanip>
  58
  59 using namespace cv;
  60 using namespace cv::ocl;
  61 using namespace std;
  62
  63 namespace cv
  64 {
  65     namespace ocl
  66     {
  67
  68         ////////////////////////////////////OpenCL kernel strings//////////////////////////
  69         extern const char *meanShift;
  70         extern const char *imgproc_copymakeboder;
  71         extern const char *imgproc_median;
  72         extern const char *imgproc_threshold;
  73         extern const char *imgproc_resize;
  74         extern const char *imgproc_remap;
  75         extern const char *imgproc_warpAffine;
  76         extern const char *imgproc_warpPerspective;
  77         extern const char *imgproc_integral_sum;
  78         extern const char *imgproc_integral;
  79         extern const char *imgproc_histogram;
  80         extern const char *imgproc_bilateral;
  81         extern const char *imgproc_calcHarris;
  82         extern const char *imgproc_calcMinEigenVal;
  83         extern const char *imgproc_convolve;
  84         extern const char *imgproc_clahe;
  85         ////////////////////////////////////OpenCL call wrappers////////////////////////////
  86
  87         template <typename T> struct index_and_sizeof;
  88         template <> struct index_and_sizeof<char>
  89         {
  90             enum { index = 1 };
  91         };
  92         template <> struct index_and_sizeof<unsigned char>
  93         {
  94             enum { index = 2 };
  95         };
  96         template <> struct index_and_sizeof<short>
  97         {
  98             enum { index = 3 };
  99         };
 100         template <> struct index_and_sizeof<unsigned short>
 101         {
 102             enum { index = 4 };
 103         };
 104         template <> struct index_and_sizeof<int>
 105         {
 106             enum { index = 5 };
 107         };
 108         template <> struct index_and_sizeof<float>
 109         {
 110             enum { index = 6 };
 111         };
 112         template <> struct index_and_sizeof<double>
 113         {
 114             enum { index = 7 };
 115         };
 116
 117         /////////////////////////////////////////////////////////////////////////////////////
 118         // threshold
 119
 120         typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
 121
 122         static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
 123         {
 124             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
 125             Context *clCxt = src.clCxt;
 126
 127             uchar thresh_uchar = cvFloor(thresh);
 128             uchar max_val = cvRound(maxVal);
 129             string kernelName = "threshold";
 130
 131             size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
 132             size_t bSizeX = 16, bSizeY = 16;
 133             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
 134             size_t gSizeY = dst.rows;
 135             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
 136             size_t localThreads[3] = {bSizeX, bSizeY, 1};
 137
 138             vector< pair<size_t, const void *> > args;
 139             args.push_back( make_pair(sizeof(cl_mem), &src.data));
 140             args.push_back( make_pair(sizeof(cl_mem), &dst.data));
 141             args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
 142             args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
 143             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
 144             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 145             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 146             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
 147             args.push_back( make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
 148             args.push_back( make_pair(sizeof(cl_uchar), (void *)&max_val));
 149             args.push_back( make_pair(sizeof(cl_int), (void *)&type));
 150             openCLExecuteKernel(clCxt, &imgproc_threshold, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 151         }
 152
 153         static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
 154         {
 155             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
 156             Context *clCxt = src.clCxt;
 157
 158             float thresh_f = thresh;
 159             float max_val = maxVal;
 160             int dst_offset = (dst.offset >> 2);
 161             int dst_step = (dst.step >> 2);
 162             int src_offset = (src.offset >> 2);
 163             int src_step = (src.step >> 2);
 164
 165             string kernelName = "threshold";
 166
 167             size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
 168             //size_t cols = dst.cols;
 169             size_t bSizeX = 16, bSizeY = 16;
 170             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
 171             size_t gSizeY = dst.rows;
 172             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
 173             size_t localThreads[3] = {bSizeX, bSizeY, 1};
 174
 175             vector< pair<size_t, const void *> > args;
 176             args.push_back( make_pair(sizeof(cl_mem), &src.data));
 177             args.push_back( make_pair(sizeof(cl_mem), &dst.data));
 178             args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset));
 179             args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
 180             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset));
 181             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 182             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 183             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
 184             args.push_back( make_pair(sizeof(cl_float), (void *)&thresh_f));
 185             args.push_back( make_pair(sizeof(cl_float), (void *)&max_val));
 186             args.push_back( make_pair(sizeof(cl_int), (void *)&type));
 187             openCLExecuteKernel(clCxt, &imgproc_threshold, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 188
 189         }
 190
 191         //threshold: support 8UC1 and 32FC1 data type and five threshold type
 192         double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
 193         {
 194             //TODO: These limitations shall be removed later.
 195             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
 196             CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
 197                       || type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
 198
 199             static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
 200
 201             dst.create( src.size(), src.type() );
 202             gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
 203
 204             return thresh;
 205         }
 206         ////////////////////////////////////////////////////////////////////////////////////////////
 207         ///////////////////////////////   remap   //////////////////////////////////////////////////
 208         ////////////////////////////////////////////////////////////////////////////////////////////
 209
 210         void remap( const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int borderType, const Scalar &borderValue )
 211         {
 212             Context *clCxt = src.clCxt;
 213             CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST
 214                       || interpolation == INTER_CUBIC || interpolation == INTER_LANCZOS4);
 215             CV_Assert((map1.type() == CV_16SC2 && !map2.data) || (map1.type() == CV_32FC2 && !map2.data) || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1));
 216             CV_Assert(!map2.data || map2.size() == map1.size());
 217             CV_Assert(dst.size() == map1.size());
 218
 219             dst.create(map1.size(), src.type());
 220
 221
 222             string kernelName;
 223
 224             if( map1.type() == CV_32FC2 && !map2.data )
 225             {
 226                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
 227                     kernelName = "remapLNFConstant";
 228                 else if(interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
 229                     kernelName = "remapNNFConstant";
 230             }
 231             else if(map1.type() == CV_16SC2 && !map2.data)
 232             {
 233                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
 234                     kernelName = "remapLNSConstant";
 235                 else if(interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
 236                     kernelName = "remapNNSConstant";
 237
 238             }
 239             else if(map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
 240             {
 241                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
 242                     kernelName = "remapLNF1Constant";
 243                 else if (interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
 244                     kernelName = "remapNNF1Constant";
 245             }
 246
 247             //int channels = dst.oclchannels();
 248             //int depth = dst.depth();
 249             //int type = src.type();
 250             size_t blkSizeX = 16, blkSizeY = 16;
 251             size_t glbSizeX;
 252             int cols = dst.cols;
 253             if(src.type() == CV_8UC1)
 254             {
 255                 cols = (dst.cols + dst.offset % 4 + 3) / 4;
 256                 glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 257
 258             }
 259             else if(src.type() == CV_32FC1 && interpolation == INTER_LINEAR)
 260             {
 261                 cols = (dst.cols + (dst.offset >> 2) % 4 + 3) / 4;
 262                 glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 263             }
 264             else
 265             {
 266                 glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
 267
 268             }
 269
 270             size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
 271             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
 272             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 273
 274             float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
 275             vector< pair<size_t, const void *> > args;
 276             if(map1.channels() == 2)
 277             {
 278                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 279                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 280                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
 281                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
 282                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
 283                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
 284                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
 285                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
 286                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
 287                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 288                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 289                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 290                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 291                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
 292                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
 293                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
 294
 295                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
 296                 {
 297                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
 298                 }
 299                 else
 300                 {
 301                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
 302                 }
 303             }
 304             if(map1.channels() == 1)
 305             {
 306                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 307                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 308                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
 309                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map2.data));
 310                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
 311                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
 312                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
 313                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
 314                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
 315                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
 316                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 317                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 318                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 319                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 320                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
 321                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
 322                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
 323                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
 324                 {
 325                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
 326                 }
 327                 else
 328                 {
 329                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
 330                 }
 331             }
 332             openCLExecuteKernel(clCxt, &imgproc_remap, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 333         }
 334
 335         ////////////////////////////////////////////////////////////////////////////////////////////
 336         // resize
 337
 338         static void resize_gpu( const oclMat &src, oclMat &dst, double fx, double fy, int interpolation)
 339         {
 340             CV_Assert( (src.channels() == dst.channels()) );
 341             Context *clCxt = src.clCxt;
 342             float ifx = 1. / fx;
 343             float ify = 1. / fy;
 344             double ifx_d = 1. / fx;
 345             double ify_d = 1. / fy;
 346             int srcStep_in_pixel = src.step1() / src.oclchannels();
 347             int srcoffset_in_pixel = src.offset / src.elemSize();
 348             int dstStep_in_pixel = dst.step1() / dst.oclchannels();
 349             int dstoffset_in_pixel = dst.offset / dst.elemSize();
 350             //printf("%d %d\n",src.step1() , dst.elemSize());
 351             string kernelName;
 352             if(interpolation == INTER_LINEAR)
 353                 kernelName = "resizeLN";
 354             else if(interpolation == INTER_NEAREST)
 355                 kernelName = "resizeNN";
 356
 357             //TODO: improve this kernel
 358             size_t blkSizeX = 16, blkSizeY = 16;
 359             size_t glbSizeX;
 360             if(src.type() == CV_8UC1)
 361             {
 362                 size_t cols = (dst.cols + dst.offset % 4 + 3) / 4;
 363                 glbSizeX = cols % blkSizeX == 0 && cols != 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 364             }
 365             else
 366             {
 367                 glbSizeX = dst.cols % blkSizeX == 0 && dst.cols != 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
 368             }
 369             size_t glbSizeY = dst.rows % blkSizeY == 0 && dst.rows != 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
 370             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
 371             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 372
 373             vector< pair<size_t, const void *> > args;
 374             if(interpolation == INTER_NEAREST)
 375             {
 376                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 377                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 378                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
 379                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
 380                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
 381                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
 382                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 383                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 384                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 385                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 386                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
 387                 {
 388                     args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
 389                     args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
 390                 }
 391                 else
 392                 {
 393                     args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
 394                     args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
 395                 }
 396             }
 397             else
 398             {
 399                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
 400                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
 401                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
 402                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
 403                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
 404                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
 405                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
 406                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
 407                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
 408                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
 409                 args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
 410                 args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
 411             }
 412
 413             openCLExecuteKernel(clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 414         }
 415
 416
 417         void resize(const oclMat &src, oclMat &dst, Size dsize,
 418                     double fx, double fy, int interpolation)
 419         {
 420             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4
 421                       || src.type() == CV_32FC1 || src.type() == CV_32FC3 || src.type() == CV_32FC4);
 422             CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST);
 423             CV_Assert( src.size().area() > 0 );
 424             CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
 425
 426             if(!(dsize == Size()) && (fx > 0 && fy > 0))
 427             {
 428                 if(dsize.width != (int)(src.cols * fx) || dsize.height != (int)(src.rows * fy))
 429                 {
 430                     CV_Error(CV_StsUnmatchedSizes, "invalid dsize and fx, fy!");
 431                 }
 432             }
 433             if( dsize == Size() )
 434             {
 435                 dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
 436             }
 437             else
 438             {
 439                 fx = (double)dsize.width / src.cols;
 440                 fy = (double)dsize.height / src.rows;
 441             }
 442
 443             dst.create(dsize, src.type());
 444
 445             if( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR )
 446             {
 447                 resize_gpu( src, dst, fx, fy, interpolation);
 448                 return;
 449             }
 450             CV_Error(CV_StsUnsupportedFormat, "Non-supported interpolation method");
 451         }
 452
 453
 454         ////////////////////////////////////////////////////////////////////////
 455         // medianFilter
 456         void medianFilter(const oclMat &src, oclMat &dst, int m)
 457         {
 458             CV_Assert( m % 2 == 1 && m > 1 );
 459             CV_Assert( m <= 5 || src.depth() == CV_8U );
 460             CV_Assert( src.cols <= dst.cols && src.rows <= dst.rows );
 461
 462             if(src.data == dst.data)
 463             {
 464                 oclMat src1;
 465                 src.copyTo(src1);
 466                 return medianFilter(src1, dst, m);
 467             }
 468
 469             int srcStep = src.step1() / src.oclchannels();
 470             int dstStep = dst.step1() / dst.oclchannels();
 471             int srcOffset = src.offset / src.oclchannels() / src.elemSize1();
 472             int dstOffset = dst.offset / dst.oclchannels() / dst.elemSize1();
 473
 474             Context *clCxt = src.clCxt;
 475             string kernelName = "medianFilter";
 476
 477
 478             vector< pair<size_t, const void *> > args;
 479             args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
 480             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
 481             args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
 482             args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
 483             args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
 484             args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
 485             args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
 486             args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
 487
 488             size_t globalThreads[3] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
 489             size_t localThreads[3] = {16, 16, 1};
 490
 491             if(m == 3)
 492             {
 493                 string kernelName = "medianFilter3";
 494                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 495             }
 496             else if(m == 5)
 497             {
 498                 string kernelName = "medianFilter5";
 499                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 500             }
 501             else
 502             {
 503                 CV_Error(CV_StsUnsupportedFormat, "Non-supported filter length");
 504                 //string kernelName = "medianFilter";
 505                 //args.push_back( make_pair( sizeof(cl_int),(void*)&m));
 506
 507                 //openCLExecuteKernel(clCxt,&imgproc_median,kernelName,globalThreads,localThreads,args,src.oclchannels(),-1);
 508             }
 509
 510         }
 511
 512         ////////////////////////////////////////////////////////////////////////
 513         // copyMakeBorder
 514         void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
 515         {
 516             //CV_Assert(src.oclchannels() != 2);
 517             CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
 518             if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
 519             {
 520                 if(((bordertype & cv::BORDER_ISOLATED) == 0) &&
 521                         (bordertype != cv::BORDER_CONSTANT) &&
 522                         (bordertype != cv::BORDER_REPLICATE))
 523                 {
 524                     CV_Error(CV_StsBadArg, "unsupported border type");
 525                 }
 526             }
 527             bordertype &= ~cv::BORDER_ISOLATED;
 528             if((bordertype == cv::BORDER_REFLECT) || (bordertype == cv::BORDER_WRAP))
 529             {
 530                 CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
 531             }
 532             if(bordertype == cv::BORDER_REFLECT_101)
 533             {
 534                 CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
 535             }
 536             dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
 537             int srcStep = src.step1() / src.oclchannels();
 538             int dstStep = dst.step1() / dst.oclchannels();
 539             int srcOffset = src.offset / src.elemSize();
 540             int dstOffset = dst.offset / dst.elemSize();
 541             int __bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101};
 542             const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"};
 543             size_t bordertype_index;
 544             for(bordertype_index = 0; bordertype_index < sizeof(__bordertype) / sizeof(int); bordertype_index++)
 545             {
 546                 if(__bordertype[bordertype_index] == bordertype)
 547                     break;
 548             }
 549             if(bordertype_index == sizeof(__bordertype) / sizeof(int))
 550             {
 551                 CV_Error(CV_StsBadArg, "unsupported border type");
 552             }
 553             string kernelName = "copymakeborder";
 554             size_t localThreads[3] = {16, 16, 1};
 555             size_t globalThreads[3] = {(dst.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
 556                                        (dst.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1
 557                                       };
 558
 559             vector< pair<size_t, const void *> > args;
 560             args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
 561             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
 562             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
 563             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows));
 564             args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
 565             args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
 566             args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
 567             args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
 568             args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
 569             args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
 570             args.push_back( make_pair( sizeof(cl_int), (void *)&top));
 571             args.push_back( make_pair( sizeof(cl_int), (void *)&left));
 572             char compile_option[64];
 573             union sc
 574             {
 575                 cl_uchar4 uval;
 576                 cl_char4  cval;
 577                 cl_ushort4 usval;
 578                 cl_short4 shval;
 579                 cl_int4 ival;
 580                 cl_float4 fval;
 581                 cl_double4 dval;
 582             } val;
 583             switch(dst.depth())
 584             {
 585             case CV_8U:
 586                 val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
 587                 val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
 588                 val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
 589                 val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
 590                 switch(dst.oclchannels())
 591                 {
 592                 case 1:
 593                     sprintf(compile_option, "-D GENTYPE=uchar -D %s", borderstr[bordertype_index]);
 594                     args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
 595                     if(((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
 596                     {
 597                         kernelName = "copymakeborder_C1_D0";
 598                         globalThreads[0] = (dst.cols / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
 599                     }
 600                     break;
 601                 case 4:
 602                     sprintf(compile_option, "-D GENTYPE=uchar4 -D %s", borderstr[bordertype_index]);
 603                     args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
 604                     break;
 605                 default:
 606                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 607                 }
 608                 break;
 609             case CV_8S:
 610                 val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
 611                 val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
 612                 val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
 613                 val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
 614                 switch(dst.oclchannels())
 615                 {
 616                 case 1:
 617                     sprintf(compile_option, "-D GENTYPE=char -D %s", borderstr[bordertype_index]);
 618                     args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
 619                     break;
 620                 case 4:
 621                     sprintf(compile_option, "-D GENTYPE=char4 -D %s", borderstr[bordertype_index]);
 622                     args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
 623                     break;
 624                 default:
 625                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 626                 }
 627                 break;
 628             case CV_16U:
 629                 val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
 630                 val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
 631                 val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
 632                 val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
 633                 switch(dst.oclchannels())
 634                 {
 635                 case 1:
 636                     sprintf(compile_option, "-D GENTYPE=ushort -D %s", borderstr[bordertype_index]);
 637                     args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
 638                     break;
 639                 case 4:
 640                     sprintf(compile_option, "-D GENTYPE=ushort4 -D %s", borderstr[bordertype_index]);
 641                     args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
 642                     break;
 643                 default:
 644                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 645                 }
 646                 break;
 647             case CV_16S:
 648                 val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
 649                 val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
 650                 val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
 651                 val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
 652                 switch(dst.oclchannels())
 653                 {
 654                 case 1:
 655                     sprintf(compile_option, "-D GENTYPE=short -D %s", borderstr[bordertype_index]);
 656                     args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
 657                     break;
 658                 case 4:
 659                     sprintf(compile_option, "-D GENTYPE=short4 -D %s", borderstr[bordertype_index]);
 660                     args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
 661                     break;
 662                 default:
 663                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 664                 }
 665                 break;
 666             case CV_32S:
 667                 val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
 668                 val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
 669                 val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
 670                 val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
 671                 switch(dst.oclchannels())
 672                 {
 673                 case 1:
 674                     sprintf(compile_option, "-D GENTYPE=int -D %s", borderstr[bordertype_index]);
 675                     args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
 676                     break;
 677                 case 2:
 678                     sprintf(compile_option, "-D GENTYPE=int2 -D %s", borderstr[bordertype_index]);
 679                     cl_int2 i2val;
 680                     i2val.s[0] = val.ival.s[0];
 681                     i2val.s[1] = val.ival.s[1];
 682                     args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
 683                     break;
 684                 case 4:
 685                     sprintf(compile_option, "-D GENTYPE=int4 -D %s", borderstr[bordertype_index]);
 686                     args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
 687                     break;
 688                 default:
 689                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 690                 }
 691                 break;
 692             case CV_32F:
 693                 val.fval.s[0] = scalar.val[0];
 694                 val.fval.s[1] = scalar.val[1];
 695                 val.fval.s[2] = scalar.val[2];
 696                 val.fval.s[3] = scalar.val[3];
 697                 switch(dst.oclchannels())
 698                 {
 699                 case 1:
 700                     sprintf(compile_option, "-D GENTYPE=float -D %s", borderstr[bordertype_index]);
 701                     args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
 702                     break;
 703                 case 4:
 704                     sprintf(compile_option, "-D GENTYPE=float4 -D %s", borderstr[bordertype_index]);
 705                     args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
 706                     break;
 707                 default:
 708                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 709                 }
 710                 break;
 711             case CV_64F:
 712                 val.dval.s[0] = scalar.val[0];
 713                 val.dval.s[1] = scalar.val[1];
 714                 val.dval.s[2] = scalar.val[2];
 715                 val.dval.s[3] = scalar.val[3];
 716                 switch(dst.oclchannels())
 717                 {
 718                 case 1:
 719                     sprintf(compile_option, "-D GENTYPE=double -D %s", borderstr[bordertype_index]);
 720                     args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
 721                     break;
 722                 case 4:
 723                     sprintf(compile_option, "-D GENTYPE=double4 -D %s", borderstr[bordertype_index]);
 724                     args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
 725                     break;
 726                 default:
 727                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
 728                 }
 729                 break;
 730             default:
 731                 CV_Error(CV_StsUnsupportedFormat, "unknown depth");
 732             }
 733
 734             openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
 735             //uchar* cputemp=new uchar[32*dst.wholerows];
 736             ////int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
 737             //openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE,
 738             //                                          0, 32*dst.wholerows, cputemp, 0, NULL, NULL));
 739             //for(int i=0;i<dst.wholerows;i++)
 740             //{
 741             //  for(int j=0;j<dst.wholecols;j++)
 742             //  {
 743             //          cout<< (int)cputemp[i*32+j]<<" ";
 744             //  }
 745             //  cout<<endl;
 746             //}
 747             //delete []cputemp;
 748         }
 749
 750         ////////////////////////////////////////////////////////////////////////
 751         // warp
 752
 753         namespace
 754         {
 755 #define F double
 756
 757             void convert_coeffs(F *M)
 758             {
 759                 double D = M[0] * M[4] - M[1] * M[3];
 760                 D = D != 0 ? 1. / D : 0;
 761                 double A11 = M[4] * D, A22 = M[0] * D;
 762                 M[0] = A11;
 763                 M[1] *= -D;
 764                 M[3] *= -D;
 765                 M[4] = A22;
 766                 double b1 = -M[0] * M[2] - M[1] * M[5];
 767                 double b2 = -M[3] * M[2] - M[4] * M[5];
 768                 M[2] = b1;
 769                 M[5] = b2;
 770             }
 771
 772             double invert(double *M)
 773             {
 774 #define Sd(y,x) (Sd[y*3+x])
 775 #define Dd(y,x) (Dd[y*3+x])
 776 #define det3(m)    (m(0,0)*(m(1,1)*m(2,2) - m(1,2)*m(2,1)) -  \
 777                     m(0,1)*(m(1,0)*m(2,2) - m(1,2)*m(2,0)) +  \
 778                     m(0,2)*(m(1,0)*m(2,1) - m(1,1)*m(2,0)))
 779                 double *Sd = M;
 780                 double *Dd = M;
 781                 double d = det3(Sd);
 782                 double result = 0;
 783                 if( d != 0)
 784                 {
 785                     double t[9];
 786                     result = d;
 787                     d = 1. / d;
 788
 789                     t[0] = (Sd(1, 1) * Sd(2, 2) - Sd(1, 2) * Sd(2, 1)) * d;
 790                     t[1] = (Sd(0, 2) * Sd(2, 1) - Sd(0, 1) * Sd(2, 2)) * d;
 791                     t[2] = (Sd(0, 1) * Sd(1, 2) - Sd(0, 2) * Sd(1, 1)) * d;
 792
 793                     t[3] = (Sd(1, 2) * Sd(2, 0) - Sd(1, 0) * Sd(2, 2)) * d;
 794                     t[4] = (Sd(0, 0) * Sd(2, 2) - Sd(0, 2) * Sd(2, 0)) * d;
 795                     t[5] = (Sd(0, 2) * Sd(1, 0) - Sd(0, 0) * Sd(1, 2)) * d;
 796
 797                     t[6] = (Sd(1, 0) * Sd(2, 1) - Sd(1, 1) * Sd(2, 0)) * d;
 798                     t[7] = (Sd(0, 1) * Sd(2, 0) - Sd(0, 0) * Sd(2, 1)) * d;
 799                     t[8] = (Sd(0, 0) * Sd(1, 1) - Sd(0, 1) * Sd(1, 0)) * d;
 800
 801                     Dd(0, 0) = t[0];
 802                     Dd(0, 1) = t[1];
 803                     Dd(0, 2) = t[2];
 804                     Dd(1, 0) = t[3];
 805                     Dd(1, 1) = t[4];
 806                     Dd(1, 2) = t[5];
 807                     Dd(2, 0) = t[6];
 808                     Dd(2, 1) = t[7];
 809                     Dd(2, 2) = t[8];
 810                 }
 811                 return result;
 812             }
 813
 814             void warpAffine_gpu(const oclMat &src, oclMat &dst, F coeffs[2][3], int interpolation)
 815             {
 816                 CV_Assert( (src.oclchannels() == dst.oclchannels()) );
 817                 int srcStep = src.step1();
 818                 int dstStep = dst.step1();
 819                 float float_coeffs[2][3];
 820                 cl_mem coeffs_cm;
 821
 822                 Context *clCxt = src.clCxt;
 823                 string s[3] = {"NN", "Linear", "Cubic"};
 824                 string kernelName = "warpAffine" + s[interpolation];
 825
 826
 827                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
 828                 {
 829                     cl_int st;
 830                     coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
 831                     openCLVerifyCall(st);
 832                     openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
 833                 }
 834                 else
 835                 {
 836                     cl_int st;
 837                     for(int m = 0; m < 2; m++)
 838                         for(int n = 0; n < 3; n++)
 839                         {
 840                             float_coeffs[m][n] = coeffs[m][n];
 841                         }
 842                         coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
 843                         openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
 844
 845                 }
 846                 //TODO: improve this kernel
 847                 size_t blkSizeX = 16, blkSizeY = 16;
 848                 size_t glbSizeX;
 849                 size_t cols;
 850                 //if(src.type() == CV_8UC1 && interpolation != 2)
 851                 if(src.type() == CV_8UC1 && interpolation != 2)
 852                 {
 853                     cols = (dst.cols + dst.offset % 4 + 3) / 4;
 854                     glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 855                 }
 856                 else
 857                 {
 858                     cols = dst.cols;
 859                     glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
 860                 }
 861                 size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
 862                 size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
 863                 size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 864
 865                 vector< pair<size_t, const void *> > args;
 866
 867                 args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
 868                 args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
 869                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
 870                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
 871                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols));
 872                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows));
 873                 args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
 874                 args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
 875                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset));
 876                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset));
 877                 args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
 878                 args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
 879
 880                 openCLExecuteKernel(clCxt, &imgproc_warpAffine, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 881                 openCLSafeCall(clReleaseMemObject(coeffs_cm));
 882             }
 883
 884
 885             void warpPerspective_gpu(const oclMat &src, oclMat &dst, double coeffs[3][3], int interpolation)
 886             {
 887                 CV_Assert( (src.oclchannels() == dst.oclchannels()) );
 888                 int srcStep = src.step1();
 889                 int dstStep = dst.step1();
 890                 float float_coeffs[3][3];
 891                 cl_mem coeffs_cm;
 892
 893                 Context *clCxt = src.clCxt;
 894                 string s[3] = {"NN", "Linear", "Cubic"};
 895                 string kernelName = "warpPerspective" + s[interpolation];
 896
 897                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
 898                 {
 899                     cl_int st;
 900                     coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
 901                     openCLVerifyCall(st);
 902                     openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
 903                 }
 904                 else
 905                 {
 906                     cl_int st;
 907                     for(int m = 0; m < 3; m++)
 908                         for(int n = 0; n < 3; n++)
 909                             float_coeffs[m][n] = coeffs[m][n];
 910
 911                     coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
 912                     openCLVerifyCall(st);
 913                     openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
 914                 }
 915                 //TODO: improve this kernel
 916                 size_t blkSizeX = 16, blkSizeY = 16;
 917                 size_t glbSizeX;
 918                 size_t cols;
 919                 if(src.type() == CV_8UC1 && interpolation == 0)
 920                 {
 921                     cols = (dst.cols + dst.offset % 4 + 3) / 4;
 922                     glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
 923                 }
 924                 else
 925                     /*
 926                     */
 927                 {
 928                     cols = dst.cols;
 929                     glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
 930                 }
 931                 size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
 932                 size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
 933                 size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
 934
 935                 vector< pair<size_t, const void *> > args;
 936
 937                 args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
 938                 args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
 939                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
 940                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
 941                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols));
 942                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows));
 943                 args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
 944                 args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
 945                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset));
 946                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset));
 947                 args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
 948                 args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
 949
 950                 openCLExecuteKernel(clCxt, &imgproc_warpPerspective, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
 951                 openCLSafeCall(clReleaseMemObject(coeffs_cm));
 952             }
 953         }
 954
 955         void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
 956         {
 957             int interpolation = flags & INTER_MAX;
 958
 959             CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
 960             CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
 961
 962             dst.create(dsize, src.type());
 963
 964             CV_Assert(M.rows == 2 && M.cols == 3);
 965
 966             int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
 967             F coeffs[2][3];
 968
 969             double coeffsM[2*3];
 970             Mat coeffsMat(2, 3, CV_64F, (void *)coeffsM);
 971             M.convertTo(coeffsMat, coeffsMat.type());
 972             if(!warpInd)
 973             {
 974                 convert_coeffs(coeffsM);
 975             }
 976
 977             for(int i = 0; i < 2; ++i)
 978                 for(int j = 0; j < 3; ++j)
 979                     coeffs[i][j] = coeffsM[i*3+j];
 980
 981             warpAffine_gpu(src, dst, coeffs, interpolation);
 982         }
 983
 984         void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
 985         {
 986             int interpolation = flags & INTER_MAX;
 987
 988             CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
 989             CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
 990
 991             dst.create(dsize, src.type());
 992
 993
 994             CV_Assert(M.rows == 3 && M.cols == 3);
 995
 996             int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
 997             double coeffs[3][3];
 998
 999             double coeffsM[3*3];
1000             Mat coeffsMat(3, 3, CV_64F, (void *)coeffsM);
1001             M.convertTo(coeffsMat, coeffsMat.type());
1002             if(!warpInd)
1003             {
1004                 invert(coeffsM);
1005             }
1006
1007             for(int i = 0; i < 3; ++i)
1008                 for(int j = 0; j < 3; ++j)
1009                     coeffs[i][j] = coeffsM[i*3+j];
1010
1011             warpPerspective_gpu(src, dst, coeffs, interpolation);
1012         }
1013
1014         ////////////////////////////////////////////////////////////////////////
1015         // integral
1016         void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
1017         {
1018             CV_Assert(src.type() == CV_8UC1);
1019             if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
1020             {
1021                 CV_Error(CV_GpuNotSupported, "select device don't support double");
1022             }
1023             int vlen = 4;
1024             int offset = src.offset / vlen;
1025             int pre_invalid = src.offset % vlen;
1026             int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
1027
1028             oclMat t_sum , t_sqsum;
1029             int w = src.cols + 1, h = src.rows + 1;
1030             int depth;
1031             if( src.cols * src.rows <= 2901 * 2901 ) //2901 is the maximum size for int when all values are 255
1032             {
1033                 t_sum.create(src.cols, src.rows, CV_32SC1);
1034                 sum.create(h, w, CV_32SC1);
1035             }
1036             else
1037             {
1038                  //Use float to prevent overflow
1039                 t_sum.create(src.cols, src.rows, CV_32FC1);
1040                 sum.create(h, w, CV_32FC1);
1041              }
1042              t_sqsum.create(src.cols, src.rows, CV_32FC1);
1043              sqsum.create(h, w, CV_32FC1);
1044              depth = sum.depth();
1045              int sum_offset = sum.offset / vlen;
1046              int sqsum_offset = sqsum.offset / vlen;
1047
1048              vector<pair<size_t , const void *> > args;
1049              args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1050              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1051              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
1052              args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
1053              args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
1054              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
1055              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
1056              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1057              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
1058              size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
1059              openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, depth);
1060              args.clear();
1061              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1062              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
1063              args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
1064              args.push_back( make_pair( sizeof(cl_mem) , (void *)&sqsum.data ));
1065              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
1066              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
1067              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
1068              args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
1069              args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum.step));
1070              args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
1071              args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
1072              size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
1073              openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, depth);
1074         }
1075
1076         void integral(const oclMat &src, oclMat &sum)
1077         {
1078             CV_Assert(src.type() == CV_8UC1);
1079             int vlen = 4;
1080             int offset = src.offset / vlen;
1081             int pre_invalid = src.offset % vlen;
1082             int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
1083
1084             oclMat t_sum;
1085             int w = src.cols + 1, h = src.rows + 1;
1086             int depth;
1087             if(src.cols * src.rows <= 2901 * 2901)
1088             {
1089                 t_sum.create(src.cols, src.rows, CV_32SC1);
1090                 sum.create(h, w, CV_32SC1);
1091             }else
1092             {
1093                  t_sum.create(src.cols, src.rows, CV_32FC1);
1094                  sum.create(h, w, CV_32FC1);
1095              }
1096              depth = sum.depth();
1097              int sum_offset = sum.offset / vlen;
1098              vector<pair<size_t , const void *> > args;
1099              args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1100              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1101              args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
1102              args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
1103              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
1104              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
1105              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1106              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
1107              size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
1108              openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, depth);
1109              args.clear();
1110              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1111              args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
1112              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
1113              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
1114              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
1115              args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
1116              args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
1117              size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
1118              openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, depth);
1119         }
1120
1121         /////////////////////// corner //////////////////////////////
1122         static void extractCovData(const oclMat &src, oclMat &Dx, oclMat &Dy,
1123                             int blockSize, int ksize, int borderType)
1124         {
1125             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
1126             double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
1127             if (ksize < 0)
1128                 scale *= 2.;
1129
1130             if (src.depth() == CV_8U)
1131             {
1132                 scale *= 255.;
1133                 scale = 1. / scale;
1134             }
1135             else
1136             {
1137                 scale = 1. / scale;
1138             }
1139             if (ksize > 0)
1140             {
1141                 Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
1142                 Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
1143             }
1144             else
1145             {
1146                 Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
1147                 Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
1148             }
1149             CV_Assert(Dx.offset == 0 && Dy.offset == 0);
1150         }
1151
1152         static void corner_ocl(const char *src_str, string kernelName, int block_size, float k, oclMat &Dx, oclMat &Dy,
1153                         oclMat &dst, int border_type)
1154         {
1155             char borderType[30];
1156             switch (border_type)
1157             {
1158             case cv::BORDER_CONSTANT:
1159                 sprintf(borderType, "BORDER_CONSTANT");
1160                 break;
1161             case cv::BORDER_REFLECT101:
1162                 sprintf(borderType, "BORDER_REFLECT101");
1163                 break;
1164             case cv::BORDER_REFLECT:
1165                 sprintf(borderType, "BORDER_REFLECT");
1166                 break;
1167             case cv::BORDER_REPLICATE:
1168                 sprintf(borderType, "BORDER_REPLICATE");
1169                 break;
1170             default:
1171                 cout << "BORDER type is not supported!" << endl;
1172             }
1173             char build_options[150];
1174             sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s",
1175                     block_size / 2, block_size / 2, block_size, block_size, borderType);
1176
1177             size_t blockSizeX = 256, blockSizeY = 1;
1178             size_t gSize = blockSizeX - block_size / 2 * 2;
1179             size_t globalSizeX = (Dx.cols) % gSize == 0 ? Dx.cols / gSize * blockSizeX : (Dx.cols / gSize + 1) * blockSizeX;
1180             size_t rows_per_thread = 2;
1181             size_t globalSizeY = ((Dx.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ?
1182                                  ((Dx.rows + rows_per_thread - 1) / rows_per_thread) :
1183                                  (((Dx.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
1184
1185             size_t gt[3] = { globalSizeX, globalSizeY, 1 };
1186             size_t lt[3]  = { blockSizeX, blockSizeY, 1 };
1187             vector<pair<size_t , const void *> > args;
1188             args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
1189             args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dy.data));
1190             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
1191             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.offset ));
1192             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholerows ));
1193             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholecols ));
1194             args.push_back( make_pair(sizeof(cl_int), (void *)&Dx.step));
1195             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.offset ));
1196             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholerows ));
1197             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholecols ));
1198             args.push_back( make_pair(sizeof(cl_int), (void *)&Dy.step));
1199             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
1200             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
1201             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
1202             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
1203             args.push_back( make_pair( sizeof(cl_float) , (void *)&k));
1204             openCLExecuteKernel(dst.clCxt, &src_str, kernelName, gt, lt, args, -1, -1, build_options);
1205         }
1206
1207         void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
1208                           double k, int borderType)
1209         {
1210             oclMat dx, dy;
1211             cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
1212         }
1213
1214         void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
1215                           double k, int borderType)
1216         {
1217             if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
1218             {
1219                 CV_Error(CV_GpuNotSupported, "select device don't support double");
1220             }
1221             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
1222             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1223             extractCovData(src, dx, dy, blockSize, ksize, borderType);
1224             dst.create(src.size(), CV_32F);
1225             corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
1226         }
1227
1228         void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
1229         {
1230             oclMat dx, dy;
1231             cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
1232         }
1233
1234         void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
1235         {
1236             if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
1237             {
1238                 CV_Error(CV_GpuNotSupported, "select device don't support double");
1239             }
1240             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
1241             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1242             extractCovData(src, dx, dy, blockSize, ksize, borderType);
1243             dst.create(src.size(), CV_32F);
1244             corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
1245         }
1246         /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
1247         static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
1248         {
1249             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
1250             CV_Assert( !(dst.step & 0x3) );
1251             Context *clCxt = src.clCxt;
1252
1253             //Arrange the NDRange
1254             int col = src.cols, row = src.rows;
1255             int ltx = 16, lty = 8;
1256             if(src.cols % ltx != 0)
1257                 col = (col / ltx + 1) * ltx;
1258             if(src.rows % lty != 0)
1259                 row = (row / lty + 1) * lty;
1260
1261             size_t globalThreads[3] = {col, row, 1};
1262             size_t localThreads[3]  = {ltx, lty, 1};
1263
1264             //set args
1265             vector<pair<size_t , const void *> > args;
1266             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
1267             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.step ));
1268             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1269             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1270             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.offset ));
1271             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.offset ));
1272             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
1273             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
1274             args.push_back( make_pair( sizeof(cl_int) , (void *)&sp ));
1275             args.push_back( make_pair( sizeof(cl_int) , (void *)&sr ));
1276             args.push_back( make_pair( sizeof(cl_int) , (void *)&maxIter ));
1277             args.push_back( make_pair( sizeof(cl_float) , (void *)&eps ));
1278             openCLExecuteKernel(clCxt, &meanShift, "meanshift_kernel", globalThreads, localThreads, args, -1, -1);
1279         }
1280
1281         void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, TermCriteria criteria)
1282         {
1283             if( src.empty() )
1284                 CV_Error( CV_StsBadArg, "The input image is empty" );
1285
1286             if( src.depth() != CV_8U || src.oclchannels() != 4 )
1287                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
1288
1289             //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
1290             //            {
1291             //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
1292             //            }
1293
1294             dst.create( src.size(), CV_8UC4 );
1295
1296             if( !(criteria.type & TermCriteria::MAX_ITER) )
1297                 criteria.maxCount = 5;
1298
1299             int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
1300
1301             float eps;
1302             if( !(criteria.type & TermCriteria::EPS) )
1303                 eps = 1.f;
1304             eps = (float)std::max(criteria.epsilon, 0.0);
1305
1306             meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);
1307
1308         }
1309
1310         static void meanShiftProc_gpu(const oclMat &src, oclMat dstr, oclMat dstsp, int sp, int sr, int maxIter, float eps)
1311         {
1312             //sanity checks
1313             CV_Assert( (src.cols == dstr.cols) && (src.rows == dstr.rows) &&
1314                        (src.rows == dstsp.rows) && (src.cols == dstsp.cols));
1315             CV_Assert( !(dstsp.step & 0x3) );
1316             Context *clCxt = src.clCxt;
1317
1318             //Arrange the NDRange
1319             int col = src.cols, row = src.rows;
1320             int ltx = 16, lty = 8;
1321             if(src.cols % ltx != 0)
1322                 col = (col / ltx + 1) * ltx;
1323             if(src.rows % lty != 0)
1324                 row = (row / lty + 1) * lty;
1325
1326             size_t globalThreads[3] = {col, row, 1};
1327             size_t localThreads[3]  = {ltx, lty, 1};
1328
1329             //set args
1330             vector<pair<size_t , const void *> > args;
1331             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1332             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dstr.data ));
1333             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dstsp.data ));
1334             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1335             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.step ));
1336             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstsp.step ));
1337             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.offset ));
1338             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.offset ));
1339             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstsp.offset ));
1340             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.cols ));
1341             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.rows ));
1342             args.push_back( make_pair( sizeof(cl_int) , (void *)&sp ));
1343             args.push_back( make_pair( sizeof(cl_int) , (void *)&sr ));
1344             args.push_back( make_pair( sizeof(cl_int) , (void *)&maxIter ));
1345             args.push_back( make_pair( sizeof(cl_float) , (void *)&eps ));
1346             openCLExecuteKernel(clCxt, &meanShift, "meanshiftproc_kernel", globalThreads, localThreads, args, -1, -1);
1347         }
1348
1349         void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, TermCriteria criteria)
1350         {
1351             if( src.empty() )
1352                 CV_Error( CV_StsBadArg, "The input image is empty" );
1353
1354             if( src.depth() != CV_8U || src.oclchannels() != 4 )
1355                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
1356
1357             //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
1358             //            {
1359             //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
1360             //            }
1361
1362             dstr.create( src.size(), CV_8UC4 );
1363             dstsp.create( src.size(), CV_16SC2 );
1364
1365             if( !(criteria.type & TermCriteria::MAX_ITER) )
1366                 criteria.maxCount = 5;
1367
1368             int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
1369
1370             float eps;
1371             if( !(criteria.type & TermCriteria::EPS) )
1372                 eps = 1.f;
1373             eps = (float)std::max(criteria.epsilon, 0.0);
1374
1375             meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);
1376         }
1377
1378         ///////////////////////////////////////////////////////////////////////////////////////////////////
1379         ////////////////////////////////////////////////////hist///////////////////////////////////////////////
1380         /////////////////////////////////////////////////////////////////////////////////////////////////////
1381         namespace histograms
1382         {
1383             const int PARTIAL_HISTOGRAM256_COUNT = 256;
1384             const int HISTOGRAM256_BIN_COUNT = 256;
1385         }
1386         ///////////////////////////////calcHist/////////////////////////////////////////////////////////////////
1387         static void calc_sub_hist(const oclMat &mat_src, const oclMat &mat_sub_hist)
1388         {
1389             using namespace histograms;
1390
1391             Context  *clCxt = mat_src.clCxt;
1392             int depth = mat_src.depth();
1393
1394             string kernelName = "calc_sub_hist";
1395
1396             size_t localThreads[3]  = { HISTOGRAM256_BIN_COUNT, 1, 1 };
1397             size_t globalThreads[3] = { PARTIAL_HISTOGRAM256_COUNT *localThreads[0], 1, 1};
1398
1399             int dataWidth = 16;
1400             int dataWidth_bits = 4;
1401             int mask = dataWidth - 1;
1402
1403             int cols = mat_src.cols * mat_src.oclchannels();
1404             int src_offset = mat_src.offset;
1405             int hist_step = mat_sub_hist.step >> 2;
1406             int left_col = 0, right_col = 0;
1407
1408             if(cols >= dataWidth * 2 - 1)
1409             {
1410                 left_col = dataWidth - (src_offset & mask);
1411                 left_col &= mask;
1412                 src_offset += left_col;
1413                 cols -= left_col;
1414                 right_col = cols & mask;
1415                 cols -= right_col;
1416             }
1417             else
1418             {
1419                 left_col = cols;
1420                 right_col = 0;
1421                 cols = 0;
1422                 globalThreads[0] = 0;
1423             }
1424
1425             vector<pair<size_t , const void *> > args;
1426             if(globalThreads[0] != 0)
1427             {
1428                 int tempcols = cols >> dataWidth_bits;
1429                 int inc_x = globalThreads[0] % tempcols;
1430                 int inc_y = globalThreads[0] / tempcols;
1431                 src_offset >>= dataWidth_bits;
1432                 int src_step = mat_src.step >> dataWidth_bits;
1433                 int datacount = tempcols * mat_src.rows;
1434                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
1435                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
1436                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
1437                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
1438                 args.push_back( make_pair( sizeof(cl_int), (void *)&datacount));
1439                 args.push_back( make_pair( sizeof(cl_int), (void *)&tempcols));
1440                 args.push_back( make_pair( sizeof(cl_int), (void *)&inc_x));
1441                 args.push_back( make_pair( sizeof(cl_int), (void *)&inc_y));
1442                 args.push_back( make_pair( sizeof(cl_int), (void *)&hist_step));
1443                 openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, depth);
1444             }
1445             if(left_col != 0 || right_col != 0)
1446             {
1447                 kernelName = "calc_sub_hist_border";
1448                 src_offset = mat_src.offset;
1449                 localThreads[0] = 1;
1450                 localThreads[1] = 256;
1451                 globalThreads[0] = left_col + right_col;
1452                 globalThreads[1] = (mat_src.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
1453
1454                 args.clear();
1455                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
1456                 args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step));
1457                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
1458                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
1459                 args.push_back( make_pair( sizeof(cl_int), (void *)&left_col));
1460                 args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
1461                 args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows));
1462                 args.push_back( make_pair( sizeof(cl_int), (void *)&hist_step));
1463                 openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, depth);
1464             }
1465         }
1466         static void merge_sub_hist(const oclMat &sub_hist, oclMat &mat_hist)
1467         {
1468             using namespace histograms;
1469
1470             Context  *clCxt = sub_hist.clCxt;
1471             string kernelName = "merge_hist";
1472
1473             size_t localThreads[3]  = { 256, 1, 1 };
1474             size_t globalThreads[3] = { HISTOGRAM256_BIN_COUNT *localThreads[0], 1, 1};
1475             int src_step = sub_hist.step >> 2;
1476             vector<pair<size_t , const void *> > args;
1477             args.push_back( make_pair( sizeof(cl_mem), (void *)&sub_hist.data));
1478             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
1479             args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
1480             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
1481         }
1482         void calcHist(const oclMat &mat_src, oclMat &mat_hist)
1483         {
1484             using namespace histograms;
1485             CV_Assert(mat_src.type() == CV_8UC1);
1486             mat_hist.create(1, 256, CV_32SC1);
1487
1488             oclMat buf(PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_BIN_COUNT, CV_32SC1);
1489             buf.setTo(0);
1490
1491             calc_sub_hist(mat_src, buf);
1492             merge_sub_hist(buf, mat_hist);
1493         }
1494         ///////////////////////////////////equalizeHist/////////////////////////////////////////////////////
1495         void equalizeHist(const oclMat &mat_src, oclMat &mat_dst)
1496         {
1497             mat_dst.create(mat_src.rows, mat_src.cols, CV_8UC1);
1498
1499             oclMat mat_hist(1, 256, CV_32SC1);
1500
1501             calcHist(mat_src, mat_hist);
1502
1503             Context *clCxt = mat_src.clCxt;
1504             string kernelName = "calLUT";
1505             size_t localThreads[3] = { 256, 1, 1};
1506             size_t globalThreads[3] = { 256, 1, 1};
1507             oclMat lut(1, 256, CV_8UC1);
1508             vector<pair<size_t , const void *> > args;
1509             int total = mat_src.rows * mat_src.cols;
1510             args.push_back( make_pair( sizeof(cl_mem), (void *)&lut.data));
1511             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
1512             args.push_back( make_pair( sizeof(int), (void *)&total));
1513             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
1514             LUT(mat_src, lut, mat_dst);
1515         }
1516
1517         ////////////////////////////////////////////////////////////////////////
1518         // CLAHE
1519         namespace clahe
1520         {
1521             inline int divUp(int total, int grain)
1522             {
1523                 return (total + grain - 1) / grain * grain;
1524             }
1525
1526             static void calcLut(const oclMat &src, oclMat &dst,
1527                 const int tilesX, const int tilesY, const cv::Size tileSize,
1528                 const int clipLimit, const float lutScale)
1529             {
1530                 cl_int2 tile_size;
1531                 tile_size.s[0] = tileSize.width;
1532                 tile_size.s[1] = tileSize.height;
1533
1534                 std::vector<pair<size_t , const void *> > args;
1535                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1536                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1537                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
1538                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1539                 args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
1540                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
1541                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
1542                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
1543
1544                 String kernelName = "calcLut";
1545                 size_t localThreads[3]  = { 32, 8, 1 };
1546                 size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
1547                 bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
1548                 if (is_cpu)
1549                 {
1550                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
1551                 }
1552                 else
1553                 {
1554                     cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
1555                     int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
1556                     openCLSafeCall(clReleaseKernel(kernel));
1557
1558                     static char opt[20] = {0};
1559                     sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
1560                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt);
1561                 }
1562             }
1563
1564             static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
1565                 const int tilesX, const int tilesY, const cv::Size tileSize)
1566             {
1567                 cl_int2 tile_size;
1568                 tile_size.s[0] = tileSize.width;
1569                 tile_size.s[1] = tileSize.height;
1570
1571                 std::vector<pair<size_t , const void *> > args;
1572                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1573                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1574                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
1575                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
1576                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1577                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.step ));
1578                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
1579                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
1580                 args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
1581                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
1582                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
1583
1584                 String kernelName = "transform";
1585                 size_t localThreads[3]  = { 32, 8, 1 };
1586                 size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 };
1587
1588                 openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
1589             }
1590         }
1591
1592         namespace
1593         {
1594             class CLAHE_Impl : public cv::ocl::CLAHE
1595             {
1596             public:
1597                 CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
1598
1599                 cv::AlgorithmInfo* info() const;
1600
1601                 void apply(const oclMat &src, oclMat &dst);
1602
1603                 void setClipLimit(double clipLimit);
1604                 double getClipLimit() const;
1605
1606                 void setTilesGridSize(cv::Size tileGridSize);
1607                 cv::Size getTilesGridSize() const;
1608
1609                 void collectGarbage();
1610
1611             private:
1612                 double clipLimit_;
1613                 int tilesX_;
1614                 int tilesY_;
1615
1616                 oclMat srcExt_;
1617                 oclMat lut_;
1618             };
1619
1620             CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
1621             clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
1622             {
1623             }
1624
1625             void CLAHE_Impl::apply(const oclMat &src, oclMat &dst)
1626             {
1627                 CV_Assert( src.type() == CV_8UC1 );
1628
1629                 dst.create( src.size(), src.type() );
1630
1631                 const int histSize = 256;
1632
1633                 ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
1634
1635                 cv::Size tileSize;
1636                 oclMat srcForLut;
1637
1638                 if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
1639                 {
1640                     tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
1641                     srcForLut = src;
1642                 }
1643                 else
1644                 {
1645                     cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar());
1646
1647                     tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
1648                     srcForLut = srcExt_;
1649                 }
1650
1651                 const int tileSizeTotal = tileSize.area();
1652                 const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
1653
1654                 int clipLimit = 0;
1655                 if (clipLimit_ > 0.0)
1656                 {
1657                     clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
1658                     clipLimit = std::max(clipLimit, 1);
1659                 }
1660
1661                 clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale);
1662                 //finish();
1663                 clahe::transform(src, dst, lut_, tilesX_, tilesY_, tileSize);
1664             }
1665
1666             void CLAHE_Impl::setClipLimit(double clipLimit)
1667             {
1668                 clipLimit_ = clipLimit;
1669             }
1670
1671             double CLAHE_Impl::getClipLimit() const
1672             {
1673                 return clipLimit_;
1674             }
1675
1676             void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
1677             {
1678                 tilesX_ = tileGridSize.width;
1679                 tilesY_ = tileGridSize.height;
1680             }
1681
1682             cv::Size CLAHE_Impl::getTilesGridSize() const
1683             {
1684                 return cv::Size(tilesX_, tilesY_);
1685             }
1686
1687             void CLAHE_Impl::collectGarbage()
1688             {
1689                 srcExt_.release();
1690                 lut_.release();
1691             }
1692         }
1693
1694         cv::Ptr<cv::ocl::CLAHE> createCLAHE(double clipLimit, cv::Size tileGridSize)
1695         {
1696             return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
1697         }
1698
1699         //////////////////////////////////bilateralFilter////////////////////////////////////////////////////
1700         static void
1701         oclbilateralFilter_8u( const oclMat &src, oclMat &dst, int d,
1702                                double sigma_color, double sigma_space,
1703                                int borderType )
1704         {
1705             int cn = src.channels();
1706             int i, j, maxk, radius;
1707
1708             CV_Assert( (src.channels() == 1 || src.channels() == 3) &&
1709                        src.type() == dst.type() && src.size() == dst.size() &&
1710                        src.data != dst.data );
1711
1712             if( sigma_color <= 0 )
1713                 sigma_color = 1;
1714             if( sigma_space <= 0 )
1715                 sigma_space = 1;
1716
1717             double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
1718             double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
1719
1720             if( d <= 0 )
1721                 radius = cvRound(sigma_space * 1.5);
1722             else
1723                 radius = d / 2;
1724             radius = MAX(radius, 1);
1725             d = radius * 2 + 1;
1726
1727             oclMat temp;
1728             copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
1729
1730             vector<float> _color_weight(cn * 256);
1731             vector<float> _space_weight(d * d);
1732             vector<int> _space_ofs(d * d);
1733             float *color_weight = &_color_weight[0];
1734             float *space_weight = &_space_weight[0];
1735             int *space_ofs = &_space_ofs[0];
1736             int dst_step_in_pixel = dst.step / dst.elemSize();
1737             int dst_offset_in_pixel = dst.offset / dst.elemSize();
1738             int temp_step_in_pixel = temp.step / temp.elemSize();
1739             // initialize color-related bilateral filter coefficients
1740             for( i = 0; i < 256 * cn; i++ )
1741                 color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
1742
1743             // initialize space-related bilateral filter coefficients
1744             for( i = -radius, maxk = 0; i <= radius; i++ )
1745                 for( j = -radius; j <= radius; j++ )
1746                 {
1747                     double r = std::sqrt((double)i * i + (double)j * j);
1748                     if( r > radius )
1749                         continue;
1750                     space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
1751                     space_ofs[maxk++] = (int)(i * temp_step_in_pixel + j);
1752                 }
1753             oclMat oclcolor_weight(1, cn * 256, CV_32FC1, color_weight);
1754             oclMat oclspace_weight(1, d * d, CV_32FC1, space_weight);
1755             oclMat oclspace_ofs(1, d * d, CV_32SC1, space_ofs);
1756
1757             string kernelName = "bilateral";
1758             size_t localThreads[3]  = { 16, 16, 1 };
1759             size_t globalThreads[3] = { (dst.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
1760                                         (dst.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1],
1761                                         1
1762                                       };
1763             if((dst.type() == CV_8UC1) && ((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
1764             {
1765                 kernelName = "bilateral2";
1766                 globalThreads[0] = (dst.cols / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
1767             }
1768             vector<pair<size_t , const void *> > args;
1769             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
1770             args.push_back( make_pair( sizeof(cl_mem), (void *)&temp.data ));
1771             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
1772             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
1773             args.push_back( make_pair( sizeof(cl_int), (void *)&maxk ));
1774             args.push_back( make_pair( sizeof(cl_int), (void *)&radius ));
1775             args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step_in_pixel ));
1776             args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_in_pixel ));
1777             args.push_back( make_pair( sizeof(cl_int), (void *)&temp_step_in_pixel ));
1778             args.push_back( make_pair( sizeof(cl_int), (void *)&temp.rows ));
1779             args.push_back( make_pair( sizeof(cl_int), (void *)&temp.cols ));
1780             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data ));
1781             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data ));
1782             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data ));
1783             openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, dst.oclchannels(), dst.depth());
1784         }
1785         void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType)
1786         {
1787
1788             dst.create( src.size(), src.type() );
1789             if( src.depth() == CV_8U )
1790                 oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType );
1791             else
1792                 CV_Error( CV_StsUnsupportedFormat,
1793                           "Bilateral filtering is only implemented for 8uimages" );
1794         }
1795
1796     }
1797 }
1798 //////////////////////////////////convolve////////////////////////////////////////////////////
1799 inline int divUp(int total, int grain)
1800 {
1801     return (total + grain - 1) / grain;
1802 }
1803 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const char **kernelString)
1804 {
1805     CV_Assert(src.depth() == CV_32FC1);
1806     CV_Assert(temp1.depth() == CV_32F);
1807     CV_Assert(temp1.cols <= 17 && temp1.rows <= 17);
1808
1809     dst.create(src.size(), src.type());
1810
1811     CV_Assert(src.cols == dst.cols && src.rows == dst.rows);
1812     CV_Assert(src.type() == dst.type());
1813
1814     Context  *clCxt = src.clCxt;
1815     int channels = dst.oclchannels();
1816     int depth = dst.depth();
1817
1818     size_t vector_length = 1;
1819     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
1820     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
1821     int rows = dst.rows;
1822
1823     size_t localThreads[3]  = { 16, 16, 1 };
1824     size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
1825                                 divUp(rows, localThreads[1]) *localThreads[1],
1826                                 1
1827                               };
1828
1829     vector<pair<size_t , const void *> > args;
1830     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
1831     args.push_back( make_pair( sizeof(cl_mem), (void *)&temp1.data ));
1832     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
1833     args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
1834     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
1835     args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
1836     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
1837     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.step ));
1838     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.rows ));
1839     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.cols ));
1840
1841     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
1842 }
1843 void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y)
1844 {
1845     CV_Assert(x.depth() == CV_32F);
1846     CV_Assert(t.depth() == CV_32F);
1847     CV_Assert(x.type() == y.type() && x.size() == y.size());
1848     y.create(x.size(), x.type());
1849     string kernelName = "convolve";
1850
1851     convolve_run(x, t, y, kernelName, &imgproc_convolve);
1852 }