catching OpenCL double not supported exceptions
[profile/ivi/opencv.git] / modules / ocl / src / imgproc.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // @Authors
19 //    Niko Li, newlife20080214@gmail.com
20 //    Jia Haipeng, jiahaipeng95@gmail.com
21 //    Shengen Yan, yanshengen@gmail.com
22 //    Rock Li, Rock.Li@amd.com
23 //    Zero Lin, Zero.Lin@amd.com
24 //    Zhang Ying, zhangying913@gmail.com
25 //    Xu Pang, pangxu010@163.com
26 //    Wu Zailong, bullet@yeah.net
27 //    Wenju He, wenju@multicorewareinc.com
28 //    Sen Liu, swjtuls1987@126.com
29 //
30 // Redistribution and use in source and binary forms, with or without modification,
31 // are permitted provided that the following conditions are met:
32 //
33 //   * Redistribution's of source code must retain the above copyright notice,
34 //     this list of conditions and the following disclaimer.
35 //
36 //   * Redistribution's in binary form must reproduce the above copyright notice,
37 //     this list of conditions and the following disclaimer in the documentation
38 //     and/or other oclMaterials provided with the distribution.
39 //
40 //   * The name of the copyright holders may not be used to endorse or promote products
41 //     derived from this software without specific prior written permission.
42 //
43 // This software is provided by the copyright holders and contributors "as is" and
44 // any express or implied warranties, including, but not limited to, the implied
45 // warranties of merchantability and fitness for a particular purpose are disclaimed.
46 // In no event shall the Intel Corporation or contributors be liable for any direct,
47 // indirect, incidental, special, exemplary, or consequential damages
48 // (including, but not limited to, procurement of substitute goods or services;
49 // loss of use, data, or profits; or business interruption) however caused
50 // and on any theory of liability, whether in contract, strict liability,
51 // or tort (including negligence or otherwise) arising in any way out of
52 // the use of this software, even if advised of the possibility of such damage.
53 //
54 //M*/
55
56 #include "precomp.hpp"
57 #include "opencl_kernels.hpp"
58
59 using namespace cv;
60 using namespace cv::ocl;
61
62 namespace cv
63 {
64     namespace ocl
65     {
66         ////////////////////////////////////OpenCL call wrappers////////////////////////////
67
68         template <typename T> struct index_and_sizeof;
69         template <> struct index_and_sizeof<char>
70         {
71             enum { index = 1 };
72         };
73         template <> struct index_and_sizeof<unsigned char>
74         {
75             enum { index = 2 };
76         };
77         template <> struct index_and_sizeof<short>
78         {
79             enum { index = 3 };
80         };
81         template <> struct index_and_sizeof<unsigned short>
82         {
83             enum { index = 4 };
84         };
85         template <> struct index_and_sizeof<int>
86         {
87             enum { index = 5 };
88         };
89         template <> struct index_and_sizeof<float>
90         {
91             enum { index = 6 };
92         };
93         template <> struct index_and_sizeof<double>
94         {
95             enum { index = 7 };
96         };
97
98         /////////////////////////////////////////////////////////////////////////////////////
99         // threshold
100
101         typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
102
103         static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
104         {
105             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
106             Context *clCxt = src.clCxt;
107
108             uchar thresh_uchar = cvFloor(thresh);
109             uchar max_val = cvRound(maxVal);
110             string kernelName = "threshold";
111
112             size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
113             size_t bSizeX = 16, bSizeY = 16;
114             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
115             size_t gSizeY = dst.rows;
116             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
117             size_t localThreads[3] = {bSizeX, bSizeY, 1};
118
119             vector< pair<size_t, const void *> > args;
120             args.push_back( make_pair(sizeof(cl_mem), &src.data));
121             args.push_back( make_pair(sizeof(cl_mem), &dst.data));
122             args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
123             args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
124             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
125             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
126             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
127             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
128             args.push_back( make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
129             args.push_back( make_pair(sizeof(cl_uchar), (void *)&max_val));
130             args.push_back( make_pair(sizeof(cl_int), (void *)&type));
131             openCLExecuteKernel(clCxt, &imgproc_threshold, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
132         }
133
134         static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
135         {
136             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
137             Context *clCxt = src.clCxt;
138
139             float thresh_f = thresh;
140             float max_val = maxVal;
141             int dst_offset = (dst.offset >> 2);
142             int dst_step = (dst.step >> 2);
143             int src_offset = (src.offset >> 2);
144             int src_step = (src.step >> 2);
145
146             string kernelName = "threshold";
147
148             size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
149             //size_t cols = dst.cols;
150             size_t bSizeX = 16, bSizeY = 16;
151             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
152             size_t gSizeY = dst.rows;
153             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
154             size_t localThreads[3] = {bSizeX, bSizeY, 1};
155
156             vector< pair<size_t, const void *> > args;
157             args.push_back( make_pair(sizeof(cl_mem), &src.data));
158             args.push_back( make_pair(sizeof(cl_mem), &dst.data));
159             args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset));
160             args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
161             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset));
162             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
163             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
164             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
165             args.push_back( make_pair(sizeof(cl_float), (void *)&thresh_f));
166             args.push_back( make_pair(sizeof(cl_float), (void *)&max_val));
167             args.push_back( make_pair(sizeof(cl_int), (void *)&type));
168             openCLExecuteKernel(clCxt, &imgproc_threshold, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
169
170         }
171
172         //threshold: support 8UC1 and 32FC1 data type and five threshold type
173         double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
174         {
175             //TODO: These limitations shall be removed later.
176             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
177             CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
178                       || type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
179
180             static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
181
182             dst.create( src.size(), src.type() );
183             gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
184
185             return thresh;
186         }
187         ////////////////////////////////////////////////////////////////////////////////////////////
188         ///////////////////////////////   remap   //////////////////////////////////////////////////
189         ////////////////////////////////////////////////////////////////////////////////////////////
190
191         void remap( const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int borderType, const Scalar &borderValue )
192         {
193             Context *clCxt = src.clCxt;
194             CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST
195                       || interpolation == INTER_CUBIC || interpolation == INTER_LANCZOS4);
196             CV_Assert((map1.type() == CV_16SC2 && !map2.data) || (map1.type() == CV_32FC2 && !map2.data) || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1));
197             CV_Assert(!map2.data || map2.size() == map1.size());
198             CV_Assert(dst.size() == map1.size());
199
200             dst.create(map1.size(), src.type());
201
202
203             string kernelName;
204
205             if( map1.type() == CV_32FC2 && !map2.data )
206             {
207                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
208                     kernelName = "remapLNFConstant";
209                 else if(interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
210                     kernelName = "remapNNFConstant";
211             }
212             else if(map1.type() == CV_16SC2 && !map2.data)
213             {
214                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
215                     kernelName = "remapLNSConstant";
216                 else if(interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
217                     kernelName = "remapNNSConstant";
218
219             }
220             else if(map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
221             {
222                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
223                     kernelName = "remapLNF1Constant";
224                 else if (interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
225                     kernelName = "remapNNF1Constant";
226             }
227
228             size_t blkSizeX = 16, blkSizeY = 16;
229             size_t glbSizeX;
230             int cols = dst.cols;
231             if(src.type() == CV_8UC1)
232             {
233                 cols = (dst.cols + dst.offset % 4 + 3) / 4;
234                 glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
235
236             }
237             else if(src.type() == CV_32FC1 && interpolation == INTER_LINEAR)
238             {
239                 cols = (dst.cols + (dst.offset >> 2) % 4 + 3) / 4;
240                 glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
241             }
242             else
243             {
244                 glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
245
246             }
247
248             size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
249             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
250             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
251
252             float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
253             vector< pair<size_t, const void *> > args;
254             if(map1.channels() == 2)
255             {
256                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
257                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
258                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
259                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
260                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
261                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
262                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
263                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
264                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
265                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
266                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
267                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
268                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
269                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
270                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
271                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
272
273                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
274                 {
275                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
276                 }
277                 else
278                 {
279                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
280                 }
281             }
282             if(map1.channels() == 1)
283             {
284                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
285                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
286                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
287                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map2.data));
288                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
289                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
290                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
291                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
292                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
293                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
294                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
295                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
296                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
297                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
298                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
299                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
300                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
301                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
302                 {
303                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
304                 }
305                 else
306                 {
307                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
308                 }
309             }
310             openCLExecuteKernel(clCxt, &imgproc_remap, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
311         }
312
313         ////////////////////////////////////////////////////////////////////////////////////////////
314         // resize
315
316         static void resize_gpu( const oclMat &src, oclMat &dst, double fx, double fy, int interpolation)
317         {
318             CV_Assert( (src.channels() == dst.channels()) );
319             Context *clCxt = src.clCxt;
320             float ifx = 1. / fx;
321             float ify = 1. / fy;
322             double ifx_d = 1. / fx;
323             double ify_d = 1. / fy;
324             int srcStep_in_pixel = src.step1() / src.oclchannels();
325             int srcoffset_in_pixel = src.offset / src.elemSize();
326             int dstStep_in_pixel = dst.step1() / dst.oclchannels();
327             int dstoffset_in_pixel = dst.offset / dst.elemSize();
328             //printf("%d %d\n",src.step1() , dst.elemSize());
329             string kernelName;
330             if(interpolation == INTER_LINEAR)
331                 kernelName = "resizeLN";
332             else if(interpolation == INTER_NEAREST)
333                 kernelName = "resizeNN";
334
335             //TODO: improve this kernel
336             size_t blkSizeX = 16, blkSizeY = 16;
337             size_t glbSizeX;
338             if(src.type() == CV_8UC1)
339             {
340                 size_t cols = (dst.cols + dst.offset % 4 + 3) / 4;
341                 glbSizeX = cols % blkSizeX == 0 && cols != 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
342             }
343             else
344             {
345                 glbSizeX = dst.cols % blkSizeX == 0 && dst.cols != 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
346             }
347             size_t glbSizeY = dst.rows % blkSizeY == 0 && dst.rows != 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
348             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
349             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
350
351             vector< pair<size_t, const void *> > args;
352             if(interpolation == INTER_NEAREST)
353             {
354                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
355                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
356                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
357                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
358                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
359                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
360                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
361                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
362                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
363                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
364                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
365                 {
366                     args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
367                     args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
368                 }
369                 else
370                 {
371                     args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
372                     args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
373                 }
374             }
375             else
376             {
377                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
378                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
379                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
380                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
381                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
382                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
383                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
384                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
385                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
386                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
387                 args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
388                 args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
389             }
390
391             openCLExecuteKernel(clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
392         }
393
394
395         void resize(const oclMat &src, oclMat &dst, Size dsize,
396                     double fx, double fy, int interpolation)
397         {
398             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4
399                       || src.type() == CV_32FC1 || src.type() == CV_32FC3 || src.type() == CV_32FC4);
400             CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST);
401             CV_Assert( src.size().area() > 0 );
402             CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
403
404             if(!(dsize == Size()) && (fx > 0 && fy > 0))
405             {
406                 if(dsize.width != (int)(src.cols * fx) || dsize.height != (int)(src.rows * fy))
407                 {
408                     CV_Error(CV_StsUnmatchedSizes, "invalid dsize and fx, fy!");
409                 }
410             }
411             if( dsize == Size() )
412             {
413                 dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
414             }
415             else
416             {
417                 fx = (double)dsize.width / src.cols;
418                 fy = (double)dsize.height / src.rows;
419             }
420
421             dst.create(dsize, src.type());
422
423             if( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR )
424             {
425                 resize_gpu( src, dst, fx, fy, interpolation);
426                 return;
427             }
428             CV_Error(CV_StsUnsupportedFormat, "Non-supported interpolation method");
429         }
430
431
432         ////////////////////////////////////////////////////////////////////////
433         // medianFilter
434         void medianFilter(const oclMat &src, oclMat &dst, int m)
435         {
436             CV_Assert( m % 2 == 1 && m > 1 );
437             CV_Assert( m <= 5 || src.depth() == CV_8U );
438             CV_Assert( src.cols <= dst.cols && src.rows <= dst.rows );
439
440             if(src.data == dst.data)
441             {
442                 oclMat src1;
443                 src.copyTo(src1);
444                 return medianFilter(src1, dst, m);
445             }
446
447             int srcStep = src.step1() / src.oclchannels();
448             int dstStep = dst.step1() / dst.oclchannels();
449             int srcOffset = src.offset / src.oclchannels() / src.elemSize1();
450             int dstOffset = dst.offset / dst.oclchannels() / dst.elemSize1();
451
452             Context *clCxt = src.clCxt;
453             string kernelName = "medianFilter";
454
455
456             vector< pair<size_t, const void *> > args;
457             args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
458             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
459             args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
460             args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
461             args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
462             args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
463             args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
464             args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
465
466             size_t globalThreads[3] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
467             size_t localThreads[3] = {16, 16, 1};
468
469             if(m == 3)
470             {
471                 string kernelName = "medianFilter3";
472                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
473             }
474             else if(m == 5)
475             {
476                 string kernelName = "medianFilter5";
477                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
478             }
479             else
480                 CV_Error(CV_StsUnsupportedFormat, "Non-supported filter length");
481         }
482
483         ////////////////////////////////////////////////////////////////////////
484         // copyMakeBorder
485         void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
486         {
487             CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
488             if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
489             {
490                 if(((bordertype & cv::BORDER_ISOLATED) == 0) &&
491                         (bordertype != cv::BORDER_CONSTANT) &&
492                         (bordertype != cv::BORDER_REPLICATE))
493                 {
494                     CV_Error(CV_StsBadArg, "unsupported border type");
495                 }
496             }
497             bordertype &= ~cv::BORDER_ISOLATED;
498             if((bordertype == cv::BORDER_REFLECT) || (bordertype == cv::BORDER_WRAP))
499             {
500                 CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
501             }
502
503             if(bordertype == cv::BORDER_REFLECT_101)
504             {
505                 CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
506             }
507
508             dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
509             int srcStep = src.step1() / src.oclchannels();
510             int dstStep = dst.step1() / dst.oclchannels();
511             int srcOffset = src.offset / src.elemSize();
512             int dstOffset = dst.offset / dst.elemSize();
513             int __bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101};
514             const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"};
515             size_t bordertype_index;
516             for(bordertype_index = 0; bordertype_index < sizeof(__bordertype) / sizeof(int); bordertype_index++)
517             {
518                 if(__bordertype[bordertype_index] == bordertype)
519                     break;
520             }
521             if(bordertype_index == sizeof(__bordertype) / sizeof(int))
522             {
523                 CV_Error(CV_StsBadArg, "unsupported border type");
524             }
525             string kernelName = "copymakeborder";
526             size_t localThreads[3] = {16, 16, 1};
527             size_t globalThreads[3] = {(dst.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
528                                        (dst.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1
529                                       };
530
531             vector< pair<size_t, const void *> > args;
532             args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
533             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
534             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
535             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows));
536             args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
537             args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
538             args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
539             args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
540             args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
541             args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
542             args.push_back( make_pair( sizeof(cl_int), (void *)&top));
543             args.push_back( make_pair( sizeof(cl_int), (void *)&left));
544             char compile_option[64];
545             union sc
546             {
547                 cl_uchar4 uval;
548                 cl_char4  cval;
549                 cl_ushort4 usval;
550                 cl_short4 shval;
551                 cl_int4 ival;
552                 cl_float4 fval;
553                 cl_double4 dval;
554             } val;
555             switch(dst.depth())
556             {
557             case CV_8U:
558                 val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
559                 val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
560                 val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
561                 val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
562                 switch(dst.oclchannels())
563                 {
564                 case 1:
565                     sprintf(compile_option, "-D GENTYPE=uchar -D %s", borderstr[bordertype_index]);
566                     args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
567                     if(((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
568                     {
569                         kernelName = "copymakeborder_C1_D0";
570                         globalThreads[0] = (dst.cols / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
571                     }
572                     break;
573                 case 4:
574                     sprintf(compile_option, "-D GENTYPE=uchar4 -D %s", borderstr[bordertype_index]);
575                     args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
576                     break;
577                 default:
578                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
579                 }
580                 break;
581             case CV_8S:
582                 val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
583                 val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
584                 val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
585                 val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
586                 switch(dst.oclchannels())
587                 {
588                 case 1:
589                     sprintf(compile_option, "-D GENTYPE=char -D %s", borderstr[bordertype_index]);
590                     args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
591                     break;
592                 case 4:
593                     sprintf(compile_option, "-D GENTYPE=char4 -D %s", borderstr[bordertype_index]);
594                     args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
595                     break;
596                 default:
597                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
598                 }
599                 break;
600             case CV_16U:
601                 val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
602                 val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
603                 val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
604                 val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
605                 switch(dst.oclchannels())
606                 {
607                 case 1:
608                     sprintf(compile_option, "-D GENTYPE=ushort -D %s", borderstr[bordertype_index]);
609                     args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
610                     break;
611                 case 4:
612                     sprintf(compile_option, "-D GENTYPE=ushort4 -D %s", borderstr[bordertype_index]);
613                     args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
614                     break;
615                 default:
616                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
617                 }
618                 break;
619             case CV_16S:
620                 val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
621                 val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
622                 val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
623                 val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
624                 switch(dst.oclchannels())
625                 {
626                 case 1:
627                     sprintf(compile_option, "-D GENTYPE=short -D %s", borderstr[bordertype_index]);
628                     args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
629                     break;
630                 case 4:
631                     sprintf(compile_option, "-D GENTYPE=short4 -D %s", borderstr[bordertype_index]);
632                     args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
633                     break;
634                 default:
635                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
636                 }
637                 break;
638             case CV_32S:
639                 val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
640                 val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
641                 val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
642                 val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
643                 switch(dst.oclchannels())
644                 {
645                 case 1:
646                     sprintf(compile_option, "-D GENTYPE=int -D %s", borderstr[bordertype_index]);
647                     args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
648                     break;
649                 case 2:
650                     sprintf(compile_option, "-D GENTYPE=int2 -D %s", borderstr[bordertype_index]);
651                     cl_int2 i2val;
652                     i2val.s[0] = val.ival.s[0];
653                     i2val.s[1] = val.ival.s[1];
654                     args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
655                     break;
656                 case 4:
657                     sprintf(compile_option, "-D GENTYPE=int4 -D %s", borderstr[bordertype_index]);
658                     args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
659                     break;
660                 default:
661                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
662                 }
663                 break;
664             case CV_32F:
665                 val.fval.s[0] = scalar.val[0];
666                 val.fval.s[1] = scalar.val[1];
667                 val.fval.s[2] = scalar.val[2];
668                 val.fval.s[3] = scalar.val[3];
669                 switch(dst.oclchannels())
670                 {
671                 case 1:
672                     sprintf(compile_option, "-D GENTYPE=float -D %s", borderstr[bordertype_index]);
673                     args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
674                     break;
675                 case 4:
676                     sprintf(compile_option, "-D GENTYPE=float4 -D %s", borderstr[bordertype_index]);
677                     args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
678                     break;
679                 default:
680                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
681                 }
682                 break;
683             case CV_64F:
684                 val.dval.s[0] = scalar.val[0];
685                 val.dval.s[1] = scalar.val[1];
686                 val.dval.s[2] = scalar.val[2];
687                 val.dval.s[3] = scalar.val[3];
688                 switch(dst.oclchannels())
689                 {
690                 case 1:
691                     sprintf(compile_option, "-D GENTYPE=double -D %s", borderstr[bordertype_index]);
692                     args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
693                     break;
694                 case 4:
695                     sprintf(compile_option, "-D GENTYPE=double4 -D %s", borderstr[bordertype_index]);
696                     args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
697                     break;
698                 default:
699                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
700                 }
701                 break;
702             default:
703                 CV_Error(CV_StsUnsupportedFormat, "unknown depth");
704             }
705
706             openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
707         }
708
709         ////////////////////////////////////////////////////////////////////////
710         // warp
711
712         namespace
713         {
714 #define F double
715
716             void convert_coeffs(F *M)
717             {
718                 double D = M[0] * M[4] - M[1] * M[3];
719                 D = D != 0 ? 1. / D : 0;
720                 double A11 = M[4] * D, A22 = M[0] * D;
721                 M[0] = A11;
722                 M[1] *= -D;
723                 M[3] *= -D;
724                 M[4] = A22;
725                 double b1 = -M[0] * M[2] - M[1] * M[5];
726                 double b2 = -M[3] * M[2] - M[4] * M[5];
727                 M[2] = b1;
728                 M[5] = b2;
729             }
730
731             double invert(double *M)
732             {
733 #define Sd(y,x) (Sd[y*3+x])
734 #define Dd(y,x) (Dd[y*3+x])
735 #define det3(m)    (m(0,0)*(m(1,1)*m(2,2) - m(1,2)*m(2,1)) -  \
736                     m(0,1)*(m(1,0)*m(2,2) - m(1,2)*m(2,0)) +  \
737                     m(0,2)*(m(1,0)*m(2,1) - m(1,1)*m(2,0)))
738                 double *Sd = M;
739                 double *Dd = M;
740                 double d = det3(Sd);
741                 double result = 0;
742                 if( d != 0)
743                 {
744                     double t[9];
745                     result = d;
746                     d = 1. / d;
747
748                     t[0] = (Sd(1, 1) * Sd(2, 2) - Sd(1, 2) * Sd(2, 1)) * d;
749                     t[1] = (Sd(0, 2) * Sd(2, 1) - Sd(0, 1) * Sd(2, 2)) * d;
750                     t[2] = (Sd(0, 1) * Sd(1, 2) - Sd(0, 2) * Sd(1, 1)) * d;
751
752                     t[3] = (Sd(1, 2) * Sd(2, 0) - Sd(1, 0) * Sd(2, 2)) * d;
753                     t[4] = (Sd(0, 0) * Sd(2, 2) - Sd(0, 2) * Sd(2, 0)) * d;
754                     t[5] = (Sd(0, 2) * Sd(1, 0) - Sd(0, 0) * Sd(1, 2)) * d;
755
756                     t[6] = (Sd(1, 0) * Sd(2, 1) - Sd(1, 1) * Sd(2, 0)) * d;
757                     t[7] = (Sd(0, 1) * Sd(2, 0) - Sd(0, 0) * Sd(2, 1)) * d;
758                     t[8] = (Sd(0, 0) * Sd(1, 1) - Sd(0, 1) * Sd(1, 0)) * d;
759
760                     Dd(0, 0) = t[0];
761                     Dd(0, 1) = t[1];
762                     Dd(0, 2) = t[2];
763                     Dd(1, 0) = t[3];
764                     Dd(1, 1) = t[4];
765                     Dd(1, 2) = t[5];
766                     Dd(2, 0) = t[6];
767                     Dd(2, 1) = t[7];
768                     Dd(2, 2) = t[8];
769                 }
770                 return result;
771             }
772
773             void warpAffine_gpu(const oclMat &src, oclMat &dst, F coeffs[2][3], int interpolation)
774             {
775                 CV_Assert( (src.oclchannels() == dst.oclchannels()) );
776                 int srcStep = src.step1();
777                 int dstStep = dst.step1();
778                 float float_coeffs[2][3];
779                 cl_mem coeffs_cm;
780
781                 Context *clCxt = src.clCxt;
782                 string s[3] = {"NN", "Linear", "Cubic"};
783                 string kernelName = "warpAffine" + s[interpolation];
784
785
786                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
787                 {
788                     cl_int st;
789                     coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
790                     openCLVerifyCall(st);
791                     openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
792                 }
793                 else
794                 {
795                     cl_int st;
796                     for(int m = 0; m < 2; m++)
797                         for(int n = 0; n < 3; n++)
798                         {
799                             float_coeffs[m][n] = coeffs[m][n];
800                         }
801                         coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
802                         openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
803
804                 }
805                 //TODO: improve this kernel
806                 size_t blkSizeX = 16, blkSizeY = 16;
807                 size_t glbSizeX;
808                 size_t cols;
809                 //if(src.type() == CV_8UC1 && interpolation != 2)
810                 if(src.type() == CV_8UC1 && interpolation != 2)
811                 {
812                     cols = (dst.cols + dst.offset % 4 + 3) / 4;
813                     glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
814                 }
815                 else
816                 {
817                     cols = dst.cols;
818                     glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
819                 }
820                 size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
821                 size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
822                 size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
823
824                 vector< pair<size_t, const void *> > args;
825
826                 args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
827                 args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
828                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
829                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
830                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols));
831                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows));
832                 args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
833                 args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
834                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset));
835                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset));
836                 args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
837                 args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
838
839                 openCLExecuteKernel(clCxt, &imgproc_warpAffine, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
840                 openCLSafeCall(clReleaseMemObject(coeffs_cm));
841             }
842
843
844             void warpPerspective_gpu(const oclMat &src, oclMat &dst, double coeffs[3][3], int interpolation)
845             {
846                 CV_Assert( (src.oclchannels() == dst.oclchannels()) );
847                 int srcStep = src.step1();
848                 int dstStep = dst.step1();
849                 float float_coeffs[3][3];
850                 cl_mem coeffs_cm;
851
852                 Context *clCxt = src.clCxt;
853                 string s[3] = {"NN", "Linear", "Cubic"};
854                 string kernelName = "warpPerspective" + s[interpolation];
855
856                 if(src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
857                 {
858                     cl_int st;
859                     coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
860                     openCLVerifyCall(st);
861                     openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
862                 }
863                 else
864                 {
865                     cl_int st;
866                     for(int m = 0; m < 3; m++)
867                         for(int n = 0; n < 3; n++)
868                             float_coeffs[m][n] = coeffs[m][n];
869
870                     coeffs_cm = clCreateBuffer(*(cl_context*)clCxt->getOpenCLContextPtr(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
871                     openCLVerifyCall(st);
872                     openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)clCxt->getOpenCLCommandQueuePtr(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
873                 }
874                 //TODO: improve this kernel
875                 size_t blkSizeX = 16, blkSizeY = 16;
876                 size_t glbSizeX;
877                 size_t cols;
878                 if(src.type() == CV_8UC1 && interpolation == 0)
879                 {
880                     cols = (dst.cols + dst.offset % 4 + 3) / 4;
881                     glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
882                 }
883                 else
884                     /*
885                     */
886                 {
887                     cols = dst.cols;
888                     glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
889                 }
890                 size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
891                 size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
892                 size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
893
894                 vector< pair<size_t, const void *> > args;
895
896                 args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
897                 args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
898                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
899                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
900                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols));
901                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows));
902                 args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
903                 args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
904                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset));
905                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset));
906                 args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
907                 args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
908
909                 openCLExecuteKernel(clCxt, &imgproc_warpPerspective, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
910                 openCLSafeCall(clReleaseMemObject(coeffs_cm));
911             }
912         }
913
914         void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
915         {
916             int interpolation = flags & INTER_MAX;
917
918             CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
919             CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
920
921             dst.create(dsize, src.type());
922
923             CV_Assert(M.rows == 2 && M.cols == 3);
924
925             int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
926             F coeffs[2][3];
927
928             double coeffsM[2*3];
929             Mat coeffsMat(2, 3, CV_64F, (void *)coeffsM);
930             M.convertTo(coeffsMat, coeffsMat.type());
931             if(!warpInd)
932             {
933                 convert_coeffs(coeffsM);
934             }
935
936             for(int i = 0; i < 2; ++i)
937                 for(int j = 0; j < 3; ++j)
938                     coeffs[i][j] = coeffsM[i*3+j];
939
940             warpAffine_gpu(src, dst, coeffs, interpolation);
941         }
942
943         void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
944         {
945             int interpolation = flags & INTER_MAX;
946
947             CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
948             CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
949
950             dst.create(dsize, src.type());
951
952
953             CV_Assert(M.rows == 3 && M.cols == 3);
954
955             int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
956             double coeffs[3][3];
957
958             double coeffsM[3*3];
959             Mat coeffsMat(3, 3, CV_64F, (void *)coeffsM);
960             M.convertTo(coeffsMat, coeffsMat.type());
961             if(!warpInd)
962             {
963                 invert(coeffsM);
964             }
965
966             for(int i = 0; i < 3; ++i)
967                 for(int j = 0; j < 3; ++j)
968                     coeffs[i][j] = coeffsM[i*3+j];
969
970             warpPerspective_gpu(src, dst, coeffs, interpolation);
971         }
972
973         ////////////////////////////////////////////////////////////////////////
974         // integral
975         void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
976         {
977             CV_Assert(src.type() == CV_8UC1);
978             if(!src.clCxt->supportsFeature(ocl::FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
979             {
980                 CV_Error(CV_OpenCLDoubleNotSupported, "select device don't support double");
981                 return;
982             }
983
984             int vlen = 4;
985             int offset = src.offset / vlen;
986             int pre_invalid = src.offset % vlen;
987             int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
988
989             oclMat t_sum , t_sqsum;
990             int w = src.cols + 1, h = src.rows + 1;
991             int depth = src.depth() == CV_8U ? CV_32S : CV_64F;
992             int type = CV_MAKE_TYPE(depth, 1);
993
994             t_sum.create(src.cols, src.rows, type);
995             sum.create(h, w, type);
996
997             t_sqsum.create(src.cols, src.rows, CV_32FC1);
998             sqsum.create(h, w, CV_32FC1);
999
1000             int sum_offset = sum.offset / vlen;
1001             int sqsum_offset = sqsum.offset / vlen;
1002
1003             vector<pair<size_t , const void *> > args;
1004             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1005             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1006             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
1007             args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
1008             args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
1009             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
1010             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
1011             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1012             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
1013             size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
1014             openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, depth);
1015
1016             args.clear();
1017             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1018             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
1019             args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
1020             args.push_back( make_pair( sizeof(cl_mem) , (void *)&sqsum.data ));
1021             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
1022             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
1023             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
1024             args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
1025             args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum.step));
1026             args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
1027             args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
1028             size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
1029             openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, depth);
1030         }
1031
1032         void integral(const oclMat &src, oclMat &sum)
1033         {
1034             CV_Assert(src.type() == CV_8UC1);
1035             int vlen = 4;
1036             int offset = src.offset / vlen;
1037             int pre_invalid = src.offset % vlen;
1038             int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
1039
1040             oclMat t_sum;
1041             int w = src.cols + 1, h = src.rows + 1;
1042             int depth = src.depth() == CV_8U ? CV_32S : CV_32F;
1043             int type = CV_MAKE_TYPE(depth, 1);
1044
1045             t_sum.create(src.cols, src.rows, type);
1046             sum.create(h, w, type);
1047
1048             int sum_offset = sum.offset / vlen;
1049             vector<pair<size_t , const void *> > args;
1050             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1051             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1052             args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
1053             args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
1054             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
1055             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
1056             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1057             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
1058             size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
1059             openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, depth);
1060
1061             args.clear();
1062             args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1063             args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
1064             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
1065             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
1066             args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
1067             args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
1068             args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
1069             size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
1070             openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, depth);
1071         }
1072
1073         /////////////////////// corner //////////////////////////////
1074         static void extractCovData(const oclMat &src, oclMat &Dx, oclMat &Dy,
1075                             int blockSize, int ksize, int borderType)
1076         {
1077             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
1078             double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
1079             if (ksize < 0)
1080                 scale *= 2.;
1081
1082             if (src.depth() == CV_8U)
1083             {
1084                 scale *= 255.;
1085                 scale = 1. / scale;
1086             }
1087             else
1088             {
1089                 scale = 1. / scale;
1090             }
1091             if (ksize > 0)
1092             {
1093                 Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
1094                 Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
1095             }
1096             else
1097             {
1098                 Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
1099                 Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
1100             }
1101             CV_Assert(Dx.offset == 0 && Dy.offset == 0);
1102         }
1103
1104         static void corner_ocl(const cv::ocl::ProgramEntry* source, string kernelName, int block_size, float k, oclMat &Dx, oclMat &Dy,
1105                         oclMat &dst, int border_type)
1106         {
1107             char borderType[30];
1108             switch (border_type)
1109             {
1110             case cv::BORDER_CONSTANT:
1111                 sprintf(borderType, "BORDER_CONSTANT");
1112                 break;
1113             case cv::BORDER_REFLECT101:
1114                 sprintf(borderType, "BORDER_REFLECT101");
1115                 break;
1116             case cv::BORDER_REFLECT:
1117                 sprintf(borderType, "BORDER_REFLECT");
1118                 break;
1119             case cv::BORDER_REPLICATE:
1120                 sprintf(borderType, "BORDER_REPLICATE");
1121                 break;
1122             default:
1123                 cout << "BORDER type is not supported!" << endl;
1124             }
1125             char build_options[150];
1126             sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s",
1127                     block_size / 2, block_size / 2, block_size, block_size, borderType);
1128
1129             size_t blockSizeX = 256, blockSizeY = 1;
1130             size_t gSize = blockSizeX - block_size / 2 * 2;
1131             size_t globalSizeX = (Dx.cols) % gSize == 0 ? Dx.cols / gSize * blockSizeX : (Dx.cols / gSize + 1) * blockSizeX;
1132             size_t rows_per_thread = 2;
1133             size_t globalSizeY = ((Dx.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ?
1134                                  ((Dx.rows + rows_per_thread - 1) / rows_per_thread) :
1135                                  (((Dx.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
1136
1137             size_t gt[3] = { globalSizeX, globalSizeY, 1 };
1138             size_t lt[3]  = { blockSizeX, blockSizeY, 1 };
1139             vector<pair<size_t , const void *> > args;
1140             args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
1141             args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dy.data));
1142             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
1143             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.offset ));
1144             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholerows ));
1145             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholecols ));
1146             args.push_back( make_pair(sizeof(cl_int), (void *)&Dx.step));
1147             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.offset ));
1148             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholerows ));
1149             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholecols ));
1150             args.push_back( make_pair(sizeof(cl_int), (void *)&Dy.step));
1151             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
1152             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
1153             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
1154             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
1155             args.push_back( make_pair( sizeof(cl_float) , (void *)&k));
1156             openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, build_options);
1157         }
1158
1159         void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
1160                           double k, int borderType)
1161         {
1162             oclMat dx, dy;
1163             cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
1164         }
1165
1166         void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
1167                           double k, int borderType)
1168         {
1169             if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1170             {
1171                 CV_Error(CV_OpenCLDoubleNotSupported, "select device don't support double");
1172             }
1173             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
1174             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1175             extractCovData(src, dx, dy, blockSize, ksize, borderType);
1176             dst.create(src.size(), CV_32F);
1177             corner_ocl(&imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
1178         }
1179
1180         void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
1181         {
1182             oclMat dx, dy;
1183             cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
1184         }
1185
1186         void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
1187         {
1188             if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
1189             {
1190                 CV_Error(CV_OpenCLDoubleNotSupported, "select device don't support double");
1191             }
1192             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
1193             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1194             extractCovData(src, dx, dy, blockSize, ksize, borderType);
1195             dst.create(src.size(), CV_32F);
1196             corner_ocl(&imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
1197         }
1198         /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
1199         static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
1200         {
1201             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
1202             CV_Assert( !(dst.step & 0x3) );
1203             Context *clCxt = src.clCxt;
1204
1205             //Arrange the NDRange
1206             int col = src.cols, row = src.rows;
1207             int ltx = 16, lty = 8;
1208             if(src.cols % ltx != 0)
1209                 col = (col / ltx + 1) * ltx;
1210             if(src.rows % lty != 0)
1211                 row = (row / lty + 1) * lty;
1212
1213             size_t globalThreads[3] = {col, row, 1};
1214             size_t localThreads[3]  = {ltx, lty, 1};
1215
1216             //set args
1217             vector<pair<size_t , const void *> > args;
1218             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
1219             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.step ));
1220             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1221             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1222             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.offset ));
1223             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.offset ));
1224             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
1225             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
1226             args.push_back( make_pair( sizeof(cl_int) , (void *)&sp ));
1227             args.push_back( make_pair( sizeof(cl_int) , (void *)&sr ));
1228             args.push_back( make_pair( sizeof(cl_int) , (void *)&maxIter ));
1229             args.push_back( make_pair( sizeof(cl_float) , (void *)&eps ));
1230             openCLExecuteKernel(clCxt, &meanShift, "meanshift_kernel", globalThreads, localThreads, args, -1, -1);
1231         }
1232
1233         void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, TermCriteria criteria)
1234         {
1235             if( src.empty() )
1236                 CV_Error( CV_StsBadArg, "The input image is empty" );
1237
1238             if( src.depth() != CV_8U || src.oclchannels() != 4 )
1239                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
1240
1241             dst.create( src.size(), CV_8UC4 );
1242
1243             if( !(criteria.type & TermCriteria::MAX_ITER) )
1244                 criteria.maxCount = 5;
1245
1246             int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
1247
1248             float eps;
1249             if( !(criteria.type & TermCriteria::EPS) )
1250                 eps = 1.f;
1251             eps = (float)std::max(criteria.epsilon, 0.0);
1252
1253             meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);
1254
1255         }
1256
1257         static void meanShiftProc_gpu(const oclMat &src, oclMat dstr, oclMat dstsp, int sp, int sr, int maxIter, float eps)
1258         {
1259             //sanity checks
1260             CV_Assert( (src.cols == dstr.cols) && (src.rows == dstr.rows) &&
1261                        (src.rows == dstsp.rows) && (src.cols == dstsp.cols));
1262             CV_Assert( !(dstsp.step & 0x3) );
1263             Context *clCxt = src.clCxt;
1264
1265             //Arrange the NDRange
1266             int col = src.cols, row = src.rows;
1267             int ltx = 16, lty = 8;
1268             if(src.cols % ltx != 0)
1269                 col = (col / ltx + 1) * ltx;
1270             if(src.rows % lty != 0)
1271                 row = (row / lty + 1) * lty;
1272
1273             size_t globalThreads[3] = {col, row, 1};
1274             size_t localThreads[3]  = {ltx, lty, 1};
1275
1276             //set args
1277             vector<pair<size_t , const void *> > args;
1278             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1279             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dstr.data ));
1280             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dstsp.data ));
1281             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1282             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.step ));
1283             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstsp.step ));
1284             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.offset ));
1285             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.offset ));
1286             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstsp.offset ));
1287             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.cols ));
1288             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.rows ));
1289             args.push_back( make_pair( sizeof(cl_int) , (void *)&sp ));
1290             args.push_back( make_pair( sizeof(cl_int) , (void *)&sr ));
1291             args.push_back( make_pair( sizeof(cl_int) , (void *)&maxIter ));
1292             args.push_back( make_pair( sizeof(cl_float) , (void *)&eps ));
1293             openCLExecuteKernel(clCxt, &meanShift, "meanshiftproc_kernel", globalThreads, localThreads, args, -1, -1);
1294         }
1295
1296         void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, TermCriteria criteria)
1297         {
1298             if( src.empty() )
1299                 CV_Error( CV_StsBadArg, "The input image is empty" );
1300
1301             if( src.depth() != CV_8U || src.oclchannels() != 4 )
1302                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
1303
1304 //            if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE))
1305 //            {
1306 //                CV_Error( CV_OpenCLDoubleNotSupportedNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
1307 //                return;
1308 //            }
1309
1310             dstr.create( src.size(), CV_8UC4 );
1311             dstsp.create( src.size(), CV_16SC2 );
1312
1313             if( !(criteria.type & TermCriteria::MAX_ITER) )
1314                 criteria.maxCount = 5;
1315
1316             int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
1317
1318             float eps;
1319             if( !(criteria.type & TermCriteria::EPS) )
1320                 eps = 1.f;
1321             eps = (float)std::max(criteria.epsilon, 0.0);
1322
1323             meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);
1324         }
1325
1326         ///////////////////////////////////////////////////////////////////////////////////////////////////
1327         ////////////////////////////////////////////////////hist///////////////////////////////////////////////
1328         /////////////////////////////////////////////////////////////////////////////////////////////////////
1329         namespace histograms
1330         {
1331             const int PARTIAL_HISTOGRAM256_COUNT = 256;
1332             const int HISTOGRAM256_BIN_COUNT = 256;
1333         }
1334         ///////////////////////////////calcHist/////////////////////////////////////////////////////////////////
1335         static void calc_sub_hist(const oclMat &mat_src, const oclMat &mat_sub_hist)
1336         {
1337             using namespace histograms;
1338
1339             Context  *clCxt = mat_src.clCxt;
1340             int depth = mat_src.depth();
1341
1342             string kernelName = "calc_sub_hist";
1343
1344             size_t localThreads[3]  = { HISTOGRAM256_BIN_COUNT, 1, 1 };
1345             size_t globalThreads[3] = { PARTIAL_HISTOGRAM256_COUNT *localThreads[0], 1, 1};
1346
1347             int dataWidth = 16;
1348             int dataWidth_bits = 4;
1349             int mask = dataWidth - 1;
1350
1351             int cols = mat_src.cols * mat_src.oclchannels();
1352             int src_offset = mat_src.offset;
1353             int hist_step = mat_sub_hist.step >> 2;
1354             int left_col = 0, right_col = 0;
1355
1356             if(cols >= dataWidth * 2 - 1)
1357             {
1358                 left_col = dataWidth - (src_offset & mask);
1359                 left_col &= mask;
1360                 src_offset += left_col;
1361                 cols -= left_col;
1362                 right_col = cols & mask;
1363                 cols -= right_col;
1364             }
1365             else
1366             {
1367                 left_col = cols;
1368                 right_col = 0;
1369                 cols = 0;
1370                 globalThreads[0] = 0;
1371             }
1372
1373             vector<pair<size_t , const void *> > args;
1374             if(globalThreads[0] != 0)
1375             {
1376                 int tempcols = cols >> dataWidth_bits;
1377                 int inc_x = globalThreads[0] % tempcols;
1378                 int inc_y = globalThreads[0] / tempcols;
1379                 src_offset >>= dataWidth_bits;
1380                 int src_step = mat_src.step >> dataWidth_bits;
1381                 int datacount = tempcols * mat_src.rows;
1382                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
1383                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
1384                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
1385                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
1386                 args.push_back( make_pair( sizeof(cl_int), (void *)&datacount));
1387                 args.push_back( make_pair( sizeof(cl_int), (void *)&tempcols));
1388                 args.push_back( make_pair( sizeof(cl_int), (void *)&inc_x));
1389                 args.push_back( make_pair( sizeof(cl_int), (void *)&inc_y));
1390                 args.push_back( make_pair( sizeof(cl_int), (void *)&hist_step));
1391                 openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, depth);
1392             }
1393             if(left_col != 0 || right_col != 0)
1394             {
1395                 kernelName = "calc_sub_hist_border";
1396                 src_offset = mat_src.offset;
1397                 localThreads[0] = 1;
1398                 localThreads[1] = 256;
1399                 globalThreads[0] = left_col + right_col;
1400                 globalThreads[1] = (mat_src.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
1401
1402                 args.clear();
1403                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
1404                 args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step));
1405                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
1406                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
1407                 args.push_back( make_pair( sizeof(cl_int), (void *)&left_col));
1408                 args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
1409                 args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows));
1410                 args.push_back( make_pair( sizeof(cl_int), (void *)&hist_step));
1411                 openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, depth);
1412             }
1413         }
1414         static void merge_sub_hist(const oclMat &sub_hist, oclMat &mat_hist)
1415         {
1416             using namespace histograms;
1417
1418             Context  *clCxt = sub_hist.clCxt;
1419             string kernelName = "merge_hist";
1420
1421             size_t localThreads[3]  = { 256, 1, 1 };
1422             size_t globalThreads[3] = { HISTOGRAM256_BIN_COUNT *localThreads[0], 1, 1};
1423             int src_step = sub_hist.step >> 2;
1424             vector<pair<size_t , const void *> > args;
1425             args.push_back( make_pair( sizeof(cl_mem), (void *)&sub_hist.data));
1426             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
1427             args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
1428             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
1429         }
1430         void calcHist(const oclMat &mat_src, oclMat &mat_hist)
1431         {
1432             using namespace histograms;
1433             CV_Assert(mat_src.type() == CV_8UC1);
1434             mat_hist.create(1, 256, CV_32SC1);
1435
1436             oclMat buf(PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_BIN_COUNT, CV_32SC1);
1437             buf.setTo(0);
1438
1439             calc_sub_hist(mat_src, buf);
1440             merge_sub_hist(buf, mat_hist);
1441         }
1442         ///////////////////////////////////equalizeHist/////////////////////////////////////////////////////
1443         void equalizeHist(const oclMat &mat_src, oclMat &mat_dst)
1444         {
1445             mat_dst.create(mat_src.rows, mat_src.cols, CV_8UC1);
1446
1447             oclMat mat_hist(1, 256, CV_32SC1);
1448
1449             calcHist(mat_src, mat_hist);
1450
1451             Context *clCxt = mat_src.clCxt;
1452             string kernelName = "calLUT";
1453             size_t localThreads[3] = { 256, 1, 1};
1454             size_t globalThreads[3] = { 256, 1, 1};
1455             oclMat lut(1, 256, CV_8UC1);
1456             vector<pair<size_t , const void *> > args;
1457             int total = mat_src.rows * mat_src.cols;
1458             args.push_back( make_pair( sizeof(cl_mem), (void *)&lut.data));
1459             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
1460             args.push_back( make_pair( sizeof(int), (void *)&total));
1461             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
1462             LUT(mat_src, lut, mat_dst);
1463         }
1464
1465         ////////////////////////////////////////////////////////////////////////
1466         // CLAHE
1467         namespace clahe
1468         {
1469             static void calcLut(const oclMat &src, oclMat &dst,
1470                 const int tilesX, const int tilesY, const cv::Size tileSize,
1471                 const int clipLimit, const float lutScale)
1472             {
1473                 cl_int2 tile_size;
1474                 tile_size.s[0] = tileSize.width;
1475                 tile_size.s[1] = tileSize.height;
1476
1477                 std::vector<pair<size_t , const void *> > args;
1478                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1479                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1480                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
1481                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1482                 args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
1483                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
1484                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
1485                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
1486
1487                 String kernelName = "calcLut";
1488                 size_t localThreads[3]  = { 32, 8, 1 };
1489                 size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
1490                 bool is_cpu = isCpuDevice();
1491                 if (is_cpu)
1492                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
1493                 else
1494                 {
1495                     cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
1496                     size_t wave_size = queryWaveFrontSize(kernel);
1497                     openCLSafeCall(clReleaseKernel(kernel));
1498
1499                     static char opt[20] = {0};
1500                     sprintf(opt, " -D WAVE_SIZE=%d", (int)wave_size);
1501                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt);
1502                 }
1503             }
1504
1505             static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
1506                 const int tilesX, const int tilesY, const cv::Size tileSize)
1507             {
1508                 cl_int2 tile_size;
1509                 tile_size.s[0] = tileSize.width;
1510                 tile_size.s[1] = tileSize.height;
1511
1512                 std::vector<pair<size_t , const void *> > args;
1513                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1514                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1515                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
1516                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
1517                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1518                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.step ));
1519                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
1520                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
1521                 args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
1522                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
1523                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
1524
1525                 String kernelName = "transform";
1526                 size_t localThreads[3]  = { 32, 8, 1 };
1527                 size_t globalThreads[3] = { src.cols, src.rows, 1 };
1528
1529                 openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
1530             }
1531         }
1532
1533         namespace
1534         {
1535             class CLAHE_Impl : public cv::CLAHE
1536             {
1537             public:
1538                 CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
1539
1540                 cv::AlgorithmInfo* info() const;
1541
1542                 void apply(cv::InputArray src, cv::OutputArray dst);
1543
1544                 void setClipLimit(double clipLimit);
1545                 double getClipLimit() const;
1546
1547                 void setTilesGridSize(cv::Size tileGridSize);
1548                 cv::Size getTilesGridSize() const;
1549
1550                 void collectGarbage();
1551
1552             private:
1553                 double clipLimit_;
1554                 int tilesX_;
1555                 int tilesY_;
1556
1557                 oclMat srcExt_;
1558                 oclMat lut_;
1559             };
1560             CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
1561             clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
1562             {
1563             }
1564
1565             CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE_OCL",
1566                 obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
1567                 obj.info()->addParam(obj, "tilesX", obj.tilesX_);
1568                 obj.info()->addParam(obj, "tilesY", obj.tilesY_))
1569             void CLAHE_Impl::apply(cv::InputArray src_raw, cv::OutputArray dst_raw)
1570             {
1571                 oclMat& src = getOclMatRef(src_raw);
1572                 oclMat& dst = getOclMatRef(dst_raw);
1573                 CV_Assert( src.type() == CV_8UC1 );
1574
1575                 dst.create( src.size(), src.type() );
1576
1577                 const int histSize = 256;
1578
1579                 ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
1580
1581                 cv::Size tileSize;
1582                 oclMat srcForLut;
1583
1584                 if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
1585                 {
1586                     tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
1587                     srcForLut = src;
1588                 }
1589                 else
1590                 {
1591                     cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar());
1592
1593                     tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
1594                     srcForLut = srcExt_;
1595                 }
1596
1597                 const int tileSizeTotal = tileSize.area();
1598                 const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
1599
1600                 int clipLimit = 0;
1601                 if (clipLimit_ > 0.0)
1602                 {
1603                     clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
1604                     clipLimit = std::max(clipLimit, 1);
1605                 }
1606
1607                 clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale);
1608                 //finish();
1609                 clahe::transform(src, dst, lut_, tilesX_, tilesY_, tileSize);
1610             }
1611
1612             void CLAHE_Impl::setClipLimit(double clipLimit)
1613             {
1614                 clipLimit_ = clipLimit;
1615             }
1616
1617             double CLAHE_Impl::getClipLimit() const
1618             {
1619                 return clipLimit_;
1620             }
1621
1622             void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
1623             {
1624                 tilesX_ = tileGridSize.width;
1625                 tilesY_ = tileGridSize.height;
1626             }
1627
1628             cv::Size CLAHE_Impl::getTilesGridSize() const
1629             {
1630                 return cv::Size(tilesX_, tilesY_);
1631             }
1632
1633             void CLAHE_Impl::collectGarbage()
1634             {
1635                 srcExt_.release();
1636                 lut_.release();
1637             }
1638         }
1639
1640         cv::Ptr<cv::CLAHE> createCLAHE(double clipLimit, cv::Size tileGridSize)
1641         {
1642             return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
1643         }
1644
1645         //////////////////////////////////bilateralFilter////////////////////////////////////////////////////
1646         static void
1647         oclbilateralFilter_8u( const oclMat &src, oclMat &dst, int d,
1648                                double sigma_color, double sigma_space,
1649                                int borderType )
1650         {
1651             int cn = src.channels();
1652             int i, j, maxk, radius;
1653
1654             CV_Assert( (src.channels() == 1 || src.channels() == 3) &&
1655                        src.type() == dst.type() && src.size() == dst.size() &&
1656                        src.data != dst.data );
1657
1658             if( sigma_color <= 0 )
1659                 sigma_color = 1;
1660             if( sigma_space <= 0 )
1661                 sigma_space = 1;
1662
1663             double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
1664             double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
1665
1666             if( d <= 0 )
1667                 radius = cvRound(sigma_space * 1.5);
1668             else
1669                 radius = d / 2;
1670             radius = MAX(radius, 1);
1671             d = radius * 2 + 1;
1672
1673             oclMat temp;
1674             copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
1675
1676             vector<float> _color_weight(cn * 256);
1677             vector<float> _space_weight(d * d);
1678             vector<int> _space_ofs(d * d);
1679             float *color_weight = &_color_weight[0];
1680             float *space_weight = &_space_weight[0];
1681             int *space_ofs = &_space_ofs[0];
1682             int dst_step_in_pixel = dst.step / dst.elemSize();
1683             int dst_offset_in_pixel = dst.offset / dst.elemSize();
1684             int temp_step_in_pixel = temp.step / temp.elemSize();
1685             // initialize color-related bilateral filter coefficients
1686             for( i = 0; i < 256 * cn; i++ )
1687                 color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
1688
1689             // initialize space-related bilateral filter coefficients
1690             for( i = -radius, maxk = 0; i <= radius; i++ )
1691                 for( j = -radius; j <= radius; j++ )
1692                 {
1693                     double r = std::sqrt((double)i * i + (double)j * j);
1694                     if( r > radius )
1695                         continue;
1696                     space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
1697                     space_ofs[maxk++] = (int)(i * temp_step_in_pixel + j);
1698                 }
1699             oclMat oclcolor_weight(1, cn * 256, CV_32FC1, color_weight);
1700             oclMat oclspace_weight(1, d * d, CV_32FC1, space_weight);
1701             oclMat oclspace_ofs(1, d * d, CV_32SC1, space_ofs);
1702
1703             string kernelName = "bilateral";
1704             size_t localThreads[3]  = { 16, 16, 1 };
1705             size_t globalThreads[3] = { (dst.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
1706                                         (dst.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1],
1707                                         1
1708                                       };
1709             if((dst.type() == CV_8UC1) && ((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
1710             {
1711                 kernelName = "bilateral2";
1712                 globalThreads[0] = (dst.cols / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
1713             }
1714             vector<pair<size_t , const void *> > args;
1715             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
1716             args.push_back( make_pair( sizeof(cl_mem), (void *)&temp.data ));
1717             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
1718             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
1719             args.push_back( make_pair( sizeof(cl_int), (void *)&maxk ));
1720             args.push_back( make_pair( sizeof(cl_int), (void *)&radius ));
1721             args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step_in_pixel ));
1722             args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_in_pixel ));
1723             args.push_back( make_pair( sizeof(cl_int), (void *)&temp_step_in_pixel ));
1724             args.push_back( make_pair( sizeof(cl_int), (void *)&temp.rows ));
1725             args.push_back( make_pair( sizeof(cl_int), (void *)&temp.cols ));
1726             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data ));
1727             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data ));
1728             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data ));
1729             openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, dst.oclchannels(), dst.depth());
1730         }
1731         void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType)
1732         {
1733
1734             dst.create( src.size(), src.type() );
1735             if( src.depth() == CV_8U )
1736                 oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType );
1737             else
1738                 CV_Error( CV_StsUnsupportedFormat,
1739                           "Bilateral filtering is only implemented for 8uimages" );
1740         }
1741
1742     }
1743 }
1744 //////////////////////////////////convolve////////////////////////////////////////////////////
1745
1746 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const cv::ocl::ProgramEntry* source)
1747 {
1748     CV_Assert(src.depth() == CV_32FC1);
1749     CV_Assert(temp1.depth() == CV_32F);
1750     CV_Assert(temp1.cols <= 17 && temp1.rows <= 17);
1751
1752     dst.create(src.size(), src.type());
1753
1754     CV_Assert(src.cols == dst.cols && src.rows == dst.rows);
1755     CV_Assert(src.type() == dst.type());
1756
1757     Context  *clCxt = src.clCxt;
1758     int channels = dst.oclchannels();
1759     int depth = dst.depth();
1760
1761     size_t vector_length = 1;
1762     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
1763     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
1764     int rows = dst.rows;
1765
1766     size_t localThreads[3]  = { 16, 16, 1 };
1767     size_t globalThreads[3] = { cols, rows, 1 };
1768
1769     vector<pair<size_t , const void *> > args;
1770     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
1771     args.push_back( make_pair( sizeof(cl_mem), (void *)&temp1.data ));
1772     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
1773     args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
1774     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
1775     args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
1776     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
1777     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.step ));
1778     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.rows ));
1779     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.cols ));
1780
1781     openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, -1, depth);
1782 }
1783 void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y)
1784 {
1785     CV_Assert(x.depth() == CV_32F);
1786     CV_Assert(t.depth() == CV_32F);
1787     CV_Assert(x.type() == y.type() && x.size() == y.size());
1788     y.create(x.size(), x.type());
1789     string kernelName = "convolve";
1790
1791     convolve_run(x, t, y, kernelName, &imgproc_convolve);
1792 }