CLAHE Python bindings
[profile/ivi/opencv.git] / modules / ocl / src / imgproc.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // @Authors
19 //    Niko Li, newlife20080214@gmail.com
20 //    Jia Haipeng, jiahaipeng95@gmail.com
21 //    Shengen Yan, yanshengen@gmail.com
22 //    Rock Li, Rock.Li@amd.com
23 //    Zero Lin, Zero.Lin@amd.com
24 //    Zhang Ying, zhangying913@gmail.com
25 //    Xu Pang, pangxu010@163.com
26 //    Wu Zailong, bullet@yeah.net
27 //    Wenju He, wenju@multicorewareinc.com
28 //    Sen Liu, swjtuls1987@126.com
29 //
30 // Redistribution and use in source and binary forms, with or without modification,
31 // are permitted provided that the following conditions are met:
32 //
33 //   * Redistribution's of source code must retain the above copyright notice,
34 //     this list of conditions and the following disclaimer.
35 //
36 //   * Redistribution's in binary form must reproduce the above copyright notice,
37 //     this list of conditions and the following disclaimer in the documentation
38 //     and/or other oclMaterials provided with the distribution.
39 //
40 //   * The name of the copyright holders may not be used to endorse or promote products
41 //     derived from this software without specific prior written permission.
42 //
43 // This software is provided by the copyright holders and contributors "as is" and
44 // any express or implied warranties, including, but not limited to, the implied
45 // warranties of merchantability and fitness for a particular purpose are disclaimed.
46 // In no event shall the Intel Corporation or contributors be liable for any direct,
47 // indirect, incidental, special, exemplary, or consequential damages
48 // (including, but not limited to, procurement of substitute goods or services;
49 // loss of use, data, or profits; or business interruption) however caused
50 // and on any theory of liability, whether in contract, strict liability,
51 // or tort (including negligence or otherwise) arising in any way out of
52 // the use of this software, even if advised of the possibility of such damage.
53 //
54 //M*/
55
56 #include "precomp.hpp"
57 #include <iomanip>
58
59 using namespace cv;
60 using namespace cv::ocl;
61 using namespace std;
62
63 namespace cv
64 {
65     namespace ocl
66     {
67
68         ////////////////////////////////////OpenCL kernel strings//////////////////////////
69         extern const char *meanShift;
70         extern const char *imgproc_copymakeboder;
71         extern const char *imgproc_median;
72         extern const char *imgproc_threshold;
73         extern const char *imgproc_resize;
74         extern const char *imgproc_remap;
75         extern const char *imgproc_warpAffine;
76         extern const char *imgproc_warpPerspective;
77         extern const char *imgproc_integral_sum;
78         extern const char *imgproc_integral;
79         extern const char *imgproc_histogram;
80         extern const char *imgproc_bilateral;
81         extern const char *imgproc_calcHarris;
82         extern const char *imgproc_calcMinEigenVal;
83         extern const char *imgproc_convolve;
84         extern const char *imgproc_clahe;
85         ////////////////////////////////////OpenCL call wrappers////////////////////////////
86
87         template <typename T> struct index_and_sizeof;
88         template <> struct index_and_sizeof<char>
89         {
90             enum { index = 1 };
91         };
92         template <> struct index_and_sizeof<unsigned char>
93         {
94             enum { index = 2 };
95         };
96         template <> struct index_and_sizeof<short>
97         {
98             enum { index = 3 };
99         };
100         template <> struct index_and_sizeof<unsigned short>
101         {
102             enum { index = 4 };
103         };
104         template <> struct index_and_sizeof<int>
105         {
106             enum { index = 5 };
107         };
108         template <> struct index_and_sizeof<float>
109         {
110             enum { index = 6 };
111         };
112         template <> struct index_and_sizeof<double>
113         {
114             enum { index = 7 };
115         };
116
117         /////////////////////////////////////////////////////////////////////////////////////
118         // threshold
119
120         typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type);
121
122         static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
123         {
124             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
125             Context *clCxt = src.clCxt;
126
127             uchar thresh_uchar = cvFloor(thresh);
128             uchar max_val = cvRound(maxVal);
129             string kernelName = "threshold";
130
131             size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16;
132             size_t bSizeX = 16, bSizeY = 16;
133             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
134             size_t gSizeY = dst.rows;
135             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
136             size_t localThreads[3] = {bSizeX, bSizeY, 1};
137
138             vector< pair<size_t, const void *> > args;
139             args.push_back( make_pair(sizeof(cl_mem), &src.data));
140             args.push_back( make_pair(sizeof(cl_mem), &dst.data));
141             args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
142             args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
143             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
144             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
145             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
146             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
147             args.push_back( make_pair(sizeof(cl_uchar), (void *)&thresh_uchar));
148             args.push_back( make_pair(sizeof(cl_uchar), (void *)&max_val));
149             args.push_back( make_pair(sizeof(cl_int), (void *)&type));
150             openCLExecuteKernel(clCxt, &imgproc_threshold, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
151         }
152
153         static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
154         {
155             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
156             Context *clCxt = src.clCxt;
157
158             float thresh_f = thresh;
159             float max_val = maxVal;
160             int dst_offset = (dst.offset >> 2);
161             int dst_step = (dst.step >> 2);
162             int src_offset = (src.offset >> 2);
163             int src_step = (src.step >> 2);
164
165             string kernelName = "threshold";
166
167             size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4;
168             //size_t cols = dst.cols;
169             size_t bSizeX = 16, bSizeY = 16;
170             size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX;
171             size_t gSizeY = dst.rows;
172             size_t globalThreads[3] = {gSizeX, gSizeY, 1};
173             size_t localThreads[3] = {bSizeX, bSizeY, 1};
174
175             vector< pair<size_t, const void *> > args;
176             args.push_back( make_pair(sizeof(cl_mem), &src.data));
177             args.push_back( make_pair(sizeof(cl_mem), &dst.data));
178             args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset));
179             args.push_back( make_pair(sizeof(cl_int), (void *)&src_step));
180             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset));
181             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
182             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
183             args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step));
184             args.push_back( make_pair(sizeof(cl_float), (void *)&thresh_f));
185             args.push_back( make_pair(sizeof(cl_float), (void *)&max_val));
186             args.push_back( make_pair(sizeof(cl_int), (void *)&type));
187             openCLExecuteKernel(clCxt, &imgproc_threshold, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
188
189         }
190
191         //threshold: support 8UC1 and 32FC1 data type and five threshold type
192         double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type)
193         {
194             //TODO: These limitations shall be removed later.
195             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
196             CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC
197                       || type == THRESH_TOZERO || type == THRESH_TOZERO_INV );
198
199             static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f};
200
201             dst.create( src.size(), src.type() );
202             gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type);
203
204             return thresh;
205         }
206         ////////////////////////////////////////////////////////////////////////////////////////////
207         ///////////////////////////////   remap   //////////////////////////////////////////////////
208         ////////////////////////////////////////////////////////////////////////////////////////////
209
210         void remap( const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int borderType, const Scalar &borderValue )
211         {
212             Context *clCxt = src.clCxt;
213             CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST
214                       || interpolation == INTER_CUBIC || interpolation == INTER_LANCZOS4);
215             CV_Assert((map1.type() == CV_16SC2 && !map2.data) || (map1.type() == CV_32FC2 && !map2.data) || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1));
216             CV_Assert(!map2.data || map2.size() == map1.size());
217             CV_Assert(dst.size() == map1.size());
218
219             dst.create(map1.size(), src.type());
220
221
222             string kernelName;
223
224             if( map1.type() == CV_32FC2 && !map2.data )
225             {
226                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
227                     kernelName = "remapLNFConstant";
228                 else if(interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
229                     kernelName = "remapNNFConstant";
230             }
231             else if(map1.type() == CV_16SC2 && !map2.data)
232             {
233                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
234                     kernelName = "remapLNSConstant";
235                 else if(interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
236                     kernelName = "remapNNSConstant";
237
238             }
239             else if(map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
240             {
241                 if(interpolation == INTER_LINEAR && borderType == BORDER_CONSTANT)
242                     kernelName = "remapLNF1Constant";
243                 else if (interpolation == INTER_NEAREST && borderType == BORDER_CONSTANT)
244                     kernelName = "remapNNF1Constant";
245             }
246
247             //int channels = dst.oclchannels();
248             //int depth = dst.depth();
249             //int type = src.type();
250             size_t blkSizeX = 16, blkSizeY = 16;
251             size_t glbSizeX;
252             int cols = dst.cols;
253             if(src.type() == CV_8UC1)
254             {
255                 cols = (dst.cols + dst.offset % 4 + 3) / 4;
256                 glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
257
258             }
259             else if(src.type() == CV_32FC1 && interpolation == INTER_LINEAR)
260             {
261                 cols = (dst.cols + (dst.offset >> 2) % 4 + 3) / 4;
262                 glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
263             }
264             else
265             {
266                 glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
267
268             }
269
270             size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
271             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
272             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
273
274             float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
275             vector< pair<size_t, const void *> > args;
276             if(map1.channels() == 2)
277             {
278                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
279                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
280                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
281                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
282                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
283                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
284                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
285                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
286                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
287                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
288                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
289                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
290                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
291                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
292                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
293                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
294                 
295                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
296                 {
297                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
298                 }
299                 else
300                 {
301                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
302                 }
303             }
304             if(map1.channels() == 1)
305             {
306                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
307                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
308                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
309                 args.push_back( make_pair(sizeof(cl_mem), (void *)&map2.data));
310                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
311                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
312                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.offset));
313                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
314                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.step));
315                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.step));
316                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
317                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
318                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
319                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
320                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.cols));
321                 args.push_back( make_pair(sizeof(cl_int), (void *)&map1.rows));
322                 args.push_back( make_pair(sizeof(cl_int), (void *)&cols));
323                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
324                 {
325                     args.push_back( make_pair(sizeof(cl_double4), (void *)&borderValue));
326                 }
327                 else
328                 {
329                     args.push_back( make_pair(sizeof(cl_float4), (void *)&borderFloat));
330                 }
331             }
332             openCLExecuteKernel(clCxt, &imgproc_remap, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
333         }
334
335         ////////////////////////////////////////////////////////////////////////////////////////////
336         // resize
337
338         static void resize_gpu( const oclMat &src, oclMat &dst, double fx, double fy, int interpolation)
339         {
340             CV_Assert( (src.channels() == dst.channels()) );
341             Context *clCxt = src.clCxt;
342             float ifx = 1. / fx;
343             float ify = 1. / fy;
344             double ifx_d = 1. / fx;
345             double ify_d = 1. / fy;
346             int srcStep_in_pixel = src.step1() / src.oclchannels();
347             int srcoffset_in_pixel = src.offset / src.elemSize();
348             int dstStep_in_pixel = dst.step1() / dst.oclchannels();
349             int dstoffset_in_pixel = dst.offset / dst.elemSize();
350             //printf("%d %d\n",src.step1() , dst.elemSize());
351             string kernelName;
352             if(interpolation == INTER_LINEAR)
353                 kernelName = "resizeLN";
354             else if(interpolation == INTER_NEAREST)
355                 kernelName = "resizeNN";
356
357             //TODO: improve this kernel
358             size_t blkSizeX = 16, blkSizeY = 16;
359             size_t glbSizeX;
360             if(src.type() == CV_8UC1)
361             {
362                 size_t cols = (dst.cols + dst.offset % 4 + 3) / 4;
363                 glbSizeX = cols % blkSizeX == 0 && cols != 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
364             }
365             else
366             {
367                 glbSizeX = dst.cols % blkSizeX == 0 && dst.cols != 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
368             }
369             size_t glbSizeY = dst.rows % blkSizeY == 0 && dst.rows != 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
370             size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
371             size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
372
373             vector< pair<size_t, const void *> > args;
374             if(interpolation == INTER_NEAREST)
375             {
376                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
377                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
378                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
379                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
380                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
381                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
382                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
383                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
384                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
385                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
386                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
387                 {
388                     args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
389                     args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
390                 }
391                 else
392                 {
393                     args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
394                     args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
395                 }
396             }
397             else
398             {
399                 args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
400                 args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
401                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
402                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
403                 args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
404                 args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
405                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
406                 args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
407                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
408                 args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
409                 args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
410                 args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
411             }
412
413             openCLExecuteKernel(clCxt, &imgproc_resize, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
414         }
415
416
417         void resize(const oclMat &src, oclMat &dst, Size dsize,
418                     double fx, double fy, int interpolation)
419         {
420             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC3 || src.type() == CV_8UC4
421                       || src.type() == CV_32FC1 || src.type() == CV_32FC3 || src.type() == CV_32FC4);
422             CV_Assert(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST);
423             CV_Assert( src.size().area() > 0 );
424             CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
425
426             if(!(dsize == Size()) && (fx > 0 && fy > 0))
427             {
428                 if(dsize.width != (int)(src.cols * fx) || dsize.height != (int)(src.rows * fy))
429                 {
430                     CV_Error(CV_StsUnmatchedSizes, "invalid dsize and fx, fy!");
431                 }
432             }
433             if( dsize == Size() )
434             {
435                 dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
436             }
437             else
438             {
439                 fx = (double)dsize.width / src.cols;
440                 fy = (double)dsize.height / src.rows;
441             }
442
443             dst.create(dsize, src.type());
444
445             if( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR )
446             {
447                 resize_gpu( src, dst, fx, fy, interpolation);
448                 return;
449             }
450             CV_Error(CV_StsUnsupportedFormat, "Non-supported interpolation method");
451         }
452
453
454         ////////////////////////////////////////////////////////////////////////
455         // medianFilter
456         void medianFilter(const oclMat &src, oclMat &dst, int m)
457         {
458             CV_Assert( m % 2 == 1 && m > 1 );
459             CV_Assert( m <= 5 || src.depth() == CV_8U );
460             CV_Assert( src.cols <= dst.cols && src.rows <= dst.rows );
461
462             if(src.data == dst.data)
463             {
464                 oclMat src1;
465                 src.copyTo(src1);
466                 return medianFilter(src1, dst, m);
467             }
468
469             int srcStep = src.step1() / src.oclchannels();
470             int dstStep = dst.step1() / dst.oclchannels();
471             int srcOffset = src.offset / src.oclchannels() / src.elemSize1();
472             int dstOffset = dst.offset / dst.oclchannels() / dst.elemSize1();
473
474             Context *clCxt = src.clCxt;
475             string kernelName = "medianFilter";
476
477
478             vector< pair<size_t, const void *> > args;
479             args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
480             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
481             args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
482             args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
483             args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
484             args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
485             args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
486             args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
487
488             size_t globalThreads[3] = {(src.cols + 18) / 16 * 16, (src.rows + 15) / 16 * 16, 1};
489             size_t localThreads[3] = {16, 16, 1};
490
491             if(m == 3)
492             {
493                 string kernelName = "medianFilter3";
494                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
495             }
496             else if(m == 5)
497             {
498                 string kernelName = "medianFilter5";
499                 openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
500             }
501             else
502             {
503                 CV_Error(CV_StsUnsupportedFormat, "Non-supported filter length");
504                 //string kernelName = "medianFilter";
505                 //args.push_back( make_pair( sizeof(cl_int),(void*)&m));
506
507                 //openCLExecuteKernel(clCxt,&imgproc_median,kernelName,globalThreads,localThreads,args,src.oclchannels(),-1);
508             }
509
510         }
511
512         ////////////////////////////////////////////////////////////////////////
513         // copyMakeBorder
514         void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
515         {
516             //CV_Assert(src.oclchannels() != 2);
517             CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
518             if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
519             {
520                 if(((bordertype & cv::BORDER_ISOLATED) == 0) &&
521                         (bordertype != cv::BORDER_CONSTANT) &&
522                         (bordertype != cv::BORDER_REPLICATE))
523                 {
524                     CV_Error(CV_StsBadArg, "unsupported border type");
525                 }
526             }
527             bordertype &= ~cv::BORDER_ISOLATED;
528             if((bordertype == cv::BORDER_REFLECT) || (bordertype == cv::BORDER_WRAP))
529             {
530                 CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
531             }
532             if(bordertype == cv::BORDER_REFLECT_101)
533             {
534                 CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
535             }
536             dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
537             int srcStep = src.step1() / src.oclchannels();
538             int dstStep = dst.step1() / dst.oclchannels();
539             int srcOffset = src.offset / src.elemSize();
540             int dstOffset = dst.offset / dst.elemSize();
541             int __bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, BORDER_REFLECT, BORDER_WRAP, BORDER_REFLECT_101};
542             const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"};
543             size_t bordertype_index;
544             for(bordertype_index = 0; bordertype_index < sizeof(__bordertype) / sizeof(int); bordertype_index++)
545             {
546                 if(__bordertype[bordertype_index] == bordertype)
547                     break;
548             }
549             if(bordertype_index == sizeof(__bordertype) / sizeof(int))
550             {
551                 CV_Error(CV_StsBadArg, "unsupported border type");
552             }
553             string kernelName = "copymakeborder";
554             size_t localThreads[3] = {16, 16, 1};
555             size_t globalThreads[3] = {(dst.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
556                                        (dst.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1
557                                       };
558
559             vector< pair<size_t, const void *> > args;
560             args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
561             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
562             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
563             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows));
564             args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
565             args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
566             args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep));
567             args.push_back( make_pair( sizeof(cl_int), (void *)&srcOffset));
568             args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep));
569             args.push_back( make_pair( sizeof(cl_int), (void *)&dstOffset));
570             args.push_back( make_pair( sizeof(cl_int), (void *)&top));
571             args.push_back( make_pair( sizeof(cl_int), (void *)&left));
572             char compile_option[64];
573             union sc
574             {
575                 cl_uchar4 uval;
576                 cl_char4  cval;
577                 cl_ushort4 usval;
578                 cl_short4 shval;
579                 cl_int4 ival;
580                 cl_float4 fval;
581                 cl_double4 dval;
582             } val;
583             switch(dst.depth())
584             {
585             case CV_8U:
586                 val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
587                 val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
588                 val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
589                 val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
590                 switch(dst.oclchannels())
591                 {
592                 case 1:
593                     sprintf(compile_option, "-D GENTYPE=uchar -D %s", borderstr[bordertype_index]);
594                     args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
595                     if(((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
596                     {
597                         kernelName = "copymakeborder_C1_D0";
598                         globalThreads[0] = (dst.cols / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
599                     }
600                     break;
601                 case 4:
602                     sprintf(compile_option, "-D GENTYPE=uchar4 -D %s", borderstr[bordertype_index]);
603                     args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
604                     break;
605                 default:
606                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
607                 }
608                 break;
609             case CV_8S:
610                 val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
611                 val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
612                 val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
613                 val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
614                 switch(dst.oclchannels())
615                 {
616                 case 1:
617                     sprintf(compile_option, "-D GENTYPE=char -D %s", borderstr[bordertype_index]);
618                     args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
619                     break;
620                 case 4:
621                     sprintf(compile_option, "-D GENTYPE=char4 -D %s", borderstr[bordertype_index]);
622                     args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
623                     break;
624                 default:
625                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
626                 }
627                 break;
628             case CV_16U:
629                 val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
630                 val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
631                 val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
632                 val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
633                 switch(dst.oclchannels())
634                 {
635                 case 1:
636                     sprintf(compile_option, "-D GENTYPE=ushort -D %s", borderstr[bordertype_index]);
637                     args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
638                     break;
639                 case 4:
640                     sprintf(compile_option, "-D GENTYPE=ushort4 -D %s", borderstr[bordertype_index]);
641                     args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
642                     break;
643                 default:
644                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
645                 }
646                 break;
647             case CV_16S:
648                 val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
649                 val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
650                 val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
651                 val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
652                 switch(dst.oclchannels())
653                 {
654                 case 1:
655                     sprintf(compile_option, "-D GENTYPE=short -D %s", borderstr[bordertype_index]);
656                     args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
657                     break;
658                 case 4:
659                     sprintf(compile_option, "-D GENTYPE=short4 -D %s", borderstr[bordertype_index]);
660                     args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
661                     break;
662                 default:
663                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
664                 }
665                 break;
666             case CV_32S:
667                 val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
668                 val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
669                 val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
670                 val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
671                 switch(dst.oclchannels())
672                 {
673                 case 1:
674                     sprintf(compile_option, "-D GENTYPE=int -D %s", borderstr[bordertype_index]);
675                     args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
676                     break;
677                 case 2:
678                     sprintf(compile_option, "-D GENTYPE=int2 -D %s", borderstr[bordertype_index]);
679                     cl_int2 i2val;
680                     i2val.s[0] = val.ival.s[0];
681                     i2val.s[1] = val.ival.s[1];
682                     args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val ));
683                     break;
684                 case 4:
685                     sprintf(compile_option, "-D GENTYPE=int4 -D %s", borderstr[bordertype_index]);
686                     args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
687                     break;
688                 default:
689                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
690                 }
691                 break;
692             case CV_32F:
693                 val.fval.s[0] = scalar.val[0];
694                 val.fval.s[1] = scalar.val[1];
695                 val.fval.s[2] = scalar.val[2];
696                 val.fval.s[3] = scalar.val[3];
697                 switch(dst.oclchannels())
698                 {
699                 case 1:
700                     sprintf(compile_option, "-D GENTYPE=float -D %s", borderstr[bordertype_index]);
701                     args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
702                     break;
703                 case 4:
704                     sprintf(compile_option, "-D GENTYPE=float4 -D %s", borderstr[bordertype_index]);
705                     args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
706                     break;
707                 default:
708                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
709                 }
710                 break;
711             case CV_64F:
712                 val.dval.s[0] = scalar.val[0];
713                 val.dval.s[1] = scalar.val[1];
714                 val.dval.s[2] = scalar.val[2];
715                 val.dval.s[3] = scalar.val[3];
716                 switch(dst.oclchannels())
717                 {
718                 case 1:
719                     sprintf(compile_option, "-D GENTYPE=double -D %s", borderstr[bordertype_index]);
720                     args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
721                     break;
722                 case 4:
723                     sprintf(compile_option, "-D GENTYPE=double4 -D %s", borderstr[bordertype_index]);
724                     args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
725                     break;
726                 default:
727                     CV_Error(CV_StsUnsupportedFormat, "unsupported channels");
728                 }
729                 break;
730             default:
731                 CV_Error(CV_StsUnsupportedFormat, "unknown depth");
732             }
733
734             openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
735             //uchar* cputemp=new uchar[32*dst.wholerows];
736             ////int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
737             //openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE,
738             //                                          0, 32*dst.wholerows, cputemp, 0, NULL, NULL));
739             //for(int i=0;i<dst.wholerows;i++)
740             //{
741             //  for(int j=0;j<dst.wholecols;j++)
742             //  {
743             //          cout<< (int)cputemp[i*32+j]<<" ";
744             //  }
745             //  cout<<endl;
746             //}
747             //delete []cputemp;
748         }
749
750         ////////////////////////////////////////////////////////////////////////
751         // warp
752
753         namespace
754         {
755 #define F double
756
757             void convert_coeffs(F *M)
758             {
759                 double D = M[0] * M[4] - M[1] * M[3];
760                 D = D != 0 ? 1. / D : 0;
761                 double A11 = M[4] * D, A22 = M[0] * D;
762                 M[0] = A11;
763                 M[1] *= -D;
764                 M[3] *= -D;
765                 M[4] = A22;
766                 double b1 = -M[0] * M[2] - M[1] * M[5];
767                 double b2 = -M[3] * M[2] - M[4] * M[5];
768                 M[2] = b1;
769                 M[5] = b2;
770             }
771
772             double invert(double *M)
773             {
774 #define Sd(y,x) (Sd[y*3+x])
775 #define Dd(y,x) (Dd[y*3+x])
776 #define det3(m)    (m(0,0)*(m(1,1)*m(2,2) - m(1,2)*m(2,1)) -  \
777                     m(0,1)*(m(1,0)*m(2,2) - m(1,2)*m(2,0)) +  \
778                     m(0,2)*(m(1,0)*m(2,1) - m(1,1)*m(2,0)))
779                 double *Sd = M;
780                 double *Dd = M;
781                 double d = det3(Sd);
782                 double result = 0;
783                 if( d != 0)
784                 {
785                     double t[9];
786                     result = d;
787                     d = 1. / d;
788
789                     t[0] = (Sd(1, 1) * Sd(2, 2) - Sd(1, 2) * Sd(2, 1)) * d;
790                     t[1] = (Sd(0, 2) * Sd(2, 1) - Sd(0, 1) * Sd(2, 2)) * d;
791                     t[2] = (Sd(0, 1) * Sd(1, 2) - Sd(0, 2) * Sd(1, 1)) * d;
792
793                     t[3] = (Sd(1, 2) * Sd(2, 0) - Sd(1, 0) * Sd(2, 2)) * d;
794                     t[4] = (Sd(0, 0) * Sd(2, 2) - Sd(0, 2) * Sd(2, 0)) * d;
795                     t[5] = (Sd(0, 2) * Sd(1, 0) - Sd(0, 0) * Sd(1, 2)) * d;
796
797                     t[6] = (Sd(1, 0) * Sd(2, 1) - Sd(1, 1) * Sd(2, 0)) * d;
798                     t[7] = (Sd(0, 1) * Sd(2, 0) - Sd(0, 0) * Sd(2, 1)) * d;
799                     t[8] = (Sd(0, 0) * Sd(1, 1) - Sd(0, 1) * Sd(1, 0)) * d;
800
801                     Dd(0, 0) = t[0];
802                     Dd(0, 1) = t[1];
803                     Dd(0, 2) = t[2];
804                     Dd(1, 0) = t[3];
805                     Dd(1, 1) = t[4];
806                     Dd(1, 2) = t[5];
807                     Dd(2, 0) = t[6];
808                     Dd(2, 1) = t[7];
809                     Dd(2, 2) = t[8];
810                 }
811                 return result;
812             }
813
814             void warpAffine_gpu(const oclMat &src, oclMat &dst, F coeffs[2][3], int interpolation)
815             {
816                 CV_Assert( (src.oclchannels() == dst.oclchannels()) );
817                 int srcStep = src.step1();
818                 int dstStep = dst.step1();
819                 float float_coeffs[2][3];
820                 cl_mem coeffs_cm;
821
822                 Context *clCxt = src.clCxt;
823                 string s[3] = {"NN", "Linear", "Cubic"};
824                 string kernelName = "warpAffine" + s[interpolation];
825
826
827                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
828                 {
829                     cl_int st;
830                     coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(F) * 2 * 3, NULL, &st );
831                     openCLVerifyCall(st);
832                     openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(F) * 2 * 3, coeffs, 0, 0, 0));
833                 }
834                 else
835                 {
836                     cl_int st;
837                     for(int m = 0; m < 2; m++)
838                         for(int n = 0; n < 3; n++)
839                         {
840                             float_coeffs[m][n] = coeffs[m][n];
841                         }
842                         coeffs_cm = clCreateBuffer( (cl_context)clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 2 * 3, NULL, &st );
843                         openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 2 * 3, float_coeffs, 0, 0, 0));
844
845                 }
846                 //TODO: improve this kernel
847                 size_t blkSizeX = 16, blkSizeY = 16;
848                 size_t glbSizeX;
849                 size_t cols;
850                 //if(src.type() == CV_8UC1 && interpolation != 2)
851                 if(src.type() == CV_8UC1 && interpolation != 2)
852                 {
853                     cols = (dst.cols + dst.offset % 4 + 3) / 4;
854                     glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
855                 }
856                 else
857                 {
858                     cols = dst.cols;
859                     glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
860                 }
861                 size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
862                 size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
863                 size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
864
865                 vector< pair<size_t, const void *> > args;
866
867                 args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
868                 args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
869                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
870                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
871                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols));
872                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows));
873                 args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
874                 args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
875                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset));
876                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset));
877                 args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
878                 args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
879
880                 openCLExecuteKernel(clCxt, &imgproc_warpAffine, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
881                 openCLSafeCall(clReleaseMemObject(coeffs_cm));
882             }
883
884
885             void warpPerspective_gpu(const oclMat &src, oclMat &dst, double coeffs[3][3], int interpolation)
886             {
887                 CV_Assert( (src.oclchannels() == dst.oclchannels()) );
888                 int srcStep = src.step1();
889                 int dstStep = dst.step1();
890                 float float_coeffs[3][3];
891                 cl_mem coeffs_cm;
892
893                 Context *clCxt = src.clCxt;
894                 string s[3] = {"NN", "Linear", "Cubic"};
895                 string kernelName = "warpPerspective" + s[interpolation];
896
897                 if(src.clCxt->supportsFeature(Context::CL_DOUBLE))
898                 {
899                     cl_int st;
900                     coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(double) * 3 * 3, NULL, &st );
901                     openCLVerifyCall(st);
902                     openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(double) * 3 * 3, coeffs, 0, 0, 0));
903                 }
904                 else
905                 {
906                     cl_int st;
907                     for(int m = 0; m < 3; m++)
908                         for(int n = 0; n < 3; n++)
909                             float_coeffs[m][n] = coeffs[m][n];
910
911                     coeffs_cm = clCreateBuffer((cl_context) clCxt->oclContext(), CL_MEM_READ_WRITE, sizeof(float) * 3 * 3, NULL, &st );
912                     openCLVerifyCall(st);
913                     openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)clCxt->oclCommandQueue(), (cl_mem)coeffs_cm, 1, 0, sizeof(float) * 3 * 3, float_coeffs, 0, 0, 0));
914                 }
915                 //TODO: improve this kernel
916                 size_t blkSizeX = 16, blkSizeY = 16;
917                 size_t glbSizeX;
918                 size_t cols;
919                 if(src.type() == CV_8UC1 && interpolation == 0)
920                 {
921                     cols = (dst.cols + dst.offset % 4 + 3) / 4;
922                     glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
923                 }
924                 else
925                     /*
926                     */
927                 {
928                     cols = dst.cols;
929                     glbSizeX = dst.cols % blkSizeX == 0 ? dst.cols : (dst.cols / blkSizeX + 1) * blkSizeX;
930                 }
931                 size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
932                 size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
933                 size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
934
935                 vector< pair<size_t, const void *> > args;
936
937                 args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data));
938                 args.push_back(make_pair(sizeof(cl_mem), (void *)&dst.data));
939                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.cols));
940                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.rows));
941                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.cols));
942                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.rows));
943                 args.push_back(make_pair(sizeof(cl_int), (void *)&srcStep));
944                 args.push_back(make_pair(sizeof(cl_int), (void *)&dstStep));
945                 args.push_back(make_pair(sizeof(cl_int), (void *)&src.offset));
946                 args.push_back(make_pair(sizeof(cl_int), (void *)&dst.offset));
947                 args.push_back(make_pair(sizeof(cl_mem), (void *)&coeffs_cm));
948                 args.push_back(make_pair(sizeof(cl_int), (void *)&cols));
949
950                 openCLExecuteKernel(clCxt, &imgproc_warpPerspective, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
951                 openCLSafeCall(clReleaseMemObject(coeffs_cm));
952             }
953         }
954
955         void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
956         {
957             int interpolation = flags & INTER_MAX;
958
959             CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
960             CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
961
962             dst.create(dsize, src.type());
963
964             CV_Assert(M.rows == 2 && M.cols == 3);
965
966             int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
967             F coeffs[2][3];
968
969             double coeffsM[2*3];
970             Mat coeffsMat(2, 3, CV_64F, (void *)coeffsM);
971             M.convertTo(coeffsMat, coeffsMat.type());
972             if(!warpInd)
973             {
974                 convert_coeffs(coeffsM);
975             }
976
977             for(int i = 0; i < 2; ++i)
978                 for(int j = 0; j < 3; ++j)
979                     coeffs[i][j] = coeffsM[i*3+j];
980
981             warpAffine_gpu(src, dst, coeffs, interpolation);
982         }
983
984         void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags)
985         {
986             int interpolation = flags & INTER_MAX;
987
988             CV_Assert((src.depth() == CV_8U  || src.depth() == CV_32F) && src.oclchannels() != 2 && src.oclchannels() != 3);
989             CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
990
991             dst.create(dsize, src.type());
992
993
994             CV_Assert(M.rows == 3 && M.cols == 3);
995
996             int warpInd = (flags & WARP_INVERSE_MAP) >> 4;
997             double coeffs[3][3];
998
999             double coeffsM[3*3];
1000             Mat coeffsMat(3, 3, CV_64F, (void *)coeffsM);
1001             M.convertTo(coeffsMat, coeffsMat.type());
1002             if(!warpInd)
1003             {
1004                 invert(coeffsM);
1005             }
1006
1007             for(int i = 0; i < 3; ++i)
1008                 for(int j = 0; j < 3; ++j)
1009                     coeffs[i][j] = coeffsM[i*3+j];
1010
1011             warpPerspective_gpu(src, dst, coeffs, interpolation);
1012         }
1013
1014         ////////////////////////////////////////////////////////////////////////
1015         // integral
1016         void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
1017         {
1018             CV_Assert(src.type() == CV_8UC1);
1019             if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
1020             {
1021                 CV_Error(CV_GpuNotSupported, "select device don't support double");
1022             }
1023             int vlen = 4;
1024             int offset = src.offset / vlen;
1025             int pre_invalid = src.offset % vlen;
1026             int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
1027
1028             oclMat t_sum , t_sqsum;
1029             int w = src.cols + 1, h = src.rows + 1;
1030             int depth;
1031             if( src.cols * src.rows <= 2901 * 2901 ) //2901 is the maximum size for int when all values are 255
1032             {
1033                 t_sum.create(src.cols, src.rows, CV_32SC1);
1034                 sum.create(h, w, CV_32SC1);
1035             }
1036             else
1037             {
1038                  //Use float to prevent overflow
1039                 t_sum.create(src.cols, src.rows, CV_32FC1);
1040                 sum.create(h, w, CV_32FC1);
1041              }
1042              t_sqsum.create(src.cols, src.rows, CV_32FC1);
1043              sqsum.create(h, w, CV_32FC1);
1044              depth = sum.depth();
1045              int sum_offset = sum.offset / vlen;
1046              int sqsum_offset = sqsum.offset / vlen;
1047
1048              vector<pair<size_t , const void *> > args;
1049              args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1050              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1051              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
1052              args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
1053              args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
1054              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
1055              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
1056              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1057              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
1058              size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
1059              openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, depth);
1060              args.clear();
1061              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1062              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
1063              args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
1064              args.push_back( make_pair( sizeof(cl_mem) , (void *)&sqsum.data ));
1065              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
1066              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
1067              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
1068              args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
1069              args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum.step));
1070              args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
1071              args.push_back( make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
1072              size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
1073              openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, depth);
1074         }
1075
1076         void integral(const oclMat &src, oclMat &sum)
1077         {
1078             CV_Assert(src.type() == CV_8UC1);
1079             int vlen = 4;
1080             int offset = src.offset / vlen;
1081             int pre_invalid = src.offset % vlen;
1082             int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
1083
1084             oclMat t_sum;
1085             int w = src.cols + 1, h = src.rows + 1;
1086             int depth;
1087             if(src.cols * src.rows <= 2901 * 2901)
1088             {
1089                 t_sum.create(src.cols, src.rows, CV_32SC1);
1090                 sum.create(h, w, CV_32SC1);
1091             }else
1092             {
1093                  t_sum.create(src.cols, src.rows, CV_32FC1);
1094                  sum.create(h, w, CV_32FC1);
1095              }
1096              depth = sum.depth();
1097              int sum_offset = sum.offset / vlen;
1098              vector<pair<size_t , const void *> > args;
1099              args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1100              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1101              args.push_back( make_pair( sizeof(cl_int) , (void *)&offset ));
1102              args.push_back( make_pair( sizeof(cl_int) , (void *)&pre_invalid ));
1103              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
1104              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
1105              args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1106              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step));
1107              size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
1108              openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, depth);
1109              args.clear();
1110              args.push_back( make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
1111              args.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
1112              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.rows ));
1113              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.cols ));
1114              args.push_back( make_pair( sizeof(cl_int) , (void *)&t_sum.step ));
1115              args.push_back( make_pair( sizeof(cl_int) , (void *)&sum.step));
1116              args.push_back( make_pair( sizeof(cl_int) , (void *)&sum_offset));
1117              size_t gt2[3] = {t_sum.cols  * 32, 1, 1}, lt2[3] = {256, 1, 1};
1118              openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, depth);         
1119         }
1120
1121         /////////////////////// corner //////////////////////////////
1122         static void extractCovData(const oclMat &src, oclMat &Dx, oclMat &Dy,
1123                             int blockSize, int ksize, int borderType)
1124         {
1125             CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1);
1126             double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
1127             if (ksize < 0)
1128                 scale *= 2.;
1129
1130             if (src.depth() == CV_8U)
1131             {
1132                 scale *= 255.;
1133                 scale = 1. / scale;
1134             }
1135             else
1136             {
1137                 scale = 1. / scale;
1138             }
1139             if (ksize > 0)
1140             {
1141                 Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
1142                 Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
1143             }
1144             else
1145             {
1146                 Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
1147                 Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
1148             }
1149             CV_Assert(Dx.offset == 0 && Dy.offset == 0);
1150         }
1151
1152         static void corner_ocl(const char *src_str, string kernelName, int block_size, float k, oclMat &Dx, oclMat &Dy,
1153                         oclMat &dst, int border_type)
1154         {
1155             char borderType[30];
1156             switch (border_type)
1157             {
1158             case cv::BORDER_CONSTANT:
1159                 sprintf(borderType, "BORDER_CONSTANT");
1160                 break;
1161             case cv::BORDER_REFLECT101:
1162                 sprintf(borderType, "BORDER_REFLECT101");
1163                 break;
1164             case cv::BORDER_REFLECT:
1165                 sprintf(borderType, "BORDER_REFLECT");
1166                 break;
1167             case cv::BORDER_REPLICATE:
1168                 sprintf(borderType, "BORDER_REPLICATE");
1169                 break;
1170             default:
1171                 cout << "BORDER type is not supported!" << endl;
1172             }
1173             char build_options[150];
1174             sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s",
1175                     block_size / 2, block_size / 2, block_size, block_size, borderType);
1176
1177             size_t blockSizeX = 256, blockSizeY = 1;
1178             size_t gSize = blockSizeX - block_size / 2 * 2;
1179             size_t globalSizeX = (Dx.cols) % gSize == 0 ? Dx.cols / gSize * blockSizeX : (Dx.cols / gSize + 1) * blockSizeX;
1180             size_t rows_per_thread = 2;
1181             size_t globalSizeY = ((Dx.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ?
1182                                  ((Dx.rows + rows_per_thread - 1) / rows_per_thread) :
1183                                  (((Dx.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY;
1184
1185             size_t gt[3] = { globalSizeX, globalSizeY, 1 };
1186             size_t lt[3]  = { blockSizeX, blockSizeY, 1 };
1187             vector<pair<size_t , const void *> > args;
1188             args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
1189             args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dy.data));
1190             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data));
1191             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.offset ));
1192             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholerows ));
1193             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholecols ));
1194             args.push_back( make_pair(sizeof(cl_int), (void *)&Dx.step));
1195             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.offset ));
1196             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholerows ));
1197             args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholecols ));
1198             args.push_back( make_pair(sizeof(cl_int), (void *)&Dy.step));
1199             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
1200             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
1201             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
1202             args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
1203             args.push_back( make_pair( sizeof(cl_float) , (void *)&k));
1204             openCLExecuteKernel(dst.clCxt, &src_str, kernelName, gt, lt, args, -1, -1, build_options);
1205         }
1206
1207         void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
1208                           double k, int borderType)
1209         {
1210             oclMat dx, dy;
1211             cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
1212         }
1213
1214         void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
1215                           double k, int borderType)
1216         {
1217             if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
1218             {
1219                 CV_Error(CV_GpuNotSupported, "select device don't support double");
1220             }
1221             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
1222             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1223             extractCovData(src, dx, dy, blockSize, ksize, borderType);
1224             dst.create(src.size(), CV_32F);
1225             corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
1226         }
1227
1228         void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
1229         {
1230             oclMat dx, dy;
1231             cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
1232         }
1233         
1234         void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
1235         {
1236             if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
1237             {
1238                 CV_Error(CV_GpuNotSupported, "select device don't support double");
1239             }
1240             CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
1241             CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
1242             extractCovData(src, dx, dy, blockSize, ksize, borderType);
1243             dst.create(src.size(), CV_32F);
1244             corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
1245         }
1246         /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
1247         static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
1248         {
1249             CV_Assert( (src.cols == dst.cols) && (src.rows == dst.rows) );
1250             CV_Assert( !(dst.step & 0x3) );
1251             Context *clCxt = src.clCxt;
1252
1253             //Arrange the NDRange
1254             int col = src.cols, row = src.rows;
1255             int ltx = 16, lty = 8;
1256             if(src.cols % ltx != 0)
1257                 col = (col / ltx + 1) * ltx;
1258             if(src.rows % lty != 0)
1259                 row = (row / lty + 1) * lty;
1260
1261             size_t globalThreads[3] = {col, row, 1};
1262             size_t localThreads[3]  = {ltx, lty, 1};
1263
1264             //set args
1265             vector<pair<size_t , const void *> > args;
1266             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
1267             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.step ));
1268             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1269             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1270             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.offset ));
1271             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.offset ));
1272             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
1273             args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
1274             args.push_back( make_pair( sizeof(cl_int) , (void *)&sp ));
1275             args.push_back( make_pair( sizeof(cl_int) , (void *)&sr ));
1276             args.push_back( make_pair( sizeof(cl_int) , (void *)&maxIter ));
1277             args.push_back( make_pair( sizeof(cl_float) , (void *)&eps ));
1278             openCLExecuteKernel(clCxt, &meanShift, "meanshift_kernel", globalThreads, localThreads, args, -1, -1);
1279         }
1280
1281         void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, TermCriteria criteria)
1282         {
1283             if( src.empty() )
1284                 CV_Error( CV_StsBadArg, "The input image is empty" );
1285
1286             if( src.depth() != CV_8U || src.oclchannels() != 4 )
1287                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
1288
1289             //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
1290             //            {
1291             //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
1292             //            }
1293
1294             dst.create( src.size(), CV_8UC4 );
1295
1296             if( !(criteria.type & TermCriteria::MAX_ITER) )
1297                 criteria.maxCount = 5;
1298
1299             int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
1300
1301             float eps;
1302             if( !(criteria.type & TermCriteria::EPS) )
1303                 eps = 1.f;
1304             eps = (float)std::max(criteria.epsilon, 0.0);
1305
1306             meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);
1307
1308         }
1309
1310         static void meanShiftProc_gpu(const oclMat &src, oclMat dstr, oclMat dstsp, int sp, int sr, int maxIter, float eps)
1311         {
1312             //sanity checks
1313             CV_Assert( (src.cols == dstr.cols) && (src.rows == dstr.rows) &&
1314                        (src.rows == dstsp.rows) && (src.cols == dstsp.cols));
1315             CV_Assert( !(dstsp.step & 0x3) );
1316             Context *clCxt = src.clCxt;
1317
1318             //Arrange the NDRange
1319             int col = src.cols, row = src.rows;
1320             int ltx = 16, lty = 8;
1321             if(src.cols % ltx != 0)
1322                 col = (col / ltx + 1) * ltx;
1323             if(src.rows % lty != 0)
1324                 row = (row / lty + 1) * lty;
1325
1326             size_t globalThreads[3] = {col, row, 1};
1327             size_t localThreads[3]  = {ltx, lty, 1};
1328
1329             //set args
1330             vector<pair<size_t , const void *> > args;
1331             args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
1332             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dstr.data ));
1333             args.push_back( make_pair( sizeof(cl_mem) , (void *)&dstsp.data ));
1334             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
1335             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.step ));
1336             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstsp.step ));
1337             args.push_back( make_pair( sizeof(cl_int) , (void *)&src.offset ));
1338             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.offset ));
1339             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstsp.offset ));
1340             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.cols ));
1341             args.push_back( make_pair( sizeof(cl_int) , (void *)&dstr.rows ));
1342             args.push_back( make_pair( sizeof(cl_int) , (void *)&sp ));
1343             args.push_back( make_pair( sizeof(cl_int) , (void *)&sr ));
1344             args.push_back( make_pair( sizeof(cl_int) , (void *)&maxIter ));
1345             args.push_back( make_pair( sizeof(cl_float) , (void *)&eps ));
1346             openCLExecuteKernel(clCxt, &meanShift, "meanshiftproc_kernel", globalThreads, localThreads, args, -1, -1);
1347         }
1348
1349         void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, TermCriteria criteria)
1350         {
1351             if( src.empty() )
1352                 CV_Error( CV_StsBadArg, "The input image is empty" );
1353
1354             if( src.depth() != CV_8U || src.oclchannels() != 4 )
1355                 CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
1356
1357             //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
1358             //            {
1359             //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
1360             //            }
1361
1362             dstr.create( src.size(), CV_8UC4 );
1363             dstsp.create( src.size(), CV_16SC2 );
1364
1365             if( !(criteria.type & TermCriteria::MAX_ITER) )
1366                 criteria.maxCount = 5;
1367
1368             int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
1369
1370             float eps;
1371             if( !(criteria.type & TermCriteria::EPS) )
1372                 eps = 1.f;
1373             eps = (float)std::max(criteria.epsilon, 0.0);
1374
1375             meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);
1376         }
1377
1378         ///////////////////////////////////////////////////////////////////////////////////////////////////
1379         ////////////////////////////////////////////////////hist///////////////////////////////////////////////
1380         /////////////////////////////////////////////////////////////////////////////////////////////////////
1381         namespace histograms
1382         {
1383             const int PARTIAL_HISTOGRAM256_COUNT = 256;
1384             const int HISTOGRAM256_BIN_COUNT = 256;
1385         }
1386         ///////////////////////////////calcHist/////////////////////////////////////////////////////////////////
1387         static void calc_sub_hist(const oclMat &mat_src, const oclMat &mat_sub_hist)
1388         {
1389             using namespace histograms;
1390
1391             Context  *clCxt = mat_src.clCxt;
1392             int depth = mat_src.depth();
1393
1394             string kernelName = "calc_sub_hist";
1395
1396             size_t localThreads[3]  = { HISTOGRAM256_BIN_COUNT, 1, 1 };
1397             size_t globalThreads[3] = { PARTIAL_HISTOGRAM256_COUNT *localThreads[0], 1, 1};
1398
1399             int dataWidth = 16;
1400             int dataWidth_bits = 4;
1401             int mask = dataWidth - 1;
1402
1403             int cols = mat_src.cols * mat_src.oclchannels();
1404             int src_offset = mat_src.offset;
1405             int hist_step = mat_sub_hist.step >> 2;
1406             int left_col = 0, right_col = 0;
1407
1408             if(cols >= dataWidth * 2 - 1)
1409             {
1410                 left_col = dataWidth - (src_offset & mask);
1411                 left_col &= mask;
1412                 src_offset += left_col;
1413                 cols -= left_col;
1414                 right_col = cols & mask;
1415                 cols -= right_col;
1416             }
1417             else
1418             {
1419                 left_col = cols;
1420                 right_col = 0;
1421                 cols = 0;
1422                 globalThreads[0] = 0;
1423             }
1424
1425             vector<pair<size_t , const void *> > args;
1426             if(globalThreads[0] != 0)
1427             {
1428                 int tempcols = cols >> dataWidth_bits;
1429                 int inc_x = globalThreads[0] % tempcols;
1430                 int inc_y = globalThreads[0] / tempcols;
1431                 src_offset >>= dataWidth_bits;
1432                 int src_step = mat_src.step >> dataWidth_bits;
1433                 int datacount = tempcols * mat_src.rows;
1434                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
1435                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
1436                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
1437                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
1438                 args.push_back( make_pair( sizeof(cl_int), (void *)&datacount));
1439                 args.push_back( make_pair( sizeof(cl_int), (void *)&tempcols));
1440                 args.push_back( make_pair( sizeof(cl_int), (void *)&inc_x));
1441                 args.push_back( make_pair( sizeof(cl_int), (void *)&inc_y));
1442                 args.push_back( make_pair( sizeof(cl_int), (void *)&hist_step));
1443                 openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, depth);
1444             }
1445             if(left_col != 0 || right_col != 0)
1446             {
1447                 kernelName = "calc_sub_hist_border";
1448                 src_offset = mat_src.offset;
1449                 localThreads[0] = 1;
1450                 localThreads[1] = 256;
1451                 globalThreads[0] = left_col + right_col;
1452                 globalThreads[1] = (mat_src.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1];
1453
1454                 args.clear();
1455                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
1456                 args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step));
1457                 args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset));
1458                 args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_sub_hist.data));
1459                 args.push_back( make_pair( sizeof(cl_int), (void *)&left_col));
1460                 args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
1461                 args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows));
1462                 args.push_back( make_pair( sizeof(cl_int), (void *)&hist_step));
1463                 openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, depth);
1464             }
1465         }
1466         static void merge_sub_hist(const oclMat &sub_hist, oclMat &mat_hist)
1467         {
1468             using namespace histograms;
1469
1470             Context  *clCxt = sub_hist.clCxt;
1471             string kernelName = "merge_hist";
1472
1473             size_t localThreads[3]  = { 256, 1, 1 };
1474             size_t globalThreads[3] = { HISTOGRAM256_BIN_COUNT *localThreads[0], 1, 1};
1475             int src_step = sub_hist.step >> 2;
1476             vector<pair<size_t , const void *> > args;
1477             args.push_back( make_pair( sizeof(cl_mem), (void *)&sub_hist.data));
1478             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
1479             args.push_back( make_pair( sizeof(cl_int), (void *)&src_step));
1480             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
1481         }
1482         void calcHist(const oclMat &mat_src, oclMat &mat_hist)
1483         {
1484             using namespace histograms;
1485             CV_Assert(mat_src.type() == CV_8UC1);
1486             mat_hist.create(1, 256, CV_32SC1);
1487
1488             oclMat buf(PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_BIN_COUNT, CV_32SC1);
1489             buf.setTo(0);
1490
1491             calc_sub_hist(mat_src, buf);
1492             merge_sub_hist(buf, mat_hist);
1493         }
1494         ///////////////////////////////////equalizeHist/////////////////////////////////////////////////////
1495         void equalizeHist(const oclMat &mat_src, oclMat &mat_dst)
1496         {
1497             mat_dst.create(mat_src.rows, mat_src.cols, CV_8UC1);
1498
1499             oclMat mat_hist(1, 256, CV_32SC1);
1500
1501             calcHist(mat_src, mat_hist);
1502
1503             Context *clCxt = mat_src.clCxt;
1504             string kernelName = "calLUT";
1505             size_t localThreads[3] = { 256, 1, 1};
1506             size_t globalThreads[3] = { 256, 1, 1};
1507             oclMat lut(1, 256, CV_8UC1);
1508             vector<pair<size_t , const void *> > args;
1509             int total = mat_src.rows * mat_src.cols;
1510             args.push_back( make_pair( sizeof(cl_mem), (void *)&lut.data));
1511             args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_hist.data));
1512             args.push_back( make_pair( sizeof(int), (void *)&total));
1513             openCLExecuteKernel(clCxt, &imgproc_histogram, kernelName, globalThreads, localThreads, args, -1, -1);
1514             LUT(mat_src, lut, mat_dst);
1515         }
1516
1517         ////////////////////////////////////////////////////////////////////////
1518         // CLAHE
1519         namespace clahe
1520         {
1521             inline int divUp(int total, int grain)
1522             {
1523                 return (total + grain - 1) / grain * grain;
1524             }
1525
1526             static void calcLut(const oclMat &src, oclMat &dst,
1527                 const int tilesX, const int tilesY, const cv::Size tileSize,
1528                 const int clipLimit, const float lutScale)
1529             {
1530                 cl_int2 tile_size;
1531                 tile_size.s[0] = tileSize.width;
1532                 tile_size.s[1] = tileSize.height;
1533
1534                 std::vector<pair<size_t , const void *> > args;
1535                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1536                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1537                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
1538                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1539                 args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
1540                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
1541                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&clipLimit ));
1542                 args.push_back( std::make_pair( sizeof(cl_float), (void *)&lutScale ));
1543
1544                 String kernelName = "calcLut";
1545                 size_t localThreads[3]  = { 32, 8, 1 };
1546                 size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
1547                 bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
1548                 if (is_cpu)
1549                 {
1550                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, (char*)" -D CPU");
1551                 }
1552                 else
1553                 {
1554                     cl_kernel kernel = openCLGetKernelFromSource(Context::getContext(), &imgproc_clahe, kernelName);
1555                     int wave_size = queryDeviceInfo<WAVEFRONT_SIZE, int>(kernel);
1556                     openCLSafeCall(clReleaseKernel(kernel));
1557
1558                     static char opt[20] = {0};
1559                     sprintf(opt, " -D WAVE_SIZE=%d", wave_size);
1560                     openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1, opt);
1561                 }
1562             }
1563
1564             static void transform(const oclMat &src, oclMat &dst, const oclMat &lut,
1565                 const int tilesX, const int tilesY, const cv::Size tileSize)
1566             {
1567                 cl_int2 tile_size;
1568                 tile_size.s[0] = tileSize.width;
1569                 tile_size.s[1] = tileSize.height;
1570
1571                 std::vector<pair<size_t , const void *> > args;
1572                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data ));
1573                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
1574                 args.push_back( std::make_pair( sizeof(cl_mem), (void *)&lut.data ));
1575                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step ));
1576                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
1577                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&lut.step ));
1578                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.cols ));
1579                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.rows ));
1580                 args.push_back( std::make_pair( sizeof(cl_int2), (void *)&tile_size ));
1581                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesX ));
1582                 args.push_back( std::make_pair( sizeof(cl_int), (void *)&tilesY ));
1583
1584                 String kernelName = "transform";
1585                 size_t localThreads[3]  = { 32, 8, 1 };
1586                 size_t globalThreads[3] = { divUp(src.cols, localThreads[0]), divUp(src.rows, localThreads[1]), 1 };
1587
1588                 openCLExecuteKernel(Context::getContext(), &imgproc_clahe, kernelName, globalThreads, localThreads, args, -1, -1);
1589             }
1590         }
1591
1592         namespace
1593         {
1594             class CLAHE_Impl : public cv::ocl::CLAHE
1595             {
1596             public:
1597                 CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
1598
1599                 cv::AlgorithmInfo* info() const;
1600
1601                 void apply(const oclMat &src, oclMat &dst);
1602
1603                 void setClipLimit(double clipLimit);
1604                 double getClipLimit() const;
1605
1606                 void setTilesGridSize(cv::Size tileGridSize);
1607                 cv::Size getTilesGridSize() const;
1608
1609                 void collectGarbage();
1610
1611             private:
1612                 double clipLimit_;
1613                 int tilesX_;
1614                 int tilesY_;
1615
1616                 oclMat srcExt_;
1617                 oclMat lut_;
1618             };
1619
1620             CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
1621             clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
1622             {
1623             }
1624
1625             void CLAHE_Impl::apply(const oclMat &src, oclMat &dst)
1626             {
1627                 CV_Assert( src.type() == CV_8UC1 );
1628
1629                 dst.create( src.size(), src.type() );
1630
1631                 const int histSize = 256;
1632
1633                 ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
1634
1635                 cv::Size tileSize;
1636                 oclMat srcForLut;
1637
1638                 if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
1639                 {
1640                     tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
1641                     srcForLut = src;
1642                 }
1643                 else
1644                 {
1645                     cv::ocl::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar());
1646
1647                     tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
1648                     srcForLut = srcExt_;
1649                 }
1650
1651                 const int tileSizeTotal = tileSize.area();
1652                 const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
1653
1654                 int clipLimit = 0;
1655                 if (clipLimit_ > 0.0)
1656                 {
1657                     clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
1658                     clipLimit = std::max(clipLimit, 1);
1659                 }
1660
1661                 clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale);
1662                 //finish();
1663                 clahe::transform(src, dst, lut_, tilesX_, tilesY_, tileSize);
1664             }
1665
1666             void CLAHE_Impl::setClipLimit(double clipLimit)
1667             {
1668                 clipLimit_ = clipLimit;
1669             }
1670
1671             double CLAHE_Impl::getClipLimit() const
1672             {
1673                 return clipLimit_;
1674             }
1675
1676             void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
1677             {
1678                 tilesX_ = tileGridSize.width;
1679                 tilesY_ = tileGridSize.height;
1680             }
1681
1682             cv::Size CLAHE_Impl::getTilesGridSize() const
1683             {
1684                 return cv::Size(tilesX_, tilesY_);
1685             }
1686
1687             void CLAHE_Impl::collectGarbage()
1688             {
1689                 srcExt_.release();
1690                 lut_.release();
1691             }
1692         }
1693
1694         cv::Ptr<cv::ocl::CLAHE> createCLAHE(double clipLimit, cv::Size tileGridSize)
1695         {
1696             return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
1697         }
1698
1699         //////////////////////////////////bilateralFilter////////////////////////////////////////////////////
1700         static void
1701         oclbilateralFilter_8u( const oclMat &src, oclMat &dst, int d,
1702                                double sigma_color, double sigma_space,
1703                                int borderType )
1704         {
1705             int cn = src.channels();
1706             int i, j, maxk, radius;
1707
1708             CV_Assert( (src.channels() == 1 || src.channels() == 3) &&
1709                        src.type() == dst.type() && src.size() == dst.size() &&
1710                        src.data != dst.data );
1711
1712             if( sigma_color <= 0 )
1713                 sigma_color = 1;
1714             if( sigma_space <= 0 )
1715                 sigma_space = 1;
1716
1717             double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
1718             double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
1719
1720             if( d <= 0 )
1721                 radius = cvRound(sigma_space * 1.5);
1722             else
1723                 radius = d / 2;
1724             radius = MAX(radius, 1);
1725             d = radius * 2 + 1;
1726
1727             oclMat temp;
1728             copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
1729
1730             vector<float> _color_weight(cn * 256);
1731             vector<float> _space_weight(d * d);
1732             vector<int> _space_ofs(d * d);
1733             float *color_weight = &_color_weight[0];
1734             float *space_weight = &_space_weight[0];
1735             int *space_ofs = &_space_ofs[0];
1736             int dst_step_in_pixel = dst.step / dst.elemSize();
1737             int dst_offset_in_pixel = dst.offset / dst.elemSize();
1738             int temp_step_in_pixel = temp.step / temp.elemSize();
1739             // initialize color-related bilateral filter coefficients
1740             for( i = 0; i < 256 * cn; i++ )
1741                 color_weight[i] = (float)std::exp(i * i * gauss_color_coeff);
1742
1743             // initialize space-related bilateral filter coefficients
1744             for( i = -radius, maxk = 0; i <= radius; i++ )
1745                 for( j = -radius; j <= radius; j++ )
1746                 {
1747                     double r = std::sqrt((double)i * i + (double)j * j);
1748                     if( r > radius )
1749                         continue;
1750                     space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff);
1751                     space_ofs[maxk++] = (int)(i * temp_step_in_pixel + j);
1752                 }
1753             oclMat oclcolor_weight(1, cn * 256, CV_32FC1, color_weight);
1754             oclMat oclspace_weight(1, d * d, CV_32FC1, space_weight);
1755             oclMat oclspace_ofs(1, d * d, CV_32SC1, space_ofs);
1756
1757             string kernelName = "bilateral";
1758             size_t localThreads[3]  = { 16, 16, 1 };
1759             size_t globalThreads[3] = { (dst.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0],
1760                                         (dst.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1],
1761                                         1
1762                                       };
1763             if((dst.type() == CV_8UC1) && ((dst.offset & 3) == 0) && ((dst.cols & 3) == 0))
1764             {
1765                 kernelName = "bilateral2";
1766                 globalThreads[0] = (dst.cols / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
1767             }
1768             vector<pair<size_t , const void *> > args;
1769             args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
1770             args.push_back( make_pair( sizeof(cl_mem), (void *)&temp.data ));
1771             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
1772             args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
1773             args.push_back( make_pair( sizeof(cl_int), (void *)&maxk ));
1774             args.push_back( make_pair( sizeof(cl_int), (void *)&radius ));
1775             args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step_in_pixel ));
1776             args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset_in_pixel ));
1777             args.push_back( make_pair( sizeof(cl_int), (void *)&temp_step_in_pixel ));
1778             args.push_back( make_pair( sizeof(cl_int), (void *)&temp.rows ));
1779             args.push_back( make_pair( sizeof(cl_int), (void *)&temp.cols ));
1780             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclcolor_weight.data ));
1781             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_weight.data ));
1782             args.push_back( make_pair( sizeof(cl_mem), (void *)&oclspace_ofs.data ));
1783             openCLExecuteKernel(src.clCxt, &imgproc_bilateral, kernelName, globalThreads, localThreads, args, dst.oclchannels(), dst.depth());
1784         }
1785         void bilateralFilter(const oclMat &src, oclMat &dst, int radius, double sigmaclr, double sigmaspc, int borderType)
1786         {
1787
1788             dst.create( src.size(), src.type() );
1789             if( src.depth() == CV_8U )
1790                 oclbilateralFilter_8u( src, dst, radius, sigmaclr, sigmaspc, borderType );
1791             else
1792                 CV_Error( CV_StsUnsupportedFormat,
1793                           "Bilateral filtering is only implemented for 8uimages" );
1794         }
1795
1796     }
1797 }
1798 //////////////////////////////////convolve////////////////////////////////////////////////////
1799 inline int divUp(int total, int grain)
1800 {
1801     return (total + grain - 1) / grain;
1802 }
1803 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, string kernelName, const char **kernelString)
1804 {
1805     CV_Assert(src.depth() == CV_32FC1);
1806     CV_Assert(temp1.depth() == CV_32F);
1807     CV_Assert(temp1.cols <= 17 && temp1.rows <= 17);
1808
1809     dst.create(src.size(), src.type());
1810
1811     CV_Assert(src.cols == dst.cols && src.rows == dst.rows);
1812     CV_Assert(src.type() == dst.type());
1813
1814     Context  *clCxt = src.clCxt;
1815     int channels = dst.oclchannels();
1816     int depth = dst.depth();
1817
1818     size_t vector_length = 1;
1819     int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1);
1820     int cols = divUp(dst.cols * channels + offset_cols, vector_length);
1821     int rows = dst.rows;
1822
1823     size_t localThreads[3]  = { 16, 16, 1 };
1824     size_t globalThreads[3] = { divUp(cols, localThreads[0]) *localThreads[0],
1825                                 divUp(rows, localThreads[1]) *localThreads[1],
1826                                 1
1827                               };
1828
1829     vector<pair<size_t , const void *> > args;
1830     args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
1831     args.push_back( make_pair( sizeof(cl_mem), (void *)&temp1.data ));
1832     args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
1833     args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
1834     args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
1835     args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
1836     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
1837     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.step ));
1838     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.rows ));
1839     args.push_back( make_pair( sizeof(cl_int), (void *)&temp1.cols ));
1840
1841     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
1842 }
1843 void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y)
1844 {
1845     CV_Assert(x.depth() == CV_32F);
1846     CV_Assert(t.depth() == CV_32F);
1847     CV_Assert(x.type() == y.type() && x.size() == y.size());
1848     y.create(x.size(), x.type());
1849     string kernelName = "convolve";
1850
1851     convolve_run(x, t, y, kernelName, &imgproc_convolve);
1852 }