Merge pull request #1263 from abidrahmank:pyCLAHE_24
[profile/ivi/opencv.git] / modules / ocl / src / initialization.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // @Authors
19 //    Guoping Long, longguoping@gmail.com
20 //    Niko Li, newlife20080214@gmail.com
21 //    Yao Wang, bitwangyaoyao@gmail.com
22 // Redistribution and use in source and binary forms, with or without modification,
23 // are permitted provided that the following conditions are met:
24 //
25 //   * Redistribution's of source code must retain the above copyright notice,
26 //     this list of conditions and the following disclaimer.
27 //
28 //   * Redistribution's in binary form must reproduce the above copyright notice,
29 //     this list of conditions and the following disclaimer in the documentation
30 //     and/or other oclMaterials provided with the distribution.
31 //
32 //   * The name of the copyright holders may not be used to endorse or promote products
33 //     derived from this software without specific prior written permission.
34 //
35 // This software is provided by the copyright holders and contributors "as is" and
36 // any express or implied warranties, including, but not limited to, the implied
37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
38 // In no event shall the Intel Corporation or contributors be liable for any direct,
39 // indirect, incidental, special, exemplary, or consequential damages
40 // (including, but not limited to, procurement of substitute goods or services;
41 // loss of use, data, or profits; or business interruption) however caused
42 // and on any theory of liability, whether in contract, strict liability,
43 // or tort (including negligence or otherwise) arising in any way out of
44 // the use of this software, even if advised of the possibility of such damage.
45 //
46 //M*/
47
48 #include "precomp.hpp"
49 #include <iomanip>
50 #include <fstream>
51 #include "binarycaching.hpp"
52
53 using namespace cv;
54 using namespace cv::ocl;
55 using namespace std;
56 using std::cout;
57 using std::endl;
58
59 //#define PRINT_KERNEL_RUN_TIME
60 #define RUN_TIMES 100
61 #ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
62 #define CL_MEM_USE_PERSISTENT_MEM_AMD 0
63 #endif
64 //#define AMD_DOUBLE_DIFFER
65
66 namespace cv
67 {
68     namespace ocl
69     {
70         extern void fft_teardown();
71         /*
72          * The binary caching system to eliminate redundant program source compilation.
73          * Strictly, this is not a cache because we do not implement evictions right now.
74          * We shall add such features to trade-off memory consumption and performance when necessary.
75          */
76         auto_ptr<ProgramCache> ProgramCache::programCache;
77         ProgramCache *programCache = NULL;
78         DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
79         DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
80         int gDevMemTypeValueMap[5] = {0,
81                                       CL_MEM_ALLOC_HOST_PTR,
82                                       CL_MEM_USE_HOST_PTR,
83                                       CL_MEM_COPY_HOST_PTR,
84                                       CL_MEM_USE_PERSISTENT_MEM_AMD};
85         int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
86
87         ProgramCache::ProgramCache()
88         {
89             codeCache.clear();
90             cacheSize = 0;
91         }
92
93         ProgramCache::~ProgramCache()
94         {
95             releaseProgram();
96         }
97
98         cl_program ProgramCache::progLookup(string srcsign)
99         {
100             map<string, cl_program>::iterator iter;
101             iter = codeCache.find(srcsign);
102             if(iter != codeCache.end())
103                 return iter->second;
104             else
105                 return NULL;
106         }
107
108         void ProgramCache::addProgram(string srcsign , cl_program program)
109         {
110             if(!progLookup(srcsign))
111             {
112                 codeCache.insert(map<string, cl_program>::value_type(srcsign, program));
113             }
114         }
115
116         void ProgramCache::releaseProgram()
117         {
118             map<string, cl_program>::iterator iter;
119             for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
120             {
121                 openCLSafeCall(clReleaseProgram(iter->second));
122             }
123             codeCache.clear();
124             cacheSize = 0;
125         }
126         struct Info::Impl
127         {
128             cl_platform_id oclplatform;
129             std::vector<cl_device_id> devices;
130             std::vector<std::string> devName;
131             std::string clVersion;
132
133             cl_context oclcontext;
134             cl_command_queue clCmdQueue;
135             int devnum;
136             size_t maxWorkGroupSize;
137             cl_uint maxDimensions; // == maxWorkItemSizes.size()
138             std::vector<size_t> maxWorkItemSizes;
139             cl_uint maxComputeUnits;
140             char extra_options[512];
141             int  double_support;
142             int unified_memory; //1 means integrated GPU, otherwise this value is 0
143             int refcounter;
144
145             Impl();
146
147             void setDevice(void *ctx, void *q, int devnum);
148
149             void release()
150             {
151                 if(1 == CV_XADD(&refcounter, -1))
152                 {
153                     releaseResources();
154                     delete this;
155                 }
156             }
157
158             Impl* copy()
159             {
160                 CV_XADD(&refcounter, 1);
161                 return this;
162             }
163
164         private:
165             Impl(const Impl&);
166             Impl& operator=(const Impl&);
167             void releaseResources();
168         };
169
170         // global variables to hold binary cache properties
171         static int enable_disk_cache = 
172 #ifdef _DEBUG
173             false;
174 #else
175             true;
176 #endif
177         static int update_disk_cache = false;
178         static String binpath = "";
179
180         Info::Impl::Impl()
181             :oclplatform(0),
182             oclcontext(0),
183             clCmdQueue(0),
184             devnum(-1),
185             maxWorkGroupSize(0),
186             maxDimensions(0),
187             maxComputeUnits(0),
188             double_support(0),
189             unified_memory(0),
190             refcounter(1)
191         {
192             memset(extra_options, 0, 512);
193         }
194
195         void Info::Impl::releaseResources()
196         {
197             devnum = -1;
198
199             if(clCmdQueue)
200             {
201                 //temporarily disable command queue release as it causes program hang at exit
202                 //openCLSafeCall(clReleaseCommandQueue(clCmdQueue));
203                 clCmdQueue = 0;
204             }
205
206             if(oclcontext)
207             {
208                 openCLSafeCall(clReleaseContext(oclcontext));
209                 oclcontext = 0;
210             }
211         }
212
213         void Info::Impl::setDevice(void *ctx, void *q, int dnum)
214         {
215             if((ctx && q) || devnum != dnum)
216                 releaseResources();
217
218             CV_Assert(dnum >= 0 && dnum < (int)devices.size());
219             devnum = dnum;
220             if(ctx && q)
221             {
222                 oclcontext = (cl_context)ctx;
223                 clCmdQueue = (cl_command_queue)q;
224                 clRetainContext(oclcontext);
225                 clRetainCommandQueue(clCmdQueue);
226             }
227             else
228             {
229                 cl_int status = 0;
230                 cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 };
231                 oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status);
232                 openCLVerifyCall(status);
233                 clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status);
234                 openCLVerifyCall(status);
235             }
236
237             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0));
238             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0));
239             maxWorkItemSizes.resize(maxDimensions);
240             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0));
241             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0));
242
243             cl_bool unfymem = false;
244             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0));
245             unified_memory = unfymem ? 1 : 0;
246
247             //initialize extra options for compilation. Currently only fp64 is included.
248             //Assume 4KB is enough to store all possible extensions.
249             const int EXT_LEN = 4096 + 1 ;
250             char extends_set[EXT_LEN];
251             size_t extends_size;
252             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
253             extends_set[EXT_LEN - 1] = 0;
254             size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64");
255
256             if(fp64_khr != std::string::npos)
257             {
258                 sprintf(extra_options, "-D DOUBLE_SUPPORT");
259                 double_support = 1;
260             }
261             else
262             {
263                 memset(extra_options, 0, 512);
264                 double_support = 0;
265             }
266         }
267
268         ////////////////////////Common OpenCL specific calls///////////////
269         int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
270         {
271             rw_type = gDeviceMemRW;
272             mem_type = gDeviceMemType;
273             return Context::getContext()->impl->unified_memory;
274         }
275
276         int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
277         {
278             if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
279                  mem_type == DEVICE_MEM_UHP ||
280                  mem_type == DEVICE_MEM_CHP )
281                 return -1;
282             gDeviceMemRW = rw_type;
283             gDeviceMemType = mem_type;
284             return 0;
285         }
286
287         inline int divUp(int total, int grain)
288         {
289             return (total + grain - 1) / grain;
290         }
291
292         int getDevice(std::vector<Info> &oclinfo, int devicetype)
293         {
294             //TODO: cache oclinfo vector
295             oclinfo.clear();
296
297             switch(devicetype)
298             {
299             case CVCL_DEVICE_TYPE_DEFAULT:
300             case CVCL_DEVICE_TYPE_CPU:
301             case CVCL_DEVICE_TYPE_GPU:
302             case CVCL_DEVICE_TYPE_ACCELERATOR:
303             case CVCL_DEVICE_TYPE_ALL:
304                 break;
305             default:
306                 return 0;
307             }
308
309             // Platform info
310             cl_uint numPlatforms;
311             openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms));
312             if(numPlatforms < 1) return 0;
313
314             std::vector<cl_platform_id> platforms(numPlatforms);
315             openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0));
316
317             char deviceName[256];
318             int devcienums = 0;
319             char clVersion[256];
320             for (unsigned i = 0; i < numPlatforms; ++i)
321             {
322                 cl_uint numsdev = 0;
323                 cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
324                 if(status != CL_DEVICE_NOT_FOUND)
325                     openCLVerifyCall(status);
326
327                 if(numsdev > 0)
328                 {
329                     devcienums += numsdev;
330                     std::vector<cl_device_id> devices(numsdev);
331                     openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0));
332
333                     Info ocltmpinfo;
334                     ocltmpinfo.impl->oclplatform = platforms[i];
335                     openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL));
336                     ocltmpinfo.impl->clVersion = clVersion;
337                     for(unsigned j = 0; j < numsdev; ++j)
338                     {
339                         ocltmpinfo.impl->devices.push_back(devices[j]);
340                         openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0));
341                         ocltmpinfo.impl->devName.push_back(deviceName);
342                         ocltmpinfo.DeviceName.push_back(deviceName);
343                     }
344                     oclinfo.push_back(ocltmpinfo);
345                 }
346             }
347             if(devcienums > 0)
348             {
349                 setDevice(oclinfo[0]);
350             }
351             return devcienums;
352         }
353
354         void setDevice(Info &oclinfo, int devnum)
355         {
356             oclinfo.impl->setDevice(0, 0, devnum);
357             Context::setContext(oclinfo);
358         }
359
360         void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum)
361         {
362             oclinfo.impl->setDevice(ctx, q, devnum);
363             Context::setContext(oclinfo);
364          }
365
366         void *getoclContext()
367         {
368             return &(Context::getContext()->impl->oclcontext);
369         }
370
371         void *getoclCommandQueue()
372         {
373             return &(Context::getContext()->impl->clCmdQueue);
374         }
375
376         void finish()
377         {
378             clFinish(Context::getContext()->impl->clCmdQueue);
379         }
380
381         //template specializations of queryDeviceInfo
382         template<>
383         bool queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel)
384         {
385             Info::Impl* impl = Context::getContext()->impl;
386             cl_device_type devicetype;
387             openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum],
388                 CL_DEVICE_TYPE, sizeof(cl_device_type),
389                 &devicetype, NULL));
390             return (devicetype == CVCL_DEVICE_TYPE_CPU);
391         }
392
393         template<typename _ty>
394         static _ty queryWavesize(cl_kernel kernel)
395         {
396             size_t info = 0;
397             Info::Impl* impl = Context::getContext()->impl;
398             bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
399             if(is_cpu)
400             {
401                 return 1;
402             }
403             CV_Assert(kernel != NULL);
404             openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
405                 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &info, NULL));
406             return static_cast<_ty>(info);
407         }
408
409         template<>
410         size_t queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel)
411         {
412             return queryWavesize<size_t>(kernel);
413         }
414         template<>
415         int queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel)
416         {
417             return queryWavesize<int>(kernel);
418         }
419
420         void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
421         {
422             cl_int status;
423             status = clEnqueueReadBuffer(clCxt->impl->clCmdQueue, dst_buffer, CL_TRUE, 0,
424                                          size, host_buffer, 0, NULL, NULL);
425             openCLVerifyCall(status);
426         }
427
428         cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
429         {
430             cl_int status;
431             cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status);
432             openCLVerifyCall(status);
433             return buffer;
434         }
435
436         void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
437                                size_t widthInBytes, size_t height)
438         {
439             openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
440         }
441
442         void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
443                                size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
444         {
445             cl_int status;
446             *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
447                                       widthInBytes * height, 0, &status);
448             openCLVerifyCall(status);
449             *pitch = widthInBytes;
450         }
451
452         void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
453                             const void *src, size_t spitch,
454                             size_t width, size_t height, openCLMemcpyKind kind, int channels)
455         {
456             size_t buffer_origin[3] = {0, 0, 0};
457             size_t host_origin[3] = {0, 0, 0};
458             size_t region[3] = {width, height, 1};
459             if(kind == clMemcpyHostToDevice)
460             {
461                 if(dpitch == width || channels == 3 || height == 1)
462                 {
463                     openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
464                                                         0, width * height, src, 0, NULL, NULL));
465                 }
466                 else
467                 {
468                     openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
469                                                             buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
470                 }
471             }
472             else if(kind == clMemcpyDeviceToHost)
473             {
474                 if(spitch == width || channels == 3 || height == 1)
475                 {
476                     openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
477                                                        0, width * height, dst, 0, NULL, NULL));
478                 }
479                 else
480                 {
481                     openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
482                                                            buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
483                 }
484             }
485         }
486
487         void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
488                                 const void *src, size_t spitch,
489                                 size_t width, size_t height, int src_offset)
490         {
491             size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
492             size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
493             size_t region[3] = {width, height, 1};
494
495             openCLSafeCall(clEnqueueCopyBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
496                                                    region, spitch, 0, dpitch, 0, 0, 0, 0));
497         }
498
499         void openCLFree(void *devPtr)
500         {
501             openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
502         }
503         cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName)
504         {
505             return openCLGetKernelFromSource(clCxt, source, kernelName, NULL);
506         }
507
508         void setBinaryDiskCache(int mode, String path)
509         {
510             if(mode == CACHE_NONE)
511             {
512                 update_disk_cache = 0;
513                 enable_disk_cache = 0;
514                 return;
515             }
516             update_disk_cache |= (mode & CACHE_UPDATE) == CACHE_UPDATE;
517             enable_disk_cache |= 
518 #ifdef _DEBUG 
519                 (mode & CACHE_DEBUG)   == CACHE_DEBUG;
520 #else
521                 (mode & CACHE_RELEASE) == CACHE_RELEASE;
522 #endif
523             if(enable_disk_cache && !path.empty())
524             {
525                 binpath = path;
526             }
527         }
528
529         void setBinpath(const char *path)
530         {
531             binpath = path;
532         }
533
534         int savetofile(const Context*,  cl_program &program, const char *fileName)
535         {
536             size_t binarySize;
537             openCLSafeCall(clGetProgramInfo(program,
538                                     CL_PROGRAM_BINARY_SIZES,
539                                     sizeof(size_t),
540                                     &binarySize, NULL));
541             char* binary = (char*)malloc(binarySize);
542             if(binary == NULL)
543             {
544                 CV_Error(CV_StsNoMem, "Failed to allocate host memory.");
545             }
546             openCLSafeCall(clGetProgramInfo(program,
547                                     CL_PROGRAM_BINARIES,
548                                     sizeof(char *),
549                                     &binary,
550                                     NULL));
551
552             FILE *fp = fopen(fileName, "wb+");
553             if(fp != NULL)
554             {
555                 fwrite(binary, binarySize, 1, fp);
556                 free(binary);
557                 fclose(fp);
558             }
559             return 1;
560         }
561
562         cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName,
563                                             const char *build_options)
564         {
565             cl_kernel kernel;
566             cl_program program ;
567             cl_int status = 0;
568             stringstream src_sign;
569             string srcsign;
570             string filename;
571             CV_Assert(programCache != NULL);
572
573             if(NULL != build_options)
574             {
575                 src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options;
576             }
577             else
578             {
579                 src_sign << (int64)(*source) << clCxt->impl->oclcontext;
580             }
581             srcsign = src_sign.str();
582
583             program = NULL;
584             program = programCache->progLookup(srcsign);
585
586             if(!program)
587             {
588                 //config build programs
589                 char all_build_options[1024];
590                 memset(all_build_options, 0, 1024);
591                 char zeromem[512] = {0};
592                 if(0 != memcmp(clCxt -> impl->extra_options, zeromem, 512))
593                     strcat(all_build_options, clCxt -> impl->extra_options);
594                 strcat(all_build_options, " ");
595                 if(build_options != NULL)
596                     strcat(all_build_options, build_options);
597                 if(all_build_options != NULL)
598                 {
599                     filename = binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb";
600                 }
601                 else
602                 {
603                     filename = binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
604                 }
605
606                 FILE *fp = enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL;
607                 if(fp == NULL || update_disk_cache)
608                 {
609                     if(fp != NULL)
610                         fclose(fp);
611
612                     program = clCreateProgramWithSource(
613                                   clCxt->impl->oclcontext, 1, source, NULL, &status);
614                     openCLVerifyCall(status);
615                     status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
616                     if(status == CL_SUCCESS && enable_disk_cache)
617                         savetofile(clCxt, program, filename.c_str());
618                 }
619                 else
620                 {
621                     fseek(fp, 0, SEEK_END);
622                     size_t binarySize = ftell(fp);
623                     fseek(fp, 0, SEEK_SET);
624                     char *binary = new char[binarySize];
625                     CV_Assert(1 == fread(binary, binarySize, 1, fp));
626                     fclose(fp);
627                     cl_int status = 0;
628                     program = clCreateProgramWithBinary(clCxt->impl->oclcontext,
629                                                         1,
630                                                         &(clCxt->impl->devices[clCxt->impl->devnum]),
631                                                         (const size_t *)&binarySize,
632                                                         (const unsigned char **)&binary,
633                                                         NULL,
634                                                         &status);
635                     openCLVerifyCall(status);
636                     status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
637                     delete[] binary;
638                 }
639
640                 if(status != CL_SUCCESS)
641                 {
642                     if(status == CL_BUILD_PROGRAM_FAILURE)
643                     {
644                         cl_int logStatus;
645                         char *buildLog = NULL;
646                         size_t buildLogSize = 0;
647                         logStatus = clGetProgramBuildInfo(program,
648                                                           clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize,
649                                                           buildLog, &buildLogSize);
650                         if(logStatus != CL_SUCCESS)
651                             cout << "Failed to build the program and get the build info." << endl;
652                         buildLog = new char[buildLogSize];
653                         CV_DbgAssert(!!buildLog);
654                         memset(buildLog, 0, buildLogSize);
655                         openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum],
656                                                              CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
657                         cout << "\n\t\t\tBUILD LOG\n";
658                         cout << buildLog << endl;
659                         delete [] buildLog;
660                     }
661                     openCLVerifyCall(status);
662                 }
663                 //Cache the binary for future use if build_options is null
664                 if( (programCache->cacheSize += 1) < programCache->MAX_PROG_CACHE_SIZE)
665                     programCache->addProgram(srcsign, program);
666                 else
667                     cout << "Warning: code cache has been full.\n";
668             }
669             kernel = clCreateKernel(program, kernelName.c_str(), &status);
670             openCLVerifyCall(status);
671             return kernel;
672         }
673
674         void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
675         {
676             size_t kernelWorkGroupSize;
677             openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
678                                                     CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
679             CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
680             CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
681             CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
682             CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
683             CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
684         }
685
686 #ifdef PRINT_KERNEL_RUN_TIME
687         static double total_execute_time = 0;
688         static double total_kernel_time = 0;
689 #endif
690         void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
691                                   size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
692                                   int depth, const char *build_options)
693         {
694             //construct kernel name
695             //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
696             //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
697             stringstream idxStr;
698             if(channels != -1)
699                 idxStr << "_C" << channels;
700             if(depth != -1)
701                 idxStr << "_D" << depth;
702             kernelName += idxStr.str();
703
704             cl_kernel kernel;
705             kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
706
707             if ( localThreads != NULL)
708             {
709                 globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
710                 globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
711                 globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
712
713                 //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
714                 cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
715             }
716             for(size_t i = 0; i < args.size(); i ++)
717                 openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
718
719 #ifndef PRINT_KERNEL_RUN_TIME
720             openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
721                                                   localThreads, 0, NULL, NULL));
722 #else
723             cl_event event = NULL;
724             openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
725                                                   localThreads, 0, NULL, &event));
726
727             cl_ulong start_time, end_time, queue_time;
728             double execute_time = 0;
729             double total_time   = 0;
730
731             openCLSafeCall(clWaitForEvents(1, &event));
732             openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
733                                                    sizeof(cl_ulong), &start_time, 0));
734
735             openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
736                                                    sizeof(cl_ulong), &end_time, 0));
737
738             openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
739                                                    sizeof(cl_ulong), &queue_time, 0));
740
741             execute_time = (double)(end_time - start_time) / (1000 * 1000);
742             total_time = (double)(end_time - queue_time) / (1000 * 1000);
743
744             //  cout << setiosflags(ios::left) << setw(15) << execute_time;
745             //  cout << setiosflags(ios::left) << setw(15) << total_time - execute_time;
746             //  cout << setiosflags(ios::left) << setw(15) << total_time << endl;
747
748             total_execute_time += execute_time;
749             total_kernel_time += total_time;
750             clReleaseEvent(event);
751 #endif
752
753             clFlush(clCxt->impl->clCmdQueue);
754             openCLSafeCall(clReleaseKernel(kernel));
755         }
756
757         void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
758                                  size_t globalThreads[3], size_t localThreads[3],
759                                  vector< pair<size_t, const void *> > &args, int channels, int depth)
760         {
761             openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args,
762                                 channels, depth, NULL);
763         }
764         void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
765                                  size_t globalThreads[3], size_t localThreads[3],
766                                  vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
767
768         {
769 #ifndef PRINT_KERNEL_RUN_TIME
770             openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
771                                  build_options);
772 #else
773             string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
774             cout << endl;
775             cout << "Function Name: " << kernelName;
776             if(depth >= 0)
777                 cout << " |data type: " << data_type[depth];
778             cout << " |channels: " << channels;
779             cout << " |Time Unit: " << "ms" << endl;
780
781             total_execute_time = 0;
782             total_kernel_time = 0;
783             cout << "-------------------------------------" << endl;
784
785             cout << setiosflags(ios::left) << setw(15) << "excute time";
786             cout << setiosflags(ios::left) << setw(15) << "lauch time";
787             cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
788             int i = 0;
789             for(i = 0; i < RUN_TIMES; i++)
790                 openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
791                                      build_options);
792
793             cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
794             cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
795 #endif
796         }
797
798        double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName,
799                                  size_t globalThreads[3], size_t localThreads[3],
800                                  vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
801                                  bool finish, bool measureKernelTime, bool cleanUp)
802
803         {
804             //construct kernel name
805             //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
806             //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
807             stringstream idxStr;
808             if(channels != -1)
809                 idxStr << "_C" << channels;
810             if(depth != -1)
811                 idxStr << "_D" << depth;
812             kernelName += idxStr.str();
813
814             cl_kernel kernel;
815             kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
816
817             double kernelTime = 0.0;
818
819             if( globalThreads != NULL)
820             {
821                 if ( localThreads != NULL)
822                 {
823                     globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
824                     globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
825                     globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
826
827                     //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
828                     cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
829                 }
830                 for(size_t i = 0; i < args.size(); i ++)
831                     openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
832
833                 if(measureKernelTime == false)
834                 {
835                     openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
836                                     localThreads, 0, NULL, NULL));
837                 }
838                 else
839                 {
840                     cl_event event = NULL;
841                     openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
842                                     localThreads, 0, NULL, &event));
843
844                     cl_ulong end_time, queue_time;
845
846                     openCLSafeCall(clWaitForEvents(1, &event));
847
848                     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
849                                     sizeof(cl_ulong), &end_time, 0));
850
851                     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
852                                     sizeof(cl_ulong), &queue_time, 0));
853
854                     kernelTime = (double)(end_time - queue_time) / (1000 * 1000);
855
856                     clReleaseEvent(event);
857                 }
858             }
859
860             if(finish)
861             {
862                 clFinish(clCxt->impl->clCmdQueue);
863             }
864
865             if(cleanUp)
866             {
867                 openCLSafeCall(clReleaseKernel(kernel));
868             }
869
870             return kernelTime;
871         }
872
873         // Converts the contents of a file into a string
874         static int convertToString(const char *filename, std::string& s)
875         {
876             size_t size;
877             char*  str;
878
879             std::fstream f(filename, (std::fstream::in | std::fstream::binary));
880             if(f.is_open())
881             {
882                 size_t fileSize;
883                 f.seekg(0, std::fstream::end);
884                 size = fileSize = (size_t)f.tellg();
885                 f.seekg(0, std::fstream::beg);
886
887                 str = new char[size+1];
888                 if(!str)
889                 {
890                     f.close();
891                     return -1;
892                 }
893
894                 f.read(str, fileSize);
895                 f.close();
896                 str[size] = '\0';
897
898                 s = str;
899                 delete[] str;
900                 return 0;
901             }
902             printf("Error: Failed to open file %s\n", filename);
903             return -1;
904         }
905
906         double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName,
907                                  size_t globalThreads[3], size_t localThreads[3],
908                                  vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
909                                  bool finish, bool measureKernelTime, bool cleanUp)
910
911         {
912             std::vector<std::string> fsource;
913             for (int i = 0 ; i < numFiles ; i++)
914             {
915                 std::string str;
916                 if (convertToString(fileName[i], str) >= 0)
917                     fsource.push_back(str);
918             }
919             const char **source = new const char *[numFiles];
920             for (int i = 0 ; i < numFiles ; i++)
921                 source[i] = fsource[i].c_str();
922             double kernelTime = openCLExecuteKernelInterop(clCxt ,source, kernelName, globalThreads, localThreads,
923                                  args, channels, depth, build_options, finish, measureKernelTime, cleanUp);
924             fsource.clear();
925             delete []source;
926             return kernelTime;
927         }
928
929         cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
930                              const size_t size)
931         {
932             int status;
933             cl_mem con_struct;
934
935             con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
936             openCLSafeCall(status);
937
938             openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
939                                                 value, 0, 0, 0));
940
941             return con_struct;
942
943         }
944
945         /////////////////////////////OpenCL initialization/////////////////
946         auto_ptr<Context> Context::clCxt;
947         int Context::val = 0;
948         static Mutex cs;
949         static volatile int context_tear_down = 0;
950
951         bool initialized()
952         {
953             return *((volatile int*)&Context::val) != 0 && 
954                 Context::clCxt->impl->clCmdQueue != NULL&& 
955                 Context::clCxt->impl->oclcontext != NULL;
956         }
957
958         Context* Context::getContext()
959         {
960             if(*((volatile int*)&val) != 1)
961             {
962                 AutoLock al(cs);
963                 if(*((volatile int*)&val) != 1)
964                 {
965                     if (context_tear_down)
966                         return clCxt.get();
967                     if( 0 == clCxt.get())
968                         clCxt.reset(new Context);
969                     std::vector<Info> oclinfo;
970                     CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
971
972                     *((volatile int*)&val) = 1;
973                 }
974             }
975             return clCxt.get();
976         }
977
978         void Context::setContext(Info &oclinfo)
979         {
980             AutoLock guard(cs);
981             if(*((volatile int*)&val) != 1)
982             {
983                 if( 0 == clCxt.get())
984                     clCxt.reset(new Context);
985
986                 clCxt.get()->impl = oclinfo.impl->copy();
987
988                 *((volatile int*)&val) = 1;
989             }
990             else
991             {
992                 clCxt.get()->impl->release();
993                 clCxt.get()->impl = oclinfo.impl->copy();
994             }
995         }
996
997         Context::Context()
998         {
999             impl = 0;
1000             programCache = ProgramCache::getProgramCache();
1001         }
1002
1003         Context::~Context()
1004         {
1005             release();
1006         }
1007
1008         void Context::release()
1009         {
1010             if (impl)
1011                 impl->release();
1012             programCache->releaseProgram();
1013         }
1014
1015         bool Context::supportsFeature(int ftype)
1016         {
1017             switch(ftype)
1018             {
1019             case CL_DOUBLE:
1020                 return impl->double_support == 1;
1021             case CL_UNIFIED_MEM:
1022                 return impl->unified_memory == 1;
1023             case CL_VER_1_2:
1024                 return impl->clVersion.find("OpenCL 1.2") != string::npos;
1025             default:
1026                 return false;
1027             }
1028         }
1029
1030         size_t Context::computeUnits()
1031         {
1032             return impl->maxComputeUnits;
1033         }
1034
1035         void* Context::oclContext()
1036         {
1037             return impl->oclcontext;
1038         }
1039
1040         void* Context::oclCommandQueue()
1041         {
1042             return impl->clCmdQueue;
1043         }
1044
1045         Info::Info()
1046         {
1047             impl = new Impl;
1048         }
1049
1050         void Info::release()
1051         {
1052             fft_teardown();
1053             impl->release();
1054             impl = new Impl;
1055             DeviceName.clear();
1056         }
1057
1058         Info::~Info()
1059         {
1060             fft_teardown();
1061             impl->release();
1062         }
1063
1064         Info &Info::operator = (const Info &m)
1065         {
1066             impl->release();
1067             impl = m.impl->copy();
1068             DeviceName = m.DeviceName;
1069             return *this;
1070         }
1071
1072         Info::Info(const Info &m)
1073         {
1074             impl = m.impl->copy();
1075             DeviceName = m.DeviceName;
1076         }
1077     }//namespace ocl
1078
1079 }//namespace cv